# Settings and Installing

In [None]:
!pip install transformers
!pip install fastBPE
!pip install fairseq
!pip install emoji==1.7

In [None]:
!pip install vncorenlp

!mkdir -p vncorenlp/models/wordsegmenter
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/VnCoreNLP-1.1.1.jar
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/vi-vocab
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/wordsegmenter.rdr
!mv VnCoreNLP-1.1.1.jar vncorenlp/ 
!mv vi-vocab vncorenlp/models/wordsegmenter/
!mv wordsegmenter.rdr vncorenlp/models/wordsegmenter/!

In [None]:
!wget https://public.vinai.io/PhoBERT_base_transformers.tar.gz
!tar -xzvf PhoBERT_base_transformers.tar.gz

In [None]:
from fairseq.data.encoders.fastbpe import fastBPE
from fairseq.data import Dictionary
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--bpe-codes', 
    default="/kaggle/working/PhoBERT_base_transformers/bpe.codes",
    required=False,
    type=str,
    help='path to fastBPE BPE'
)
args, unknown = parser.parse_known_args()
bpe = fastBPE(args)

vocab = Dictionary()
vocab.add_from_file("/kaggle/working/PhoBERT_base_transformers/dict.txt")

In [None]:
import tensorflow
import pandas as pd
import numpy as np
import re
import regex

In [None]:
!pip install underthesea
import underthesea
from underthesea import word_tokenize

# Import data

In [None]:
train_path = "/kaggle/input/newdatafoody/train-set.csv"
test_path = "/kaggle/input/newdatafoody/test-set.csv"
pos_path = "/kaggle/input/newdatafoody/pos.txt"
neg_path = "/kaggle/input/newdatafoody/neg.txt"
not_path = "/kaggle/input/newdatafoody/not.txt"
intensifier_path = "/kaggle/input/newdatafoody/intensifier.txt"

In [None]:
train_text, train_label = [], []
test_id, test_text = [], []

pos_list = pd.read_table(pos_path)
neg_list = pd.read_table(neg_path)
not_list = pd.read_table(not_path)
intensifier_list = pd.read_table(intensifier_path)
pos_list = pos_list.values.tolist()
neg_list = neg_list.values.tolist()
not_list = not_list.values.tolist()
intensifier_list = intensifier_list.values.tolist()

for i in range(len(pos_list)):
    pos_list[i] = pos_list[i][0]

for i in range(len(neg_list)):
    neg_list[i] = neg_list[i][0]

for i in range(len(not_list)):
    not_list[i] = not_list[i][0]

for i in range(len(intensifier_list)):
    intensifier_list[i] = intensifier_list[i][0]

df = pd.read_csv(
    train_path,
    names=["#", "RevId", "UserId", "Comment", "image_urls", "Rating"])

df= df[(df['Rating']=='1') | (df['Rating']=='0')].reset_index(drop=True)

df = df[['Comment', 'Rating']]

train_text = df['Comment']
train_labels = df['Rating']

new_data = []

for index,row in enumerate(neg_list):
    new_data.append([row,'0'])
for index,row in enumerate(pos_list):
    new_data.append([row,'1'])
    
new_data.append(["th·∫≠m t·ªá", '0'])
new_data.append(["xu·∫•t s·∫Øc", '1'])
    
aug_df_text = pd.Series( (v[0] for v in new_data) )
aug_df_labels = pd.Series( (v[1] for v in new_data) )

train_text = train_text.append(aug_df_text).reset_index(drop=True)
train_labels = train_labels.append(aug_df_labels).reset_index(drop=True)

# Text preprocessing

In [None]:
def emojis_abbreviations(text):
    replace_list = {
        "üëπ": "t·ªá", "üëª": "t·ªët", "üíÉ": "t·ªët",'ü§ô': ' t·ªët ', 'üëç': ' t·ªët ',
        "üíÑ": "t·ªët", "üíé": "t·ªët", "üí©": "t·ªët","üòï": "t·ªá", "üò±": "t·ªá", "üò∏": "t·ªët",
        "üòæ": "t·ªá", "üö´": "t·ªá",  "ü§¨": "t·ªá","üßö": "t·ªët", "üß°": "t·ªët",'üê∂':' t·ªët ',
        'üëé': ' t·ªá ', 'üò£': ' t·ªá ','‚ú®': ' t·ªët ', '‚ù£': ' t·ªët ','‚òÄ': ' t·ªët ',
        '‚ô•': ' t·ªët ', 'ü§©': ' t·ªët ', 'like': ' t·ªët ', 'üíå': ' t·ªët ',
        'ü§£': ' t·ªët ', 'üñ§': ' t·ªët ', 'ü§§': ' t·ªët ', ':(': ' t·ªá ', 'üò¢': ' t·ªá ',
        '‚ù§': ' t·ªët ', 'üòç': ' t·ªët ', 'üòò': ' t·ªët ', 'üò™': ' t·ªá ', 'üòä': ' t·ªët ',
        '?': ' ? ', 'üòÅ': ' t·ªët ', 'üíñ': ' t·ªët ', 'üòü': ' t·ªá ', 'üò≠': ' t·ªá ',
        'üíØ': ' t·ªët ', 'üíó': ' t·ªët ', '‚ô°': ' t·ªët ', 'üíú': ' t·ªët ', 'ü§ó': ' t·ªët ',
        '^^': ' t·ªët ', 'üò®': ' t·ªá ', '‚ò∫': ' t·ªët ', 'üíã': ' t·ªët ', 'üëå': ' t·ªët ',
        'üòñ': ' t·ªá ', 'üòÄ': ' t·ªët ', ':((': ' t·ªá ', 'üò°': ' t·ªá ', 'üò†': ' t·ªá ',
        'üòí': ' t·ªá ', 'üôÇ': ' t·ªët ', 'üòè': ' t·ªá ', 'üòù': ' t·ªët ', 'üòÑ': ' t·ªët ',
        'üòô': ' t·ªët ', 'üò§': ' t·ªá ', 'üòé': ' t·ªët ', 'üòÜ': ' t·ªët ', 'üíö': ' t·ªët ',
        '‚úå': ' t·ªët ', 'üíï': ' t·ªët ', 'üòû': ' t·ªá ', 'üòì': ' t·ªá ', 'Ô∏èüÜóÔ∏è': ' t·ªët ',
        'üòâ': ' t·ªët ', 'üòÇ': ' t·ªët ', ':v': '  t·ªët ', '=))': '  t·ªët ', 'üòã': ' t·ªët ',
        'üíì': ' t·ªët ', 'üòê': ' t·ªá ', ':3': ' t·ªët ', 'üò´': ' t·ªá ', 'üò•': ' t·ªá ',
        'üòÉ': ' t·ªët ', 'üò¨': ' üò¨ ', 'üòå': ' üòå ', 'üíõ': ' t·ªët ', 'ü§ù': ' t·ªët ', 'üéà': ' t·ªët ',
        'üòó': ' t·ªët ', 'ü§î': ' t·ªá ', 'üòë': ' t·ªá ', 'üî•': ' t·ªá ', 'üôè': ' t·ªá ',
        'üÜó': ' t·ªët ', 'üòª': ' t·ªët ', 'üíô': ' t·ªët ', 'üíü': ' t·ªët ',
        'üòö': ' t·ªët ', '‚ùå': ' t·ªá ', 'üëè': ' t·ªët ', ';)': ' t·ªët ', '<3': ' t·ªët ',
        'üåù': ' t·ªët ',  'üå∑': ' t·ªët ', 'üå∏': ' t·ªët ', 'üå∫': ' t·ªët ',
        'üåº': ' t·ªët ', 'üçì': ' t·ªët ', 'üêÖ': ' t·ªët ', 'üêæ': ' t·ªët ', 'üëâ': ' t·ªët ',
        'üíê': ' t·ªët ', 'üíû': ' t·ªët ', 'üí•': ' t·ªët ', 'üí™': ' t·ªët ',
        'üí∞': ' t·ªët ',  'üòá': ' t·ªët ', 'üòõ': ' t·ªët ', 'üòú': ' t·ªët ',
        'üôÉ': ' t·ªët ', 'ü§ë': ' t·ªët ', 'ü§™': ' t·ªët ','‚òπ': ' t·ªá ',  'üíÄ': ' t·ªá ',
        'üòî': ' t·ªá ', 'üòß': ' t·ªá ', 'üò©': ' t·ªá ', 'üò∞': ' t·ªá ', 'üò≥': ' t·ªá ',
        'üòµ': ' t·ªá ', 'üò∂': ' t·ªá ', 'üôÅ': ' t·ªá ',
        ':))': '  t·ªët ', '=)': ' t·ªët ',':>': ' t·ªët ','<3': ' t·ªët ', ':3': ' t·ªët ', '√¥ k√™i': ' ok ', 'okie': ' ok ', ' o k√™ ': ' ok ',
        'okey': ' ok ', '√¥k√™': ' ok ', 'oki': ' ok ',' √¥ k√™ ': ' ok ', ' oke ':  ' ok ',' okay':' ok ','ok√™':' ok ',
        ' tks ': u' c√°m ∆°n ', 'thks': u' c√°m ∆°n ', 'thanks': u' c√°m ∆°n ', 'ths': u' c√°m ∆°n ', 'thank': u' c√°m ∆°n ',
        '‚≠ê': 'star ', '*': 'star ', 'üåü': 'star ', 'üéâ': u' t·ªët ',
        'kg ': u' kh√¥ng ', ' mk ': u' m√¨nh ', ' mik ': u' m√¨nh ', ' thik ' : u' th√≠ch ', 'not': u' kh√¥ng ',' nma ': u' nh∆∞ng m√† ', u' kg ': u' kh√¥ng ', '"k ': u' kh√¥ng ',' kh ':u' kh√¥ng ','k√¥':u' kh√¥ng ','hok':u' kh√¥ng ',' kp ': u' kh√¥ng ph·∫£i ',u' k√¥ ': u' kh√¥ng ', '"ko ': u' kh√¥ng ', u' ko ': u' kh√¥ng ', u' k ': u' kh√¥ng ', 'khong': u' kh√¥ng ', u' hok ': u' kh√¥ng ',
        'he he': ' t·ªët ','hehe': ' t·ªët ','hihi': ' t·ªët ', 'haha': ' t·ªët ', 'hjhj': ' t·ªët ',
        ' lol ': ' t·ªá ',' cc ': ' t·ªá ','cute': u' d·ªÖ th∆∞∆°ng ','huhu': ' t·ªá ', ' vs ': u' v·ªõi ', 'wa': ' qu√° ', 'w√°': u' qu√°', 'j': u' g√¨ ', '‚Äú': ' ',
        ' sz ': u' c·ª° ', 'size': u' c·ª° ', u' ƒëx ': u' ƒë∆∞·ª£c ', 'dk': u' ƒë∆∞·ª£c ', 'dc': u' ƒë∆∞·ª£c ', 'ƒëk': u' ƒë∆∞·ª£c ',
        'ƒëc': u' ƒë∆∞·ª£c ', ' qly ': u' qu·∫£n l√Ω ','authentic': u' chu·∫©n ch√≠nh h√£ng ',u' aut ': u' chu·∫©n ch√≠nh h√£ng ', u' auth ': u' chu·∫©n ch√≠nh h√£ng ', 'thick': u' t·ªët ', 'store': u' c·ª≠a h√†ng ',
        'shop': u' c·ª≠a h√†ng ', ' sp ': u' s·∫£n ph·∫©m ',u' nv ': u' nh√¢n vi√™n ',' l·ªõm ': u' l·∫Øm ', 'gud': u' t·ªët ','god': u' t·ªët ','wel done':' t·ªët ', 'good': u' t·ªët ', 'g√∫t': u' t·ªët ',
        's·∫•u': u' x·∫•u ','gut': u' t·ªët ', u' tot ': u' t·ªët ', u' nice ': u' t·ªët ', 'perfect': 'r·∫•t t·ªët', 'bt': u' b√¨nh th∆∞·ªùng ',
        'time': u' th·ªùi gian ', 'd·ªÖ t√¨m': 'd·ªÖ t√¨m t·ªët', 'q√°': u' qu√° ', u' ship ': u' giao h√†ng ', u' m ': u' m√¨nh ', u' mik ': u' m√¨nh ',
        '√™Ãâ': '·ªÉ', 'product': 's·∫£n ph·∫©m', 'quality': 'ch·∫•t l∆∞·ª£ng','chat':' ch·∫•t ', 'excelent': 'ho√†n h·∫£o', 'bad': 't·ªá','fresh': ' t∆∞∆°i ','sad': ' t·ªá ',
        ' date ': u' h·∫°n s·ª≠ d·ª•ng ', 'hsd': u' h·∫°n s·ª≠ d·ª•ng ','quickly': u' nhanh ', 'quick': u' nhanh ','fast': u' nhanh ','delivery': u' giao h√†ng ',u' s√≠p ': u' giao h√†ng ',
        ' beautiful ': u' ƒë·∫πp t·ªët v·ªùi ', u' tl ': u' tr·∫£ l·ªùi ', u' r ': u' r·ªìi ', u' shopE ': u' c·ª≠a h√†ng ',
        'ch·∫•t lg': u' ch·∫•t l∆∞·ª£ng ',u' sd ': u' s·ª≠ d·ª•ng ',u' dt ': u' ƒëi·ªán tho·∫°i ',u' nt ': u' nh·∫Øn tin ',u' tl ': u' tr·∫£ l·ªùi ',u' s√†i ': u' x√†i ',u'bjo':u' bao gi·ªù ',
        'thik': u' th√≠ch ',u' sop ': u' c·ª≠a h√†ng ', ' fb ': ' facebook ', ' face ': ' facebook ', ' very ': u' r·∫•t ',u'qu·∫£ ng ':u' qu·∫£ng  ',
        'dep': u' ƒë·∫πp ',u' xau ': u' x·∫•u ','delicious': u' ngon ', u'h√†g': u' h√†ng ', u'q·ªßa': u' qu·∫£ ',
        ' iu ': u' y√™u ',' fake ': u' gi·∫£ m·∫°o ', 'trl': 'tr·∫£ l·ªùi', ' >< ': u' t·ªët ',
        ' por ': u' t·ªá ',' poor ': u' t·ªá ', ' ib ':u' nh·∫Øn tin ', 'rep':u' tr·∫£ l·ªùi ',u'fback':' feedback ','fedback':' feedback ',
        '6 sao': ' 5star ','6 star': ' 5star ', '5star': ' 5star ','5 sao': ' 5star ','5sao': ' 5star ',
        'starstarstarstarstar': ' 5star ', '1 sao': ' 1star ', '1sao': ' 1star ','2 sao':' 1star ','2sao':' 1star ',
        '2 starstar':' 1star ','1star': ' 1star ', '0 sao': ' 1star ', '0star': ' 1star ',}

    for k, v in replace_list.items():
        text = text.replace(k, v)

    return text

def elongated_words(text):
    text = re.sub(r'([A-Z])\1+', lambda m: m.group(1).upper(), text, flags=re.IGNORECASE)
    return text

def lowercasing(text):
    return text.lower()

def punctuation(text):
    return re.sub(r'[^\w\s]', ' ', text)

def sentiment_words(text):
    texts = word_tokenize(text)
    len_text = len(texts)

    texts = [t.replace('_', ' ') for t in texts]
    for i in range(len_text):
        cp_text = texts[i]
        if cp_text in not_list: 
            numb_word = 2 if len_text - i - 1 >= 4 else len_text - i - 1

            for j in range(numb_word):
                if texts[i + j + 1] in pos_list:
                    texts[i] = 't·ªá'
                    texts[i + j + 1] = ''

                if texts[i + j + 1] in neg_list:
                    texts[i] = 't·ªët'
                    texts[i + j + 1] = ''

        if cp_text in intensifier_list: 
            numb_word = 2 if len_text - i - 1 >= 4 else len_text - i - 1

            for j in range(numb_word):
                if texts[i + j + 1] in pos_list:
                    texts[i] = 'xu·∫•t_s·∫Øc'
                    texts[i + j + 1] = ''

                if texts[i + j + 1] in neg_list:
                    texts[i] = 'th·∫≠m_t·ªá'
                    texts[i + j + 1] = ''
                
        else: 
            if cp_text in pos_list:
                texts.append('t·ªët')
            elif cp_text in neg_list:
                texts.append('t·ªá')

    text = " ".join(texts)
    return text

def irrelevant_symbol(text):
    text = text.replace(u'"', u' ')

def standardize(text):
    text = elongated_words(text)
    text = lowercasing(text)
    text = emojis_abbreviations(text)
    text = underthesea.text_normalize(text)
    text = punctuation(text)
    text = sentiment_words(text)
    return text

def preprocessing(text):
    text = standardize(text)
    text = word_tokenize(text, format="text")
    return text

In [None]:
for i in range(len(train_text)):
    train_text[i] = preprocessing(train_text[i])

# Splitting

In [None]:
from sklearn.model_selection import train_test_split

train_sents, val_sents, train_labels, val_labels = train_test_split(train_text, train_labels, test_size = 0.1)

# Byte Pair Encoding

In [None]:
from collections import defaultdict
MAX_LEN = 250

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

train_ids = []
for sent in train_sents:
    subwords = '<s> ' + bpe.encode(sent) + ' </s>'
    encoded_sent = vocab.encode_line(subwords, append_eos=True, add_if_not_exist=False).long().tolist()
    train_ids.append(encoded_sent)

val_ids = []
for sent in val_sents:
    subwords = '<s> ' + bpe.encode(sent) + ' </s>'
    encoded_sent = vocab.encode_line(subwords, append_eos=True, add_if_not_exist=False).long().tolist()
    val_ids.append(encoded_sent)
    
train_ids = pad_sequences(train_ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="pre", padding="post")
val_ids = pad_sequences(val_ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="pre", padding="post")

# Mask for transformers

In [None]:
train_masks = []
for sent in train_ids:
    mask = [int(token_id > 0) for token_id in sent]
    train_masks.append(mask)

val_masks = []
for sent in val_ids:
    mask = [int(token_id > 0) for token_id in sent]

    val_masks.append(mask)

# Convert to tensors

In [None]:
train_labels = train_labels.astype(int)
val_labels = val_labels.astype(int)

train_labels = np.array(train_labels)
val_labels = np.array(val_labels)

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch

train_inputs = torch.tensor(train_ids)
val_inputs = torch.tensor(val_ids)
train_labels = torch.tensor(train_labels)
val_labels = torch.tensor(val_labels)
train_masks = torch.tensor(train_masks)
val_masks = torch.tensor(val_masks)

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = SequentialSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=32)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=32)

# Load RobertaForSequenceClassification model

In [None]:
from transformers import RobertaForSequenceClassification, RobertaConfig, AdamW

config = RobertaConfig.from_pretrained(
    "/kaggle/working/PhoBERT_base_transformers/config.json", from_tf=False, num_labels = 2, output_hidden_states=False,
)
BERT_SA = RobertaForSequenceClassification.from_pretrained(
    "/kaggle/working/PhoBERT_base_transformers/model.bin",
    config=config
)
BERT_SA.cuda()

# Train model

In [None]:
import numpy as np
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    
    F1_score = f1_score(pred_flat, labels_flat, average='macro')
    
    return accuracy_score(pred_flat, labels_flat), F1_score

In [None]:
import random
from tqdm import notebook
device = 'cuda'
epochs = 7

param_optimizer = list(BERT_SA.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5, correct_bias=False)
max_auc = 0


for epoch_i in range(0, epochs):
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    total_loss = 0
    BERT_SA.train()
    train_accuracy = 0
    nb_train_steps = 0
    train_f1 = 0
    preds = []
    
    for step, batch in notebook.tqdm(enumerate(train_dataloader)):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        BERT_SA.zero_grad()
        outputs = BERT_SA(b_input_ids,
            token_type_ids=None, 
            attention_mask=b_input_mask, 
            labels=b_labels)
        loss = outputs[0]
        total_loss += loss.item()
        
        logits = outputs[1].detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        tmp_train_accuracy, tmp_train_f1 = flat_accuracy(logits, label_ids)
        train_accuracy += tmp_train_accuracy
        train_f1 += tmp_train_f1
        nb_train_steps += 1
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(BERT_SA.parameters(), 1.0)
        optimizer.step()
        
    avg_train_loss = total_loss / len(train_dataloader)
    print(" Accuracy: {0:.4f}".format(train_accuracy/nb_train_steps))
    print(" F1 score: {0:.4f}".format(train_f1/nb_train_steps))
    print(" Average training loss: {0:.4f}".format(avg_train_loss))

    print("Running Validation...")
    BERT_SA.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    eval_f1 = 0
    for batch in notebook.tqdm(val_dataloader):

        batch = tuple(t.to(device) for t in batch)

        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():
            outputs = BERT_SA(b_input_ids, 
            token_type_ids=None, 
            attention_mask=b_input_mask)
            logits = outputs[0]
            logits = logits.detach().cpu().numpy()
            for i in range(len(logits)):
                input = logits[i]
                predict_train = tensorflow.nn.softmax(input)
                predict_train = predict_train[1].numpy()
                preds.append(predict_train)
            label_ids = b_labels.to('cpu').numpy()

            tmp_eval_accuracy, tmp_eval_f1 = flat_accuracy(logits, label_ids)

            eval_accuracy += tmp_eval_accuracy
            eval_f1 += tmp_eval_f1
            nb_eval_steps += 1
            
    f1 = eval_f1/nb_eval_steps
    auc = roc_auc_score(val_labels.numpy(), preds)

    print(" Accuracy: {0:.4f}".format(eval_accuracy/nb_eval_steps))
    print(" F1 score: {0:.4f}".format(f1))
    print(" ROC-AUC score: {0:.4f}".format(auc))
    if auc > max_auc:
        max_auc = auc
        BERT_SA.save_pretrained("/kaggle/working/bert-sa")
        print(epoch_i)

print("Training complete!")

# Prediction on test set

In [None]:
df_ts = pd.read_csv(test_path)

test_text = df_ts['Comment']
test_text = test_text.astype(str)

for i in range(len(test_text)):
    test_text[i] = preprocessing(test_text[i])
    
test_ids = []
for sent in test_text:
    subwords = '<s> ' + bpe.encode(sent) + ' </s>'
    encoded_sent = vocab.encode_line(subwords, append_eos=True, add_if_not_exist=False).long().tolist()
    test_ids.append(encoded_sent)

test_ids = pad_sequences(test_ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="pre")

test_masks = []
for sent in test_ids:
    mask = [int(token_id > 0) for token_id in sent]
    test_masks.append(mask)

In [None]:
test_inputs = torch.tensor(test_ids)
test_masks = torch.tensor(test_masks)

test_data = TensorDataset(test_inputs, test_masks)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=32)

In [None]:
prediction = []
BERT = RobertaForSequenceClassification.from_pretrained("/kaggle/working/bert-sa").to(device)
BERT.eval()
for batch in notebook.tqdm(test_dataloader):
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask = batch

    with torch.no_grad():
        outputs = BERT(b_input_ids,
        token_type_ids=None,
        attention_mask=b_input_mask)
        logits = outputs[0]
        logits = logits.detach().cpu().numpy()
        for i in range(len(logits)):
            input = logits[i]
            predict = tensorflow.nn.softmax(input)
            predict = predict[1].numpy()
            prediction.append(predict)

In [None]:
prediction = np.array(prediction)

In [None]:
RevId_ts = df_ts['RevId']
d = {'RevId': RevId_ts, 'Rating': prediction}
f = pd.DataFrame(d)
f.to_csv("harry.csv")