# Settings and Installing

In [None]:
!pip install transformers
!pip install fastBPE
!pip install fairseq
!pip install emoji==1.7

In [None]:
!pip install vncorenlp

!mkdir -p vncorenlp/models/wordsegmenter
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/VnCoreNLP-1.1.1.jar
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/vi-vocab
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/wordsegmenter.rdr
!mv VnCoreNLP-1.1.1.jar vncorenlp/ 
!mv vi-vocab vncorenlp/models/wordsegmenter/
!mv wordsegmenter.rdr vncorenlp/models/wordsegmenter/!

In [None]:
!wget https://public.vinai.io/PhoBERT_base_transformers.tar.gz
!tar -xzvf PhoBERT_base_transformers.tar.gz

In [None]:
from fairseq.data.encoders.fastbpe import fastBPE
from fairseq.data import Dictionary
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--bpe-codes', 
    default="/kaggle/working/PhoBERT_base_transformers/bpe.codes",
    required=False,
    type=str,
    help='path to fastBPE BPE'
)
args, unknown = parser.parse_known_args()
bpe = fastBPE(args)

vocab = Dictionary()
vocab.add_from_file("/kaggle/working/PhoBERT_base_transformers/dict.txt")

In [None]:
import tensorflow
import pandas as pd
import numpy as np
import re
import regex

In [None]:
!pip install underthesea
import underthesea
from underthesea import word_tokenize

# Import data

In [None]:
train_path = "/kaggle/input/newdatafoody/train-set.csv"
test_path = "/kaggle/input/newdatafoody/test-set.csv"
pos_path = "/kaggle/input/newdatafoody/pos.txt"
neg_path = "/kaggle/input/newdatafoody/neg.txt"
not_path = "/kaggle/input/newdatafoody/not.txt"
intensifier_path = "/kaggle/input/newdatafoody/intensifier.txt"

In [None]:
train_text, train_label = [], []
test_id, test_text = [], []

pos_list = pd.read_table(pos_path)
neg_list = pd.read_table(neg_path)
not_list = pd.read_table(not_path)
intensifier_list = pd.read_table(intensifier_path)
pos_list = pos_list.values.tolist()
neg_list = neg_list.values.tolist()
not_list = not_list.values.tolist()
intensifier_list = intensifier_list.values.tolist()

for i in range(len(pos_list)):
    pos_list[i] = pos_list[i][0]

for i in range(len(neg_list)):
    neg_list[i] = neg_list[i][0]

for i in range(len(not_list)):
    not_list[i] = not_list[i][0]

for i in range(len(intensifier_list)):
    intensifier_list[i] = intensifier_list[i][0]

df = pd.read_csv(
    train_path,
    names=["#", "RevId", "UserId", "Comment", "image_urls", "Rating"])

df= df[(df['Rating']=='1') | (df['Rating']=='0')].reset_index(drop=True)

df = df[['Comment', 'Rating']]

train_text = df['Comment']
train_labels = df['Rating']

new_data = []

for index,row in enumerate(neg_list):
    new_data.append([row,'0'])
for index,row in enumerate(pos_list):
    new_data.append([row,'1'])
    
new_data.append(["thậm tệ", '0'])
new_data.append(["xuất sắc", '1'])
    
aug_df_text = pd.Series( (v[0] for v in new_data) )
aug_df_labels = pd.Series( (v[1] for v in new_data) )

train_text = train_text.append(aug_df_text).reset_index(drop=True)
train_labels = train_labels.append(aug_df_labels).reset_index(drop=True)

# Text preprocessing

In [None]:
def emojis_abbreviations(text):
    replace_list = {
        "👹": "tệ", "👻": "tốt", "💃": "tốt",'🤙': ' tốt ', '👍': ' tốt ',
        "💄": "tốt", "💎": "tốt", "💩": "tốt","😕": "tệ", "😱": "tệ", "😸": "tốt",
        "😾": "tệ", "🚫": "tệ",  "🤬": "tệ","🧚": "tốt", "🧡": "tốt",'🐶':' tốt ',
        '👎': ' tệ ', '😣': ' tệ ','✨': ' tốt ', '❣': ' tốt ','☀': ' tốt ',
        '♥': ' tốt ', '🤩': ' tốt ', 'like': ' tốt ', '💌': ' tốt ',
        '🤣': ' tốt ', '🖤': ' tốt ', '🤤': ' tốt ', ':(': ' tệ ', '😢': ' tệ ',
        '❤': ' tốt ', '😍': ' tốt ', '😘': ' tốt ', '😪': ' tệ ', '😊': ' tốt ',
        '?': ' ? ', '😁': ' tốt ', '💖': ' tốt ', '😟': ' tệ ', '😭': ' tệ ',
        '💯': ' tốt ', '💗': ' tốt ', '♡': ' tốt ', '💜': ' tốt ', '🤗': ' tốt ',
        '^^': ' tốt ', '😨': ' tệ ', '☺': ' tốt ', '💋': ' tốt ', '👌': ' tốt ',
        '😖': ' tệ ', '😀': ' tốt ', ':((': ' tệ ', '😡': ' tệ ', '😠': ' tệ ',
        '😒': ' tệ ', '🙂': ' tốt ', '😏': ' tệ ', '😝': ' tốt ', '😄': ' tốt ',
        '😙': ' tốt ', '😤': ' tệ ', '😎': ' tốt ', '😆': ' tốt ', '💚': ' tốt ',
        '✌': ' tốt ', '💕': ' tốt ', '😞': ' tệ ', '😓': ' tệ ', '️🆗️': ' tốt ',
        '😉': ' tốt ', '😂': ' tốt ', ':v': '  tốt ', '=))': '  tốt ', '😋': ' tốt ',
        '💓': ' tốt ', '😐': ' tệ ', ':3': ' tốt ', '😫': ' tệ ', '😥': ' tệ ',
        '😃': ' tốt ', '😬': ' 😬 ', '😌': ' 😌 ', '💛': ' tốt ', '🤝': ' tốt ', '🎈': ' tốt ',
        '😗': ' tốt ', '🤔': ' tệ ', '😑': ' tệ ', '🔥': ' tệ ', '🙏': ' tệ ',
        '🆗': ' tốt ', '😻': ' tốt ', '💙': ' tốt ', '💟': ' tốt ',
        '😚': ' tốt ', '❌': ' tệ ', '👏': ' tốt ', ';)': ' tốt ', '<3': ' tốt ',
        '🌝': ' tốt ',  '🌷': ' tốt ', '🌸': ' tốt ', '🌺': ' tốt ',
        '🌼': ' tốt ', '🍓': ' tốt ', '🐅': ' tốt ', '🐾': ' tốt ', '👉': ' tốt ',
        '💐': ' tốt ', '💞': ' tốt ', '💥': ' tốt ', '💪': ' tốt ',
        '💰': ' tốt ',  '😇': ' tốt ', '😛': ' tốt ', '😜': ' tốt ',
        '🙃': ' tốt ', '🤑': ' tốt ', '🤪': ' tốt ','☹': ' tệ ',  '💀': ' tệ ',
        '😔': ' tệ ', '😧': ' tệ ', '😩': ' tệ ', '😰': ' tệ ', '😳': ' tệ ',
        '😵': ' tệ ', '😶': ' tệ ', '🙁': ' tệ ',
        ':))': '  tốt ', '=)': ' tốt ',':>': ' tốt ','<3': ' tốt ', ':3': ' tốt ', 'ô kêi': ' ok ', 'okie': ' ok ', ' o kê ': ' ok ',
        'okey': ' ok ', 'ôkê': ' ok ', 'oki': ' ok ',' ô kê ': ' ok ', ' oke ':  ' ok ',' okay':' ok ','okê':' ok ',
        ' tks ': u' cám ơn ', 'thks': u' cám ơn ', 'thanks': u' cám ơn ', 'ths': u' cám ơn ', 'thank': u' cám ơn ',
        '⭐': 'star ', '*': 'star ', '🌟': 'star ', '🎉': u' tốt ',
        'kg ': u' không ', ' mk ': u' mình ', ' mik ': u' mình ', ' thik ' : u' thích ', 'not': u' không ',' nma ': u' nhưng mà ', u' kg ': u' không ', '"k ': u' không ',' kh ':u' không ','kô':u' không ','hok':u' không ',' kp ': u' không phải ',u' kô ': u' không ', '"ko ': u' không ', u' ko ': u' không ', u' k ': u' không ', 'khong': u' không ', u' hok ': u' không ',
        'he he': ' tốt ','hehe': ' tốt ','hihi': ' tốt ', 'haha': ' tốt ', 'hjhj': ' tốt ',
        ' lol ': ' tệ ',' cc ': ' tệ ','cute': u' dễ thương ','huhu': ' tệ ', ' vs ': u' với ', 'wa': ' quá ', 'wá': u' quá', 'j': u' gì ', '“': ' ',
        ' sz ': u' cỡ ', 'size': u' cỡ ', u' đx ': u' được ', 'dk': u' được ', 'dc': u' được ', 'đk': u' được ',
        'đc': u' được ', ' qly ': u' quản lý ','authentic': u' chuẩn chính hãng ',u' aut ': u' chuẩn chính hãng ', u' auth ': u' chuẩn chính hãng ', 'thick': u' tốt ', 'store': u' cửa hàng ',
        'shop': u' cửa hàng ', ' sp ': u' sản phẩm ',u' nv ': u' nhân viên ',' lớm ': u' lắm ', 'gud': u' tốt ','god': u' tốt ','wel done':' tốt ', 'good': u' tốt ', 'gút': u' tốt ',
        'sấu': u' xấu ','gut': u' tốt ', u' tot ': u' tốt ', u' nice ': u' tốt ', 'perfect': 'rất tốt', 'bt': u' bình thường ',
        'time': u' thời gian ', 'dễ tìm': 'dễ tìm tốt', 'qá': u' quá ', u' ship ': u' giao hàng ', u' m ': u' mình ', u' mik ': u' mình ',
        'ể': 'ể', 'product': 'sản phẩm', 'quality': 'chất lượng','chat':' chất ', 'excelent': 'hoàn hảo', 'bad': 'tệ','fresh': ' tươi ','sad': ' tệ ',
        ' date ': u' hạn sử dụng ', 'hsd': u' hạn sử dụng ','quickly': u' nhanh ', 'quick': u' nhanh ','fast': u' nhanh ','delivery': u' giao hàng ',u' síp ': u' giao hàng ',
        ' beautiful ': u' đẹp tốt vời ', u' tl ': u' trả lời ', u' r ': u' rồi ', u' shopE ': u' cửa hàng ',
        'chất lg': u' chất lượng ',u' sd ': u' sử dụng ',u' dt ': u' điện thoại ',u' nt ': u' nhắn tin ',u' tl ': u' trả lời ',u' sài ': u' xài ',u'bjo':u' bao giờ ',
        'thik': u' thích ',u' sop ': u' cửa hàng ', ' fb ': ' facebook ', ' face ': ' facebook ', ' very ': u' rất ',u'quả ng ':u' quảng  ',
        'dep': u' đẹp ',u' xau ': u' xấu ','delicious': u' ngon ', u'hàg': u' hàng ', u'qủa': u' quả ',
        ' iu ': u' yêu ',' fake ': u' giả mạo ', 'trl': 'trả lời', ' >< ': u' tốt ',
        ' por ': u' tệ ',' poor ': u' tệ ', ' ib ':u' nhắn tin ', 'rep':u' trả lời ',u'fback':' feedback ','fedback':' feedback ',
        '6 sao': ' 5star ','6 star': ' 5star ', '5star': ' 5star ','5 sao': ' 5star ','5sao': ' 5star ',
        'starstarstarstarstar': ' 5star ', '1 sao': ' 1star ', '1sao': ' 1star ','2 sao':' 1star ','2sao':' 1star ',
        '2 starstar':' 1star ','1star': ' 1star ', '0 sao': ' 1star ', '0star': ' 1star ',}

    for k, v in replace_list.items():
        text = text.replace(k, v)

    return text

def elongated_words(text):
    text = re.sub(r'([A-Z])\1+', lambda m: m.group(1).upper(), text, flags=re.IGNORECASE)
    return text

def lowercasing(text):
    return text.lower()

def punctuation(text):
    return re.sub(r'[^\w\s]', ' ', text)

def sentiment_words(text):
    texts = word_tokenize(text)
    len_text = len(texts)

    texts = [t.replace('_', ' ') for t in texts]
    for i in range(len_text):
        cp_text = texts[i]
        if cp_text in not_list: 
            numb_word = 2 if len_text - i - 1 >= 4 else len_text - i - 1

            for j in range(numb_word):
                if texts[i + j + 1] in pos_list:
                    texts[i] = 'tệ'
                    texts[i + j + 1] = ''

                if texts[i + j + 1] in neg_list:
                    texts[i] = 'tốt'
                    texts[i + j + 1] = ''

        if cp_text in intensifier_list: 
            numb_word = 2 if len_text - i - 1 >= 4 else len_text - i - 1

            for j in range(numb_word):
                if texts[i + j + 1] in pos_list:
                    texts[i] = 'xuất_sắc'
                    texts[i + j + 1] = ''

                if texts[i + j + 1] in neg_list:
                    texts[i] = 'thậm_tệ'
                    texts[i + j + 1] = ''
                
        else: 
            if cp_text in pos_list:
                texts.append('tốt')
            elif cp_text in neg_list:
                texts.append('tệ')

    text = " ".join(texts)
    return text

def irrelevant_symbol(text):
    text = text.replace(u'"', u' ')

def standardize(text):
    text = elongated_words(text)
    text = lowercasing(text)
    text = emojis_abbreviations(text)
    text = underthesea.text_normalize(text)
    text = punctuation(text)
    text = sentiment_words(text)
    return text

def preprocessing(text):
    text = standardize(text)
    text = word_tokenize(text, format="text")
    return text

In [None]:
for i in range(len(train_text)):
    train_text[i] = preprocessing(train_text[i])

# Splitting

In [None]:
from sklearn.model_selection import train_test_split

train_sents, val_sents, train_labels, val_labels = train_test_split(train_text, train_labels, test_size = 0.1)

# Byte Pair Encoding

In [None]:
from collections import defaultdict
MAX_LEN = 250

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

train_ids = []
for sent in train_sents:
    subwords = '<s> ' + bpe.encode(sent) + ' </s>'
    encoded_sent = vocab.encode_line(subwords, append_eos=True, add_if_not_exist=False).long().tolist()
    train_ids.append(encoded_sent)

val_ids = []
for sent in val_sents:
    subwords = '<s> ' + bpe.encode(sent) + ' </s>'
    encoded_sent = vocab.encode_line(subwords, append_eos=True, add_if_not_exist=False).long().tolist()
    val_ids.append(encoded_sent)
    
train_ids = pad_sequences(train_ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="pre", padding="post")
val_ids = pad_sequences(val_ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="pre", padding="post")

# Mask for transformers

In [None]:
train_masks = []
for sent in train_ids:
    mask = [int(token_id > 0) for token_id in sent]
    train_masks.append(mask)

val_masks = []
for sent in val_ids:
    mask = [int(token_id > 0) for token_id in sent]

    val_masks.append(mask)

# Convert to tensors

In [None]:
train_labels = train_labels.astype(int)
val_labels = val_labels.astype(int)

train_labels = np.array(train_labels)
val_labels = np.array(val_labels)

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch

train_inputs = torch.tensor(train_ids)
val_inputs = torch.tensor(val_ids)
train_labels = torch.tensor(train_labels)
val_labels = torch.tensor(val_labels)
train_masks = torch.tensor(train_masks)
val_masks = torch.tensor(val_masks)

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = SequentialSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=32)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=32)

# Load RobertaForSequenceClassification model

In [None]:
from transformers import RobertaForSequenceClassification, RobertaConfig, AdamW

config = RobertaConfig.from_pretrained(
    "/kaggle/working/PhoBERT_base_transformers/config.json", from_tf=False, num_labels = 2, output_hidden_states=False,
)
BERT_SA = RobertaForSequenceClassification.from_pretrained(
    "/kaggle/working/PhoBERT_base_transformers/model.bin",
    config=config
)
BERT_SA.cuda()

# Train model

In [None]:
import numpy as np
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    
    F1_score = f1_score(pred_flat, labels_flat, average='macro')
    
    return accuracy_score(pred_flat, labels_flat), F1_score

In [None]:
import random
from tqdm import notebook
device = 'cuda'
epochs = 7

param_optimizer = list(BERT_SA.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5, correct_bias=False)
max_auc = 0


for epoch_i in range(0, epochs):
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    total_loss = 0
    BERT_SA.train()
    train_accuracy = 0
    nb_train_steps = 0
    train_f1 = 0
    preds = []
    
    for step, batch in notebook.tqdm(enumerate(train_dataloader)):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        BERT_SA.zero_grad()
        outputs = BERT_SA(b_input_ids,
            token_type_ids=None, 
            attention_mask=b_input_mask, 
            labels=b_labels)
        loss = outputs[0]
        total_loss += loss.item()
        
        logits = outputs[1].detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        tmp_train_accuracy, tmp_train_f1 = flat_accuracy(logits, label_ids)
        train_accuracy += tmp_train_accuracy
        train_f1 += tmp_train_f1
        nb_train_steps += 1
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(BERT_SA.parameters(), 1.0)
        optimizer.step()
        
    avg_train_loss = total_loss / len(train_dataloader)
    print(" Accuracy: {0:.4f}".format(train_accuracy/nb_train_steps))
    print(" F1 score: {0:.4f}".format(train_f1/nb_train_steps))
    print(" Average training loss: {0:.4f}".format(avg_train_loss))

    print("Running Validation...")
    BERT_SA.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    eval_f1 = 0
    for batch in notebook.tqdm(val_dataloader):

        batch = tuple(t.to(device) for t in batch)

        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():
            outputs = BERT_SA(b_input_ids, 
            token_type_ids=None, 
            attention_mask=b_input_mask)
            logits = outputs[0]
            logits = logits.detach().cpu().numpy()
            for i in range(len(logits)):
                input = logits[i]
                predict_train = tensorflow.nn.softmax(input)
                predict_train = predict_train[1].numpy()
                preds.append(predict_train)
            label_ids = b_labels.to('cpu').numpy()

            tmp_eval_accuracy, tmp_eval_f1 = flat_accuracy(logits, label_ids)

            eval_accuracy += tmp_eval_accuracy
            eval_f1 += tmp_eval_f1
            nb_eval_steps += 1
            
    f1 = eval_f1/nb_eval_steps
    auc = roc_auc_score(val_labels.numpy(), preds)

    print(" Accuracy: {0:.4f}".format(eval_accuracy/nb_eval_steps))
    print(" F1 score: {0:.4f}".format(f1))
    print(" ROC-AUC score: {0:.4f}".format(auc))
    if auc > max_auc:
        max_auc = auc
        BERT_SA.save_pretrained("/kaggle/working/bert-sa")
        print(epoch_i)

print("Training complete!")

# Prediction on test set

In [None]:
df_ts = pd.read_csv(test_path)

test_text = df_ts['Comment']
test_text = test_text.astype(str)

for i in range(len(test_text)):
    test_text[i] = preprocessing(test_text[i])
    
test_ids = []
for sent in test_text:
    subwords = '<s> ' + bpe.encode(sent) + ' </s>'
    encoded_sent = vocab.encode_line(subwords, append_eos=True, add_if_not_exist=False).long().tolist()
    test_ids.append(encoded_sent)

test_ids = pad_sequences(test_ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="pre")

test_masks = []
for sent in test_ids:
    mask = [int(token_id > 0) for token_id in sent]
    test_masks.append(mask)

In [None]:
test_inputs = torch.tensor(test_ids)
test_masks = torch.tensor(test_masks)

test_data = TensorDataset(test_inputs, test_masks)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=32)

In [None]:
prediction = []
BERT = RobertaForSequenceClassification.from_pretrained("/kaggle/working/bert-sa").to(device)
BERT.eval()
for batch in notebook.tqdm(test_dataloader):
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask = batch

    with torch.no_grad():
        outputs = BERT(b_input_ids,
        token_type_ids=None,
        attention_mask=b_input_mask)
        logits = outputs[0]
        logits = logits.detach().cpu().numpy()
        for i in range(len(logits)):
            input = logits[i]
            predict = tensorflow.nn.softmax(input)
            predict = predict[1].numpy()
            prediction.append(predict)

In [None]:
prediction = np.array(prediction)

In [None]:
RevId_ts = df_ts['RevId']
d = {'RevId': RevId_ts, 'Rating': prediction}
f = pd.DataFrame(d)
f.to_csv("harry.csv")