In [None]:
!git clone https://github.com/HuangRihChang/GSHSD.git
%cd GSHSD/GSHSD
!ls

In [None]:
import functools

import torch
from torch import nn
from model.model import TransformerEncoder
from model.metrices import F1Score
from model.losses.softmax import CenterLoss
from HSDDataset import ViHSDData

from torch.utils.data import DataLoader, RandomSampler
from tqdm.notebook import tqdm
# from tqdm import tqdm
from transformers import AdamW, get_linear_schedule_with_warmup
from torch.utils.tensorboard import SummaryWriter
from utils import get_device
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
import pandas as pd
import numpy as np
import emoji
import re
import math

In [None]:
LR = 1e-4
n_epochs = 25
classes_num = 3
batch_size = 64
checkpoint_batch_size = 1024
max_len = 128
device = get_device()

writer = SummaryWriter()

In [None]:
def proprocess(x):
    x = str(x)
    x = emoji.replace_emoji(x, replace='')
    x = re.sub(r" +", " ", x)
    return x.lower().strip()

In [None]:
import random

unk_token = '[UNK]'

def get_item(row, mask_rate=0.5):
    utterance, hate_label = row["free_text"], row["label_id"]
    utterance = utterance.split()
    mask_rate = random.uniform(0., mask_rate)
    start = random.randint(0,len(utterance))
    end = min(random.randint(start,len(utterance)), start+int(len(utterance)*mask_rate))
    tmp = utterance[0:start] + [unk_token]*(end-start) + utterance[end:]
    utterance = " ".join(tmp)
    return utterance, hate_label

In [None]:
model_collate_fn = functools.partial(lambda x: x)


train_df = pd.read_csv("./data/vihsd/train.csv")
max_ = 0
for i in range(len(train_df.label_id.unique())):
    tmp = train_df[train_df["label_id"]==i]
    if len(tmp) >= max_:
        max_ = len(tmp)

for i in range(len(train_df.label_id.unique())):
    tmp = train_df[train_df["label_id"]==i]
    for _ in range(math.ceil(max_/len(tmp))-1):
        train_df = pd.concat([train_df, tmp])

train_data = ViHSDData(train_df, 
                 utterance_feild = "free_text", 
                 label_feild="label_id", 
                 text_preprocessor=proprocess,
                 augment_fnct=get_item,
                )
train_sampler = RandomSampler(train_data)
data_loader = DataLoader(train_data, batch_size=batch_size, sampler=train_sampler, collate_fn=model_collate_fn)

val_df = pd.read_csv("./data/vihsd/dev.csv")
val_data = ViHSDData(val_df, 
                 utterance_feild = "free_text", 
                 label_feild="label_id", 
                 text_preprocessor=proprocess
                )
val_sampler = RandomSampler(val_data)
val_loader = DataLoader(val_data, batch_size=batch_size, sampler=val_sampler, collate_fn=model_collate_fn)


test_df = pd.read_csv("./data/vihsd/test.csv")
test_data = ViHSDData(test_df, 
                 utterance_feild = "free_text", 
                 label_feild="label_id", 
                 text_preprocessor=proprocess
                )
test_sampler = RandomSampler(test_data)
test_loader = DataLoader(test_data, batch_size=batch_size, sampler=test_sampler, collate_fn=model_collate_fn)

training_step = len(data_loader)*n_epochs
print(f"Total {training_step} training steps for this dataset")

In [None]:
# Defining Model for specific fold "vinai/phobert-base"
model_path = "../../../input/mbert-model/bert-base-multilingual-uncased"
model_path = "./weights/multiBERTuncased"

model = TransformerEncoder(model_path, classes_num = classes_num,
                            max_seq_length = max_len, 
                            checkpoint_batch_size = checkpoint_batch_size,
                            dropout_rate = 0.5, 
                            model_args = {"output_hidden_states":False}
                            )

cross_entropy_loss = nn.CrossEntropyLoss()
center_loss = CenterLoss(num_classes=classes_num,\
                         feat_dim = model.get_word_embedding_dimension()
                        )


model.to(device)
cross_entropy_loss.to(device)
center_loss.to(device)

model_params = list(model.named_parameters()) # included all params from pooler and transformers
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
model_params = [{'params': [p for n, p in model_params if not any(nd in n for nd in no_decay)], 'weight_decay': 0.0001},
                {'params': [p for n, p in model_params if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
               ]

optimize_params = model_params + list(center_loss.parameters())

optimizer = AdamW(optimize_params, lr=LR)
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=training_step*0.5, 
                                            num_training_steps=training_step
                                            )

# for param in model.sent_encoder.embeddings.parameters():
#     param.requires_grad = False

In [None]:
def make_batch(batch, tokenizer, max_len="dynamic", device="cuda:0"):
    text_list, labels = [text for text,_ in batch], [label for _,label in batch]
    
    if max_len == "dynamic":
        lengths = np.array([len(tokenizer.tokenize(x))+2 for x in text_list])
        max_len = int(lengths.max())
    
    labels = torch.LongTensor(labels).to(device)
    toks = tokenizer.batch_encode_plus(text_list, max_length=max_len, padding='max_length', truncation=True)
    ids, mask = (torch.LongTensor(toks["input_ids"]).to(device), torch.LongTensor(toks["attention_mask"]).to(device))
    inputs = {"input_ids": ids, "attention_mask": mask}
    return inputs, labels

In [None]:
def train(data_loader, model, cross_entropy_loss, optimizer, scheduler, train_step):
    tk = tqdm(data_loader)
    f1 = F1Score()
    for x in tk:
        model.train()
        inputs, labels = make_batch(x, model.tokenizer, max_len=max_len, device=device)
        optimizer.zero_grad()
        logits, features = model(inputs)
        celoss = cross_entropy_loss(logits, labels)
        loss = 0.5*center_loss(features, labels) + celoss
        loss.backward()

        for param in center_loss.parameters():
            # lr_cent is learning rate for center loss, e.g. lr_cent = 0.5
            param.grad.data *= (0.01 / (0.5 * LR))
        
        optimizer.step()
        scheduler.step()

        with torch.no_grad():
            model.eval()
            predict = torch.argmax(torch.softmax(logits, dim=-1), dim=-1)
            macro_f1 = f1(predict+1, labels+1, "macro")
            tk.set_postfix(Epoch=e, step=train_step, loss=loss.data.item(), f1=macro_f1[0].data.item())
        train_step += 1
    return train_step


def evaluation(dev_loader, model, cross_entropy_loss):
    f1 = F1Score()
    logits_list, labels_list = [], []
    model.eval()
    with torch.no_grad():
        for x in dev_loader:
            inputs, labels = make_batch(x, model.tokenizer, max_len=max_len, device=device)
            logits, _ = model(inputs)
            logits_list.append(logits)
            labels_list.append(labels)
        labels = torch.cat(labels_list, dim=0)
        logits = torch.cat(logits_list, dim=0)
        predicts = torch.argmax(torch.softmax(logits, dim=-1), dim=-1)
        macro_f1 = f1(predicts+1, labels+1, "macro")
        loss = cross_entropy_loss(logits, labels)
        print(f"Epoch {e} - loss: {loss.data.item()} - F1: {macro_f1[0].data.item()}")
    return logits, labels, loss, macro_f1

In [None]:
train_step = 0
for e in range(n_epochs):
    train_step = train(data_loader, model, cross_entropy_loss, optimizer, scheduler, train_step)
    evaluation(val_loader, model, cross_entropy_loss)
evaluation(test_loader, model, cross_entropy_loss)

In [None]:
logits, labels, loss, macro_f1 = evaluation(test_loader, model, cross_entropy_loss)
predicts = torch.argmax(torch.softmax(logits, dim=-1), dim=-1).cpu().detach().numpy()
labels = labels.cpu().detach().numpy()
loss, macro_f1

In [None]:
confusion_matrix(labels, predicts)