<a href="https://colab.research.google.com/github/mynhungg/Datamining/blob/nguyen-anh-kiet/PhoBERT_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Download package

In [None]:
!pip3 install fairseq

In [None]:
!pip3 install fastbpe

In [None]:
!pip3 install vncorenlp

In [None]:
!pip3 install transformers

In [None]:
!pip3 install sentencepiece

In [None]:
!wget https://raw.githubusercontent.com/stopwords/vietnamese-stopwords/master/vietnamese-stopwords-dash.txt

In [None]:
!wget https://public.vinai.io/PhoBERT_base_fairseq.tar.gz
!tar -xzvf PhoBERT_base_fairseq.tar.gz

In [None]:
!mkdir -p vncorenlp/models/wordsegmenter

In [None]:
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/VnCoreNLP-1.1.1.jar
!mv VnCoreNLP-1.1.1.jar vncorenlp/ 

In [None]:
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/vi-vocab
!mv vi-vocab vncorenlp/models/wordsegmenter/

In [None]:
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/wordsegmenter.rdr
!mv wordsegmenter.rdr vncorenlp/models/wordsegmenter/

In [None]:
!git clone https://github.com/duyvuleo/VNTC.git
!ls VNTC/Data/10Topics/Ver1.1

In [None]:
!unrar x VNTC/Data/10Topics/Ver1.1/Train_Full.rar
!unrar x VNTC/Data/10Topics/Ver1.1/Test_Full.rar

# Read data

## Import libraries

In [None]:
import glob2
import torch
import numpy as np
from tqdm import tqdm
from vncorenlp import VnCoreNLP
from fairseq.data import Dictionary
from fairseq.data.encoders.fastbpe import fastBPE

## Vietnamese word segmentation

In [None]:
rdrsegmenter = VnCoreNLP("vncorenlp/VnCoreNLP-1.1.1.jar", annotators="wseg", max_heap_size="-Xmx500m")

## Path difinition

In [None]:
train_path = 'Train_Full/*/*.txt'
test_path = 'Test_Full/*/*.txt'

## Function difinition

In [None]:
def read_txt(path):
  with open(path, 'r', encoding='utf-16') as f:
    data = f.read()
  return data

In [None]:
def get_stopwords_list():
  with open('vietnamese-stopwords-dash.txt', 'r', encoding='utf-8') as f:
    lines = f.readlines()
    stop_set = set(m.strip() for m in lines)
    return list(frozenset(stop_set))

In [None]:
stopwords = get_stopwords_list()
cnt = 0

def remove_stopwords(texts):
  global cnt
  tmp = texts.split(' ')
  for sw in stopwords:
    if sw in tmp:
      cnt += 1
      tmp.remove(sw)
  return ' '.join(tmp)

In [None]:
def remove_stopwords2(words_arr):
  global cnt
  for words in words_arr:
    for sw in stopwords:
      if sw in words:
        cnt += 1
        words.remove(sw)
  return words_arr

In [None]:
def make_data(path):
  texts = []
  labels = []
  for file_path in tqdm(glob2.glob(path)):
    try:
      content = read_txt(file_path)
      label = file_path.split('/')[9]
      texts.append(content)
      labels.append(label)
    except:
      next
  return texts, labels

## Save and Load function

In [None]:
import pickle

In [None]:
def _save_pkl(path, obj):
  with open(path, 'wb') as f:
    pickle.dump(obj, f)

In [None]:
def _load_pkl(path):
  with open(path, 'rb') as f:
    obj = pickle.load(f)
  return obj

## Split test and label

In [None]:
text_train, label_train = make_data(train_path)
text_test, label_test = make_data(test_path)

In [None]:
_save_pkl('text_train.pkl', text_train)
_save_pkl('label_train.pkl', label_train)
_save_pkl('text_test.pkl', text_test)
_save_pkl('label_test.pkl', label_test)

# Tokenize content

## Tokenize

In [None]:
max_seq_len = 256

def convert_lines(lines, vocab, bpe):
  outputs = np.zeros((len(lines), max_seq_len), dtype=np.int32)
  cls_id = 0
  eos_id = 2
  pad_id = 1

  for i, row in tqdm(enumerate(lines), total=len(lines)):
    subwords = bpe.encode('<s> ' + row + ' </s>')
    input_ids = vocab.encode_line(subwords, append_eos=False, add_if_not_exist=False).long().tolist()
    if len(input_ids) > max_seq_len:
      input_ids = input_ids[:max_seq_len]
      input_ids[-1] = eos_id
    else:
      input_ids += [pad_id,]*(max_seq_len - len(input_ids))
    
    outputs[i, :] = np.array(input_ids)
  return outputs

## Load Dictionary

In [None]:
vocab = Dictionary()
vocab.add_from_file("PhoBERT_base_fairseq/dict.txt")

class BPE():
  bpe_codes = 'PhoBERT_base_fairseq/bpe.codes'
args = BPE()
bpe = fastBPE(args)

## Encoding

# Training

## Import libraries

In [None]:
import os
import time
import random

from torch import nn

from transformers import *
from transformers.modeling_utils import * 

from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit

## Argument definition

In [None]:
EPOCHS = 10
BATCH_SIZE = 16
ACCUMULATION_STEPS = 5
FOLD = 4
LR = 2e-5
LR_DC_STEP = 80
LR_DC = 0.1
CUR_DIR = os.path.dirname(os.getcwd())
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
CKPT_PATH = 'model_ckpt_full_both'
SEED = 69

## Preparing

In [None]:
X = convert_lines(text_train, vocab, bpe)

In [None]:
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()
lb.fit(label_train)
y = lb.fit_transform(label_train)

In [None]:
_save_pkl('/content/drive/MyDrive/Colab Notebooks/pkl/X_train_both.pkl', X)
_save_pkl('/content/drive/MyDrive/Colab Notebooks/pkl/y_train_both.pkl', y)

In [None]:
def seed_everything(SEED):
  np.random.seed(SEED)
  torch.manual_seed(SEED)
  torch.cuda.manual_seed(SEED)
  torch.backends.cudnn.deterministic = True

In [None]:
seed_everything(SEED)

In [None]:
if not os.path.exists(CKPT_PATH):
  os.mkdir(CKPT_PATH)

In [None]:
X_train = _load_pkl('X_train_both.pkl')
y = _load_pkl('y_train_both.pkl')

## Load model pretrain PhoBERT

In [None]:
from fairseq.models.roberta import RobertaModel

phoBERT = RobertaModel.from_pretrained('PhoBERT_base_fairseq', checkpoint='model.pt')
phoBERT.bpe = bpe
phoBERT.eval()

In [None]:
phoBERT.register_classification_head('new_task', num_classes=10)
phoBERT.to(DEVICE)

## Creating optimizer and learning rate schedulers

In [None]:
param_optimizer = list(model.named_parameters())

no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']

optimizer_grouped_parameters = [
  {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
  {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

num_train_optimization_steps = int(EPOCHS*len(X_train)/BATCH_SIZE/ACCUMULATION_STEPS)

optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=LR)

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=100, num_training_steps=num_train_optimization_steps)

scheduler0 = get_constant_schedule(optimizer)

criteria = nn.NLLLoss()

## Function definition

In [None]:
def evaluate(logits, targets):
  logits = logits.detach().cpu().numpy()   

  y_pred = np.argmax(logits, axis = 1)
  targets = targets.detach().cpu().numpy()
  
  f1 = f1_score(targets, y_pred, average='weighted')
  acc = accuracy_score(targets, y_pred)
  return acc, f1

In [None]:
def validate(valid_loader, model, device):
  model.eval()
  accs = []
  f1s = []
  losses = []

  with torch.no_grad():
    for x_batch, y_batch in valid_loader:
      x_batch = x_batch.to(device)
      y_batch = y_batch.to(device)

      outputs = model.predict('new_task', x_batch)

      loss = criteria(outputs, y_batch)
      logits = torch.exp(outputs)

      acc, f1 = evaluate(logits, y_batch)
      accs.append(acc)
      
      f1s.append(f1)
      losses.append(loss.detach().cpu().numpy())
  
  mean_acc = np.mean(accs)
  mean_f1 = np.mean(f1s)
  mean_loss = np.mean(losses)
  return mean_acc, mean_f1, mean_loss

In [None]:
def train_on_epoch(train_loader, model, optimizer, scheduler, epoch, num_epochs, criteria, device, log_aggr = 100):
  model.train()
  sum_epoch_loss = 0
  sum_acc = 0
  sum_f1 = 0
  start = time.time()
  train_size = len(train_loader)

  for i, (x_batch, y_batch) in enumerate(train_loader):
    x_batch = x_batch.to(device)
    y_batch = y_batch.to(device)

    y_pred = model.predict('new_task', x_batch)
    logits = torch.exp(y_pred)
    acc, f1 = evaluate(logits, y_batch)
    loss = criteria(y_pred, y_batch)

    loss.backward()
    if i % ACCUMULATION_STEPS == 0 or i == train_size- 1:
      optimizer.step()
      optimizer.zero_grad()
      scheduler.step()

    loss_val = loss.item()
    sum_epoch_loss += loss_val
    sum_acc += acc
    sum_f1 += f1

    if i % log_aggr == 0 or i == train_size - 1:
      print(f'[TRAIN] epoch {epoch + 1}/{num_epochs}, \
        observation {i}/{train_size} batch, \
        loss: {loss:>.4f} (avg {(sum_epoch_loss / (i + 1)):>.4f}), \
        avg acc: {(sum_acc / (i + 1)):>.4f}, \
        avg f1: {(sum_f1 / (i + 1)):>.4f}, \
        ({(len(x_batch) / (time.time() - start)):>.2f} im/s)')
    start = time.time()

In [None]:
splits = list(StratifiedShuffleSplit(n_splits=1, random_state=123, test_size=0.1).split(X_train, y))

In [None]:
for fold, (train_idx, val_idx) in enumerate(splits):
  best_score = 0
  print("Training for fold {}".format(fold))

  train_dataset = torch.utils.data.TensorDataset(torch.tensor(X_train[train_idx],dtype=torch.long), torch.tensor(y[train_idx],dtype=torch.long))
  valid_dataset = torch.utils.data.TensorDataset(torch.tensor(X_train[val_idx],dtype=torch.long), torch.tensor(y[val_idx],dtype=torch.long))
  print('train size:', len(train_dataset))
  print('valid size:', len(valid_dataset))

  frozen = True

  for epoch in tqdm(range(EPOCHS + 1)):
    
    if epoch > 0 and frozen:
      for child in model.children():
        for param in child.parameters():
          param.requires_grad = True
      frozen = False
      del scheduler0
      torch.cuda.empty_cache()

    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)

    print('\nEpoch: ', epoch + 1)
    optimizer.zero_grad()
    train_on_epoch(train_loader=train_loader,
                  model=model,
                  optimizer=optimizer, 
                  epoch=epoch, 
                  num_epochs=EPOCHS + 1, 
                  criteria=criteria, 
                  device=DEVICE, 
                  scheduler=(scheduler0 if frozen else scheduler))
    
    acc, f1, loss = validate(valid_loader, model, device=DEVICE)
    print('Epoch {} validation acc: {:.4f}, f1: {:.4f}, loss: {:.4f} \n'.format(epoch + 1, acc, f1, loss))
    if f1 >= best_score:
      torch.save(model.state_dict(), os.path.join(CKPT_PATH, f'model.bin'))
      best_score = f1

# Testing

In [None]:
X_test = _load_pkl('PhoBERT_pretrain/X_test.pkl')
y_test = _load_pkl('PhoBERT_pretrain/y_test.pkl')

In [None]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
preds = []
logits = []

phoBERT.to(DEVICE)
phoBERT.load_state_dict(torch.load(os.path.join('model_ckpt_full', 'model.bin')))
test_dataset = torch.utils.data.TensorDataset(torch.tensor(X_test, dtype=torch.long))
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16, shuffle=False)
phoBERT.eval()

for i, (x_batch,) in tqdm(enumerate(test_loader), total=len(test_loader)):
  logits = phoBERT.predict('new_task', x_batch)
  probs = torch.exp(logits)
  y_pred = np.argmax(probs.detach().cpu().numpy(), axis=1)
  preds = np.concatenate([preds, y_pred])

In [None]:
from sklearn import metrics

f1 = metrics.f1_score(y_test, preds, average="micro")
acc = metrics.accuracy_score(y_test, preds)
print('f1 score: ', f1)
print('accuracy: ', acc)