# 1. Importing libraries and datasets

## 1.1 Importing torch

In [None]:
model_name = "bert-base-multilingual-cased"

In [None]:
import torch
# check if we have cuda installed
if torch.cuda.is_available():
    # to use GPU
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('GPU is:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

## 2.6 Read datasets

In [None]:
import pandas as pd
combined_df = pd.read_csv(r'../../data/combined_data.csv', sep='\t', encoding='ISO-8859-1')

combined_df = combined_df[combined_df['text'].notnull()].reset_index()[['text', 'sentiment']]
combined_df

# 3. Get feature and targets

## 3.1 Load tokenizer

In [None]:
import torch
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

## 3.3 Prepare features and target

In [None]:
# identify features and target
features = combined_df.text.values.tolist()
target = combined_df.sentiment.values.tolist()

print(features[:5])

In [None]:
# tokenize features 
MAX_LEN = 128
tokenized_feature = tokenizer.batch_encode_plus(
    # Sentences to encode
    features, 
    # Add '[CLS]' and '[SEP]'
    add_special_tokens = True,
    # Add empty tokens if len(text)<MAX_LEN
    padding = 'max_length',
    # Truncate all sentences to max length
    truncation=True,
    # Set the maximum length
    max_length = MAX_LEN, 
    # Return attention mask
    return_attention_mask = True,
    # Return pytorch tensors
    return_tensors = 'pt'       
)

## 3.5 Preparing target

The target will be cloned from input_ids and probability of 15% masked changing tokens.

In [None]:
# cloning target from input_ids
target = tokenized_feature['input_ids'].detach().clone()

target

In [None]:
# 15% masked probability
probability = 0.15

# create random array of floats in equal dimension to input_ids
rand = torch.rand(tokenized_feature['input_ids'].shape)

# where the random array is less than 0.15, we set true
# mask_arr = rand < probability
# prevent placing mask token on special tokens
# (tokenized_feature['input_ids'] != 101) * (tokenized_feature['input_ids'] != 102)

mask_arr = (rand < probability) * (tokenized_feature['input_ids'] != 101) * (tokenized_feature['input_ids'] != 102)

# create selection from mask_arr
selection = torch.flatten((mask_arr[0]).nonzero()).tolist()
selection

# apply selection index to inputs.input_ids, adding MASK tokens
tokenized_feature['input_ids'][0, selection] = 103

In [None]:
tokenized_feature['input_ids']

## 3.4 Add language embeddings

In [None]:
import sys
import os

module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from language_tokens import get_lang_tokens

{ 'special_token': 0, 'english': 1, 'malay': 2, 'other': 3 }

language_ids = []
for input_id in tokenized_feature['input_ids']:
    input_tokens = input_id.tolist()
    
    language_ids.append(get_lang_tokens(tokenizer.batch_decode(input_tokens)))

language_ids = torch.tensor(language_ids)

tokenized_feature['language_ids'] = language_ids

tokenized_feature['language_ids']

# 4 Prepare train, validation and test set

In [None]:
# Use 80% for training, 20% for testing and 20% for validation
from sklearn.model_selection import train_test_split

train_inputs, test_inputs, train_labels, test_labels, train_masks, test_masks, train_langs, test_langs, train_target, _ = train_test_split(tokenized_feature['input_ids'], 
                                                                                                                                                  target_num,
                                                                                                                                                  tokenized_feature['attention_mask'],
                                                                                                                                                  tokenized_feature['language_ids'],
                                                                                                                                                  target,
                                                                                                                                                  random_state=42,
                                                                                                                                                  test_size=0.2,
                                                                                                                                                  stratify=target)

train_inputs, validation_inputs, train_labels, validation_labels, train_masks, validation_masks, train_langs, validation_langs = train_test_split(train_inputs, 
                                                                                                                                                  train_labels,
                                                                                                                                                  train_masks,
                                                                                                                                                  train_langs,
                                                                                                                                                  random_state=42,
                                                                                                                                                  test_size=0.25,
                                                                                                                                                  stratify=train_target)

In [None]:
from torch.utils.data import TensorDataset, DataLoader

# define batch_size
batch_size = 16

# Create the DataLoader for our training set
train_data = TensorDataset(train_inputs, train_masks, train_langs, torch.tensor(train_labels))
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
# Create the DataLoader for our validation set
validation_data = TensorDataset(validation_inputs, validation_masks, validation_langs,  torch.tensor(validation_labels))
validation_dataloader = DataLoader(validation_data, batch_size=batch_size, shuffle=True)
# Create the DataLoader for our test set
test_data = TensorDataset(test_inputs, test_masks, test_langs, torch.tensor(test_labels))
test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

# 5 Settings up BERT Model

In [None]:
from modeling_gpt2 import GPT2ForSequenceClassification, GPT2LMHeadModel

model = GPT2ForSequenceClassification.from_pretrained(
    model_name, 
    # Specify number of classes
    num_labels = len(set(target)), 
    # Whether the model returns attentions weights
    output_attentions = False,
    # Whether the model returns all hidden-states 
    output_hidden_states = False
)
model.config.pad_token_id = model.config.eos_token_id

## 5.2 Optimizer Setup

In [None]:
from transformers import GPT2Tokenizer, AdamW, get_linear_schedule_with_warmup
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Because we add two words [‘covid’, ‘coronavirus’] into the vocabulary
# we will need to resize the token to make sure the model pick it up as whole words.

# Receive the full size of the new word
model.resize_token_embeddings(len(tokenizer))

# Optimizer & Learning Rate Scheduler
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, 
                  eps = 1e-8 
                )

In [None]:
# Number of training epochs
epochs = 20
# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs
# Create the learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [None]:
model.to(device)

## 5.3 Training Model

In [None]:
%%time
import random
import numpy as np

# Set a seed value.
seed_val = 1024

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
# Store the average loss after each epoch 
loss_values = []

In [None]:
import numpy as np
from tqdm.auto import tqdm

# Training
from sklearn.metrics import accuracy_score
import time
# number of total steps for each epoch
print('total steps per epoch: ',  len(train_dataloader) / batch_size)
# looping over epochs
for epoch_i in range(0, epochs):

    # ====== Training ======
    print('training on epoch: ', epoch_i)
    progress_bar_train =  tqdm(range(len(train_dataloader)))
    # set start time 
    t0 = time.time()
    # reset total loss
    total_loss = 0
    # model in training 
    model.train()
    # loop through batch 
    for step, batch in enumerate(train_dataloader):
        # load data from dataloader 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_input_langs = batch[2].to(device)
        b_labels = batch[3].to(device)

        # clear any previously calculated gradients 
        model.zero_grad()
        # get outputs
        outputs = model(input_ids=b_input_ids,
                        token_type_ids=None,
                        attention_mask=b_input_mask,
                        language_ids=b_input_langs,
                        labels=b_labels)
        # get loss
        loss = outputs[0]
        # total loss
        total_loss += loss.item()
        # perform backward pass
        # loss.backward()
        # clip the norm of the gradients to 1.0.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        # update optimizer
        optimizer.step()
        # update learning rate 
        scheduler.step()
        # Zero the gradients
        optimizer.zero_grad()

        progress_bar_train.update(1)
    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)
    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)
    print("average training loss: {0:.2f}".format(avg_train_loss))

    # ====== Validating ======
    print('validating on epoch: ', epoch_i)
    progress_bar_eval =  tqdm(range(len(validation_dataloader)))

    model.eval()
    torch.set_grad_enabled(False)
    epoch_acc_scores_list = []
    targets_list = []
    total_loss = 0

    for step, batch in enumerate(validation_dataloader):
      # load data from dataloader 
      b_input_ids = batch[0].to(device)
      b_input_mask = batch[1].to(device)
      b_input_langs = batch[2].to(device)
      b_labels = batch[3].to(device)

      outputs = model(b_input_ids, 
                    token_type_ids=None,
                    attention_mask=b_input_mask,
                    language_ids=b_input_langs,
                    labels=b_labels)
      
      loss = outputs[0]
      total_loss += loss.item()
      
      progress_bar_eval.update(1)

      # Get the preds
      preds = outputs[1]
      # Move preds to the CPU
      val_preds = preds.detach().cpu().numpy()
      # Move the labels to the cpu
      targets_np = b_labels.to('cpu').numpy()
      # Append the labels to a numpy list
      targets_list.extend(targets_np)

      if step == 0:  # first batch
        stacked_val_preds = val_preds
      else:
        stacked_val_preds = np.vstack((stacked_val_preds, val_preds))

      # Calculate the validation accuracy
      y_true = targets_list
      y_pred = np.argmax(stacked_val_preds, axis=1)

      val_acc = accuracy_score(y_true, y_pred)

      epoch_acc_scores_list.append(val_acc)

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)
    print("average validation loss: {0:.2f}".format(avg_train_loss))
    print('validation accuracy: ', val_acc)

    torch.save({
        'epoch': epoch_i,
        'model': model.state_dict(),
        'optimizer': optimizer.state_dict(),
        'losses': loss_values
    }, '/content/drive/MyDrive/model_state.bin')
    

## 5.4 Evaluate Model

In [None]:
# Test
import sklearn.metrics as metrics
import numpy as np
t0 = time.time()
# model in validation mode
model.eval()
# save prediction
losses = []
acc = 0
counter = 0
# evaluate data for one epoch
for batch in validation_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_input_langs, b_labels = batch
    # validation
    with torch.no_grad():
        outputs = model(input_ids=b_input_ids,
                        token_type_ids=None,
                        attention_mask=b_input_mask,
                        language_ids=b_input_langs,
                        labels=b_labels)
    # get output
    loss = outputs[0]
    logits = outputs[1]
    # move logits and labels to CPU
    _, prediction = torch.max(logits, dim=1)
    labels = b_labels.cpu().detach().numpy().flatten()
    prediction = np.argmax(logits, axis=-1).flatten()
    accuracy = metrics.accuracy_score(labels, prediction)
    
    acc += accuracy
    losses.append(loss.item())
    counter += 1
    
print('total time used is: {0:.2f} s'.format(time.time() - t0))
print('accuracy: {0:.2f}%'.format(acc / counter))
print('losses: {0:.2f}%'.format(np.mean(losses)))

## 5.5 Saving Model

In [None]:
!mkdir ..\saved_model\GPT2+LI

torch.save(model, '../../saved_model/GPT2+LI/gpt2+li_mlm.bin')

In [None]:
import pickle
!mkdir ..\saved_model\GPT2+LI
    
with open('../../saved_model/GPT2+LI/gpt2+li_mlm_predictions.bin', 'wb') as fp:
    pickle.dump([losses, acc, counter], fp)

# 6 Confusion Matrix