# 1. Importing libraries and datasets

## 1.1 Importing torch

In [1]:
model_name = "bert-base-multilingual-cased"

In [2]:
import torch
# check if we have cuda installed
if torch.cuda.is_available():
    # to use GPU
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('GPU is:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


## 2.6 Read datasets

In [None]:
import pandas as pd
combined_df = pd.read_csv(r'../../data/combined_data.csv', sep='\t', encoding='ISO-8859-1')

combined_df = combined_df[combined_df['text'].notnull()].reset_index()[['text', 'sentiment']]
combined_df

# 3. Get feature and targets

## 3.1 Load tokenizer

In [72]:
import torch
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

## 3.3 Prepare features and target

In [28]:
# identify features and target
features = combined_df.text.values.tolist()
target = combined_df.sentiment.values.tolist()

print(features[:5])

['looks like scum kepada me', 'perhatian status aras air semasa di sarawak miri marudi aras air sungai adalah iait', 'change the heart of china kemudian', 'can rebate further dengan gc and shopback', 'aku baru habis tengok padman']


In [29]:
# tokenize features 
MAX_LEN = 128
tokenized_feature = tokenizer.batch_encode_plus(
    # Sentences to encode
    features, 
    # Add '[CLS]' and '[SEP]'
    add_special_tokens = True,
    # Add empty tokens if len(text)<MAX_LEN
    padding = 'max_length',
    # Truncate all sentences to max length
    truncation=True,
    # Set the maximum length
    max_length = MAX_LEN, 
    # Return attention mask
    return_attention_mask = True,
    # Return pytorch tensors
    return_tensors = 'pt'       
)

## 3.5 Preparing target

The target will be cloned from input_ids and probability of 15% masked changing tokens.

In [18]:
# cloning target from input_ids
target = tokenized_feature['input_ids'].detach().clone()

target

tensor([[   101,  59148,  11850,  ...,      0,      0,      0],
        [   101, 103601,  14042,  ...,      0,      0,      0],
        [   101,  15453,  10105,  ...,      0,      0,      0],
        ...,
        [   101,  90567,  10592,  ...,      0,      0,      0],
        [   101,  10659,  16113,  ...,      0,      0,      0],
        [   101,  24604,  14657,  ...,      0,      0,      0]])

In [5]:
# 15% masked probability
probability = 0.15

# create random array of floats in equal dimension to input_ids
rand = torch.rand(tokenized_feature['input_ids'].shape)

# where the random array is less than 0.15, we set true
# mask_arr = rand < probability
# prevent placing mask token on special tokens
# (tokenized_feature['input_ids'] != 101) * (tokenized_feature['input_ids'] != 102)

mask_arr = (rand < probability) * (tokenized_feature['input_ids'] != 101) * (tokenized_feature['input_ids'] != 102)

# create selection from mask_arr
selection = torch.flatten((mask_arr[0]).nonzero()).tolist()
selection

# apply selection index to inputs.input_ids, adding MASK tokens
tokenized_feature['input_ids'][0, selection] = 103

In [6]:
tokenized_feature['input_ids']

tensor([[   101,  59148,  11850,  ...,      0,    103,      0],
        [   101, 103601,  14042,  ...,      0,      0,      0],
        [   101,  15453,  10105,  ...,      0,      0,      0],
        ...,
        [   101,  90567,  10592,  ...,      0,      0,      0],
        [   101,  10659,  16113,  ...,      0,      0,      0],
        [   101,  24604,  14657,  ...,      0,      0,      0]])

## 3.4 Add language embeddings

In [35]:
import sys
import os

module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from language_tokens import get_lang_tokens

{ 'special_token': 0, 'english': 1, 'malay': 2, 'other': 3 }

language_ids = []
for input_id in tokenized_feature['input_ids']:
    input_tokens = input_id.tolist()
    
    language_ids.append(get_lang_tokens(tokenizer.batch_decode(input_tokens)))

language_ids = torch.tensor(language_ids)

tokenized_feature['language_ids'] = language_ids

tokenized_feature['language_ids']

tensor([[0, 0, 0,  ..., 2, 2, 0],
        [0, 0, 0,  ..., 2, 2, 0],
        [0, 0, 0,  ..., 2, 1, 0],
        ...,
        [0, 0, 0,  ..., 2, 2, 0],
        [0, 0, 0,  ..., 2, 2, 0],
        [0, 0, 0,  ..., 2, 2, 0]])

# 4 Prepare train and validation set

In [7]:
# Use 80% for training and 20% for validation
from sklearn.model_selection import train_test_split

train_inputs, validation_inputs, train_labels, validation_labels, train_masks, validation_masks, train_langs, validation_langs = train_test_split(tokenized_feature['input_ids'], 
                                                                                                                                                  target,
                                                                                                                                                  tokenized_feature['attention_mask'],
                                                                                                                                                  tokenized_feature['language_ids'],
                                                                                                                                                  random_state=2018,
                                                                                                                                                  test_size=0.2)

In [8]:
from torch.utils.data import TensorDataset, RandomSampler, DataLoader, SequentialSampler

# define batch_size
batch_size = 16

# Create the DataLoader for our training set
train_data = TensorDataset(train_inputs, train_masks, train_langs, torch.tensor(train_labels))
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
# Create the DataLoader for our test set
validation_data = TensorDataset(validation_inputs, validation_masks, validation_langs, torch.tensor(validation_labels))
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

  train_data = TensorDataset(train_inputs, train_masks, train_langs, torch.tensor(train_labels))
  validation_data = TensorDataset(validation_inputs, validation_masks, validation_langs, torch.tensor(validation_labels))


# 5 Settings up BERT Model

In [9]:
from modeling_gpt2 import GPT2ForSequenceClassification, GPT2LMHeadModel

model = GPT2ForSequenceClassification.from_pretrained(
    model_name, 
    # Specify number of classes
    num_labels = len(set(target)), 
    # Whether the model returns attentions weights
    output_attentions = False,
    # Whether the model returns all hidden-states 
    output_hidden_states = False
)
model.config.pad_token_id = model.config.eos_token_id

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMaskedLM were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['bert.embeddings.language_embeddings.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 5.2 Optimizer Setup

In [10]:
from transformers import GPT2Tokenizer, AdamW, get_linear_schedule_with_warmup
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Because we add two words [‘covid’, ‘coronavirus’] into the vocabulary
# we will need to resize the token to make sure the model pick it up as whole words.

# Receive the full size of the new word
model.resize_token_embeddings(len(tokenizer))

# Optimizer & Learning Rate Scheduler
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, 
                  eps = 1e-8 
                )



In [11]:
# Number of training epochs
epochs = 20
# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs
# Create the learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [12]:
# use cuda if existing
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (language_embeddings): Embedding(119547, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm

## 5.3 Training Model

In [13]:
# Training
import time
# Store the average loss after each epoch 
loss_values = []
# number of total steps for each epoch
print('total steps per epoch: ',  len(train_dataloader) / batch_size)
# looping over epochs
for epoch_i in range(0, epochs):
    
    print('training on epoch: ', epoch_i)
    # set start time 
    t0 = time.time()
    # reset total loss
    total_loss = 0
    # model in training 
    model.train()
    # loop through batch 
    for step, batch in enumerate(train_dataloader):
        # Progress update every 50 step 
        if step % 50 == 0 and not step == 0:
            print('training on step: ', step)
            print('total time used is: {0:.2f} s'.format(time.time() - t0))
        # load data from dataloader 
        b_input_ids = batch[0].to(device).long()
        b_input_mask = batch[1].to(device).long()
        b_input_langs = batch[2].to(device).long()
        b_labels = batch[3].to(device).long()
        # clear any previously calculated gradients 
        model.zero_grad()
        # get outputs
        outputs = model(input_ids=b_input_ids,
                        token_type_ids=None,
                        attention_mask=b_input_mask,
                        language_ids=b_input_langs,
                        labels=b_labels)
        # get loss
        loss = outputs[0]
        # total loss
        total_loss += loss.item()
        # clip the norm of the gradients to 1.0.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        # update optimizer
        optimizer.step()
        # update learning rate 
        scheduler.step()
    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)
    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)
    print("average training loss: {0:.2f}".format(avg_train_loss))

total steps per epoch:  156.25
training on epoch:  0
training on step:  50
total time used is: 336.68 s
training on step:  100
total time used is: 676.86 s
training on step:  150
total time used is: 1019.46 s
training on step:  200
total time used is: 1352.80 s
training on step:  250
total time used is: 1685.69 s
training on step:  300
total time used is: 2018.14 s
training on step:  350
total time used is: 2344.39 s
training on step:  400
total time used is: 2678.25 s
training on step:  450
total time used is: 3011.39 s
training on step:  500
total time used is: 3344.70 s
training on step:  550
total time used is: 3673.67 s
training on step:  600
total time used is: 4005.16 s
training on step:  650
total time used is: 4324.02 s
training on step:  700
total time used is: 4633.43 s
training on step:  750
total time used is: 4945.56 s
training on step:  800
total time used is: 5257.95 s
training on step:  850
total time used is: 5574.25 s
training on step:  900
total time used is: 5898.3

training on step:  150
total time used is: 969.11 s
training on step:  200
total time used is: 1289.35 s
training on step:  250
total time used is: 1605.61 s
training on step:  300
total time used is: 1923.60 s
training on step:  350
total time used is: 2241.73 s
training on step:  400
total time used is: 2560.01 s
training on step:  450
total time used is: 2877.87 s
training on step:  500
total time used is: 3196.69 s
training on step:  550
total time used is: 3513.20 s
training on step:  600
total time used is: 3830.38 s
training on step:  650
total time used is: 4147.81 s
training on step:  700
total time used is: 4467.68 s
training on step:  750
total time used is: 4787.31 s
training on step:  800
total time used is: 5110.69 s
training on step:  850
total time used is: 5431.79 s
training on step:  900
total time used is: 5752.04 s
training on step:  950
total time used is: 6088.04 s
training on step:  1000
total time used is: 6406.38 s
training on step:  1050
total time used is: 67

## 5.4 Evaluate Model

In [None]:
# Test
import sklearn.metrics as metrics
import numpy as np
t0 = time.time()
# model in validation mode
model.eval()
# save prediction
losses = []
acc = 0
counter = 0
# evaluate data for one epoch
for batch in validation_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_input_langs, b_labels = batch
    # validation
    with torch.no_grad():
        outputs = model(input_ids=b_input_ids,
                        token_type_ids=None,
                        attention_mask=b_input_mask,
                        language_ids=b_input_langs,
                        labels=b_labels)
    # get output
    loss = outputs[0]
    logits = outputs[1]
    # move logits and labels to CPU
    _, prediction = torch.max(logits, dim=1)
    labels = b_labels.cpu().detach().numpy().flatten()
    prediction = np.argmax(logits, axis=-1).flatten()
    accuracy = metrics.accuracy_score(labels, prediction)
    
    acc += accuracy
    losses.append(loss.item())
    counter += 1
    
print('total time used is: {0:.2f} s'.format(time.time() - t0))
print('accuracy: {0:.2f}%'.format(acc / counter))
print('losses: {0:.2f}%'.format(np.mean(losses)))

## 5.5 Saving Model

In [14]:
!mkdir ..\saved_model\GPT2+LI

torch.save(model, '../../saved_model/GPT2+LI/gpt2+li_mlm.bin')

A subdirectory or file ..\saved_model\bert+li already exists.
Error occurred while processing: ..\saved_model\bert+li.


In [169]:
import pickle
!mkdir ..\saved_model\GPT2+LI
    
with open('../../saved_model/GPT2+LI/gpt2+li_mlm_predictions.bin', 'wb') as fp:
    pickle.dump([losses, acc, counter], fp)

A subdirectory or file -p already exists.
Error occurred while processing: -p.
A subdirectory or file ..\saved_model\bert+li already exists.
Error occurred while processing: ..\saved_model\bert+li.


# 6 Confusion Matrix