In [1]:
!nvidia-smi

Fri Jun  3 00:51:25 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   44C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#Install libraries

In [3]:
pip install transformers sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 8.0 MB/s 
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 43.6 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 60.6 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 7.0 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.

In [4]:
import pandas as pd
from transformers import AutoTokenizer, AutoConfig,get_linear_schedule_with_warmup,RobertaModel,RobertaPreTrainedModel
from torch.utils.data import DataLoader
import torch
from create_dataloaders import create_dataloader
import numpy as np
from os.path import join
from tqdm.auto import tqdm
import os
from os.path import join
from torch import nn
from sklearn.metrics import *
from torch.nn import CrossEntropyLoss
from transformers.modeling_outputs import SequenceClassifierOutput

In [5]:
pre_dataset_directory="/content/drive/MyDrive/en-vi-nli/pre_dataset"

In [6]:
train=pd.read_csv(f'{pre_dataset_directory}/train/train_60_100k.csv')
valid=pd.read_csv(f'{pre_dataset_directory}/valid/valid.csv')

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device


device(type='cuda')

In [8]:
len(train),len(valid)


(100000, 4980)

In [9]:
max_length=60

In [10]:
train_batch_size=32
train_premises=train['norm_premise'].tolist()
train_hypothesises=train['norm_hypothesis'].tolist()
train_labels=train['label'].tolist()

valid_batch_size=32
valid_premises=valid['norm_premise'].tolist()
valid_hypothesises=valid['norm_hypothesis'].tolist()
valid_labels=valid['label'].tolist()

In [11]:
class Classifier(nn.Module):
  def __init__(self, dense_dim_in,dense_dim_out,num_labels,drop_rate):
    super(Classifier,self).__init__()

    self.dense_dim_in=dense_dim_in
    self.dense_dim_out=dense_dim_out
    self.drop_rate=drop_rate
    self.num_labels=num_labels

    self.dense = nn.Linear(self.dense_dim_in, self.dense_dim_out)
    
    self.dropout = nn.Dropout(self.drop_rate)
    self.out_proj = nn.Linear(self.dense_dim_out, self.num_labels)
  
  def forward(self,inputs):
    #inputs: (batch_size,dense_dim_in)
    inputs=self.dense(inputs)
    inputs = self.dense(inputs)
    inputs = torch.tanh(inputs)
    inputs = self.dropout(inputs)
    logits = self.out_proj(inputs)

    return logits

In [21]:
model_directory='/content/drive/MyDrive/en-vi-nli/models'

##XLM_ROBERTA_BASE CONFIG

In [16]:
labels=[0,1,2]
num_labels=len(labels)
classifier_dropout=0.1
config=AutoConfig.from_pretrained('xlm-roberta-base')
config.classifier_dropout=classifier_dropout
config.num_labels=num_labels

In [17]:
config_directory='/content/drive/MyDrive/en-vi-nli/models/xlm-roberta-base'
os.makedirs(config_directory,exist_ok=True)

In [18]:
config.save_pretrained(config_directory)

#Create sentence pairs dataloader 

In [44]:
name='xlm-roberta-base'

In [45]:
tokenizer = AutoTokenizer.from_pretrained(name)

Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/8.68M [00:00<?, ?B/s]

In [46]:
train_dataloader=create_dataloader(tokenizer,train_premises,train_hypothesises,train_labels,max_length,train_batch_size,device=device,shuffle=False)

  0%|          | 0/100000 [00:00<?, ?it/s]

In [47]:
valid_dataloader=create_dataloader(tokenizer,valid_premises,valid_hypothesises,valid_labels,max_length,valid_batch_size,device=device,shuffle=False)

  0%|          | 0/4980 [00:00<?, ?it/s]

In [48]:
for i in valid_dataloader:
  print(i['inputs'].keys())
  break

dict_keys(['input_ids', 'attention_mask'])


#Approaches

##Approach 1: Fine-tuning last layer XLM-ROBERTA-BASE

Note: Init fixed weights for classifier layer for the first time

In [None]:
# last_layer_classifier=Classifier(config.hidden_size,config.hidden_size,num_labels,config.classifier_dropout)
# torch.save({'last_layer_classifier':last_layer_classifier.state_dict()},
#            join(model_directory,'last_layer_classifier.pt'))

In [None]:
model_directory='/content/drive/MyDrive/en-vi-nli/models/last_layer_classifier'
os.makedirs(model_directory,exist_ok=True)

Define last layer xlm-robert-base

In [104]:
class XLMRoBERTa4LastLayersForClassification(RobertaPreTrainedModel):
    _keys_to_ignore_on_load_missing = [r"position_ids"]

    def __init__(self, config):
        super().__init__(config)
        self.config = config
        
        self.roberta = RobertaModel(config, add_pooling_layer=False)
        
        self.classifier=self.load_weight_classifier()

        self.init_weights()

    def load_weight_classifier(self):
        classifier_ckpt=torch.load('/content/drive/MyDrive/en-vi-nli/models/last_layer_classifier/last_layer_classifier.pt')
        classifier=Classifier(self.config.hidden_size,self.config.hidden_size,self.config.num_labels,self.config.classifier_dropout)
        classifier.load_state_dict(classifier_ckpt['last_layer_classifier'])
        return classifier
    
    def forward(self,input_ids,attention_mask,labels=None):

        outputs = self.roberta(input_ids=input_ids,attention_mask=attention_mask,output_hidden_states=False)
        
        cls_embedding=outputs[0][:,0,:]
        
        logits=self.classifier(cls_embedding)
        

        loss=0
        if labels!=None:
          loss_fct = CrossEntropyLoss()
          loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
        
        return SequenceClassifierOutput(
                loss=loss,
                logits=logits)

##Init weights for four last layers approaches

Note: Init fixed weights for classifier layer for the first time. Remember to multiply hidden size to 4

In [23]:
config=AutoConfig.from_pretrained(config_directory)

In [26]:
model_directory='/content/drive/MyDrive/en-vi-nli/models/four_last_layer_classifier'
os.makedirs(model_directory,exist_ok=True)

In [27]:
four_last_layer_classifier=Classifier(config.hidden_size*4,config.hidden_size*4,num_labels,config.classifier_dropout)
torch.save({'four_last_layer_classifier':four_last_layer_classifier.state_dict()},
           join(model_directory,'four_last_layer_classifier.pt'))

##Approach 2: Fine-tuning 4 last concatenating layers of XLM-ROBERTA-BASE

In [28]:
class XLMRoBERTa4LastLayersForClassification(RobertaPreTrainedModel):
    _keys_to_ignore_on_load_missing = [r"position_ids"]

    def __init__(self, config):
        super().__init__(config)
        self.config = config
        
        self.roberta = RobertaModel(config, add_pooling_layer=False)
        
        self.classifier=self.load_weight_classifier()

        self.init_weights()

    def load_weight_classifier(self):
        classifier_ckpt=torch.load('/content/drive/MyDrive/en-vi-nli/models/four_last_layer_classifier/four_last_layer_classifier.pt')
        classifier=Classifier(self.config.hidden_size*4,self.config.hidden_size*4,self.config.num_labels,self.config.classifier_dropout)
        classifier.load_state_dict(classifier_ckpt['four_last_layer_classifier'])
        return classifier
    
    def forward(self,input_ids,attention_mask,labels=None):

        outputs = self.roberta(input_ids=input_ids,attention_mask=attention_mask,output_hidden_states=True)
        last_layers=torch.stack(outputs.hidden_states[-4:])
        last_layers=last_layers.permute(1,2,0,3)
        features=torch.flatten(last_layers,start_dim=2)
        cls_embedding=features[:,0,:]
        
        logits=self.classifier(cls_embedding)

        loss=0
        if labels!=None:
          loss_fct = CrossEntropyLoss()
          loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
        
        return SequenceClassifierOutput(
                loss=loss,
                logits=logits)

##Approach 3: Fine-tuning 4 last concatenating layers of XLM-ROBERTA-BASE + MEAN-MAX-Pooling

In [50]:
class XLMRoBERTa4LastLayersForClassification(RobertaPreTrainedModel):
    _keys_to_ignore_on_load_missing = [r"position_ids"]

    def __init__(self, config):
        super().__init__(config)
        self.config = config
        
        self.roberta = RobertaModel(config, add_pooling_layer=False)
        
        self.classifier=self.load_weight_classifier()

        self.init_weights()

    def load_weight_classifier(self):
        classifier_ckpt=torch.load('/content/drive/MyDrive/en-vi-nli/models/four_last_layer_classifier/four_last_layer_classifier.pt')
        classifier=Classifier(self.config.hidden_size*4,self.config.hidden_size*4,self.config.num_labels,self.config.classifier_dropout)
        classifier.load_state_dict(classifier_ckpt['four_last_layer_classifier'])
        return classifier
    
    def mean_pooling(self,token_embeddings, attention_mask):
      input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
      return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

    def max_pooling(self,token_embeddings, attention_mask):
      input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
      token_embeddings[input_mask_expanded == 0] = -1e9  # Set padding tokens to large negative value
      return torch.max(token_embeddings, 1)[0]

    def forward(self,input_ids,attention_mask,labels=None):

        outputs = self.roberta(input_ids=input_ids,attention_mask=attention_mask,output_hidden_states=True)
        last_layers=torch.stack(outputs.hidden_states[-4:])
        last_layers=last_layers.permute(1,2,0,3)
        features=torch.flatten(last_layers,start_dim=2)
        
        avg_pool=self.mean_pooling(features,attention_mask)
        max_pool=self.max_pooling(features,attention_mask)
        pool=avg_pool+max_pool
        
        logits=self.classifier(pool)

        loss=0
        if labels!=None:
          loss_fct = CrossEntropyLoss()
          loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
        
        return SequenceClassifierOutput(
                loss=loss,
                logits=logits)

##Approach 4: Fine-tuning 4 last concatenating layers of XLM-ROBERTA-BASE + BiLSTM + MEAN-MAX-Pooling

In [None]:
model_directory='/content/drive/MyDrive/en-vi-nli/models/bilstm_layer'
os.makedirs(model_directory,exist_ok=True)

In [None]:
bilstm_layer=torch.nn.LSTM(input_size=config.hidden_size*4,hidden_size=config.hidden_size,bidirectional=True,batch_first=True)        
torch.save({'bilstm_layer':bilstm_layer.state_dict()},
           join(model_directory,'bilstm_layer.pt'))

In [None]:
class XLMRoBERTa4LastLayersForClassification(RobertaPreTrainedModel):
    _keys_to_ignore_on_load_missing = [r"position_ids"]

    def __init__(self, config):
      super().__init__(config)
      self.config = config
      
      self.roberta = RobertaModel(config, add_pooling_layer=False)
      
      self.bilstm_layer=self.load_weights_bilstm()

      self.classifier=self.load_weight_classifier()

      self.init_weights()

    def load_weight_classifier(self):
        classifier_ckpt=torch.load('/content/drive/MyDrive/en-vi-nli/models/four_last_layer_classifier/four_last_layer_classifier.pt')
        classifier=Classifier(self.config.hidden_size*4,self.config.hidden_size*4,self.config.num_labels,self.config.classifier_dropout)
        classifier.load_state_dict(classifier_ckpt['four_last_layer_classifier'])
        return classifier

    def load_weights_bilstm(self):
      bilstm_layer_ckpt=torch.load('/content/drive/MyDrive/en-vi-nli/models/bilstm_layer/bilstm_layer.pt')
      bilstm_layer=torch.nn.LSTM(input_size=self.config.hidden_size*4,hidden_size=self.config.hidden_size,bidirectional=True,batch_first=True)        
      bilstm_layer.load_state_dict(bilstm_layer_ckpt['bilstm_layer'])
      return bilstm_layer

    
    def mean_pooling(self,token_embeddings, attention_mask):
      input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
      return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

    def max_pooling(self,token_embeddings, attention_mask):
      input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
      token_embeddings[input_mask_expanded == 0] = -1e9  # Set padding tokens to large negative value
      return torch.max(token_embeddings, 1)[0]
    
    def init_hidden(self,batch_size):
      return (torch.randn(2, batch_size, self.config.hidden_size ).to(device),
              torch.randn(2, batch_size, self.config.hidden_size ).to(device))
        
    def forward(self,input_ids,attention_mask,labels=None):
      bs=input_ids.shape[0]
      outputs = self.roberta(input_ids=input_ids,attention_mask=attention_mask,output_hidden_states=True)
      last_layers=torch.stack(outputs.hidden_states[-4:])
      last_layers=last_layers.permute(1,2,0,3)
      features=torch.flatten(last_layers,start_dim=2)
      
      hidden = self.init_hidden(bs)
    
      _features,hidden=self.bilstm_layer(features,hidden)

      _avg_pool = torch.mean(_features, 1)
      _max_pool, _ = torch.max(_features, 1)
      concat = torch.cat(( _avg_pool, _max_pool), 1)
      
      logits=self.classifier(concat)

      loss=0
      if labels!=None:
        loss_fct = CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
      
      return SequenceClassifierOutput(
              loss=loss,
              logits=logits)

##Approach 5: Fine-tuning 4 last concatenating layers of XLM-ROBERTA-BASE + Siamese strategy

In [None]:
class XLMRoBERTa4LastLayersForClassification(RobertaPreTrainedModel):
    _keys_to_ignore_on_load_missing = [r"position_ids"]

    def __init__(self, config):
        super().__init__(config)
        self.config = config
        
        self.roberta = RobertaModel(config, add_pooling_layer=False)
        self.classifier=self.load_weight_classifier()

        self.init_weights()

    def load_weight_classifier(self):
        classifier_ckpt=torch.load('/content/drive/MyDrive/en-vi-nli/models/four_last_layer_classifier/four_last_layer_classifier.pt')
        classifier=Classifier(self.config.hidden_size*4,self.config.hidden_size*4,self.config.num_labels,self.config.classifier_dropout)
        classifier.load_state_dict(classifier_ckpt['four_last_layer_classifier'])
        return classifier
    
    def forward(self,input_ids_1,attention_mask_1,input_ids_2,attention_mask_2,labels=None):

        outputs_1 = self.roberta(input_ids=input_ids_1,attention_mask=attention_mask_1,output_hidden_states=True)
        last_layers_1=torch.stack(outputs_1.hidden_states[-4:])
        last_layers_1=last_layers_1.permute(1,2,0,3)
        features_1=torch.flatten(last_layers_1,start_dim=2)
        cls_embedding_1=features_1[:,0,:]

        outputs_2 = self.roberta(input_ids=input_ids_2,attention_mask=attention_mask_2,output_hidden_states=True)
        last_layers_2=torch.stack(outputs_2.hidden_states[-4:])
        last_layers_2=last_layers_2.permute(1,2,0,3)
        features_2=torch.flatten(last_layers_2,start_dim=2)
        cls_embedding_2=features_2[:,0,:]
        
        concat=torch.concat([cls_embedding_1,cls_embedding_2,torch.abs(cls_embedding_1-cls_embedding_2)],dim=-1)
        # add dense layer with size hidden_size*12,hidden_size*12 ?????
        logits=self.classifier(concat)

        loss=0
        if labels!=None:
          loss_fct = CrossEntropyLoss()
          loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
        
        return SequenceClassifierOutput(
                loss=loss,
                logits=logits)

##Train

In [105]:
name=config.name_or_path.split('/')[-1]
model=XLMRoBERTa4LastLayersForClassification.from_pretrained(name,config=config).to(device)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRoBERTa4LastLayersForClassification: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing XLMRoBERTa4LastLayersForClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRoBERTa4LastLayersForClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRoBERTa4LastLayersForClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['clas

In [106]:
import gc
gc.collect()

592

Note: Currently, I'm not using schedulers 

In [107]:
epochs=30
init_lr=1e-5
eps =1e-8

In [108]:
optimizer=torch.optim.AdamW(model.parameters(),lr=init_lr,weight_decay =0.01,eps=eps)

In [109]:
def save_model(epoch,model,optimizer,scheduler,training_loss,valid_loss,train_acc,valid_acc,path,step=None):
  torch.save({
          'epoch': epoch,
          'model_state_dict': model,
          'optimizer_state_dict': optimizer,
          'scheduler_state_dict': scheduler,
          'training_loss': training_loss,
          'valid_loss': valid_loss,
          'train_acc': train_acc,
          'valid_acc': valid_acc,
          'step':step
          }, join(path,f'ckpt{epoch}.pt'))

In [110]:
def run_train(optimizer,dataloader,lr_scheduler=None):
  model.train(True)
  loop = tqdm(dataloader)
  loss=0
  true=[]
  pred=[]
  for batch in dataloader:
    optimizer.zero_grad()
    inputs=batch['inputs']
    inputs.update({"labels":batch['labels']})
    true+=batch['labels'].tolist()
    outputs=model(**inputs)
    outputs.loss.backward()

    loss+=outputs.loss.item()
    probs=torch.nn.functional.softmax(outputs.logits,dim=-1)
    _pred=torch.argmax(probs,dim=-1).tolist()
    pred+=_pred
    optimizer.step()
    if lr_scheduler!=None:
      lr_scheduler.step()
    loop.set_postfix({f'train_loss_per_batch':outputs.loss.item()})
    loop.update()      
  acc=accuracy_score(true,pred)
  return loss/len(dataloader),acc

In [111]:
def run_valid(dataloader):
  model.eval()
  loss=0
  pred=[]
  true=[]
  with torch.no_grad():
    for batch in dataloader:
      inputs=batch['inputs']
      inputs.update({"labels":batch['labels']})
      true+=batch['labels'].tolist()
      
      outputs=model(**inputs)
      probs=torch.nn.functional.softmax(outputs.logits,dim=-1)
      _pred=torch.argmax(probs,dim=-1).tolist()
      pred+=_pred
      loss+=outputs.loss.item()

  acc=accuracy_score(true,pred)

  return loss/len(dataloader),acc

In [112]:
import gc
gc.collect()

112

In [113]:
lr_scheduler=None

In [114]:
model_directory='/content/drive/MyDrive/en-vi-nli/models/last-layer'
os.makedirs(model_directory,exist_ok=True)
model_directory

'/content/drive/MyDrive/en-vi-nli/models/last-layer'

In [115]:
train_loss_per_step=[]
valid_loss_per_step=[]
train_acc_per_step=[]
valid_acc_per_step=[]
# min_loss=np.inf
best_acc=0
count_stopping=0
for epoch in range(epochs):

  train_loss,train_acc=run_train(optimizer,train_dataloader,lr_scheduler)
  
  valid_loss,valid_acc=run_valid(valid_dataloader)

  train_loss_per_step.append(train_loss)
  valid_loss_per_step.append(valid_loss)

  train_acc_per_step.append(train_acc)
  valid_acc_per_step.append(valid_acc)
  
  print(f'epoch: {epoch} train_loss: {train_loss} valid_loss: {valid_loss} train_acc: {train_acc} valid_acc: {valid_acc}')
  # if valid_loss<min_loss:
  if best_acc<valid_acc:
    # min_loss=valid_loss
    best_acc=valid_acc
    save_model(epoch,model.state_dict(),optimizer.state_dict(),lr_scheduler,train_loss,valid_loss,train_acc,valid_acc,model_directory)
    count_stopping=0
  else:
    count_stopping+=1
  
  if count_stopping>5:
    break


  0%|          | 0/3446 [00:00<?, ?it/s]

epoch: 0 train_loss: 0.9696788734740734 valid_loss: 0.8932984310981126 train_acc: 0.5478783701674995 valid_acc: 0.6480395749358739


  0%|          | 0/3446 [00:00<?, ?it/s]

epoch: 1 train_loss: 0.8636592005271934 valid_loss: 0.8286810482454579 train_acc: 0.6769354941098586 valid_acc: 0.7101502381824845


  0%|          | 0/3446 [00:00<?, ?it/s]

epoch: 2 train_loss: 0.8276395619572835 valid_loss: 0.8262792316793698 train_acc: 0.7152780926643028 valid_acc: 0.7165628435324295


  0%|          | 0/3446 [00:00<?, ?it/s]

epoch: 3 train_loss: 0.8031574924602387 valid_loss: 0.8260011073441533 train_acc: 0.7417859960641704 valid_acc: 0.7163796262367167


  0%|          | 0/3446 [00:00<?, ?it/s]

epoch: 4 train_loss: 0.7878416894448059 valid_loss: 0.828908143684878 train_acc: 0.7577651017058284 valid_acc: 0.7138145840967387


  0%|          | 0/3446 [00:00<?, ?it/s]

KeyboardInterrupt: ignored