<a href="https://colab.research.google.com/github/manelmech/Evaluation-of-transformer-based-methods/blob/main/Fine_tuning_Roberta_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Environment setup

In [2]:
!nvidia-smi


Thu Jan  5 18:20:25 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   70C    P0    31W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
!pip install transformers

!pip install sentencepiece transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m76.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.4/182.4 KB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m105.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1
Looking in indexes: https://pypi.org/simple, htt

In [4]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

from tabulate import tabulate
from tqdm import trange
import random

# Dataset

In [5]:
from google.colab import drive
drive.mount('/content/drive')


df = pd.read_csv('/content/drive/My Drive/GCDC_train.csv')

df_test = pd.read_csv('/content/drive/My Drive/GCDC_test.csv')

Mounted at /content/drive


In [6]:
from sklearn.utils import shuffle
df = shuffle(df)
df['labelA']= df['labelA'].astype(int)
df['labelA'] =df['labelA'] - 1
df_test['labelA']= df_test['labelA'].astype(int)
df_test['labelA'] =df_test['labelA'] - 1
print(df['labelA'])

711     2
2230    2
3420    0
1638    2
990     0
       ..
2910    2
308     2
290     2
2913    1
2664    2
Name: labelA, Length: 4000, dtype: int64


In [7]:
text = df.text.values
labels = df.labelA.values

text_eval = df_test.text.values
labels_eval = df_test.labelA.values


# Preprocessing

In [8]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case = True)

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [9]:
token_id = []
attention_masks = []
token_id_eval = []
attention_masks_eval = []

def preprocessing(input_text, tokenizer):
  '''
  Returns <class transformers.tokenization_utils_base.BatchEncoding> with the following fields:
    - input_ids: list of token ids
    - token_type_ids: list of token type ids
    - attention_mask: list of indices (0,1) specifying which tokens should considered by the model (return_attention_mask = True).
  '''
  return tokenizer.encode_plus(
                        input_text,
                        add_special_tokens = True,
                        max_length = 400,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt'
                   )


for sample in text:
  encoding_dict = preprocessing(sample, tokenizer)
  token_id.append(encoding_dict['input_ids']) 
  attention_masks.append(encoding_dict['attention_mask'])


token_id = torch.cat(token_id, dim = 0)
attention_masks = torch.cat(attention_masks, dim = 0)
labels = torch.tensor(labels)


for sample in text_eval:
  encoding_dict_eval = preprocessing(sample, tokenizer)
  token_id_eval.append(encoding_dict_eval['input_ids']) 
  attention_masks_eval.append(encoding_dict_eval['attention_mask'])


token_id_eval = torch.cat(token_id_eval, dim = 0)
attention_masks_eval = torch.cat(attention_masks_eval, dim = 0)
labels_eval = torch.tensor(labels_eval)



Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


# Data split

In [10]:
val_ratio = 0.2
# Recommended batch size: 16, 32. See: https://arxiv.org/pdf/1810.04805.pdf
batch_size = 8

# Indices of the train and validation splits stratified by labels


# Train and validation sets
train_set = TensorDataset(token_id, 
                          attention_masks, 
                          labels)

val_set = TensorDataset(token_id_eval, 
                        attention_masks_eval, 
                        labels_eval)

# Prepare DataLoader
train_dataloader = DataLoader(
            train_set,
            sampler = RandomSampler(train_set),
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            val_set,
            sampler = SequentialSampler(val_set),
            batch_size = batch_size
        )

# Train

In [11]:
# Load the BertForSequenceClassification model
model = RobertaForSequenceClassification.from_pretrained(
    'roberta-base',
    num_labels = 3,
    output_attentions = False,
    output_hidden_states = False,
)

# Recommended learning rates (Adam): 5e-5, 3e-5, 2e-5. See: https://arxiv.org/pdf/1810.04805.pdf
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
                                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
                                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay':0.0}
]

optimizer = torch.optim.AdamW(model.parameters(), 
                              lr = 1e-5,
                              eps = 1e-08
                              )


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device
model = model.to(device)

Downloading:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

In [12]:
from sklearn import metrics
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device
predictions , true_labels = [], []

# Recommended number of epochs: 2, 3, 4. See: https://arxiv.org/pdf/1810.04805.pdf
epochs = 4

for _ in trange(epochs, desc = 'Epoch'):
    
    # ========== Training ==========
    
    # Set model to training mode
    model.train()
    
    # Tracking variables
    acc = 0
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    
    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        # Forward pass
        train_output = model(b_input_ids, 
                             token_type_ids = None, 
                             attention_mask = b_input_mask, 
                             labels = b_labels)  
        # Backward pass
        train_output.loss.backward()
        optimizer.step()
        # Update tracking variables
        tr_loss += train_output.loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        
    
    # ========== Validation ==========
    nb_tr_eval = 0
    # Set model to evaluation mode
    model.eval()

    
    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
          # Forward pass
          eval_output = model(b_input_ids, 
                              token_type_ids = None, 
                              attention_mask = b_input_mask)
        b_labels = b_labels.cpu().detach().numpy()
        logits = eval_output.logits.detach().cpu().numpy()
       
        preds = np.argmax(logits, axis = 1).flatten()
        b_labels= b_labels.flatten()
        
        predictions.extend(preds)
       # print(predictions)
       # Store predictions and true labels
       # predictions.append(logits)
        true_labels.extend(b_labels)

        accuracy = metrics.accuracy_score(b_labels,preds )
        acc += accuracy
       
        nb_tr_eval += 1

    
    print('\n\t - Accuracy: {:.4f}'.format(acc / len(validation_dataloader)))
    print('\n\t - Train loss: {:.4f}'.format(tr_loss / nb_tr_steps))

Epoch:  25%|██▌       | 1/4 [05:29<16:27, 329.17s/it]


	 - Accuracy: 0.6075

	 - Train loss: 0.9415


Epoch:  50%|█████     | 2/4 [10:55<10:54, 327.22s/it]


	 - Accuracy: 0.6112

	 - Train loss: 0.7955


Epoch:  75%|███████▌  | 3/4 [16:20<05:26, 326.45s/it]


	 - Accuracy: 0.5925

	 - Train loss: 0.6407


Epoch: 100%|██████████| 4/4 [21:45<00:00, 326.48s/it]


	 - Accuracy: 0.5775

	 - Train loss: 0.4705





In [15]:
print(true_labels)
print(predictions)

[2, 2, 2, 2, 1, 0, 2, 1, 2, 2, 1, 2, 2, 2, 2, 1, 1, 1, 0, 2, 2, 0, 1, 2, 0, 2, 0, 0, 2, 1, 2, 0, 0, 0, 2, 2, 2, 2, 2, 2, 1, 2, 2, 0, 0, 1, 2, 2, 1, 0, 0, 1, 2, 2, 1, 0, 2, 2, 2, 2, 2, 0, 0, 0, 2, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 0, 2, 0, 2, 0, 2, 1, 0, 2, 1, 0, 2, 2, 1, 0, 2, 0, 2, 2, 0, 0, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 0, 1, 0, 2, 1, 1, 1, 2, 2, 1, 0, 1, 2, 0, 0, 2, 0, 2, 2, 2, 1, 2, 0, 2, 2, 2, 0, 1, 2, 1, 1, 2, 0, 2, 2, 1, 2, 2, 2, 0, 2, 2, 2, 0, 0, 0, 1, 0, 1, 0, 2, 0, 2, 2, 2, 2, 2, 1, 2, 0, 2, 2, 2, 2, 2, 0, 0, 0, 2, 1, 1, 1, 0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 2, 0, 1, 1, 2, 2, 2, 1, 2, 1, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 1, 1, 0, 0, 2, 2, 1, 2, 2, 2, 0, 2, 2, 1, 1, 1, 1, 2, 2, 1, 2, 2, 1, 0, 1, 2, 2, 2, 1, 1, 0, 0, 2, 2, 0, 0, 1, 1, 1, 0, 2, 2, 0, 0, 0, 2, 0, 1, 2, 0, 2, 0, 2, 2, 0, 0, 1, 2, 0, 1, 0, 2, 2, 0, 1, 1, 2, 2, 2, 0, 0, 1, 2, 2, 2, 2, 2, 0, 2, 1, 0, 2, 0, 2, 1, 2, 1, 0, 0, 1, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 0, 1, 2, 1, 1, 1, 1, 2, 2, 

In [16]:
from sklearn.metrics import accuracy_score
from sklearn import metrics

matrix = metrics.confusion_matrix(true_labels, predictions, labels=[0, 1, 2])
print(matrix)
sum = np.sum(matrix)
acc_low = (matrix[0][0] + matrix[1][1] + matrix[1][2] + matrix[2][1] + matrix [2][2])/(sum) #exactitude relative à la classe low
acc_medium = (matrix[0][0] + matrix[1][1] + matrix[0][2] + matrix[2][0] + matrix [2][2])/(sum) #exactitude relative à la classe medium
acc_high = (matrix[0][0] + matrix[1][1] + matrix[0][1] + matrix[1][0] + matrix [2][2])/(sum) #exactitude relative à la classe high

accuracy = accuracy_score(predictions, true_labels)
print("Accuracy :")
print(accuracy)
print("Accuracy low :")
print(acc_low)  
print("Accuracy medium :")
print(acc_medium) 
print("Accuracy high :")
print(acc_high) 
print("Average accuracy :")
print((acc_low + acc_medium + acc_high)/3) 
print("Classification report :")
print(metrics.classification_report(true_labels, predictions, labels=[0, 1, 2], zero_division=1))

[[ 451   76  453]
 [ 132   47  505]
 [  68   55 1413]]
Accuracy :
0.5971875
Accuracy low :
0.7721875
Accuracy medium :
0.76
Accuracy high :
0.6621875
Average accuracy :
0.7314583333333333
Classification report :
              precision    recall  f1-score   support

           0       0.69      0.46      0.55       980
           1       0.26      0.07      0.11       684
           2       0.60      0.92      0.72      1536

    accuracy                           0.60      3200
   macro avg       0.52      0.48      0.46      3200
weighted avg       0.55      0.60      0.54      3200

