<a href="https://colab.research.google.com/github/manelmech/Evaluation-of-transformer-based-methods/blob/main/Fine_tuning_XLNet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Environment setup

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!nvidia-smi


Mon Jan  2 17:24:42 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   48C    P0    26W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
!pip install transformers

!pip install sentencepiece transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 15.4 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 81.7 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 60.0 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K   

In [4]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import XLNetTokenizer, XLNetForSequenceClassification
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

from tabulate import tabulate
from tqdm import trange
import random

# Dataset

In [5]:
df = pd.read_csv('/content/drive/My Drive/GCDC_train.csv')

df_test = pd.read_csv('/content/drive/My Drive/GCDC_test.csv')

In [6]:
from sklearn.utils import shuffle
df = shuffle(df)
df['labelA']= df['labelA'].astype(int)
df['labelA'] =df['labelA'] - 1
df_test['labelA']= df_test['labelA'].astype(int)
df_test['labelA'] =df_test['labelA'] - 1
print(df['labelA'])

2489    0
2301    2
100     2
1006    1
3365    0
       ..
2399    2
3504    2
2532    0
319     2
2363    1
Name: labelA, Length: 4000, dtype: int64


In [7]:
text = df.text.values
labels = df.labelA.values

text_eval = df_test.text.values
labels_eval = df_test.labelA.values


# Preprocessing

In [8]:
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased', do_lower_case = True)

Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/760 [00:00<?, ?B/s]

In [9]:
token_id = []
attention_masks = []
token_id_eval = []
attention_masks_eval = []

def preprocessing(input_text, tokenizer):
  '''
  Returns <class transformers.tokenization_utils_base.BatchEncoding> with the following fields:
    - input_ids: list of token ids
    - token_type_ids: list of token type ids
    - attention_mask: list of indices (0,1) specifying which tokens should considered by the model (return_attention_mask = True).
  '''
  return tokenizer.encode_plus(
                        input_text,
                        add_special_tokens = True,
                        max_length = 400,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt'
                   )


for sample in text:
  encoding_dict = preprocessing(sample, tokenizer)
  token_id.append(encoding_dict['input_ids']) 
  attention_masks.append(encoding_dict['attention_mask'])


token_id = torch.cat(token_id, dim = 0)
attention_masks = torch.cat(attention_masks, dim = 0)
labels = torch.tensor(labels)


for sample in text_eval:
  encoding_dict_eval = preprocessing(sample, tokenizer)
  token_id_eval.append(encoding_dict_eval['input_ids']) 
  attention_masks_eval.append(encoding_dict_eval['attention_mask'])


token_id_eval = torch.cat(token_id_eval, dim = 0)
attention_masks_eval = torch.cat(attention_masks_eval, dim = 0)
labels_eval = torch.tensor(labels_eval)



Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


# Data split

In [10]:
val_ratio = 0.2
# Recommended batch size: 16, 32. See: https://arxiv.org/pdf/1810.04805.pdf
batch_size = 16

# Indices of the train and validation splits stratified by labels


# Train and validation sets
train_set = TensorDataset(token_id, 
                          attention_masks, 
                          labels)

val_set = TensorDataset(token_id_eval, 
                        attention_masks_eval, 
                        labels_eval)

# Prepare DataLoader
train_dataloader = DataLoader(
            train_set,
            sampler = RandomSampler(train_set),
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            val_set,
            sampler = SequentialSampler(val_set),
            batch_size = batch_size
        )

# Train

In [11]:
# Load the BertForSequenceClassification model
model = XLNetForSequenceClassification.from_pretrained(
    'xlnet-base-cased',
    num_labels = 3,
    output_attentions = False,
    output_hidden_states = False,
)

# Recommended learning rates (Adam): 5e-5, 3e-5, 2e-5. See: https://arxiv.org/pdf/1810.04805.pdf
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
                                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
                                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay':0.0}
]

optimizer = torch.optim.AdamW(model.parameters(), 
                              lr = 1e-5,
                              eps = 1e-08
                              )


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device
model = model.to(device)

Downloading:   0%|          | 0.00/467M [00:00<?, ?B/s]

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

In [12]:
from sklearn import metrics
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

# Recommended number of epochs: 2, 3, 4. See: https://arxiv.org/pdf/1810.04805.pdf
epochs = 4

for _ in trange(epochs, desc = 'Epoch'):
    
    # ========== Training ==========
    
    # Set model to training mode
    model.train()
    
    # Tracking variables
    acc = 0
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    
    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        # Forward pass
        train_output = model(b_input_ids, 
                             token_type_ids = None, 
                             attention_mask = b_input_mask, 
                             labels = b_labels)  
        # Backward pass
        train_output.loss.backward()
        optimizer.step()
        # Update tracking variables
        tr_loss += train_output.loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        
    
    # ========== Validation ==========
    nb_tr_eval = 0
    # Set model to evaluation mode
    model.eval()

    
    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
          # Forward pass
          eval_output = model(b_input_ids, 
                              token_type_ids = None, 
                              attention_mask = b_input_mask)
        b_labels = b_labels.cpu().detach().numpy()
        logits = eval_output.logits.detach().cpu().numpy()
        preds = np.argmax(logits, axis = 1).flatten()
        b_labels= b_labels.flatten()

        accuracy = metrics.accuracy_score(b_labels,preds )
        acc += accuracy
       
        nb_tr_eval += 1

    
    print('\n\t - Accuracy: {:.4f}'.format(acc / len(validation_dataloader)))
    print('\n\t - Train loss: {:.4f}'.format(tr_loss / nb_tr_steps))

Epoch:  25%|██▌       | 1/4 [09:46<29:18, 586.01s/it]


	 - Accuracy: 0.5637

	 - Train loss: 0.9748


Epoch:  50%|█████     | 2/4 [19:30<19:30, 585.34s/it]


	 - Accuracy: 0.5650

	 - Train loss: 0.8653


Epoch:  75%|███████▌  | 3/4 [29:15<09:45, 585.23s/it]


	 - Accuracy: 0.5587

	 - Train loss: 0.7959


Epoch: 100%|██████████| 4/4 [39:01<00:00, 585.28s/it]


	 - Accuracy: 0.5813

	 - Train loss: 0.7019



