# Environment setup

In [23]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [24]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [42]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

from tabulate import tabulate
from tqdm import trange
import random

# Dataset

In [26]:
df = pd.read_csv('/content/drive/My Drive/GCDC_train.csv')

df_test = pd.read_csv('/content/drive/My Drive/GCDC_test.csv')

In [27]:
from sklearn.utils import shuffle
df = shuffle(df)
df['labelA']= df['labelA'].astype(int)
df['labelA'] =df['labelA'] - 1
df_test['labelA']= df_test['labelA'].astype(int)
df_test['labelA'] =df_test['labelA'] - 1
print(df['labelA'])

308     2
556     2
2720    2
3107    2
3617    0
       ..
1795    1
1992    1
2991    1
3185    2
2476    2
Name: labelA, Length: 4000, dtype: int64


In [28]:
text = df.text.values
labels = df.labelA.values

text_eval = df_test.text.values
labels_eval = df_test.labelA.values


# Preprocessing

In [44]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case = True)

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [45]:
token_id = []
attention_masks = []
token_id_eval = []
attention_masks_eval = []

def preprocessing(input_text, tokenizer):
  '''
  Returns <class transformers.tokenization_utils_base.BatchEncoding> with the following fields:
    - input_ids: list of token ids
    - token_type_ids: list of token type ids
    - attention_mask: list of indices (0,1) specifying which tokens should considered by the model (return_attention_mask = True).
  '''
  return tokenizer.encode_plus(
                        input_text,
                        add_special_tokens = True,
                        max_length = 400,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt'
                   )


for sample in text:
  encoding_dict = preprocessing(sample, tokenizer)
  token_id.append(encoding_dict['input_ids']) 
  attention_masks.append(encoding_dict['attention_mask'])


token_id = torch.cat(token_id, dim = 0)
attention_masks = torch.cat(attention_masks, dim = 0)
labels = torch.tensor(labels)


for sample in text_eval:
  encoding_dict_eval = preprocessing(sample, tokenizer)
  token_id_eval.append(encoding_dict_eval['input_ids']) 
  attention_masks_eval.append(encoding_dict_eval['attention_mask'])


token_id_eval = torch.cat(token_id_eval, dim = 0)
attention_masks_eval = torch.cat(attention_masks_eval, dim = 0)
labels_eval = torch.tensor(labels_eval)



Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
  labels = torch.tensor(labels)
  labels_eval = torch.tensor(labels_eval)


# Data split

In [46]:
val_ratio = 0.2
# Recommended batch size: 16, 32. See: https://arxiv.org/pdf/1810.04805.pdf
batch_size = 16

# Indices of the train and validation splits stratified by labels


# Train and validation sets
train_set = TensorDataset(token_id, 
                          attention_masks, 
                          labels)

val_set = TensorDataset(token_id_eval, 
                        attention_masks_eval, 
                        labels_eval)

# Prepare DataLoader
train_dataloader = DataLoader(
            train_set,
            sampler = RandomSampler(train_set),
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            val_set,
            sampler = SequentialSampler(val_set),
            batch_size = batch_size
        )

# Train

In [47]:
# Load the BertForSequenceClassification model
model = RobertaForSequenceClassification.from_pretrained(
    'roberta-base',
    num_labels = 3,
    output_attentions = False,
    output_hidden_states = False,
)

# Recommended learning rates (Adam): 5e-5, 3e-5, 2e-5. See: https://arxiv.org/pdf/1810.04805.pdf
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
                                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
                                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay':0.0}
]

optimizer = torch.optim.AdamW(model.parameters(), 
                              lr = 1e-5,
                              eps = 1e-08
                              )


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device
model = model.to(device)

Downloading:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classi

In [48]:
from sklearn import metrics
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

# Recommended number of epochs: 2, 3, 4. See: https://arxiv.org/pdf/1810.04805.pdf
epochs = 4

for _ in trange(epochs, desc = 'Epoch'):
    
    # ========== Training ==========
    
    # Set model to training mode
    model.train()
    
    # Tracking variables
    acc = 0
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    
    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        # Forward pass
        train_output = model(b_input_ids, 
                             token_type_ids = None, 
                             attention_mask = b_input_mask, 
                             labels = b_labels)  
        # Backward pass
        train_output.loss.backward()
        optimizer.step()
        # Update tracking variables
        tr_loss += train_output.loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        
    
    # ========== Validation ==========
    nb_tr_eval = 0
    # Set model to evaluation mode
    model.eval()

    
    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
          # Forward pass
          eval_output = model(b_input_ids, 
                              token_type_ids = None, 
                              attention_mask = b_input_mask)
        b_labels = b_labels.cpu().detach().numpy()
        logits = eval_output.logits.detach().cpu().numpy()
        preds = np.argmax(logits, axis = 1).flatten()
        b_labels= b_labels.flatten()

        accuracy = metrics.accuracy_score(b_labels,preds )
        acc += accuracy
       
        nb_tr_eval += 1

    
    print('\n\t - Accuracy: {:.4f}'.format(acc / len(validation_dataloader)))
    print('\n\t - Train loss: {:.4f}'.format(tr_loss / nb_tr_steps))

Epoch:  25%|██▌       | 1/4 [04:41<14:04, 281.49s/it]


	 - Accuracy: 0.6138

	 - Train loss: 0.9376


Epoch:  50%|█████     | 2/4 [09:23<09:23, 281.70s/it]


	 - Accuracy: 0.5962

	 - Train loss: 0.8202


Epoch:  75%|███████▌  | 3/4 [14:05<04:41, 281.79s/it]


	 - Accuracy: 0.5863

	 - Train loss: 0.7348


Epoch: 100%|██████████| 4/4 [18:47<00:00, 281.79s/it]


	 - Accuracy: 0.5600

	 - Train loss: 0.6114



