In [None]:
"""
Set the following paths appropriately.

two_sentences_dataset_path : Path of the csv file containing dataset for two sentences - commonsense_data.csv
saved_model_output_directory : Directory to store trained output models
pretrained model : Can select any of the following - {bert-large-uncased, albert-xxlarge-v2, roberta-large, xlnet-large-cased}
epochs : number of steps to train the model

Note: albert-xxlarge-v2 takes around 1hr 40mins for one epoch. Set the parameter appropriately

"""
two_sentences_dataset_path = "/content/drive/MyDrive/IIT_Bombay/CS 626/commonsense_data.csv"
saved_model_output_directory = '/content/drive/MyDrive/IIT_Bombay/CS 626/Two_Sentences_Models/BERT/'
pretrained_model = "bert-large-uncased"
epochs = 4

In [None]:
"""
Uncomment the code below if you want to load the files from drive
"""
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#installing all the packages and importing the required libraries
!pip install transformers -q

import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
import time
import datetime
import random
import os
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
#Code to select device as cpu or gpu
if torch.cuda.is_available():       
    device = torch.device("cuda")
else:
    print('Using CPU')
    device = torch.device("cpu")

In [None]:
#Change model name to try out different models
model_name = pretrained_model
model = AutoModelForSequenceClassification.from_pretrained(model_name) 
tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)

Downloading:   0%|          | 0.00/710 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/851M [00:00<?, ?B/s]

Some weights of the model checkpoint at albert-xxlarge-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.LayerNorm.bias', 'predictions.dense.bias', 'predictions.LayerNorm.weight', 'predictions.dense.weight', 'predictions.decoder.weight', 'predictions.bias', 'predictions.decoder.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-xxlarge-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
Y

Downloading:   0%|          | 0.00/742k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.25M [00:00<?, ?B/s]

In [None]:
df = pd.read_csv(two_sentences_dataset_path)


sentence1 = df.sent1.values
sentence0 = df.sent0.values
labels = df.label.values

In [None]:
MAX_LEN = 64
input_ids = []
input_type_ids = []
attention_masks = []

# For every sentence...
for i in range(len(df)):
    tokenizer.padding_side = 'right'
    encoded_sent = tokenizer.encode_plus(df['sent0'][i],df['sent1'][i],add_special_tokens = True,max_length = MAX_LEN,pad_to_max_length = True)

    input_ids.append(encoded_sent['input_ids'])
    attention_masks.append(encoded_sent['attention_mask'])


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Use 90% for training and 10% for validation.
train_sentences0,train_sentences1, validation_sentences0,validation_sentences1,_,_=(train_test_split(sentence0,sentence1, labels, random_state=42, test_size=0.1))
train_inputs, validation_inputs, train_labels, validation_labels, train_masks, validation_masks = (train_test_split(input_ids, labels, attention_masks, random_state=42, test_size=0.1))

In [None]:
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)

train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)

train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

In [None]:
batch_size = 16

# Create the DataLoader for our training set.
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set.
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [None]:
model.cuda();
optimizer = AdamW(model.parameters(),lr = 1e-5,eps = 1e-8)

In [None]:
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps = 0,num_training_steps = total_steps)

In [None]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
def format_time(time_elapsed):
    elapsed_rounded = int(round((time_elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

loss_values = []

for epoch_i in range(0, epochs):
    
    #Training the model

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()
    total_loss = 0

    model.train()

    for step, batch in enumerate(train_dataloader):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}    LR . {:.2E}'.format(step, len(train_dataloader), elapsed, scheduler.get_lr()[0]))

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
 
        model.zero_grad()        
        outputs = model(b_input_ids, 
                    attention_mask=b_input_mask, 
                    labels=b_labels)
        
        loss = outputs[0]
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)            
    loss_values.append(avg_train_loss)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
        
    #Running the Validation

    print("")
    print("Running Validation...")

    t0 = time.time()
    model.eval()

    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels, b_type_ids = batch
        with torch.no_grad():        
            outputs = model(b_input_ids, 
                            token_type_ids=b_type_ids, 
                            attention_mask=b_input_mask)
        logits = outputs[0]
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1

    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

print("")
print("Training complete!")


Training...




  Batch    40  of    789.    Elapsed: 0:04:45    LR . 2.12E-06
  Batch    80  of    789.    Elapsed: 0:09:34    LR . 4.23E-06
  Batch   120  of    789.    Elapsed: 0:14:24    LR . 6.35E-06
  Batch   160  of    789.    Elapsed: 0:19:12    LR . 8.47E-06
  Batch   200  of    789.    Elapsed: 0:24:00    LR . 9.96E-06
  Batch   240  of    789.    Elapsed: 0:28:48    LR . 9.83E-06
  Batch   280  of    789.    Elapsed: 0:33:37    LR . 9.69E-06
  Batch   320  of    789.    Elapsed: 0:38:25    LR . 9.56E-06
  Batch   360  of    789.    Elapsed: 0:43:14    LR . 9.42E-06
  Batch   400  of    789.    Elapsed: 0:48:02    LR . 9.29E-06
  Batch   440  of    789.    Elapsed: 0:52:51    LR . 9.15E-06
  Batch   480  of    789.    Elapsed: 0:57:39    LR . 9.02E-06
  Batch   520  of    789.    Elapsed: 1:02:27    LR . 8.88E-06
  Batch   560  of    789.    Elapsed: 1:07:15    LR . 8.75E-06
  Batch   600  of    789.    Elapsed: 1:12:04    LR . 8.61E-06
  Batch   640  of    789.    Elapsed: 1:16:52    LR . 8

Let's take a look at our training loss over all batches:

In [None]:
output_dir = '/content/drive/MyDrive/IIT_Bombay/CS 626/Two_Sentences_Models/BERT/'

In [None]:
if not os.path.exists(saved_model_output_directory):
    os.makedirs(saved_model_output_directory)

model.save_pretrained(saved_model_output_directory)
tokenizer.save_pretrained(saved_model_output_directory)

Saving model to /content/drive/MyDrive/IIT_Bombay/CS 626/Models/ALBERT/


('/content/drive/MyDrive/IIT_Bombay/CS 626/Models/ALBERT/tokenizer_config.json',
 '/content/drive/MyDrive/IIT_Bombay/CS 626/Models/ALBERT/special_tokens_map.json',
 '/content/drive/MyDrive/IIT_Bombay/CS 626/Models/ALBERT/tokenizer.json')

In [None]:
print("Predicting...")
model.eval()

predictions , true_labels = [], []

for batch in validation_dataloader:
  batch = tuple(t.to(device) for t in batch)
  b_input_ids, b_input_mask, b_labels,_= batch
  with torch.no_grad():
      outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask)

  logits = outputs[0]
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  
  #store the predictions
  predictions.append(logits)
  true_labels.append(label_ids)

predictions = [item for sublist in predictions for item in sublist]
predictions = np.argmax(predictions, axis=1).flatten()
true_labels = [item for sublist in true_labels for item in sublist]
# true_labels = true_labels.flatten()
print("DONE")

Predicting...
DONE


In [None]:
create_confusion_matrix = confusion_matrix(list(valid_labels), predictions, labels=[0,1])
confusion_matrix_df = pd.DataFrame(create_confusion_matrix, index=[0,1], columns=[0,1])
display(confusion_matrix_df)

Unnamed: 0,0,1
0,335,345
1,317,405


In [None]:
# Precision, Recall and F1 score F(0.5) and F2 score calculation
eval_metrics = classification_report(list(valid_labels), predictions, target_names=[0,1], output_dict=True)
print("---------------------------------Evaluation Metrics------------------------------------")
# Deleting the support result/last column 
eval_metrics_df = pd.DataFrame(eval_metrics).transpose()  
eval_metrics_df = eval_metrics_df.iloc[: , :-1]
display(eval_metrics_df)

---------------------------------Evaluation Metrics------------------------------------


Unnamed: 0,precision,recall,f1-score
0,0.513804,0.492647,0.503003
1,0.54,0.560942,0.550272
accuracy,0.527817,0.527817,0.527817
macro avg,0.526902,0.526794,0.526637
weighted avg,0.527294,0.527817,0.527345
