**CSI5138F - Intro to DL and RL - Project**

This notebook is heavily influenced by the following Tutorial:
  https://mccormickml.com/2019/07/22/BERT-fine-tuning/ 

# Set up the notebook

In [0]:
# install pytorch transformers
!pip install pytorch-transformers



## Imports and mount drive

In [0]:
import torch
import os
import string
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from pytorch_transformers import *
import numpy as np
import json
import collections
from google.colab import drive
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score
import copy

# Import nltk WordNet
import nltk
nltk.download('wordnet')
lemmatizer = nltk.stem.WordNetLemmatizer()

# Mount google drive containing the datasets
drive.mount('/content/drive', force_remount=True)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Mounted at /content/drive


## Get RoBERTa tokenizer

In [0]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

## Utility functions

In [0]:
# Create a function to import json objects from jsonl files
def load_json_objects_from_file(filename):
  # Array for json objects
  json_objects = []
  # Read file line by line
  with open(filename, mode = "r") as jsonl_file:
      for line in jsonl_file:
          json_objects.append(json.loads(line))
  return json_objects

#Takes a list of words (strings) and a sentence (as a RoBERTa tokenized ID list) and returns a list
#of pairs indicating the tokens' start and end positions in the sentence for each word
#NOTE: it can also apply to a group of words separated by spaces
#NOTE: It is important that the list of words given describes a sentence, because the order is relevant to do the matching properly
      
def find_word_in_tokenized_sentence(word,token_ids):
  decomposedWord = tokenizer.encode(word)
  # Iterate through to find a matching sublist of the token_ids
  for i in range(len(token_ids)):
    if token_ids[i] == decomposedWord[0] and token_ids[i:i+len(decomposedWord)] == decomposedWord:
      return (i,i+len(decomposedWord)-1)
  # This is the ouput when no matching pattern is found
  return (-1,-1)
  
def find_words_in_tokenized_sentences(wordList,token_ids):
  intList = []
  for word in wordList:
    if len(intList) == 0:
      intList.append(find_word_in_tokenized_sentence(word,token_ids))
    else:
      afterLastInterval = intList[-1][1]+1
      interv = find_word_in_tokenized_sentence(word,token_ids[afterLastInterval:])
      actualPositions = (interv[0] + afterLastInterval,interv[1]+afterLastInterval)
      intList.append(actualPositions)
  return intList

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels, return_predict_correctness = False):
  pred_flat = np.argmax(preds, axis=1).flatten()
  labels_flat = labels.flatten()
  if return_predict_correctness:
    return np.sum(pred_flat == labels_flat) / len(labels_flat), pred_flat == labels_flat
  else:
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def flat_predictions(preds):
  pred_flat = np.argmax(preds, axis=1).flatten()
  return pred_flat == 1

## Constants

In [0]:
BATCH_SIZE = 64
EPOCHS = 60
PATIENCE = 10
# Prepare Torch to use GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'Tesla P100-PCIE-16GB'

## Import and Process the WordInContext Training data

In [0]:
# Create a function to preprocess the WiC data
def wic_preprocessing(json_objects, training = True, shuffle_data = False, verbose = False):
  wic_sentences = []
  wic_encoded = []
  wic_labels = []
  wic_word_locs = []
  wic_indexes = []
  for index, example in enumerate(json_objects):
    #wic_indexes.append(example['idx']) # Is it the index??
    wic_indexes.append(index)
    sentence = f"<s>{example['sentence1']}</s><s>{example['sentence2']}</s>"
    wic_sentences.append(sentence)
    # Then encode the sentences
    wic_encoded.append(tokenizer.encode(sentence, add_special_tokens=False))
    # Find the word in each sentences
    word = example['word']
    word_locs = (-1, -1)
    # Split the 2 sentences on space. (Also, lemmatize and uncapitilize each word)
    sent1_split = example['sentence1'].split(' ')
    sent2_split = example['sentence2'].split(' ')
    # Get the index of word in both sentences
    sent1_word_char_loc = (example['start1'], example['end1'])
    sent2_word_char_loc = (example['start2'], example['end2'])
    # Create a variable to keep track of the number of characters parsed in each sentence as we loop
    sent_chars = 0
    # Loop over the words in the first sentence
    i, j = 0, 0
    word1_not_found, word2_not_found = True, True
    while word1_not_found and i < len(sent1_split):
      word_len = len(sent1_split[i])
      if sent_chars >= sent1_word_char_loc[0] or sent_chars + word_len >= sent1_word_char_loc[1]:
        word_locs = (i, -1) # Found the word in the sentence
        word1_not_found = False
      elif sent_chars > sent1_word_char_loc[1]:
        # If we somehow got past the word. Assume it was the previous word
        word_locs = (i - 1, -1) # Found the word in the sentence
        word1_not_found = False
      else:
        # Look at the next word
        sent_chars += word_len + 1 # Plus one for the space
        i += 1
    # Loop over the words in the second
    sent_chars = 0 # Reset
    while word2_not_found and j < len(sent2_split):
      word_len = len(sent2_split[j])
      if sent_chars >= sent2_word_char_loc[0] or sent_chars + word_len >= sent2_word_char_loc[1]:
        word_locs = (i, j) # Found the word in the sentence
        word2_not_found = False
      elif sent_chars > sent2_word_char_loc[1]:
        # If we somehow got past the word. Assume it was the previous word
        word_locs = (i, j - 1) # Found the word in the sentence
        word2_not_found = False
      else:
        # Look at the next word
        sent_chars += word_len + 1 # Plus one for the space
        j += 1
    # For testing
    if verbose:
      print(word)
      print(sent1_split)
      print(sent2_split)
      print(word_locs)
    # Now to find the word in the tokenized sentences
    word1 = sent1_split[word_locs[0]].translate(str.maketrans('', '', string.punctuation)) #Remove punctuation (See https://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string)
    word2 = sent2_split[word_locs[1]].translate(str.maketrans('', '', string.punctuation)) #Remove punctuation
    token_word_locs = find_words_in_tokenized_sentences([word1, word2], wic_encoded[-1])
    wic_word_locs.append(token_word_locs)
    # Get the label if we expect it to be there
    if training:
      if example['label']:
        wic_labels.append(1)
      else:
        wic_labels.append(0)
  # Pad the sequences and find the encoded word location in the combined input
  max_len = np.array([len(ex) for ex in wic_encoded]).max()
  wic_padded = {"input_ids" : [], "attention_mask" : [], "token_type_ids" : [], "word1_locs": [], "word2_locs" : [], "index" : wic_indexes}
  for i in range(0, len(wic_encoded)):
    enc_sentence = wic_encoded[i]
    word_locs = wic_word_locs[i]
    # Pad the sequences
    ex_len = len(enc_sentence)
    padded_sentence = enc_sentence.copy()
    padded_sentence.extend([0]*(max_len - ex_len))
    wic_padded["input_ids"].append(padded_sentence)
    padded_mask = [1] * ex_len
    padded_mask.extend([0]*(max_len - ex_len))
    wic_padded["attention_mask"].append(padded_mask)
    # Create the vector to get back the words after RoBERTa
    token_word_locs = wic_word_locs[i]
    first_word_loc = []
    second_word_loc = []
    len_first_word = token_word_locs[0][1] - token_word_locs[0][0] + 1
    len_second_word = token_word_locs[1][1] - token_word_locs[1][0] + 1
    for j in range(0, max_len):
      if j >= token_word_locs[0][0] and j <= token_word_locs[0][1]:
        #Part of the first word
        first_word_loc.append(1.0 / len_first_word)
      else:
        first_word_loc.append(0.0)
      if j >= token_word_locs[1][0] and j <= token_word_locs[1][1]:
        #Part of the second word
        second_word_loc.append(1.0 / len_second_word)
      else:
        second_word_loc.append(0.0)
    # We want to append a [1, max_len] vector instead of a [max_len] vector so wrap in an array
    wic_padded["word1_locs"].append([first_word_loc])
    wic_padded["word2_locs"].append([second_word_loc])
    # token_type_ids is a mask that tells where the first and second sentences are
    token_type_id = []
    first_sentence = True
    sentence_start = True
    for token in padded_sentence:
      if first_sentence and sentence_start and token == 0:
        # Allows 0 at the start of the first sentence
        token_type_id.append(0)
      elif first_sentence and token > 0:
        if sentence_start:
          sentence_start = False
        token_type_id.append(0)
      elif first_sentence and not sentence_start and token == 0:
        first_sentence = False
        # Start of second sentence
        token_type_id.append(1)
      else:
        # Second sentence
        token_type_id.append(1)
    wic_padded["token_type_ids"].append(token_type_id)
  if training:
    if shuffle_data:
      # Shuffle the data
      raw_set = {"input_ids": [], "token_type_ids": [], "attention_mask": [], "labels": [], "word1_locs": [], "word2_locs" : [], "index" : []}
      raw_set["input_ids"], raw_set["token_type_ids"], raw_set["attention_mask"], raw_set["labels"], raw_set["word1_locs"], raw_set["word2_locs"], raw_set["index"] = shuffle(
          wic_padded["input_ids"], wic_padded["token_type_ids"], wic_padded["attention_mask"], wic_labels, wic_padded["word1_locs"], wic_padded["word2_locs"], wic_padded["index"])
    else:
      raw_set = {"input_ids": wic_padded["input_ids"], "token_type_ids": wic_padded["token_type_ids"],
                 "attention_mask": wic_padded["attention_mask"], "labels": wic_labels, "index" : wic_padded["index"],
                 "word1_locs": wic_padded["word1_locs"], "word2_locs" : wic_padded["word2_locs"]}
  else: # No labels present (Testing set)
    # Do not shuffle the testing set
    raw_set = {"input_ids": wic_padded["input_ids"], "token_type_ids": wic_padded["token_type_ids"], 
               "attention_mask": wic_padded["attention_mask"], "index" : wic_padded["index"], 
               "word1_locs": wic_padded["word1_locs"], "word2_locs" : wic_padded["word2_locs"]}
  # Return the raw data (Need to put them in a PyTorch tensor and dataset)
  return raw_set

In [0]:
# Process the data
train_json_objs = load_json_objects_from_file("/content/drive/My Drive/CSI5138_Project/WiC/train.jsonl")
raw_train_set = wic_preprocessing(train_json_objs, shuffle_data=True, verbose = False) # We do not want to shuffle for now.
print(train_json_objs[raw_train_set["index"][15]])
print(raw_train_set["input_ids"][15]),
print(raw_train_set["token_type_ids"][15]),
print(raw_train_set["attention_mask"][15]),
print(raw_train_set["labels"][15])
print(raw_train_set["word1_locs"][15]) #right, so it's better to use one-hot vectors to do a selection afterwards, I can compute it more precisely with the token intervals however
print(raw_train_set["word2_locs"][15])

{'word': 'title', 'sentence1': "He had no documents confirming his title to his father's estate.", 'sentence2': 'I can never remember movie titles.', 'idx': 330, 'label': False, 'start1': 35, 'start2': 27, 'end1': 40, 'end2': 33, 'version': 1.1}
[0, 91, 56, 117, 2339, 13958, 39, 1270, 7, 39, 1150, 18, 2587, 4, 2, 0, 38, 64, 393, 2145, 1569, 4867, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
0
[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,

### Load into a PyTorch dataset

In [0]:
# Create a PyTorch dataset for it
train_data = TensorDataset(
    torch.tensor(raw_train_set["input_ids"]),
    torch.tensor(raw_train_set["token_type_ids"]),
    torch.tensor(raw_train_set["attention_mask"]),
    torch.tensor(raw_train_set["labels"]),
    torch.tensor(raw_train_set["word1_locs"]),
    torch.tensor(raw_train_set["word2_locs"]),
    torch.tensor(raw_train_set["index"])
)
# Create a sampler and loader
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)

### Repeat the procedure to load the WiC validation and testing sets

In [0]:
# Load the json objects from each file
test_json_objs = load_json_objects_from_file("/content/drive/My Drive/CSI5138_Project/WiC/test.jsonl")
print(test_json_objs)
valid_json_objs = load_json_objects_from_file("/content/drive/My Drive/CSI5138_Project/WiC/val.jsonl")
# Process the objects
raw_test_set = wic_preprocessing(test_json_objs, training = False) # The labels for the testing set are unknown
raw_valid_set = wic_preprocessing(valid_json_objs)
# Create PyTorch datasets
test_data = TensorDataset(
    torch.tensor(raw_test_set["input_ids"]),
    torch.tensor(raw_test_set["token_type_ids"]),
    torch.tensor(raw_test_set["attention_mask"]),
    torch.tensor(raw_test_set["word1_locs"]),
    torch.tensor(raw_test_set["word2_locs"]),
    torch.tensor(raw_test_set["index"])
)
validation_data = TensorDataset(
    torch.tensor(raw_valid_set["input_ids"]),
    torch.tensor(raw_valid_set["token_type_ids"]),
    torch.tensor(raw_valid_set["attention_mask"]),
    torch.tensor(raw_valid_set["labels"]),
    torch.tensor(raw_valid_set["word1_locs"]),
    torch.tensor(raw_valid_set["word2_locs"]),
    torch.tensor(raw_valid_set["index"])
)
# Create a sampler and loader for each
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=BATCH_SIZE)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=BATCH_SIZE)

[{'word': 'water', 'sentence1': 'The smell of fried onions makes my mouth water.', 'sentence2': 'His eyes were watering.', 'idx': 0, 'start1': 41, 'start2': 14, 'end1': 46, 'end2': 22, 'version': 1.1}, {'word': 'value', 'sentence1': 'The value assigned was 16 milliseconds.', 'sentence2': 'I establish the colors and principal values by organizing the painting into three values -- dark, medium ... and light -- Joe Hing Lowe.', 'idx': 1, 'start1': 4, 'start2': 37, 'end1': 9, 'end2': 43, 'version': 1.1}, {'word': 'presence', 'sentence1': 'He felt the presence of an evil force.', 'sentence2': 'He sensed the presence of danger.', 'idx': 2, 'start1': 12, 'start2': 14, 'end1': 20, 'end2': 22, 'version': 1.1}, {'word': 'vision', 'sentence1': 'He had a vision of the Virgin Mary.', 'sentence2': 'He had a vision of his own death.', 'idx': 3, 'start1': 9, 'start2': 9, 'end1': 15, 'end2': 15, 'version': 1.1}, {'word': 'rim', 'sentence1': 'Rim a hat.', 'sentence2': 'Sugar rimmed the dessert plate.', 

# RoBERTa

## Loading the model

In [0]:
# Variable for the path to the model
NAME = 'William'
#PATH = f'/content/drive/My Drive/CSI5138_Project/Models/{NAME}/WiCBaseline'
#if not os.path.exists(PATH):
#  os.mkdir(PATH) # Make the directory

# Load the base RoBERTa model
#model = RobertaModel.from_pretrained('roberta-base')
#model = RobertaForMaskedLM.from_pretrained('roberta-base')

# This is where the model weights would be loaded for our fine-tuned models
Model_name = "RoBERTa_0.0_0.0_1.0_Epochs3"
PATH = f'/content/drive/My Drive/CSI5138_Project/Models/{NAME}/{Model_name}'
#model = RobertaModel.from_pretrained(PATH)
model = RobertaForMaskedLM.from_pretrained(PATH)

## Create a custom head for WiC instead of a classification head
Based on https://pytorch.org/tutorials/beginner/examples_nn/two_layer_net_module.html and https://huggingface.co/transformers/_modules/transformers/modeling_roberta.html#RobertaModel



In [0]:
class WiC_Head(torch.nn.Module):
    def __init__(self, embedding_size = 768):
        """
        Keeps a reference to the provided RoBERTa model. 
        It then adds a linear layer that takes the distance between two 
        """
        super(WiC_Head, self).__init__()
        self.embedding_size = embedding_size
        #self.embedder = roberta_based_model
        self.linear_diff = torch.nn.Linear(embedding_size, 250, bias = True)
        self.linear_seperator = torch.nn.Linear(250, 2, bias = True)
        self.loss = torch.nn.CrossEntropyLoss()
        self.activation = torch.nn.ReLU()
        self.softmax = torch.nn.Softmax()

    def forward(self, roberta_based_model, input_ids=None, attention_mask=None, labels=None,
                word1_locs = None, word2_locs = None):
        """
        Takes in the same argument as RoBERTa forward plus two tensors for the location of the 2 words to compare
        """
        if word1_locs is None or word2_locs is None:
          raise ValueError("The tensors (word1_locs, word1_locs) containing the location of the words to compare in the input vector must be provided.")
        elif input_ids is None:
          raise ValueError("The input_ids tensor must be provided.")
        elif word1_locs.shape[0] != input_ids.shape[0] or word2_locs.shape[0] != input_ids.shape[0]:
          raise ValueError("All provided vectors should have the same batch size.")
        batch_size = word1_locs.shape[0]
        # Get the embeddings
        #embs, _ = self.embedder.roberta(input_ids=input_ids, attention_mask=attention_mask)
        embs, _ = roberta_based_model.roberta(input_ids=input_ids, attention_mask=attention_mask)
        # Get the words
        word1s = torch.matmul(word1_locs, embs).view(batch_size, self.embedding_size)
        word2s = torch.matmul(word2_locs, embs).view(batch_size, self.embedding_size)
        diff = word1s - word2s
        # Calculate outputs using activation
        layer1_results = self.activation(self.linear_diff(diff))
        logits = self.softmax(self.linear_seperator(layer1_results))
        outputs = logits
        # Calculate the loss
        if labels is not None:
            #  We want seperation like a SVM so use Hinge loss
            loss = self.loss(logits.view(-1, 2), labels.view(-1))
            outputs = (loss, logits)
        return outputs

## Create the WiC model

In [0]:
class_model = WiC_Head(embedding_size = 768)

## The training loop

In [0]:
initial_roberta_weights = copy.deepcopy(model.state_dict())

In [0]:
# Variable for minimal accuracy
MIN_ACCURACY = 0.800 # Based on the average accuracy
REACHED_MIN_ACCURACY = False
best_weights = class_model.state_dict()
# Want to maximize accuracy
max_val_acc = (0, 0)
# Put the model in GPU
model.cuda()
class_model.cuda()
# Create the optimizer
param_optimizer = list(class_model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]
# Freeze RoBERTa weights
#class_model.embedder.roberta.require_grad_ = False
# I use the one that comes with the models, but any other optimizer could be used
optimizer = AdamW(optimizer_grouped_parameters, lr=1e-2)
# Store our loss and accuracy for plotting
fit_history = {"loss": [],  "accuracy": [], "val_loss": [], "val_accuracy": []}
epoch_number = 0
epoch_since_max = 0
continue_learning = True
while epoch_number < EPOCHS and continue_learning:
  epoch_number += 1
  print(f"Training epoch #{epoch_number}")
  # Tracking variables
  tr_loss, tr_accuracy = 0, 0
  nb_tr_examples, nb_tr_steps = 0, 0
  eval_loss, eval_accuracy = 0, 0
  nb_eval_steps, nb_eval_examples = 0, 0
  # Training
  # Set our model to training mode (as opposed to evaluation mode)
  class_model.train()
  # Freeze RoBERTa weights
  model.eval()
  # Train the data for one epoch
  for step, batch in enumerate(train_dataloader):
    # Add batch to GPU
    batch = tuple(t.cuda() for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_token_ids, b_input_mask, b_labels, b_word1, b_word2, b_index = batch
    # Clear out the gradients (by default they accumulate)
    optimizer.zero_grad()
    # Forward pass
    #loss, logits = class_model(b_input_ids, token_type_ids=b_token_ids, attention_mask=b_input_mask, labels=b_labels)   
    loss, logits = class_model(model, b_input_ids, attention_mask=b_input_mask, 
                               labels=b_labels, word1_locs = b_word1, word2_locs = b_word2) 
    # Backward pass
    loss.backward()
    # Update parameters and take a step using the computed gradient
    optimizer.step()
    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.cpu().numpy()
    # Calculate the accuracy
    b_accuracy = flat_accuracy(logits, label_ids) # For RobertaForClassification
    # Append to fit history
    fit_history["loss"].append(loss.item()) 
    fit_history["accuracy"].append(b_accuracy) 
    # Update tracking variables
    tr_loss += loss.item()
    tr_accuracy += b_accuracy
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1
    if nb_tr_steps%10 == 0:
      print("\t\tTraining Batch {}: Loss: {}; Accuracy: {}".format(nb_tr_steps, loss.item(), b_accuracy))
  print("Training:\n\tLoss: {}; Accuracy: {}".format(tr_loss/nb_tr_steps, tr_accuracy/nb_tr_steps))
  # Validation
  # Put model in evaluation mode to evaluate loss on the validation set
  class_model.eval()
  model.eval()
  # Evaluate data for one epoch
  for batch in validation_dataloader:
    # Add batch to GPU
    batch = tuple(t.cuda() for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_token_ids, b_input_mask, b_labels, b_word1, b_word2, b_index = batch
    # Telling the model not to compute or store gradients, saving memory and speeding up validation
    with torch.no_grad():
      # Forward pass, calculate logit predictions
      #loss, logits = class_model(b_input_ids, token_type_ids=b_token_ids, attention_mask=b_input_mask, labels=b_labels)
      loss, logits = class_model(model, b_input_ids, attention_mask=b_input_mask, 
                                 labels=b_labels, word1_locs = b_word1, word2_locs = b_word2)
    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.cpu().numpy()
    # Calculate the accuracy
    b_accuracy = flat_accuracy(logits, label_ids) # For RobertaForClassification
    # Append to fit history
    fit_history["val_loss"].append(loss.item()) 
    fit_history["val_accuracy"].append(b_accuracy) 
    # Update tracking variables
    eval_loss += loss.item()
    eval_accuracy += b_accuracy
    nb_eval_examples += b_input_ids.size(0)
    nb_eval_steps += 1
    if nb_eval_steps%10 == 0:
      print("\t\tValidation Batch {}: Loss: {}; Accuracy: {}".format(nb_eval_steps, loss.item(), b_accuracy))
  eval_acc = eval_accuracy/nb_eval_steps
  if eval_acc >= max_val_acc[0]:
    max_val_acc = (eval_acc, epoch_number)
    continue_learning = True
    epoch_since_max = 0 # New max
    best_weights = copy.deepcopy(class_model.state_dict()) # Keep the best weights
    # See if we have reached min_accuracy
    if eval_acc >= MIN_ACCURACY:
      REACHED_MIN_ACCURACY = True
    # Save to file only if it has reached min acc
    if REACHED_MIN_ACCURACY:
      # Save the best weights to file
      torch.save(best_weights, os.path.join(PATH,'WiCHead.pt'))
      continue_learning = False # Stop learning. Reached baseline acc for this model
  else:
    epoch_since_max += 1
    if epoch_since_max > PATIENCE:
      continue_learning = False # Stop learning, starting to overfit
  print("Validation:\n\tLoss={}; Accuracy: {}".format(eval_loss/nb_eval_steps, eval_accuracy/nb_eval_steps))
print(f"Best accuracy ({max_val_acc[0]}) obtained at epoch #{max_val_acc[1]}.")
# Reload the best weights (from memory)
class_model.load_state_dict(best_weights)

Training epoch #1




		Training Batch 10: Loss: 0.8095329403877258; Accuracy: 0.4375
		Training Batch 20: Loss: 0.7327826023101807; Accuracy: 0.53125
		Training Batch 30: Loss: 0.6956872940063477; Accuracy: 0.59375
		Training Batch 40: Loss: 0.7099709510803223; Accuracy: 0.59375
		Training Batch 50: Loss: 0.5918506383895874; Accuracy: 0.703125
		Training Batch 60: Loss: 0.6430878639221191; Accuracy: 0.59375
		Training Batch 70: Loss: 0.8158175349235535; Accuracy: 0.46875
		Training Batch 80: Loss: 0.6577164530754089; Accuracy: 0.640625
Training:
	Loss: 0.6914185173371259; Accuracy: 0.5900876696832579
		Validation Batch 10: Loss: 0.6940237283706665; Accuracy: 0.6129032258064516
Validation:
	Loss=0.7314184665679931; Accuracy: 0.5628528225806452
Training epoch #2
		Training Batch 10: Loss: 0.63173508644104; Accuracy: 0.6875
		Training Batch 20: Loss: 0.6198910474777222; Accuracy: 0.703125
		Training Batch 30: Loss: 0.725109875202179; Accuracy: 0.546875
		Training Batch 40: Loss: 0.6716289520263672; Accuracy: 

<All keys matched successfully>

In [0]:
final_roberta_weights = copy.deepcopy(model.cpu().state_dict())
print(final_roberta_weights.keys())

odict_keys(['roberta.embeddings.word_embeddings.weight', 'roberta.embeddings.position_embeddings.weight', 'roberta.embeddings.token_type_embeddings.weight', 'roberta.embeddings.LayerNorm.weight', 'roberta.embeddings.LayerNorm.bias', 'roberta.encoder.layer.0.attention.self.query.weight', 'roberta.encoder.layer.0.attention.self.query.bias', 'roberta.encoder.layer.0.attention.self.key.weight', 'roberta.encoder.layer.0.attention.self.key.bias', 'roberta.encoder.layer.0.attention.self.value.weight', 'roberta.encoder.layer.0.attention.self.value.bias', 'roberta.encoder.layer.0.attention.output.dense.weight', 'roberta.encoder.layer.0.attention.output.dense.bias', 'roberta.encoder.layer.0.attention.output.LayerNorm.weight', 'roberta.encoder.layer.0.attention.output.LayerNorm.bias', 'roberta.encoder.layer.0.intermediate.dense.weight', 'roberta.encoder.layer.0.intermediate.dense.bias', 'roberta.encoder.layer.0.output.dense.weight', 'roberta.encoder.layer.0.output.dense.bias', 'roberta.encoder.la

In [0]:
key = 'roberta.encoder.layer.4.attention.self.value.weight'
print(final_roberta_weights[key])
print(initial_roberta_weights[key])

tensor([[ 0.0470, -0.0254, -0.0300,  ..., -0.0408,  0.0015,  0.0086],
        [-0.0948,  0.0016, -0.0334,  ..., -0.0375, -0.0263, -0.0076],
        [ 0.0603,  0.0454, -0.0301,  ..., -0.0142,  0.0405, -0.0022],
        ...,
        [-0.0499, -0.0351, -0.0352,  ...,  0.0083,  0.0323, -0.0006],
        [ 0.0114,  0.0489,  0.0172,  ...,  0.0143,  0.0889, -0.0473],
        [-0.0166, -0.0042,  0.0314,  ...,  0.0176,  0.1166,  0.0699]])
tensor([[ 0.0470, -0.0254, -0.0300,  ..., -0.0408,  0.0015,  0.0086],
        [-0.0948,  0.0016, -0.0334,  ..., -0.0375, -0.0263, -0.0076],
        [ 0.0603,  0.0454, -0.0301,  ..., -0.0142,  0.0405, -0.0022],
        ...,
        [-0.0499, -0.0351, -0.0352,  ...,  0.0083,  0.0323, -0.0006],
        [ 0.0114,  0.0489,  0.0172,  ...,  0.0143,  0.0889, -0.0473],
        [-0.0166, -0.0042,  0.0314,  ...,  0.0176,  0.1166,  0.0699]],
       device='cuda:0')


### Save a model that was close to the baseline (Manual choice)

In [0]:
# Save the best weights to file
# torch.save(best_weights, os.path.join(PATH,'WiCHead.pt'))

## Load an already trained WiC model from file

In [0]:
# Load the model
#class_model.load_state_dict(torch.load(os.path.join(PATH,'WiCHead.pt')))
# Put the model in GPU
#class_model.cuda()

## Get the predictions on the validation set

In [0]:
validation_predictions_correctness = {}
# Validation
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
# Put model in evaluation mode
class_model.eval()
# Evaluate data for one epoch
for batch in validation_dataloader:
  # Add batch to GPU
  batch = tuple(t.cuda() for t in batch)
  # Unpack the inputs from our dataloader
  b_input_ids, b_token_ids, b_input_mask, b_labels, b_word1, b_word2, b_index = batch
  # Telling the model not to compute or store gradients, saving memory and speeding up validation
  with torch.no_grad():
    # Forward pass, calculate logit predictions
    #loss, logits = class_model(b_input_ids, token_type_ids=b_token_ids, attention_mask=b_input_mask, labels=b_labels)
    loss, logits = class_model(model, b_input_ids, attention_mask=b_input_mask, 
                                labels=b_labels, word1_locs = b_word1, word2_locs = b_word2)
  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.cpu().numpy()
  # Calculate the accuracy
  b_accuracy, b_pred_correctness = flat_accuracy(logits, label_ids, return_predict_correctness = True) # For RobertaForClassification
  indexes = b_index.detach().cpu().numpy() # Get the indexes
  # Add to predictions
  for index, pred in zip(indexes, b_pred_correctness):
    validation_predictions_correctness[index] = pred
  # Update tracking variables
  eval_loss += loss.item()
  eval_accuracy += b_accuracy
  nb_eval_examples += b_input_ids.size(0)
  nb_eval_steps += 1
  if nb_eval_steps%10 == 0:
    print("\t\tValidation Batch {}: Loss: {}; Accuracy: {}".format(nb_eval_steps, loss.item(), b_accuracy))
print("Validation:\n\tLoss={}; Accuracy: {}".format(eval_loss/nb_eval_steps, eval_accuracy/nb_eval_steps))
validation_predictions_correctness = collections.OrderedDict(sorted(validation_predictions_correctness.items()))
print(validation_predictions_correctness)



		Validation Batch 10: Loss: 0.5690038204193115; Accuracy: 0.7419354838709677
Validation:
	Loss=0.5920733273029327; Accuracy: 0.7163810483870968
OrderedDict([(0, True), (1, True), (2, True), (3, True), (4, True), (5, True), (6, True), (7, False), (8, True), (9, False), (10, True), (11, True), (12, True), (13, False), (14, False), (15, True), (16, True), (17, False), (18, True), (19, True), (20, True), (21, False), (22, True), (23, True), (24, True), (25, True), (26, True), (27, False), (28, False), (29, False), (30, True), (31, True), (32, True), (33, True), (34, False), (35, True), (36, False), (37, True), (38, True), (39, False), (40, False), (41, True), (42, True), (43, True), (44, True), (45, False), (46, True), (47, True), (48, False), (49, True), (50, True), (51, False), (52, True), (53, True), (54, True), (55, True), (56, True), (57, False), (58, False), (59, True), (60, True), (61, True), (62, False), (63, True), (64, True), (65, True), (66, True), (67, False), (68, True), (69,

## Get the testing results
**Works for now. Would need to be modified to submit for competition reward.**

See https://competitions.codalab.org/competitions/20010#learn_the_details-evaluation

**Update:** The jsonl file from superglue is not in the same order as the text file taken from WiC. It doesn't match the idx parameter either. That is weird.

In [0]:
test_predictions = {}
nb_test_examples, nb_test_steps = 0, 0
# Testing
# Put model in evaluation mode to evaluate loss on the validation set
class_model.eval()
# Evaluate data for one epoch
for batch in test_dataloader:
  # Add batch to GPU
  batch = tuple(t.cuda() for t in batch)
  # Unpack the inputs from our dataloader
  b_input_ids, b_token_ids, b_input_mask, b_word1, b_word2, b_index = batch
  # Telling the model not to compute or store gradients, saving memory and speeding up validation
  with torch.no_grad():
    # Forward pass, calculate logit predictions
    logits = class_model(model, b_input_ids, attention_mask=b_input_mask, word1_locs = b_word1, word2_locs = b_word2)
  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  # Get the predictions
  b_preds = flat_predictions(logits)
  indexes = b_index.detach().cpu().numpy() # Get the indexes
  for index, pred in zip(indexes, b_preds):
    test_predictions[index] = pred
  # Update tracking variables
  nb_test_examples += b_input_ids.size(0)
  nb_test_steps += 1
  if nb_test_steps%10 == 0:
    print("\t\tTest Batch {}".format(nb_test_steps))
# Print final results
print("Testing Done!")
test_predictions = collections.OrderedDict(sorted(test_predictions.items()))
print(test_predictions)