

# Setting up environment

#### here we import and prepare the working environment.



In [None]:
# general  
import matplotlib.pyplot as plt
import numpy as np

from collections import Counter, defaultdict
from tqdm.notebook import tqdm
from typing import *

# torch
import torch

from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import SGD
from torch.optim import Adam

# auxiliar
import os
import json
import pickle


In [None]:
! rm -rf sample_data

In [None]:
%cd '/content/drive/MyDrive/NLP/nlp2021-hw1-main/data'

/content/drive/MyDrive/NLP/nlp2021-hw1-main/data




# One time run

#### here we download embeddings such as glove.



In [None]:
#download and unzip word embedding , I used GloVe
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip


# Create dictioray word_vectors and word_index




In [None]:
# build vocabulary and associate heach word to his embedding, here I use a 300-dim embedding
word_vectors = dict()
words_limit = 400_000
with open('/content/drive/MyDrive/NLP/nlp2021-hw1-main/data/glove.6B.300d.txt',encoding='utf8') as f:

    for i, line in tqdm(enumerate(f), total=words_limit):
      
        if i == words_limit:
            break

        word, *vector = line.strip().split(' ')
        vector = torch.tensor([float(c) for c in vector])
        
        word_vectors[word] = vector

In [None]:
# save word_vectors as a pickle file
with open('words_vectors.pickle', 'wb') as handle:
    pickle.dump(word_vectors, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# pair each word in the vocabulary to an index which ( I used the embedding proposed during the lectures )
word_index = dict()
vectors_store = []

# pad token, index = 0
vectors_store.append(torch.rand(300))

# unk token, index = 1
vectors_store.append(torch.rand(300))

for word, vector in word_vectors.items():

    word_index[word]=len(vectors_store)
    vectors_store.append(vector)

word_index = defaultdict(lambda: 1, word_index)  # default dict returns 1 (unk token) when unknown word
vectors_store = torch.stack(vectors_store)

In [None]:
# save word_index dict as a pickle file
with open('words_index.pickle', 'wb') as handle:
    pickle.dump(word_index, handle, protocol=pickle.HIGHEST_PROTOCOL)



# Functions

#### Here we define general function we need for preprocessing and for LSTM and MLP training and testing.



In [None]:
# aggregation function which uses simple mean of vectors associated to the words composing the sentences

def sentence2vector(sentence: str) -> Optional[torch.Tensor]:
    sentences_word_vector = [word_vectors[w] for w in sentence.split(' ') if w in word_vectors]
    
    if len(sentences_word_vector) == 0:
        return None

    sentences_word_vector = torch.stack(sentences_word_vector)  
    return torch.mean(sentences_word_vector, dim=0)

In [None]:
# weighted sum of vectors associated with the words composing the sentence
# the weights depend on the frequency of the word in our dataset : 
# words with few occurencies contribute more than words which occur more frequently 
# we use the words_organized dictionary in order to distinguish such words

def sentence2vector0(sentence: str) -> Optional[torch.Tensor]:
    sentences_word_vector = []
    for w in sentence.split(' '):
       if (w in word_vectors):

         if (w in words_organized['few_occ']):  sentences_word_vector.append(word_vectors[w]*0.75)
         elif (w in words_organized['mean_occ']): sentences_word_vector.append(word_vectors[w]*0.6)
         else: sentences_word_vector.append(word_vectors[w]*0.45) 
    
    if len(sentences_word_vector) == 0:
        return None

    sentences_word_vector = torch.stack(sentences_word_vector)  
    return torch.sum(sentences_word_vector,dim=0)

In [None]:
# pairs words in the sentence with indices from 0 to size of vocabulary

def sentence2indices(sentence: str) -> torch.Tensor:
    return torch.tensor([word_index[word] for word in sentence.split(' ')], dtype=torch.long)

In [None]:
# build sequences taking in account sequence length, I used the solution proposed during the lectures

def rnn_collate_fn(
    data_elements: List[Tuple[torch.Tensor, torch.Tensor]]
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:

    X = [de[0] for de in data_elements] 

    X_lengths = torch.tensor([x.size(0) for x in X], dtype=torch.int)

    X = torch.nn.utils.rnn.pad_sequence(X, batch_first=True, padding_value=0) 

    y = [de[1] for de in data_elements]
    y = torch.tensor(y)

    return X, X_lengths, y

In [None]:
# remove punctuation and some special character , I decided to avoid using tokenizers and build a function from scratch

def  rm_punctuation(sentence:str) -> str:
  result = sentence.replace('"','').replace(',','').replace(':','').replace('.','').replace(';','').replace('—','').replace('”','').replace('“','').replace('-','')
  result = result.replace('(','').replace(')','').replace('[','').replace(']','').replace('{','').replace('}','').replace('+','').replace('’','').replace('/','')
  return result
  

In [None]:
# remove a list of words from a sentence
 
def rm_words(sentence:str,rm_words:list) -> str:
  resultwords  = [word for word in sentence.split() if word not in rm_words]
  result = ' '.join(resultwords)
  return result

In [None]:
# replace the the portion of the sentence associated with the lemma with the lemma itself ( replace pluarals , inflections with the plain lemma)

def search_replace_word(sentence:str,replace_word:str,start:int,end:int) -> str:
  word_tobe_replaced = ''
  for i in range(start,end):
    word_tobe_replaced += sentence.strip()[i]
  result = sentence.replace(word_tobe_replaced,replace_word)
  return result

In [None]:
# remove numbers from the sentence

def rm_numbers(sentence : str) -> str:
  result_words = []
  for word in sentence.split():
    try:
      if (isinstance(int(word),int)):
        sentence = sentence.replace(word,'')
    except:
      continue

  return sentence

In [1]:
# eliminate some words randomly according to theyr frequency

def refine_sentence(sentence:str,few:list,mean:list,more:list) -> str:
  resultwords = []
  for word in sentence.split():
    if (word in words_organized['few_occ']): 
      if (random.random() < 0.5): 
        resultwords.append(word)
    if (word in words_organized['mean_occ']):
      if (random.random() < 0.75 ):
        resultwords.append(word)
    if (word in words_organized['more_occ']):
      if (random.random() < 0.9) :
        resultwords.append(word)

  result = ' '.join(resultwords)
  return result

In [None]:
# build a dictionary which contains as elements the lists of word which occurs with different frequence

def organize_words(path:str) -> Tuple[List[str]]:

  word_freq = {}
  words_organized = {}
  words = []
  lemmas = []
  few_occ = []
  mean_occ = []
  more_occ = []
  most_occ = []


  with open(path) as f:
      for line in f:

        dict_from_line  = eval(line)

        sentence1 = dict_from_line['sentence1'].lower()
        sentence2 = dict_from_line['sentence2'].lower()
        sentence1 = rm_punctuation(sentence1)
        sentence2 = rm_punctuation(sentence2)
        lemma = dict_from_line['lemma']

        if (lemma not in lemmas): lemmas.append(lemma)

        for word in sentence1.split():
          if (word not in words): 
            words.append(word)
            word_freq[word] = 1    
          else:
            word_freq[word] += 1

        for word in sentence2.split():
          if (word not in words): 
            words.append(word)
            word_freq[word] = 1    
          else:
            word_freq[word] += 1

      for key,value in word_freq.items():
        if (key not in lemmas):
          if ( 0 < value < 2 ): few_occ.append(key)
          if ( 2 <= value < 250 ):mean_occ.append(key)
          if ( 250 <= value < 500 ):more_occ.append(key)      
          if ( 500 <= value ):most_occ.append(key)  

      words_organized['few_occ'] = few_occ   
      words_organized['mean_occ'] = mean_occ    
      words_organized['more_occ'] = more_occ    
      words_organized['most_occ'] = most_occ    
 

  return words_organized




## Loops for RNN

#### Here we define the training and testing loops for RNN (LSTM) models.



In [None]:
# loop I use for evaluating the model on the train dataset (LSTM)

def train_acc(model: nn.Module):
  
  correct_pred = 0
  my_pred = 0    
  num_sample = 0  
  for data in train_dataloader:
      input, length , label = data
      label = label.long()
      batch_out = model(input,length,label)
      optimizer.zero_grad()
      try:
          for i in range(32):
            if (batch_out['pred'][i][0] > batch_out['pred'][i][1]):
              my_pred = torch.Tensor([0])
            else :
              my_pred = torch.Tensor([1])
            if (label[i] == my_pred):
              correct_pred += 1
            num_sample += 1
      except:
        continue

  loss = batch_out['loss']
  acc = correct_pred/num_sample
  return acc , loss

In [None]:
# loop I use for evaluating the model on the dev dataset (LSTM)

def test_acc(model: nn.Module):
  
  correct_pred = 0
  my_pred = 0   
  num_sample = 0
  for data in test_dataloader:
      input, length , label = data
      label = label.long()
      optimizer.zero_grad()
      batch_out = model(input,length,label)
      try:
          for i in range(32):
            if (batch_out['pred'][i][0] > batch_out['pred'][i][1]):
              my_pred = torch.Tensor([0])
            else :
              my_pred = torch.Tensor([1])
            if (label[i] == my_pred):
              correct_pred += 1
            num_sample += 1
      except:
        continue

  loss = batch_out['loss']
  acc = correct_pred/num_sample
  return acc , loss

In [None]:
# loop I use for training the model on the train dataset (LSTM)
# we save only the best model according to val_acc computed on the dev dataset and plot the losses and accuracies behavoiurs

def training_loop(model: nn.Module, optimizer: torch.optim.Optimizer, epochs: int, epoch:int = 0):
    
    if (epoch == 0): 
      max_acc = 0
      history_acc_train = []
      history_acc_val = []
      history_loss_train = []
      history_loss_val = []

    epoch += 1

    for epoch in range(epochs):

        progress_bar = tqdm()

        for data in train_dataloader:

            input ,length, label = data
            label = label.long()
            optimizer.zero_grad()
            batch_out = model(input,length,label)
            loss = batch_out['loss']
            loss.backward()
            optimizer.step()

            progress_bar.update()

        t_acc , t_loss = train_acc(model)
        v_acc , v_loss = test_acc(model)
      
        progress_bar.set_postfix(epoch=epoch, train_loss= t_loss ,  train_acc = t_acc , val_loss = v_loss , val_acc  =  v_acc )

        if (v_acc> max_acc):
          print('saving...')
          torch.save(model.state_dict(), '/content/drive/MyDrive/NLP/model_state_dict.pt')
          max_acc= v_acc

        history_acc_train.append(t_acc)
        history_acc_val.append(v_acc)
        history_loss_train.append(t_loss)
        history_loss_val.append(v_loss)

        progress_bar.close()

    history_acc = [history_acc_train , history_acc_val]
    history_loss = [history_loss_train, history_loss_val]
    for step in history_acc:
      plt.plot(step)
      plt.xlabel('Epochs')
      plt.ylabel('Accuracy')
    plt.show()
    for step in history_loss:
      plt.plot(step)
      plt.xlabel('Epochs')
      plt.ylabel('Loss')
    plt.show()
    
      



## Loops for MLP

#### Here we define the training and testing loops for MLP model.



In [None]:
# loop I use for evaluating the model on the train dataset (MLP)

def train_acc(model: nn.Module):
  
  correct_pred = 0
  my_pred = 0    
  num_sample = 0  
  for data in train_dataloader:
      input , label = data
      optimizer.zero_grad()
      batch_out = model(input,label)
      try:
        for i in range(32):
          if (batch_out['pred'][i] < 0.5):
           my_pred = torch.Tensor([0])
          else :
            my_pred = torch.Tensor([1])
          if (label[i] == my_pred):
            correct_pred += 1
          num_sample += 1
      except:
        continue

  loss = batch_out['loss']
  acc = correct_pred/num_sample
  return acc , loss

In [None]:
# loop I use for evaluating the model on the dev dataset (MLP)

def test_acc(model: nn.Module):
  
  correct_pred = 0
  my_pred = 0   
  num_sample = 0
  for data in test_dataloader:
      input , label = data
      optimizer.zero_grad()
      batch_out = model(input,label)
      try:
        for i in range(32):
          if (batch_out['pred'][i] < 0.5):
            my_pred = torch.Tensor([0])
          else :
            my_pred = torch.Tensor([1])
          if (label[i] == my_pred):
            correct_pred += 1
          num_sample += 1
      except:
        continue

  loss = batch_out['loss']
  acc = correct_pred/num_sample
  return acc , loss

In [None]:
# loop I use for training the model on the train dataset (MLP)
# we save only the best model according to val_acc computed on the dev dataset and plot the losses and accuracies behavoiurs

def training_loop(model: nn.Module, optimizer: torch.optim.Optimizer, epochs: int):
    

    max_acc = 0
    history_acc_train = []
    history_acc_val = []
    history_loss_train = []
    history_loss_val = []

    for epoch in range(epochs):

        progress_bar = tqdm()

        for data in train_dataloader:

            input , label = data
            optimizer.zero_grad()
            batch_out = model(input,label)
            loss = batch_out['loss']
            loss.backward()
            optimizer.step()

            progress_bar.update()

        t_acc , t_loss = train_acc(model)
        v_acc , v_loss = test_acc(model)
      
        progress_bar.set_postfix(epoch=epoch, train_loss = t_loss , val_loss = v_loss,train_acc = t_acc , val_acc = v_acc)

        if (v_acc> max_acc):
          print('saving...')
          torch.save(model.state_dict(), '/content/drive/MyDrive/NLP/model_0_hw' )
          max_acc = v_acc

        history_acc_train.append(t_acc)
        history_acc_val.append(v_acc)
        history_loss_train.append(t_loss)
        history_loss_val.append(v_loss)

        progress_bar.close()

    history_acc = [history_acc_train , history_acc_val]
    history_loss = [history_loss_train, history_loss_val]
    for step in history_acc:
      plt.plot(step)
      plt.xlabel('Epochs')
      plt.ylabel('Accuracy')
    plt.show()
    plt.close()

    for step in history_loss:
      plt.plot(step)
      plt.xlabel('Epochs')
      plt.ylabel('Loss')
    plt.show()
    plt.close()



## Experiment training loop 

#### Here we define the training loops for the experiment.



In [None]:
def training_loop(model: nn.Module, optimizer: torch.optim.Optimizer, epochs: int, descriptor: str):
    

    max_acc = 0
    history_acc_train = []
    history_acc_val = []
    history_loss_train = []
    history_loss_val = []

    for epoch in range(epochs):

        progress_bar = tqdm()

        for data in train_dataloader:

            input , label = data
            optimizer.zero_grad()
            batch_out = model(input,label)
            loss = batch_out['loss']
            loss.backward()
            optimizer.step()

            progress_bar.update()

        t_acc , t_loss = train_acc(model)
        v_acc , v_loss = test_acc(model)
      
        progress_bar.set_postfix(epoch=epoch, train_loss = t_loss , val_loss = v_loss,train_acc = t_acc , val_acc = v_acc)

        #save models according to accuracy
        if (v_acc> max_acc):
          print('saving...')
          torch.save(model.state_dict(),  descriptor + '.ph')
          max_acc = v_acc

        history_acc_train.append(t_acc)
        history_acc_val.append(v_acc)
        history_loss_train.append(t_loss)
        history_loss_val.append(v_loss)

        progress_bar.close()

    #save plots
    history_acc = [history_acc_train , history_acc_val]
    history_loss = [history_loss_train, history_loss_val]
    for step in history_acc:
      plt.plot(step)
      plt.xlabel('Epochs')
      plt.ylabel('Accuracy')

    plt.savefig(descriptor + '_ACC.png')  
    plt.close()

    for step in history_loss:
      plt.plot(step)
      plt.xlabel('Epochs')
      plt.ylabel('Loss')

    plt.savefig(descriptor + '_LOSS.png')
    plt.close()

    #save file with max_acc as a dict
    stdout = {descriptor + '_ACC': max_acc}
    out_file = open("out.json", "w")
    json.dump(stdout, out_file)
    out_file. close()

    
      



# Load data and process the data


In [None]:
# instantiate organized_words dict and save it as pickle file

words_organized  = organize_words('/content/drive/MyDrive/NLP/nlp2021-hw1-main/data/train.jsonl')

with open('words_organized.pickle', 'wb') as handle:
    pickle.dump(words_organized, handle, protocol=pickle.HIGHEST_PROTOCOL)



##RNN (LSTM) loading module



In [None]:
class WiCDataset(torch.utils.data.Dataset):

    def __init__(self, dataset_path: str, sentence2indices):
      self.data_store = []
      self.init_structures(dataset_path, sentence2indices)

    def init_structures(self, dataset_path: str, sentence2indices) -> None:
      with open(dataset_path) as f:
        for line in f:
          #crete a dict from each sample in the train.json
          dict_from_line  = eval(line)

          #extraxt elements fro keys 
          start1 = int(dict_from_line['start1'])
          start2 = int(dict_from_line['start2'])
          end1 = int(dict_from_line['end1'])
          end2 = int(dict_from_line['end2'])
          sentence1 = dict_from_line['sentence1'].lower()
          sentence2 = dict_from_line['sentence2'].lower()
          lemma = dict_from_line['lemma']
          label = dict_from_line['label']
          if (label == 'True'):
            label = torch.Tensor([1])
            
          else:
            label = torch.Tensor([0])


          # here we could have all the different preprocess functions defined above such as search and replace lemma
          # remove numbers or refine sentence, whihc I do not display since they turned out to enworse performances 

          # SCONCAT
          '''

          sentence1 = search_replace_word(sentence1,lemma,start1,end1)
          sentence2 = search_replace_word(sentence2,lemma,start2,end2)
          sentence_concat = sentence1 +' ~ '+sentence2+' ~ '+lemma
          sentence_concat = rm_punctuation(sentence_concat)
          sentence_concat = rm_words(sentence_concat,words_organized['few_occ'])
          sentence_concat = rm_words(sentence_concat,words_organized['most_occ'])
          
          '''

          # TCONCAT

          sentence1 = rm_punctuation(sentence1)
          sentence2 = rm_punctuation(sentence2)
          sentence1 = rm_words(sentence1,words_organized['most_occ'])
          sentence2 = rm_words(sentence2,words_organized['most_occ'])
          input_tensor1 = sentence2indices(sentence1)
          input_tensor2 = sentence2indices(sentence2)
          input_tensor = torch.cat((input_tensor1, input_tensor2), dim=0)
          self.data_store.append((input_tensor,label))

    def __len__(self) -> int:
        return len(self.data_store)

    def __getitem__(self, idx: int) -> torch.Tensor:
        return self.data_store[idx]

In [None]:
train_dataset  = WiCDataset('/content/drive/MyDrive/NLP/nlp2021-hw1-main/data/train.jsonl',sentence2indices)
test_dataset  = WiCDataset('/content/drive/MyDrive/NLP/nlp2021-hw1-main/data/dev.jsonl',sentence2indices)

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=32,collate_fn=rnn_collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=32,collate_fn=rnn_collate_fn)



## MLP loading module



In [None]:
class WiCDataset(torch.utils.data.Dataset):

    def __init__(self, dataset_path: str, sentence2vector):
      self.data_store = []
      self.init_structures(dataset_path, sentence2vector)

    def init_structures(self, dataset_path: str, sentence2vector) -> None:
      with open(dataset_path) as f:
        for line in f:
          #crete a dict from each sample in the train.json
          dict_from_line  = eval(line)

          #extraxt elements fro keys 
          start1 = int(dict_from_line['start1'])
          start2 = int(dict_from_line['start2'])
          end1 = int(dict_from_line['end1'])
          end2 = int(dict_from_line['end2'])
          sentence1 = dict_from_line['sentence1'].lower()
          sentence2 = dict_from_line['sentence2'].lower()
          lemma = dict_from_line['lemma']
          label = dict_from_line['label']
          if (label == 'True'):
            label = torch.Tensor([1])
            
          else:
            label = torch.Tensor([0])

          # here we could have all the different preprocess functions defined above such as search and replace lemma
          # remove numbers or refine sentence, whihc I do not display since they turned out to enworse performances  


          # SCONCAT
          '''
          sentence_concat = sentence1 +' ~ '+sentence2
          sentence_concat = rm_punctuation(sentence_concat)
          #sentence_concat = rm_words(sentence_concat,words_organized['few_occ'])
          sentence_concat = rm_words(sentence_concat,words_organized['most_occ'])
          input_tensor = sentence2vector(sentence_concat)
          self.data_store.append((input_tensor,label))
          
          '''

          # TCONCAT
          sentence1 = rm_punctuation(sentence1)
          sentence2 = rm_punctuation(sentence2)
          sentence1 = rm_words(sentence1,words_organized['few_occ'])
          sentence2 = rm_words(sentence2,words_organized['few_occ'])
          sentence1 = rm_words(sentence1,words_organized['most_occ'])
          sentence2 = rm_words(sentence2,words_organized['most_occ'])
          input_tensor1 = sentence2vector(sentence1)
          input_tensor2 = sentence2vector(sentence2)
          input_tensor = torch.cat((input_tensor1, input_tensor2), dim=0)
          self.data_store.append((input_tensor,label))
          

    def __len__(self) -> int:
        return len(self.data_store)

    def __getitem__(self, idx: int) -> torch.Tensor:
        return self.data_store[idx]

In [None]:
train_dataset  = WiCDataset('/content/drive/MyDrive/NLP/nlp2021-hw1-main/data/train.jsonl',sentence2vector)
test_dataset  = WiCDataset('/content/drive/MyDrive/NLP/nlp2021-hw1-main/data/dev.jsonl',sentence2vector)

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=32)
test_dataloader = DataLoader(test_dataset, batch_size=32)



# Models Used for MLP and LSTM experiment

### Here I define the different model architectures/structures I used for the experiments:

##strct0 

####WiCClassifier(
####  (lin1): Linear(in_features=300, out_features=N, bias=True)
####  (lin2): Linear(in_features=N, out_features=1, bias=True)
####  (loss_fn): BCELoss())
------------------------------------------------------------------------
##strct1 

###WiCClassifier(
####  (lin1): Linear(in_features=300, out_features=int(n_hidden/2), bias=True)
####  (lin2): Linear(in_features=int(n_hidden/2), out_features=int(n_hidden/4), bias=True)
####  (lin3): Linear(in_features=int(n_hidden/4), out_features=1, bias=True)
####  (loss_fn): BCELoss())
------------------------------------------------------------------------
##strct2

###3WiCClassifier(
####  (lin1): Linear(in_features=300, out_features=N, bias=True)
####  (lin2): Linear(in_features=N, out_features=N/8, bias=True)
####  (lin3): Linear(in_features=N/8, out_features=1, bias=True)
####  (loss_fn): BCELoss())
------------------------------------------------------------------------
##strct3

####WiCClassifier(
####  (lin1): Linear(in_features=300, out_features=N, bias=True)
####  (lin2): Linear(in_features=N, out_features=N/2, bias=True)
####  (lin3): Linear(in_features=N/2, out_features=N/16, bias=True)
####  (lin4): Linear(in_features=N/16, out_features=1, bias=True)
####  (loss_fn): BCELoss())
------------------------------------------------------------------------
##strct4

####WiCClassifier(
####  (lin1): Linear(in_features=300, out_features=N, bias=True)
####  (lin2): Linear(in_features=N, out_features=N/4, bias=True)
####  (lin3): Linear(in_features=N/4, out_features=N/16, bias=True)
####  (lin4): Linear(in_features=N/16, out_features=1, bias=True)
####  (loss_fn): BCELoss())
------------------------------------------------------------------------
##strct5

####WiCClassifier(
####  (lin1): Linear(in_features=300, out_features=N, bias=True)
####  (lin2): Linear(in_features=N, out_features=N/2, bias=True)
####  (drop1): Dropout(p=0.5, inplace=False)
####  (lin3): Linear(in_features=N/2, out_features=N/16, bias=True)
####  (drop2): Dropout(p=0.5, inplace=False)
####  (lin4): Linear(in_features=N/16, out_features=1, bias=True)
####  (loss_fn): BCELoss())

### (this architectures have been used both for MLP and LSTM )
------------------------------------------------------------------------

##strct6

####WiCClassifier(
####  (embedding): Embedding(400001, 300)
####  (rnn1): LSTM(300, N, batch_first=True)
####  (lin1): Linear(in_features=N, out_features=2*N, bias=True)
####  (lin2): Linear(in_features=2*N, out_features=2, bias=True)
####  (loss_fn): CrossEntropyLoss())

### (this last one has been used only for LSTM; it is the architecture which performed better)

 



In [None]:
# define the LSTM model

class WiCClassifier(nn.Module):

    def __init__(
        self,
        vectors_store: torch.Tensor,
        n_hidden: int
    ) -> None:
        super().__init__()

        # embedding layer
        self.embedding = torch.nn.Embedding.from_pretrained(vectors_store)

        # recurrent layer
        self.rnn1 = torch.nn.LSTM(input_size=vectors_store.size(1), hidden_size=n_hidden, num_layers=1, batch_first=True)
        self.bn1 = torch.nn.BatchNorm1d(n_hidden)
        self.drop1 = torch.nn.Dropout(0.5)

        # classification head
        self.lin1 = torch.nn.Linear(n_hidden, int(n_hidden*2))
        self.bn2 = torch.nn.BatchNorm1d(int(n_hidden*2))
        self.drop2 = torch.nn.Dropout(0.5)

        self.lin2 = torch.nn.Linear(int(n_hidden*2), 2)

        # criterion
        self.loss_fn = torch.nn.CrossEntropyLoss()

    def forward(
        self, 
        X: torch.Tensor, 
        X_length: torch.Tensor, 
        y: Optional[torch.Tensor] = None
    ) -> Dict[str, torch.Tensor]:

        # embedding words from indices
        embedding_out = self.embedding(X)
        # recurrent encoding
        recurrent_out = self.rnn1(embedding_out)[0]

        batch_size, seq_len, hidden_size = recurrent_out.shape

        # here I used an adapted approach of the solution proposed during the lectures
        flattened_out = recurrent_out.reshape(-1, hidden_size)
        last_word_relative_indices = X_length - 1
        sequences_offsets = torch.arange(batch_size)*seq_len
        summary_vectors_indices = sequences_offsets + last_word_relative_indices
        summary_vectors = flattened_out[summary_vectors_indices]
  
        out = self.bn1(summary_vectors)
        out = self.drop1(out)

        out = self.lin1(out)
        out = self.bn2(out)
        out = torch.relu(out)
        out = self.drop2(out)
 

        out = self.lin2(out)

        pred = torch.softmax(out,-1)
        
        result = {'logits': out, 'pred': pred}

        if y is not None:
          loss = self.loss(out, y)
          result['loss'] = loss

        return result

    def loss(self, out, y):
        return self.loss_fn(out, y)


In [None]:
# define the MLP model

class WiCClassifier(nn.Module):

    def __init__( self,  n_features: int, n_hidden: int ):
        super().__init__()

        # embedding layer

        # classification head
        self.lin1 = torch.nn.Linear(n_features,  n_hidden)

        self.lin2 = torch.nn.Linear( n_hidden,int(n_hidden/4))
        self.drop1 = torch.nn.Dropout(0.5)

        self.lin3 = torch.nn.Linear( int(n_hidden/4), int(n_hidden/16))
        self.drop2 = torch.nn.Dropout(0.5)

        self.lin4 = torch.nn.Linear( n_hidden,1)


        # criterion
        self.loss_fn = torch.nn.BCELoss()

    def forward( self, x: torch.Tensor,  y: Optional[torch.Tensor] = None ) -> Dict[str, torch.Tensor]:

        out = self.lin1(x)
        out = torch.relu(out)

        out = self.lin2(out)
        out = torch.relu(out)
        out = self.drop1(out)

        out = self.lin3(out)
        out = torch.relu(out)
        out = self.drop2(out)

        out = self.lin4(out)

        out = torch.sigmoid(out)
        
        result = {'pred': out}

        if y is not None:

          loss = self.loss(out, y)
          result['loss'] = loss

        return result

    def loss(self, out, y):
        return self.loss_fn(out, y)




# Training loop and testing loop 


In [None]:
# build the model we used ADAM or SGG optimizer

model = WiCClassifier(vectors_store, 50)
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
#optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
model.state_dict

In [None]:
# train and evaluate the model

training_loop(model, optimizer,epochs = 70)



# Experiment

#### I built an experiment to find out a good combination of hyperparameters and to test different architectures both for LSTM and MLP.


In [None]:
#Go in the working directory

%cd '/content/drive/MyDrive/NLP/Experiment/model_TCONCAT/model_5'

/content/drive/MyDrive/NLP/Experiment/model_Adam/model_3


In [None]:
#Experiment ( hype_2 changes depending on the Optimizer, here we use SGD )

arch = '5strct'
base_path = '/content/drive/MyDrive/NLP/Experiment/model_TCONCAT/model_5'
for hype_1 in range(32,160,8):
  hype_2 = 0.01
  for i in range(0,5):

    os.chdir(base_path)
    hype_2 += 0.02
    
    descriptor = arch + '_' + str(hype_1) + '_' +str(round(hype_2,4)) 

    #create directory and go into it

    os.makedirs(descriptor)
    os.chdir(descriptor)

    model = WiCClassifier(n_features = 300, n_hidden = hype_1 )
    optimizer = torch.optim.SGD(model.parameters(), lr=hype_2)
    training_loop(model, optimizer,epochs = 80, descriptor = descriptor)






# Tester
#### used to explore quickly the results obtained from the experiment  the hyperparamters and loops descriptors must coincide between experiment and tester in order to guarantee a complete exploration of the results


In [None]:
# use the same hype_1 and hype_2  experiment settings 
arch = '5strct'
base_path = '/content/drive/MyDrive/NLP/Experiment/model_TCONCAT/model_5'
max_acc = 0 

for hype_1 in range(16,160,8):
  hype_2 = 0.01
  for i in range(0,5):
    os.chdir(base_path)
    hype_2 += 0.02
    
    descriptor = arch + '_' + str(hype_1) + '_' +str(round(hype_2,4)) 
    try:

      os.chdir(descriptor)
      with open('out.json') as f:
        for line in f:
          line = eval(line)
          for k,v in line.items():
            #print(k,v)
            if (v > max_acc): 
              best_model = k
              max_acc = v
              print(best_model,max_acc)
    except:
      continue
        
