# Sample Code for Testing Saved Model
This file provides a sample to test the saved model. Make necessary changes so that we can test your MLP model with this file. 

## Load test data
In the sample below, it loads the dev set for testing. But in real marking, the markers will load held-out test set.

In [59]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer,PorterStemmer,LancasterStemmer
from nltk import ConditionalFreqDist,FreqDist,pos_tag
from nltk.corpus import stopwords
import string
import torch as t

In [71]:
dev_data = pd.read_csv('cw2_dev.csv')
dev_dataframe=dev_data.copy()
dev_dataframe

Unnamed: 0.1,Unnamed: 0,Sent1,Sent2,SimScore
0,0,A man with a hard hat is dancing.,A man wearing a hard hat is dancing.,1.00
1,1,A young child is riding a horse.,A child is riding a horse.,0.95
2,2,A man is feeding a mouse to a snake.,The man is feeding a mouse to the snake.,1.00
3,3,A woman is playing the guitar.,A man is playing guitar.,0.48
4,4,A woman is playing the flute.,A man is playing a flute.,0.55
...,...,...,...,...
2995,2995,"The professor introduced the artists , and the...",The professor introduced the artists .,0.62
2996,2996,The doctors supported the judges .,The doctors supported the tourists and the jud...,0.68
2997,2997,The secretary knew the manager .,The secretary knew the manager danced .,0.37
2998,2998,The professors next to the president recommend...,The president recommended the professors .,0.29


# Data Pre-Processing

In [69]:
def pos_tagging(pos):
    if pos.startswith('N'):
        return 'n'
    elif pos.startswith('V'):
        return 'v'
    elif pos =='JJ' or pos == 'JJR':
        return 'a'
    elif pos.startswith('R'):
        return 'r'
    elif pos == 'JJS':
        return 's'
    else:
        return None
    
def remove_stopwords(sent_words):
    return [ww for ww in sent_words 
            if ww.lower() not in stop_words and ww not in string.punctuation]


lemmatizer = WordNetLemmatizer()
lemma_result = []
stop_words=set(stopwords.words('english'))

for i in np.arange(len(dev_data)):
    tagged_sent1 = pos_tag(remove_stopwords(word_tokenize(dev_dataframe.Sent1[i])))
    tagged_sent2 = pos_tag(remove_stopwords(word_tokenize(dev_dataframe.Sent2[i])))
    dev_dataframe.replace(dev_dataframe.Sent1[i],' '.join([lemmatizer.lemmatize(k[0]) if pos_tagging(str(k[1]))==None else lemmatizer.lemmatize(k[0],pos_tagging(str(k[1]))) for k in tagged_sent1]),inplace=True)
    dev_dataframe.replace(dev_dataframe.Sent2[i],' '.join([lemmatizer.lemmatize(l[0]) if pos_tagging(str(l[1]))==None else lemmatizer.lemmatize(l[0],pos_tagging(str(l[1]))) for l in tagged_sent2]),inplace=True)
    

# Pre-Processed Data

In [70]:
dev_dataframe

Unnamed: 0.1,Unnamed: 0,Sent1,Sent2,SimScore
0,0,A man with a hard hat is dancing.,A man wearing a hard hat is dancing.,1.00
1,1,A young child is riding a horse.,A child is riding a horse.,0.95
2,2,A man is feeding a mouse to a snake.,The man is feeding a mouse to the snake.,1.00
3,3,A woman is playing the guitar.,A man is playing guitar.,0.48
4,4,A woman is playing the flute.,A man is playing a flute.,0.55
...,...,...,...,...
2995,2995,"The professor introduced the artists , and the...",The professor introduced the artists .,0.62
2996,2996,The doctors supported the judges .,The doctors supported the tourists and the jud...,0.68
2997,2997,The secretary knew the manager .,The secretary knew the manager danced .,0.37
2998,2998,The professors next to the president recommend...,The president recommended the professors .,0.29


## Load Embeddings
Clearly specify the embeddings your implementation requires. Also provide the link for downloading the embeddings. 

In [72]:
# load pre-trained glove embeddings
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
import numpy as np

embd_name = 'glove.6B.300d'
link_to_embd = None # TODO: you should provide the link to download the embedding here

# Below is a sample to load the glove embeddings. ADJUST the code according to the
# embedding you want to use. 
word_vec_dim = 300
path_of_downloaded_files = "/Users/nithinkyatham/Desktop/MSc AI/2nd Term/NLP CS5990j/env/NLP/cw2-files/glove.6B.{}d.txt".format(word_vec_dim)
glove_file = datapath(path_of_downloaded_files)
word2vec_glove_file = get_tmpfile("glove.6B.300d.txt")
glove2word2vec(glove_file, word2vec_glove_file)
word_vectors = KeyedVectors.load_word2vec_format(word2vec_glove_file)


In [58]:
word_vectors.most_similar('why')

[('know', 0.842064619064331),
 ('what', 0.8334978222846985),
 ('how', 0.8015264272689819),
 ("n't", 0.7864867448806763),
 ('reason', 0.7832764983177185),
 ('?', 0.7644017338752747),
 ('tell', 0.7635022401809692),
 ('think', 0.759865403175354),
 ('do', 0.7519409656524658),
 ('really', 0.7497090697288513)]

## Provide Functions Needed for Evaluation
All functions used to run and evaluate your model should be provided. 

In [4]:
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

def get_sent_word_vecs(word_vectors, sent_words):
    vecs = []
    for ww in sent_words:
        if ww in word_vectors:
            vecs.append(word_vectors[ww])
        else:
            vecs.append(oov_vec)
    return np.array(vecs)

def evaluate_trained_model(trained_model, dev_data):
    pred_scores = []
    true_scores = []
    cos_sim = nn.CosineSimilarity()
    with torch.no_grad(): # let pytorch know that no gradient should be computed
        model.eval()
        for i, entry in tqdm(dev_data.iterrows()):
            sent1 = entry['Sent1']
            sent2 = entry['Sent2']
            gold_score = entry['SimScore']
            sent1_embds = get_sent_word_vecs(word_vectors, sent1.split())
            sent2_embds = get_sent_word_vecs(word_vectors, sent2.split())
            sent1_repr, sent2_repr = trained_model(sent1_embds, sent2_embds)
            pred_sim = cos_sim(sent1_repr, sent2_repr)
        
            pred_scores.append(pred_sim)
            true_scores.append(gold_score)

    assert len(true_scores) == len(pred_scores)
    squared_errors = [np.square(ts-ps) for (ts, ps) in zip(true_scores, pred_scores)]
    print('MSE of the method on the dev set:', np.mean(squared_errors))

    # check the distribution (histo gram) of the squared errors
    plt.hist(squared_errors)

# Evaluating the trained model using dot product instead of cosine similarity

In [None]:
def evaluate_trained_model_dot(trained_model, dev_data):
    pred_scores = []
    true_scores = []
    #cos_sim = nn.CosineSimilarity()
    with torch.no_grad(): # let pytorch know that no gradient should be computed
        model.eval()
        for i, entry in tqdm(dev_data.iterrows()):
            sent1 = entry['Sent1']
            sent2 = entry['Sent2']
            gold_score = entry['SimScore']
            sent1_embds = get_sent_word_vecs(word_vectors, sent1.split())
            sent2_embds = get_sent_word_vecs(word_vectors, sent2.split())
            sent1_repr, sent2_repr = trained_model(sent1_embds, sent2_embds)
            pred_sim =  t.dot(sent1_repr, sent2_repr)
        
            pred_scores.append(pred_sim)
            true_scores.append(gold_score)

    assert len(true_scores) == len(pred_scores)
    squared_errors = [np.square(ts-ps) for (ts, ps) in zip(true_scores, pred_scores)]
    print('MSE of the method on the dev set:', np.mean(squared_errors))

    # check the distribution (histo gram) of the squared errors
    plt.hist(squared_errors)

## Provide Your Model
You should provide the implementaiton of your encoder model below.

In [5]:
# define the baseline model
import numpy as np
import torch
import torch.nn as nn

class BaselineModel(nn.Module):
    def __init__(self, embd_dim):
        super(BaselineModel, self).__init__()
        self.relu = nn.ReLU()
        self.fully_connected_layer = nn.Linear(embd_dim, embd_dim)
        self.relu = nn.ReLU()
        self.fully_connected_layer = nn.Linear(embd_dim, embd_dim)
        
    def forward(self, sent1_vecs, sent2_vecs):
        avg_embd1 = torch.mean(torch.FloatTensor(sent1_vecs), dim=0).unsqueeze(0)
        avg_embd2 = torch.mean(torch.FloatTensor(sent2_vecs), dim=0).unsqueeze(0)
        sent1_repr = self.relu(self.fully_connected_layer(avg_embd1))
        sent2_repr = self.relu(self.fully_connected_layer(avg_embd2))
        
        return sent1_repr, sent2_repr

## Run and Evaluate Model
The code below creates an instance of the model, loads the saved weights (sample_model.state_dict; run cw2_sample.ipynb will generate this file), and tests it.

In [74]:
import pickle

# load the saved file
with open('sample_model.state_dict','rb') as ff:
    saved_info = pickle.load(ff)

# extract the information from the saved file
oov_vec = saved_info['oov_vec']
saved_model_state = saved_info['model_state_dict']

# create model, load saved weights, and test the model
model = BaselineModel(embd_dim=word_vec_dim)
model.load_state_dict(saved_model_state) 
evaluate_trained_model(model, dev_data)


FileNotFoundError: [Errno 2] No such file or directory: 'sample_model.state_dict'