### Import Libraries

In [115]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import string 
import re

from sklearn.manifold import TSNE
from nltk.corpus import stopwords
from collections import Counter
from collections import defaultdict
from datasets import load_dataset
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('stopwords')
RemoveWords = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/atharvadashora/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Defining necessary functions

In [26]:
def softmax(x):
	e_x = np.exp(x - np.max(x))
	return e_x / e_x.sum()

In [214]:
def preprocess_sentences(corpus, min_word_freq=10, min_sentence_len=8):
    # Remove punctuation and numbers (Roman numerals and others)
    corpus = re.sub(r'\b[MDCLXVI]+\b|\d+', '', corpus)
    corpus = corpus.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize the corpus
    sentences = [sent.strip().split() for sent in corpus.lower().split('.')]
    
    # Filter out short sentences
    sentences = [sent for sent in sentences if len(sent) >= min_sentence_len]
    
    # Filter out rare words and short sentences
    word_freq = Counter([word for sent in sentences for word in sent])
    sentences = [[word for word in sent if word_freq[word] >= min_word_freq] for sent in sentences]
    
    return [sent for sent in sentences if len(sent) >= min_sentence_len]

In [193]:
class SkipGramModel(object):
    def __init__(self):
        self.Neuron = 20
        self.lr = 0.0005
        self.Context_Window = 2

        self.X_train = []
        self.y_train = []
        

        self.words = []
        self.word_index = {}
        self.vocab = {}
  
    def InitializeWeights(self,V,data):
        self.V = V
        self.W = np.random.uniform(-0.4, 0.4, (self.V, self.Neuron))
        self.W1 = np.random.uniform(-0.4, 0.4, (self.Neuron, self.V))
          
        self.words = data
        for i in range(len(data)):
            self.word_index[data[i]] = i
            
    def train(self,mytol,maxepochs=20000):
      	#initialize loss
        self.loss =0
        self.loss1 = 1 #random number 1
        itr=1
        
        while abs(self.loss1 - self.loss)>= mytol and itr <= maxepochs:
            self.loss1 = self.loss
            self.loss = 0
            for j in range(len(self.X_train)):

             		# implementing feedforward 
                self.h = np.dot(self.W.T,self.X_train[j]).reshape(self.Neuron,1)
                self.u = np.dot(self.W1.T,self.h)
                self.y = softmax(self.u)
                
                # implementation of back propogration
                error = self.y - np.asarray(self.y_train[j]).reshape(self.V,1)
                dLdW1 = np.dot(self.h,error.T)
                X = np.array(self.X_train[j]).reshape(self.V,1)
                dLdW = np.dot(X, np.dot(self.W1,error).T)

                self.W1 = self.W1 - self.lr*dLdW1
                self.W = self.W - self.lr*dLdW

                #loss Function
                C = 0
                for m in range(self.V):
                    if(self.y_train[j][m]):
                        self.loss += -1*self.u[m][0]
                        C += 1
                self.loss += C*np.log(np.sum(np.exp(self.u)))
            #Print loss
            # print("epoch ",itr, " loss = ",self.loss)
            #update adaptive alpha
            self.lr *= 1/( (1+self.lr*itr) )
            itr=itr+1
             
    def predict(self,word,number_of_predictions):
        if word in self.words:
            index = self.word_index[word]
            X = [0 for i in range(self.V)]
            X[index] = 1


            #prediction = self.feed_forward(X)
            self.h = np.dot(self.W.T,X).reshape(self.Neuron,1)
            self.u = np.dot(self.W1.T,self.h)
            self.y = softmax(self.u)
            prediction=self.y


            output = {}
            for i in range(self.V):
                output[prediction[i][0]] = i
              
            top_context_words = []
            for k in sorted(output,reverse=True):
                top_context_words.append(self.words[output[k]])
                if(len(top_context_words)>=number_of_predictions):
                    break
      
            return top_context_words
        else:
            print("Word not found")
    
    def get_embedding_matrix(self):
        """
        Returns the word embedding matrix where each row corresponds
        to the embedding of a word in the vocabulary.
        """
        return self.W 
    
    def get_word_embedding(self, word):
        """
        Returns the embedding of the given word.
        """
        if word in self.word_index:
            word_idx = self.word_index[word]
            return self.W[word_idx]  # The embedding for the word
        else:
            print(f"Word '{word}' not in vocabulary.")
            return None
    

In [198]:
class SkipGramModelNeg(object):
    def __init__(self, negative_samples=5):
        self.Neuron = 20
        self.lr = 0.005
        self.negative_samples = negative_samples
        self.words = []
        self.word_index = {}
        self.V = 0
        self.W = None
        self.W1 = None

    def InitializeWeights(self, V, data):
        self.V = V
        self.W = np.random.uniform(-0.4, 0.4, (self.V, self.Neuron))
        self.W1 = np.random.uniform(-0.4, 0.4, (self.Neuron, self.V))
        self.words = data
        for i, word in enumerate(data):
            self.word_index[word] = i

    def train(self, mytol, maxepochs=20000):
        itr = 1

        while itr <= maxepochs:
            total_loss = 0
            
            for j in range(len(self.X_train)):
                # Feedforward
                h = np.dot(self.W.T, self.X_train[j]).reshape(self.Neuron, 1)
                u = np.dot(self.W1.T, h)
                y_pred = softmax(u)

                # Prepare positive samples
                positive_indices = np.where(self.y_train[j] == 1)[0]

                # Negative sampling
                negative_samples = np.random.choice(
                    [i for i in range(self.V) if i not in positive_indices],
                    self.negative_samples,
                    replace=False
                )

                # Create labels: 1 for positive, 0 for negative
                labels = np.zeros(self.V)
                labels[positive_indices] = 1
                labels[negative_samples] = 0
                
                # Calculate error
                error = y_pred - labels.reshape(self.V, 1)

                # Gradients
                dLdW1 = np.dot(h, error.T)
                X = np.array(self.X_train[j]).reshape(self.V, 1)
                dLdW = np.dot(X, np.dot(self.W1, error).T)

                # Update weights
                self.W1 -= self.lr * dLdW1
                self.W -= self.lr * dLdW

                # Loss calculation
                positive_loss = -np.sum(labels[positive_indices] * np.log(y_pred[positive_indices] + 1e-10))
                negative_loss = -np.sum((1 - labels[negative_samples]) * np.log(1 - y_pred[negative_samples] + 1e-10))
                total_loss += positive_loss + negative_loss

            # Print loss for monitoring
            print("Epoch ", itr, " Loss = ", total_loss)
            
            # Update learning rate
            self.lr *= 1 / (1 + self.lr * itr)
            itr += 1

    def predict(self, word, number_of_predictions):
        if word in self.word_index:
            index = self.word_index[word]
            X = np.zeros(self.V)
            X[index] = 1

            h = np.dot(self.W.T, X).reshape(self.Neuron, 1)
            u = np.dot(self.W1.T, h)
            y_pred = softmax(u)

            output = {y_pred[i][0]: i for i in range(self.V)}
            top_context_words = sorted(output, reverse=True)[:number_of_predictions]
            return [self.words[i] for i in top_context_words]
        else:
            print("Word not found")
            return []

    def get_embedding_matrix(self):
        return self.W 

    def get_word_embedding(self, word):
        if word in self.word_index:
            return self.W[self.word_index[word]]
        else:
            print(f"Word '{word}' not in vocabulary.")
            return None

### Importing Corpus

In [35]:
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")
train_data = dataset['train']
val_data = dataset['validation']
test_data = dataset['test']

In [36]:
train_data[0]["text"]

''

In [140]:
corpus = ""
for i in range(5000):
  corpus = corpus +"."+ train_data[i]["text"]

In [1]:
# corpus

In [144]:
corpus_Set = preprocess_sentences(corpus)
print(" number of sentences :", len(corpus_Set))

 number of sentences : 1


In [2]:
# print(corpus_Set)

In [40]:
# # only if we want to export corpus
# df = pd.DataFrame (corpus_Set)
# filepath = 'CorpusSet.xlsx'
# df.to_excel(filepath, index=False)

### Creating Skipgram Model and vocabulary Set

In [44]:
def prepare_data_in_batches(sentences, window_size, vocab, batch_size):
    """
    Generator to prepare context-target word pairs in batches.
    
    Args:
    - sentences: List of tokenized sentences.
    - window_size: Size of the context window.
    - vocab: Dictionary mapping words to indices (vocabulary).
    - batch_size: Number of word-context pairs to return in each batch.
    
    Yields:
    - X_batch: A batch of one-hot encoded target words.
    - y_batch: A batch of one-hot encoded context words.
    """
    X_batch = []
    y_batch = []
    current_batch_size = 0

    for sentence in sentences:
        for i, target_word in enumerate(sentence):
            if target_word not in vocab:
                continue

            target_word_idx = vocab[target_word]
            X = np.zeros(len(vocab))
            X[target_word_idx] = 1  # One-hot encode the target word

            # Get the context words within the window size
            for j in range(max(0, i - window_size), min(len(sentence), i + window_size + 1)):
                if j != i and sentence[j] in vocab:
                    context_word_idx = vocab[sentence[j]]
                    y = np.zeros(len(vocab))
                    y[context_word_idx] = 1  # One-hot encode the context word

                    X_batch.append(X)
                    y_batch.append(y)
                    current_batch_size += 1

                    # If the current batch size matches the batch_size, yield the batch
                    if current_batch_size == batch_size:
                        yield np.array(X_batch), np.array(y_batch)
                        X_batch = []
                        y_batch = []
                        current_batch_size = 0

    # Yield the final batch if there are any remaining pairs
    if current_batch_size > 0:
        yield np.array(X_batch), np.array(y_batch)

In [147]:
sentences = corpus_Set

vocab = {word: idx for idx, word in enumerate(set([word for sentence in sentences for word in sentence]))}


window_size = 2
batch_size = 1000
V = len(vocab)

model = SkipGramModel()
model.InitializeWeights(V, list(vocab.keys()))
model.vocab = vocab

# Training loop
num_epochs = 2
for epoch in range(num_epochs):
    print(f"Starting epoch {epoch + 1}/{num_epochs}")
    
    # Generate word-context pairs in batches
    batch_generator = prepare_data_in_batches(sentences, window_size, vocab, batch_size)
    
    for X_batch, y_batch in batch_generator:
        model.X_train = X_batch
        model.y_train = y_batch
        
        # Perform one epoch of training on this batch
        model.train(mytol=1e-4, maxepochs=1)  # Use a small maxepochs value for batch-wise training

    print(f"Finished epoch {epoch + 1}/{num_epochs}")

Starting epoch 1/2
epoch  1  loss =  7401.1307212793445
epoch  1  loss =  7364.913050972889
epoch  1  loss =  7398.456515716947
epoch  1  loss =  7387.475791387325
epoch  1  loss =  7361.165936140445
epoch  1  loss =  7328.266333513018
epoch  1  loss =  7338.102456367388
epoch  1  loss =  7374.019476685849
epoch  1  loss =  7358.576660827658
epoch  1  loss =  7321.917123203887
epoch  1  loss =  7356.989601327109
epoch  1  loss =  7341.96478401344
epoch  1  loss =  7300.1789577300915
epoch  1  loss =  7348.121591164813
epoch  1  loss =  7316.1479212830545
epoch  1  loss =  7313.091750675399
epoch  1  loss =  7297.5055506537765
epoch  1  loss =  7288.098546901032
epoch  1  loss =  7281.7469708241115
epoch  1  loss =  7318.546181350312
epoch  1  loss =  7296.125834560097
epoch  1  loss =  7280.369161034192
epoch  1  loss =  7292.048206327745
epoch  1  loss =  7287.130019445844
epoch  1  loss =  7236.8988367271995
epoch  1  loss =  7229.177288046652
epoch  1  loss =  7303.870796909088
epoc

In [199]:
model1 = SkipGramModelNeg()
model1.InitializeWeights(V, list(vocab.keys()))
model1.vocab = vocab

# Training loop
num_epochs = 2
for epoch in range(num_epochs):
    print(f"Starting epoch {epoch + 1}/{num_epochs}")
    
    # Generate word-context pairs in batches
    batch_generator = prepare_data_in_batches(sentences, window_size, vocab, batch_size)
    
    for X_batch, y_batch in batch_generator:
        model1.X_train = X_batch
        model1.y_train = y_batch
        
        # Perform one epoch of training on this batch
        model1.train(mytol=1e-4, maxepochs=1)  # Use a small maxepochs value for batch-wise training

    print(f"Finished epoch {epoch + 1}/{num_epochs}")

Starting epoch 1/2
Epoch  1  Loss =  7340.045339706044
Epoch  1  Loss =  7338.467535972701
Epoch  1  Loss =  7341.8162127854375
Epoch  1  Loss =  7337.174174056212
Epoch  1  Loss =  7327.5667402232475
Epoch  1  Loss =  7315.549767713357
Epoch  1  Loss =  7311.615802086002
Epoch  1  Loss =  7290.481786931636
Epoch  1  Loss =  7293.573414735636
Epoch  1  Loss =  7266.008004766331
Epoch  1  Loss =  7280.726625393017
Epoch  1  Loss =  7244.365868369978
Epoch  1  Loss =  7159.939410169584
Epoch  1  Loss =  7171.164580064096
Epoch  1  Loss =  7183.961699431784
Epoch  1  Loss =  7156.5086639119845
Epoch  1  Loss =  7112.0335904558515
Epoch  1  Loss =  7104.28547694585
Epoch  1  Loss =  7029.744269290945
Epoch  1  Loss =  7036.721240573735
Epoch  1  Loss =  7085.363359817794
Epoch  1  Loss =  6973.136697668808
Epoch  1  Loss =  6983.6949183711895
Epoch  1  Loss =  6946.501485711252
Epoch  1  Loss =  6788.179942369295
Epoch  1  Loss =  6803.631440084326
Epoch  1  Loss =  7029.500730019544
Epoch

In [120]:
def compute_similarity(embedding_matrix, target_vector):
    """
    Compute cosine similarity between the target vector and all word embeddings.
    """
    similarities = cosine_similarity(embedding_matrix, target_vector.reshape(1, -1)).flatten()
    return similarities

In [121]:
def get_rank(similarity_scores, true_index):
    """
    Get the rank of the true context word in the similarity list.
    """
    sorted_indices = np.argsort(-similarity_scores)  # Sort in descending order of similarity
    rank = np.where(sorted_indices == true_index)[0][0] + 1  # Get the rank of the true word
    return rank

In [122]:
def calculate_mrr_for_window(context_indices, target_embedding, embedding_matrix):
    """
    Calculate MRR for a single context window.
    """
    mrr = 0.0
    for context_idx in context_indices:
        similarity_scores = compute_similarity(embedding_matrix, target_embedding)
        rank = get_rank(similarity_scores, context_idx)
        mrr += 1 / rank
    mrr /= len(context_indices)  # Average over the context window
    return mrr

In [123]:
def calculate_mrr_for_dataset(test_data, embedding_matrix, word_index):
    """
    Calculate the overall MRR for the entire test dataset.
    """
    total_mrr = 0.0
    for t in test_data:
        print(t)
        target_word, context_words = t[0], t[1]
        if target_word not in word_index:
            continue  # Skip words that are not in the vocabulary
        
        target_idx = word_index[target_word]
        target_embedding = embedding_matrix[target_idx]
        
        # Get indices for all context words
        context_indices = [word_index[word] for word in context_words if word in word_index]
        
        if len(context_indices) > 0:
            mrr_window = calculate_mrr_for_window(context_indices, target_embedding, embedding_matrix)
            total_mrr += mrr_window
    
    avg_mrr = total_mrr / len(test_data)
    return avg_mrr

In [202]:
embedding_matrix = model1.get_embedding_matrix()
word_index = model1.word_index  # word to index mapping from the model

# avg_mrr = calculate_mrr_for_dataset(test_data, embedding_matrix, word_index)
# print(f"Mean Reciprocal Rank for test data: {avg_mrr:.4f}")

In [203]:
embedding_matrix

array([[-0.38516527,  0.0528593 , -0.37050841, ...,  0.4995249 ,
         0.15141531, -0.32639165],
       [-0.55062148, -0.06637911,  0.08037473, ...,  0.43831404,
         0.05264678, -0.0468442 ],
       [-0.40973828, -0.49045347, -0.37353193, ...,  0.30763087,
         0.01583437, -0.27024024],
       ...,
       [-0.61942955, -0.12203895,  0.19670987, ...,  0.20281747,
         0.11403295, -0.31951604],
       [-0.19133636,  0.1899759 , -0.40461873, ...,  0.22110652,
        -0.12172515, -0.13537633],
       [ 0.0375472 , -0.1259747 , -0.10260936, ...,  0.44124003,
        -0.0807507 ,  0.09002751]])

In [204]:
word_index

{'went': 0,
 'used': 1,
 'series': 2,
 'offer': 3,
 'debut': 4,
 'appointed': 5,
 'track': 6,
 'above': 7,
 'coin': 8,
 'organization': 9,
 'regional': 10,
 'was': 11,
 'support': 12,
 'buildings': 13,
 'few': 14,
 'normal': 15,
 'festival': 16,
 'tale': 17,
 'thomas': 18,
 'bruins': 19,
 'show': 20,
 'blue': 21,
 'because': 22,
 'minister': 23,
 'agreed': 24,
 'got': 25,
 'pope': 26,
 'london': 27,
 'young': 28,
 'deities': 29,
 'have': 30,
 'whose': 31,
 'house': 32,
 'start': 33,
 'something': 34,
 'park': 35,
 'cast': 36,
 'law': 37,
 'remaining': 38,
 'germany': 39,
 'popular': 40,
 'far': 41,
 'george': 42,
 'sexual': 43,
 'effort': 44,
 'occupation': 45,
 'deal': 46,
 'gabbar': 47,
 'ruler': 48,
 'peace': 49,
 'praised': 50,
 'college': 51,
 'light': 52,
 'genus': 53,
 'near': 54,
 'mid': 55,
 'date': 56,
 'e': 57,
 'instead': 58,
 'h': 59,
 'uk': 60,
 'average': 61,
 'jai': 62,
 'opportunities': 63,
 'over': 64,
 'typically': 65,
 'chemotherapy': 66,
 'competition': 67,
 'won':

In [205]:
#This function create One hot encoding for Input word and the context words
def prepare_test_data(sentences, window_size, vocab):
    """
    Prepare test data from sentences for MRR calculation.

    Args:
    - sentences: List of sentences (each sentence is a string).
    - window_size: The context window size (c).
    - vocab: Set of valid words (usually, your model's vocabulary).

    Returns:
    - test_data: A list of (target_word, [context_words]) tuples.
    """
    test_data = []
    print(vocab)
    for sentence in sentences:
        
        # Iterate over each word in the sentence
        for i in range(window_size, len(sentence)-window_size):
            target_word = sentence[i]
            if target_word not in vocab:
                continue  # Skip words not in the vocabulary
            
            # Get the context words within the window size
            context_words = []
            for j in range(max(0, i - window_size), min(len(sentence), i + window_size + 1)):
                if j != i and sentence[j] in vocab:  # Exclude the target word itself
                    context_words.append(sentence[j])
            
            if len(context_words) > 0:
                print(target_word, context_words)
                test_data.append([target_word, context_words])
    
    return test_data

In [206]:
corpus_Set[:10]

[['valkyria',
  'chronicles',
  'no',
  'valkyria',
  'chronicles',
  'valkyria',
  'of',
  'the',
  'referred',
  'to',
  'as',
  'valkyria',
  'chronicles',
  'outside',
  'is',
  'a',
  'role',
  'playing',
  'video',
  'game',
  'developed',
  'by',
  'and',
  'for',
  'the',
  'released',
  'in',
  'january',
  'in',
  'it',
  'is',
  'the',
  'third',
  'game',
  'in',
  'the',
  'valkyria',
  'series',
  'the',
  'same',
  'of',
  'and',
  'real',
  'time',
  'as',
  'its',
  'the',
  'story',
  'to',
  'the',
  'first',
  'game',
  'and',
  'the',
  'a',
  'military',
  'unit',
  'serving',
  'the',
  'nation',
  'of',
  'during',
  'the',
  'second',
  'war',
  'who',
  'black',
  'operations',
  'and',
  'are',
  'against',
  'the',
  'unit',
  'the',
  'game',
  'began',
  'development',
  'in',
  'over',
  'a',
  'large',
  'of',
  'the',
  'work',
  'done',
  'on',
  'valkyria',
  'chronicles',
  'while',
  'it',
  'the',
  'standard',
  'features',
  'of',
  'the',
  'ser

In [207]:
c_t = test_data["text"][:100]

In [156]:
c_t

['',
 ' = Robert Boulter = \n',
 '',
 ' Robert Boulter is an English film , television and theatre actor . He had a guest @-@ starring role on the television series The Bill in 2000 . This was followed by a starring role in the play Herons written by Simon Stephens , which was performed in 2001 at the Royal Court Theatre . He had a guest role in the television series Judge John Deed in 2002 . In 2004 Boulter landed a role as " Craig " in the episode " Teddy \'s Story " of the television series The Long Firm ; he starred alongside actors Mark Strong and Derek Jacobi . He was cast in the 2005 theatre productions of the Philip Ridley play Mercury Fur , which was performed at the Drum Theatre in Plymouth and the Menier Chocolate Factory in London . He was directed by John Tiffany and starred alongside Ben Whishaw , Shane Zaza , Harry Kent , Fraser Ayres , Sophie Stanton and Dominic Hall . \n',
 ' In 2006 , Boulter starred alongside Whishaw in the play Citizenship written by Mark Ravenhill 

In [216]:
c_t = ""
for i in c_t:
  c_t_t= c_t_t +"."+ i
  # print(i)
# print(corpus)

In [217]:
c_t_t

'.. = Robert Boulter = \n.. Robert Boulter is an English film , television and theatre actor . He had a guest @-@ starring role on the television series The Bill in 2000 . This was followed by a starring role in the play Herons written by Simon Stephens , which was performed in 2001 at the Royal Court Theatre . He had a guest role in the television series Judge John Deed in 2002 . In 2004 Boulter landed a role as " Craig " in the episode " Teddy \'s Story " of the television series The Long Firm ; he starred alongside actors Mark Strong and Derek Jacobi . He was cast in the 2005 theatre productions of the Philip Ridley play Mercury Fur , which was performed at the Drum Theatre in Plymouth and the Menier Chocolate Factory in London . He was directed by John Tiffany and starred alongside Ben Whishaw , Shane Zaza , Harry Kent , Fraser Ayres , Sophie Stanton and Dominic Hall . \n. In 2006 , Boulter starred alongside Whishaw in the play Citizenship written by Mark Ravenhill . He appeared on

In [218]:
c_t_t = preprocess_sentences(c_t_t)

In [219]:
c_t_t

[['boulter',
  'boulter',
  'is',
  'an',
  'television',
  'and',
  'theatre',
  'he',
  'had',
  'a',
  'role',
  'on',
  'the',
  'television',
  'series',
  'the',
  'in',
  'this',
  'was',
  'by',
  'a',
  'role',
  'in',
  'the',
  'by',
  'which',
  'was',
  'in',
  'at',
  'the',
  'theatre',
  'he',
  'had',
  'a',
  'role',
  'in',
  'the',
  'television',
  'series',
  'in',
  'in',
  'boulter',
  'a',
  'role',
  'as',
  'in',
  'the',
  's',
  'of',
  'the',
  'television',
  'series',
  'the',
  'he',
  'starred',
  'and',
  'he',
  'was',
  'in',
  'the',
  'theatre',
  'of',
  'the',
  'which',
  'was',
  'at',
  'the',
  'theatre',
  'in',
  'and',
  'the',
  'in',
  'he',
  'was',
  'by',
  'and',
  'starred',
  'and',
  'in',
  'boulter',
  'starred',
  'in',
  'the',
  'by',
  'he',
  'on',
  'a',
  'of',
  'the',
  'television',
  'series',
  'by',
  'a',
  'role',
  'in',
  'the',
  'theatre',
  'of',
  'to',
  'by',
  'to',
  'was',
  'at',
  'theatre',
  'in',


In [226]:
test_try = prepare_test_data(c_t_t, 2, vocab)

{'went': 0, 'used': 1, 'series': 2, 'offer': 3, 'debut': 4, 'appointed': 5, 'track': 6, 'above': 7, 'coin': 8, 'organization': 9, 'regional': 10, 'was': 11, 'support': 12, 'buildings': 13, 'few': 14, 'normal': 15, 'festival': 16, 'tale': 17, 'thomas': 18, 'bruins': 19, 'show': 20, 'blue': 21, 'because': 22, 'minister': 23, 'agreed': 24, 'got': 25, 'pope': 26, 'london': 27, 'young': 28, 'deities': 29, 'have': 30, 'whose': 31, 'house': 32, 'start': 33, 'something': 34, 'park': 35, 'cast': 36, 'law': 37, 'remaining': 38, 'germany': 39, 'popular': 40, 'far': 41, 'george': 42, 'sexual': 43, 'effort': 44, 'occupation': 45, 'deal': 46, 'gabbar': 47, 'ruler': 48, 'peace': 49, 'praised': 50, 'college': 51, 'light': 52, 'genus': 53, 'near': 54, 'mid': 55, 'date': 56, 'e': 57, 'instead': 58, 'h': 59, 'uk': 60, 'average': 61, 'jai': 62, 'opportunities': 63, 'over': 64, 'typically': 65, 'chemotherapy': 66, 'competition': 67, 'won': 68, 'october': 69, 'forms': 70, 'producer': 71, 'houston': 72, 'wid

In [227]:
test_try

[['is', ['an', 'television']],
 ['an', ['is', 'television', 'and']],
 ['television', ['is', 'an', 'and']],
 ['and', ['an', 'television', 'he']],
 ['he', ['and', 'had', 'a']],
 ['had', ['he', 'a', 'role']],
 ['a', ['he', 'had', 'role', 'on']],
 ['role', ['had', 'a', 'on', 'the']],
 ['on', ['a', 'role', 'the', 'television']],
 ['the', ['role', 'on', 'television', 'series']],
 ['television', ['on', 'the', 'series', 'the']],
 ['series', ['the', 'television', 'the', 'in']],
 ['the', ['television', 'series', 'in', 'this']],
 ['in', ['series', 'the', 'this', 'was']],
 ['this', ['the', 'in', 'was', 'by']],
 ['was', ['in', 'this', 'by', 'a']],
 ['by', ['this', 'was', 'a', 'role']],
 ['a', ['was', 'by', 'role', 'in']],
 ['role', ['by', 'a', 'in', 'the']],
 ['in', ['a', 'role', 'the', 'by']],
 ['the', ['role', 'in', 'by', 'which']],
 ['by', ['in', 'the', 'which', 'was']],
 ['which', ['the', 'by', 'was', 'in']],
 ['was', ['by', 'which', 'in', 'at']],
 ['in', ['which', 'was', 'at', 'the']],
 ['at',

In [228]:
avg_mrr = calculate_mrr_for_dataset(test_try, embedding_matrix, word_index)
print(f"Mean Reciprocal Rank for test data: {avg_mrr:.4f}")

['is', ['an', 'television']]
['an', ['is', 'television', 'and']]
['television', ['is', 'an', 'and']]
['and', ['an', 'television', 'he']]
['he', ['and', 'had', 'a']]
['had', ['he', 'a', 'role']]
['a', ['he', 'had', 'role', 'on']]
['role', ['had', 'a', 'on', 'the']]
['on', ['a', 'role', 'the', 'television']]
['the', ['role', 'on', 'television', 'series']]
['television', ['on', 'the', 'series', 'the']]
['series', ['the', 'television', 'the', 'in']]
['the', ['television', 'series', 'in', 'this']]
['in', ['series', 'the', 'this', 'was']]
['this', ['the', 'in', 'was', 'by']]
['was', ['in', 'this', 'by', 'a']]
['by', ['this', 'was', 'a', 'role']]
['a', ['was', 'by', 'role', 'in']]
['role', ['by', 'a', 'in', 'the']]
['in', ['a', 'role', 'the', 'by']]
['the', ['role', 'in', 'by', 'which']]
['by', ['in', 'the', 'which', 'was']]
['which', ['the', 'by', 'was', 'in']]
['was', ['by', 'which', 'in', 'at']]
['in', ['which', 'was', 'at', 'the']]
['at', ['was', 'in', 'the']]
['the', ['in', 'at', 'he']]
