# GloVE

Let's work on implementation of GloVE.

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

print(torch.cuda.is_available()) # True if GPU is available
print(torch.cuda.device_count()) # Number of GPUs available
print(torch.cuda.current_device()) # Device index

device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Select device
print(device)


True
1
0
cuda


In [2]:
import nltk # natural language toolkit
from nltk.corpus import reuters # Reuters dataset
nltk.__version__

'3.9.1'

In [3]:
nltk.download('punkt') # download the punkt tokenizer
nltk.download('reuters') # download the reuters dataset
categiries = reuters.categories() # get the list of categories
print(categiries[:10])  # print the first 10 categories

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mgmgk\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\mgmgk\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!


['acq', 'alum', 'barley', 'bop', 'carcass', 'castor-oil', 'cocoa', 'coconut', 'coconut-oil', 'coffee']


## 1. Load data

In [4]:
corpus = reuters.sents(categories='coffee') # get the list of sentences in the coffee category
# corpus = reuters.sents()  # Alternate way to get all the categories from reuters dataset
                            # When I tried this, it causes memory overflow and the kernel crashes,
                            # so I will stick to the coffee category for now for testing
corpus

[['INDONESIAN', 'COMMODITY', 'EXCHANGE', 'MAY', 'EXPAND', 'The', 'Indonesian', 'Commodity', 'Exchange', 'is', 'likely', 'to', 'start', 'trading', 'in', 'at', 'least', 'one', 'new', 'commodity', ',', 'and', 'possibly', 'two', ',', 'during', 'calendar', '1987', ',', 'exchange', 'chairman', 'Paian', 'Nainggolan', 'said', '.'], ['He', 'told', 'Reuters', 'in', 'a', 'telephone', 'interview', 'that', 'trading', 'in', 'palm', 'oil', ',', 'sawn', 'timber', ',', 'pepper', 'or', 'tobacco', 'was', 'being', 'considered', '.'], ...]

In [5]:
#2. numeralization
#find unique words
flatten = lambda l: [item for sublist in l for item in sublist]
#assign unique integer
vocabs = list(set(flatten(corpus))) #all the words we have in the system - <UNK>

In [6]:
#create handy mapping between integer and word to create word2index
word2index = {v:idx for idx, v in enumerate(vocabs)}
word2index['beans']

2928

In [7]:
# adding 'UNK' to handle unknown word tokens
last_vocab_idx = len(vocabs)
last_vocab_idx
vocabs.append('<UNK>')
word2index['<UNK>'] = last_vocab_idx

In [8]:
# reverse mapping from word2index to index2word
index2word = {v:k for k, v in word2index.items()}
index2word[5]

'AGO'

## 2. Build Co-occurence Matrix X

Here, we need to count the co-occurence of two words given some window size.  We gonna use window size of 1.

In [9]:
from collections import Counter

X_i = Counter(flatten(corpus))
X_i

Counter({',': 1637,
         '.': 1618,
         'the': 1462,
         'to': 968,
         'of': 819,
         'in': 609,
         'and': 585,
         'said': 576,
         'a': 546,
         'coffee': 375,
         'for': 320,
         "'": 284,
         'on': 272,
         '"': 271,
         'The': 237,
         's': 231,
         '-': 206,
         'is': 205,
         'export': 197,
         'be': 188,
         'quotas': 187,
         'will': 183,
         'that': 182,
         'ICO': 181,
         'Coffee': 168,
         'Brazil': 168,
         'by': 160,
         'not': 158,
         'mln': 158,
         'from': 157,
         'at': 152,
         '(': 149,
         'with': 146,
         'year': 145,
         'would': 141,
         'as': 132,
         'prices': 130,
         'have': 126,
         'this': 125,
         'was': 124,
         ',"': 123,
         'are': 120,
         'market': 120,
         ')': 119,
         'an': 116,
         'producers': 116,
         'bags': 116,
 

Prepare Training Data

In [10]:
# function was modified to use dynamic window size, with default 2

def generate_skip_grams(corpus, window_size=2):

    skip_grams = []
    for doc in corpus:
        for i, center in enumerate(doc):
            # Determine the range of context words
            start = max(0, i - window_size)
            end = min(len(doc), i + window_size + 1)
            context_words = [doc[j] for j in range(start, end) if j != i]

            # Create skip-grams for the current center word
            for context in context_words:
                skip_grams.append((center, context))
    
    return skip_grams

# Generate skip-grams with a window size of 2
skip_grams = generate_skip_grams(corpus, window_size=2)
print(skip_grams)

skip_grams = []
for doc in corpus:
    for i in range(1, len(doc)-1):
        center = doc[i]
        outside = [doc[i-1], doc[i+1]]
        for each_out in outside:
            skip_grams.append((center, each_out))
skip_grams

[('INDONESIAN', 'COMMODITY'), ('INDONESIAN', 'EXCHANGE'), ('COMMODITY', 'INDONESIAN'), ('COMMODITY', 'EXCHANGE'), ('COMMODITY', 'MAY'), ('EXCHANGE', 'INDONESIAN'), ('EXCHANGE', 'COMMODITY'), ('EXCHANGE', 'MAY'), ('EXCHANGE', 'EXPAND'), ('MAY', 'COMMODITY'), ('MAY', 'EXCHANGE'), ('MAY', 'EXPAND'), ('MAY', 'The'), ('EXPAND', 'EXCHANGE'), ('EXPAND', 'MAY'), ('EXPAND', 'The'), ('EXPAND', 'Indonesian'), ('The', 'MAY'), ('The', 'EXPAND'), ('The', 'Indonesian'), ('The', 'Commodity'), ('Indonesian', 'EXPAND'), ('Indonesian', 'The'), ('Indonesian', 'Commodity'), ('Indonesian', 'Exchange'), ('Commodity', 'The'), ('Commodity', 'Indonesian'), ('Commodity', 'Exchange'), ('Commodity', 'is'), ('Exchange', 'Indonesian'), ('Exchange', 'Commodity'), ('Exchange', 'is'), ('Exchange', 'likely'), ('is', 'Commodity'), ('is', 'Exchange'), ('is', 'likely'), ('is', 'to'), ('likely', 'Exchange'), ('likely', 'is'), ('likely', 'to'), ('likely', 'start'), ('to', 'is'), ('to', 'likely'), ('to', 'start'), ('to', 'tra

[('COMMODITY', 'INDONESIAN'),
 ('COMMODITY', 'EXCHANGE'),
 ('EXCHANGE', 'COMMODITY'),
 ('EXCHANGE', 'MAY'),
 ('MAY', 'EXCHANGE'),
 ('MAY', 'EXPAND'),
 ('EXPAND', 'MAY'),
 ('EXPAND', 'The'),
 ('The', 'EXPAND'),
 ('The', 'Indonesian'),
 ('Indonesian', 'The'),
 ('Indonesian', 'Commodity'),
 ('Commodity', 'Indonesian'),
 ('Commodity', 'Exchange'),
 ('Exchange', 'Commodity'),
 ('Exchange', 'is'),
 ('is', 'Exchange'),
 ('is', 'likely'),
 ('likely', 'is'),
 ('likely', 'to'),
 ('to', 'likely'),
 ('to', 'start'),
 ('start', 'to'),
 ('start', 'trading'),
 ('trading', 'start'),
 ('trading', 'in'),
 ('in', 'trading'),
 ('in', 'at'),
 ('at', 'in'),
 ('at', 'least'),
 ('least', 'at'),
 ('least', 'one'),
 ('one', 'least'),
 ('one', 'new'),
 ('new', 'one'),
 ('new', 'commodity'),
 ('commodity', 'new'),
 ('commodity', ','),
 (',', 'commodity'),
 (',', 'and'),
 ('and', ','),
 ('and', 'possibly'),
 ('possibly', 'and'),
 ('possibly', 'two'),
 ('two', 'possibly'),
 ('two', ','),
 (',', 'two'),
 (',', 'duri

In [11]:
X_ik_skipgrams = Counter(skip_grams)
X_ik_skipgrams

Counter({('said', '.'): 303,
         ("'", 's'): 224,
         ('s', "'"): 224,
         ('of', 'the'): 211,
         ('the', 'of'): 211,
         ('S', '.'): 196,
         ('.', 'S'): 191,
         (',', 'the'): 135,
         ('the', ','): 135,
         ('in', 'the'): 116,
         ('the', 'in'): 116,
         ('.', 'U'): 107,
         ('U', '.'): 97,
         (',', '000'): 90,
         ('000', ','): 90,
         ('International', 'Coffee'): 76,
         ('Coffee', 'International'): 76,
         ('export', 'quotas'): 73,
         ('quotas', 'export'): 73,
         ('on', 'the'): 65,
         ('the', 'on'): 65,
         ('for', 'the'): 64,
         ('the', 'for'): 64,
         ('he', 'said'): 63,
         ('said', 'he'): 63,
         ('to', 'the'): 58,
         ('the', 'to'): 58,
         ('mln', 'bags'): 56,
         ('bags', 'mln'): 56,
         ('1', '.'): 52,
         ('.', '1'): 51,
         ('they', 'said'): 51,
         ('said', 'they'): 51,
         (',', 'said'): 50,
        

### Weighting function

GloVe includes a weighting function to scale down too frequent words.

<img src = "../figures/glove_weighting_func.png" width=400>

In [12]:
def weighting(w_i, w_j, X_ik):
    
    #check whether the co-occurences between w_i and w_j is available
    try:
        x_ij = X_ik[(w_i, w_j)]
        #if not exist, then set to 1 "laplace smoothing"
    except:
        x_ij = 1
        
    #set xmax
    x_max = 100
    #set alpha
    alpha = 0.75
    
    #if co-ocurrence does not exceeed xmax, then just multiply with some alpha
    if x_ij < x_max:
        result = (x_ij / x_max)**alpha
    #otherwise, set to 1
    else:
        result = 1
    
    return result

In [13]:
from itertools import combinations_with_replacement

X_ik = {} #keeping the co-occurences
weighting_dic = {} #already scale the co-occurences using the weighting function

for bigram in combinations_with_replacement(vocabs, 2):
    if X_ik_skipgrams.get(bigram):  #if the pair exists in our corpus
        co = X_ik_skipgrams[bigram]
        X_ik[bigram] = co + 1 #for stability
        X_ik[(bigram[1], bigram[0])] = co + 1 #basically apple, banana = banana, apple
    else:
        pass
    
    weighting_dic[bigram] = weighting(bigram[0], bigram[1], X_ik)
    weighting_dic[(bigram[1], bigram[0])] = weighting(bigram[1], bigram[0], X_ik)

## 3. Prepare train data

In [14]:
import math

def random_batch(batch_size, word_sequence, skip_grams, X_ik, weighting_dic):
    
    random_inputs, random_labels, random_coocs, random_weightings = [], [], [], []
    
    #convert our skipgrams to id
    skip_grams_id = [(word2index[skip_gram[0]], word2index[skip_gram[1]]) for skip_gram in skip_grams]
    
    #randomly choose indexes based on batch size
    random_index = np.random.choice(range(len(skip_grams_id)), batch_size, replace=False)
    
    #get the random input and labels
    for index in random_index:
        random_inputs.append([skip_grams_id[index][0]])
        random_labels.append([skip_grams_id[index][1]])
        #coocs
        pair = skip_grams[index] #e.g., ('banana', 'fruit')
        try:
            cooc = X_ik[pair]
        except:
            cooc = 1
        random_coocs.append([math.log(cooc)])
    
        #weightings
        weighting = weighting_dic[pair]
        random_weightings.append([weighting])
        
    return np.array(random_inputs), np.array(random_labels), np.array(random_coocs), np.array(random_weightings)

### Testing the method

In [15]:
batch_size = 2
x, y, cooc, weighting = random_batch(batch_size, corpus, skip_grams, X_ik, weighting_dic)

In [16]:
print(f"Size of X is : {x}")
print(f"Size of Y is : {y}")
print(f"Size of cooc is : {cooc}")
print(f"Size of weighting is : {weighting}")


Size of X is : [[3487]
 [3046]]
Size of Y is : [[3915]
 [ 641]]
Size of cooc is : [[0.69314718]
 [0.69314718]]
Size of weighting is : [[0.05318296]
 [0.05318296]]


## 4. Model

<img src ="../figures/glove.png" width=400>

In [17]:
# Define Glove model for training word embeddings from scratch

class Glove(nn.Module):
    
    def __init__(self, voc_size, emb_size):
        super(Glove, self).__init__()
        self.center_embedding  = nn.Embedding(voc_size, emb_size)
        self.outside_embedding = nn.Embedding(voc_size, emb_size)
        
        self.center_bias       = nn.Embedding(voc_size, 1) 
        self.outside_bias      = nn.Embedding(voc_size, 1)
    
    def forward(self, center, outside, coocs, weighting):
        center_embeds  = self.center_embedding(center) #(batch_size, 1, emb_size)
        outside_embeds = self.outside_embedding(outside) #(batch_size, 1, emb_size)
        
        center_bias    = self.center_bias(center).squeeze(1)
        target_bias    = self.outside_bias(outside).squeeze(1)
        
        inner_product  = outside_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #(batch_size, 1, emb_size) @ (batch_size, emb_size, 1) = (batch_size, 1, 1) = (batch_size, 1)
        
        loss = weighting * torch.pow(inner_product + center_bias + target_bias - coocs, 2)
        
        return torch.sum(loss)

    def get_embed(self, index):
        # Retrieve the embedding for a specific word index.
        with torch.no_grad():  # Ensure gradients are not tracked
            return self.center_embedding(torch.LongTensor([index])).squeeze(0)

In [18]:
#test our system
voc_size = len(vocabs)
emb_size = 2
model = Glove(voc_size, emb_size)

In [19]:
# prepare the data
x_tensor = torch.LongTensor(x)
y_tensor = torch.LongTensor(y)
cooc_tensor = torch.FloatTensor(cooc)
weighting_tensor = torch.FloatTensor(weighting)

In [20]:
# sending model and all the tensors to cuda device
model.to(device)
x_tensor = x_tensor.to(device)
y_tensor = y_tensor.to(device)
cooc_tensor = cooc_tensor.to(device)
weighting_tensor = weighting_tensor.to(device)
loss = model(x_tensor, y_tensor, cooc_tensor, weighting_tensor)

In [21]:
loss

tensor(0.3846, device='cuda:0', grad_fn=<SumBackward0>)

## 5. Training

In [22]:
batch_size     = 10 # mini-batch size
embedding_size = 2 #so we can later plot
model          = Glove(voc_size, embedding_size).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [23]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [24]:
import time

# Training
num_epochs = 5000
start = time.time()

for epoch in range(num_epochs):
    
    input_batch, target_batch, cooc_batch, weighting_batch = random_batch(batch_size, corpus, skip_grams, X_ik, weighting_dic)
    input_batch  = torch.LongTensor(input_batch).to(device)         #[batch_size, 1]
    target_batch = torch.LongTensor(target_batch).to(device)        #[batch_size, 1]
    cooc_batch   = torch.FloatTensor(cooc_batch).to(device)         #[batch_size, 1]
    weighting_batch = torch.FloatTensor(weighting_batch).to(device) #[batch_size, 1]

    optimizer.zero_grad()
    loss = model(input_batch, target_batch, cooc_batch, weighting_batch)
    
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 1000 == 0:
        print(f"Epoch: {epoch + 1} | cost: {loss:.6f}")

# Calculate the elapsed time
end = time.time()
epoch_mins, epoch_secs = epoch_time(start, end)
print(f"Training of {num_epochs} epochs were completed in {epoch_mins}m {epoch_secs}s.")


Epoch: 1000 | cost: 23.062525
Epoch: 2000 | cost: 10.680178
Epoch: 3000 | cost: 2.802446
Epoch: 4000 | cost: 17.618662
Epoch: 5000 | cost: 3.055760
Training of 5000 epochs were completed in 1m 37s.


5. Model Comparism and Analysis

In [25]:
# function for opening the word anologies text file
def open_file(path_to_file):
    try:
        with open(path_to_file, 'r') as file:
            content = file.readlines()
    except FileNotFoundError:
        print(f"The file {path_to_file} does not exist.")
    except Exception as e:
        print(f"An error occurred: {e}")

    return content

In [26]:
# 1. Opening the word analogies text file
file_path = "data/word-test.v1.txt"
content = open_file(file_path)

# 2. Parsing the content of the file and create sematic and syntactic analogies 
semantic = []
syntactic = []

current_test = semantic
for sent in content:
    if sent.startswith(':'):
        if 'gram' in sent:
            current_test = syntactic
        else:
            current_test = semantic
        continue
    
    current_test.append(sent.strip().split())

print(f"Number of semantic analogies: {len(semantic)}")
print(f"Number of syntactic analogies: {len(syntactic)}")

Number of semantic analogies: 8869
Number of syntactic analogies: 10675


In [27]:
# 1. Function to find the closest word to a given analogy

def find_analogy(word_a, word_b, word_c, model, word2index, index2word):
    try:
        # Get the embeddings for the words
        emb_a = model.get_embed(word2index[word_a])
        emb_b = model.get_embed(word2index[word_b])
        emb_c = model.get_embed(word2index[word_c])
        
        # Perform analogy computation: emb_a - emb_b + emb_c
        analogy_vector = emb_a - emb_b + emb_c

        # Find the closest word to the analogy vector
        all_embeddings = model.center_embedding.weight  # Ensure this tensor is on the same device
        scores = torch.matmul(analogy_vector, all_embeddings.T)
        predicted_index = scores.argmax().item()

        return index2word[predicted_index]
    except KeyError as e:
        return f"Word not in vocabulary: {e}"

# 2. Function to evaluate the model on the analogies

def evaluate_analogies(analogies, model, word2index, index2word):
    correct = 0
    for analogy in analogies:
        word_a, word_b, word_c, word_d = analogy
        predicted_word = find_analogy(word_a, word_b, word_c, model, word2index, index2word)
        if predicted_word == word_d:
            correct += 1
    return correct / len(analogies)

# Evaluate semantic and syntactic analogies
model.to('cpu')
semantic_accuracy = evaluate_analogies(semantic, model, word2index, index2word)
syntactic_accuracy = evaluate_analogies(syntactic, model, word2index, index2word)

print(f"Semantic analogy accuracy: {semantic_accuracy * 100:.2f}%")
print(f"Syntactic analogy accuracy: {syntactic_accuracy * 100:.2f}%")

Semantic analogy accuracy: 0.00%
Syntactic analogy accuracy: 0.00%


In [28]:
# 1. Opening the similarities data text file
file_path = "data/wordsim_similarity_goldstandard.txt"
content = open_file(file_path)

similarity_dataset = []
for sent in content:
    word1, word2, similarity = sent.split('\t')
    similarity_dataset.append((word1, word2, float(similarity)))

print(similarity_dataset)



In [29]:
def compute_similarity(model, test_data):
    words = test_data.split("\t")

    embed0 = np.array(model.get_embed(words[0].strip()))
    embed1 = np.array(model.get_embed(words[1].strip()))

    similarity_model = embed1 @ embed0.T
    similarity_provided = float(words[2].strip())

    return similarity_provided, similarity_model

In [30]:
from scipy.stats import spearmanr

# Compute the dot product for each word pair
dot_products = []
similarity_scores = []

for word1, word2, similarity in similarity_dataset:
    if word1 in word2index and word2 in word2index:
        emb1 = model.get_embed(word2index[word1])
        emb2 = model.get_embed(word2index[word2])
        dot_product = torch.dot(emb1, emb2).item()
        dot_products.append(dot_product)
        similarity_scores.append(similarity)

# Calculate the Spearman correlation
correlation, _ = spearmanr(dot_products, similarity_scores)
print(f"Spearman correlation: {correlation:.4f}")

Spearman correlation: -0.2576


7. Save the model

In [31]:
import pickle

torch.save(model.state_dict(), 'glove.model')

glove_args = {
    'voc_size': voc_size,
    'emb_size': emb_size,
    'word2index': word2index,
}

pickle.dump(glove_args, open('glove.args', 'wb'))

In [32]:
glove_args = pickle.load(open('glove.args', 'rb'))
print("Loaded Arguments:", glove_args)  # Debug: Check what arguments were saved

glove_args.pop('word2index', None)  # Remove keys not required by the constructor

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
load_model = Glove(**glove_args)

# Load the model weights onto the correct device
load_model.load_state_dict(torch.load('glove.model', map_location=device))
load_model.eval()  # Set the model to evaluation mode

Loaded Arguments: {'voc_size': 4488, 'emb_size': 2, 'word2index': {'strong': 0, 'convene': 1, 'however': 2, 'Ours': 3, 'border': 4, 'AGO': 5, 'pact': 6, 'different': 7, 'German': 8, 'Paton': 9, "'": 10, '900': 11, 'operate': 12, 'Production': 13, 'Drake': 14, 'roasters': 15, 'Commission': 16, 'voluntary': 17, 'THROUGH': 18, 'these': 19, 'states': 20, 'consequences': 21, 'us': 22, 'mouth': 23, 'Sebaana': 24, 'off': 25, 'Lehman': 26, 'Demico': 27, 'backing': 28, 'GUAXUPE': 29, 'Bomani': 30, 'rose': 31, '3198': 32, 'severe': 33, 'cooperative': 34, 'measure': 35, 'may': 36, 'castor': 37, 'arrangement': 38, 'shortfall': 39, 'registering': 40, 'Bureau': 41, 'initiative': 42, 'moment': 43, 'nine': 44, 'French': 45, 'clause': 46, 'level': 47, 'solve': 48, 'background': 49, '255': 50, 'content': 51, 'clarification': 52, 'QUOTA': 53, 'repayments': 54, 'Chartered': 55, 'attracted': 56, 'weather': 57, 'exporters': 58, 'decided': 59, 'pct': 60, 'pack': 61, 'opening': 62, 'VOTUPORANGA': 63, '.': 64,

  load_model.load_state_dict(torch.load('glove.model', map_location=device))


Glove(
  (center_embedding): Embedding(4488, 2)
  (outside_embedding): Embedding(4488, 2)
  (center_bias): Embedding(4488, 1)
  (outside_bias): Embedding(4488, 1)
)

In [33]:
sample_input = torch.tensor([[1], [2]])  # Replace with appropriate test data
print("Sample Output:", load_model(sample_input, sample_input, cooc_tensor, weighting_tensor))

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!

In [162]:
words = ('import', 'export', 'soccer', 'king', 'rice','war', 'crop' )  # Replace with the word analogy to find

# Convert word to index
for word in words:
    if word in word2index:
        word_index = word2index[word]
        embedding = load_model.get_embed(word_index)  # Pass the index
        print(f"Embedding for '{word}': {embedding}")
    else:
        print(f"Word '{word}' not found in vocabulary.")

Embedding for 'import': tensor([1.1444, 0.6871])
Embedding for 'export': tensor([ 0.9110, -0.2708])
Word 'soccer' not found in vocabulary.
Word 'king' not found in vocabulary.
Embedding for 'rice': tensor([0.1855, 0.9371])
Embedding for 'war': tensor([0.1801, 0.3997])
Embedding for 'crop': tensor([0.2758, 0.6148])


In [163]:
def get_top_similar_contexts(query, model, corpus, word2index, top_n=10):
    # Convert the query to its embedding
    query_embedding = model.get_embed(word2index[query])
    
    # Flatten the corpus to get all words
    flattened_corpus = [word for sentence in corpus for word in sentence]
    
    # Compute the dot product between the query embedding and each word in the corpus
    similarities = []
    for word in flattened_corpus:
        if word in word2index:
            word_embedding = model.get_embed(word2index[word])
            dot_product = torch.dot(query_embedding, word_embedding).item()
            similarities.append((word, dot_product))
    
    # Sort the similarities in descending order and get the top N
    similarities.sort(key=lambda x: x[1], reverse=True)
    top_similar_contexts = similarities[:top_n]
    
    return top_similar_contexts

# Example usage
query = 'robusta'  # Replace with your query word
if query in word2index:
    top_similar_contexts = get_top_similar_contexts(query, model, corpus, word2index)
    print(top_similar_contexts)
    #word_index = word2index[word]
    #embedding = load_model.get_embed(word_index)  # Pass the index
    #print(f"Embedding for '{word}': {embedding}")
else:
    print(f"Word '{query}' not found in vocabulary.")

[('supported', 3.2373929023742676), ('supported', 3.2373929023742676), ('route', 3.2316250801086426), ('route', 3.2316250801086426), ('route', 3.2316250801086426), ('FAILURE', 3.2306151390075684), ('FAILURE', 3.2306151390075684), ('FAILURE', 3.2306151390075684), ('FAILURE', 3.2306151390075684), ('FAILURE', 3.2306151390075684)]
