In [16]:
# imports
import nltk
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity

In [17]:
def train_word2vec_model(sentences, sg, window_size):    
    model = Word2Vec(sentences, window=window_size, min_count=1, sg=sg, epochs=100, workers=2)
    return model

In [18]:
sentences = ["The bank is located near the river.", "The bank approved my loan application.", "He rose from his chair to close the window.", "The rose bloomed beautifully in the garden.", "The lead actor delivered a stunning performance.", "Exposure to lead is harmful to health.", "She is reading a book in the library.", "The book mentioned a fascinating historical event.", "I need to file a report for my manager.", "He lost the file containing important documents."]
tokenized_sentences = [nltk.word_tokenize(sentence.lower()) for sentence in sentences]
print("Tokenized sentences:", tokenized_sentences)

Tokenized sentences: [['the', 'bank', 'is', 'located', 'near', 'the', 'river', '.'], ['the', 'bank', 'approved', 'my', 'loan', 'application', '.'], ['he', 'rose', 'from', 'his', 'chair', 'to', 'close', 'the', 'window', '.'], ['the', 'rose', 'bloomed', 'beautifully', 'in', 'the', 'garden', '.'], ['the', 'lead', 'actor', 'delivered', 'a', 'stunning', 'performance', '.'], ['exposure', 'to', 'lead', 'is', 'harmful', 'to', 'health', '.'], ['she', 'is', 'reading', 'a', 'book', 'in', 'the', 'library', '.'], ['the', 'book', 'mentioned', 'a', 'fascinating', 'historical', 'event', '.'], ['i', 'need', 'to', 'file', 'a', 'report', 'for', 'my', 'manager', '.'], ['he', 'lost', 'the', 'file', 'containing', 'important', 'documents', '.']]


In [19]:
# try both skip-gram and CBOW
cbow_model = train_word2vec_model(tokenized_sentences, sg=0, window_size=3)
sg_model = train_word2vec_model(tokenized_sentences, sg=1, window_size=3)
print("CBOW Model:", cbow_model.wv.most_similar('bank', topn=3))


CBOW Model: [('a', 0.3154166042804718), ('containing', 0.28749239444732666), ('the', 0.27905935049057007)]


In [20]:
def caculate_similarity(word, cbow_model, sg_model):
    cbow_vector = cbow_model.wv[word]
    sg_vector = sg_model.wv[word]
    # find a word which has similar embedding in both models
    cousine_similarity = cosine_similarity(cbow_vector.reshape(1, -1), sg_vector.reshape(1, -1))
    #print(f"Cosine similarity between CBOW and Skip-gram vectors for '{word}':", cousine_similarity[0][0])
    return cousine_similarity[0][0]

def get_embeddings_similarity(cbow_model, sg_model):
    similarity_per_token = {}
    for sentence in tokenized_sentences:
        for token in sentence:
            similarity = caculate_similarity(token, cbow_model, sg_model)
            similarity_per_token[token] = similarity
    #print("Similarity per token:", similarity_per_token)
    return similarity_per_token
def get_most_similar_embeddings(cbow_model, sg_model):
    similarity_per_token = get_embeddings_similarity(cbow_model, sg_model)
    most_similar_embeddings = sorted(similarity_per_token.items(), key=lambda x: x[1], reverse=True)[:10]
    print("The most similar embeddings between CBOW and Skip-gram models are: ", most_similar_embeddings)

### The main difference between CBOW and Skip-gram is that CBOW predicts the target word from the context words, while Skip-gram predicts the context words from the target word.
### Because the amount of sentences we have trained on isn't big, we see that most of the words have similar Vecs between the models, using cosine similarity

In [21]:
# the difference between the models on each token
print(get_embeddings_similarity(cbow_model, sg_model))

{'the': 0.9741811, 'bank': 0.9870091, 'is': 0.98880196, 'located': 0.9942698, 'near': 0.9911646, 'river': 0.99578196, '.': 0.95895165, 'approved': 0.9902206, 'my': 0.98922086, 'loan': 0.9850665, 'application': 0.99126923, 'he': 0.97539026, 'rose': 0.9722186, 'from': 0.9860454, 'his': 0.98266244, 'chair': 0.98077714, 'to': 0.9688984, 'close': 0.9866456, 'window': 0.9846637, 'bloomed': 0.99505574, 'beautifully': 0.9943323, 'in': 0.99003625, 'garden': 0.997389, 'lead': 0.9891132, 'actor': 0.98969376, 'delivered': 0.9923188, 'a': 0.979106, 'stunning': 0.9941418, 'performance': 0.9896967, 'exposure': 0.99101806, 'harmful': 0.9970543, 'health': 0.9956724, 'she': 0.99402016, 'reading': 0.990484, 'book': 0.9785332, 'library': 0.9955744, 'mentioned': 0.9935018, 'fascinating': 0.98790365, 'historical': 0.9924734, 'event': 0.9920543, 'i': 0.9710286, 'need': 0.98716927, 'file': 0.9830948, 'report': 0.9768275, 'for': 0.98063785, 'manager': 0.9897022, 'lost': 0.99143106, 'containing': 0.98155266, 'i

## The most similar embeddings between the models are:

In [22]:
get_most_similar_embeddings(cbow_model, sg_model)

The most similar embeddings between CBOW and Skip-gram models are:  [('garden', 0.997389), ('harmful', 0.9970543), ('river', 0.99578196), ('health', 0.9956724), ('library', 0.9955744), ('bloomed', 0.99505574), ('beautifully', 0.9943323), ('located', 0.9942698), ('stunning', 0.9941418), ('she', 0.99402016)]


In [23]:
# with window size 5
cbow_model_5 = train_word2vec_model(tokenized_sentences, sg=0, window_size=5)
sg_model_5 = train_word2vec_model(tokenized_sentences, sg=1, window_size=5)
print("Word embeddings with window size 5:")
print(get_embeddings_similarity(cbow_model_5, sg_model_5))
print("Most similar embeddings with window size 5:")
get_most_similar_embeddings(cbow_model_5, sg_model_5)

Word embeddings with window size 5:
{'the': 0.9589076, 'bank': 0.9684531, 'is': 0.94957906, 'located': 0.97929347, 'near': 0.964545, 'river': 0.9521451, '.': 0.92507553, 'approved': 0.97148883, 'my': 0.898938, 'loan': 0.9673243, 'application': 0.9781905, 'he': 0.92080545, 'rose': 0.94635326, 'from': 0.93841505, 'his': 0.94708896, 'chair': 0.9413516, 'to': 0.9108333, 'close': 0.9517263, 'window': 0.94500816, 'bloomed': 0.9645338, 'beautifully': 0.9667676, 'in': 0.9525105, 'garden': 0.95785356, 'lead': 0.94001126, 'actor': 0.96895206, 'delivered': 0.97486675, 'a': 0.9332437, 'stunning': 0.95716643, 'performance': 0.96036243, 'exposure': 0.9691197, 'harmful': 0.9708192, 'health': 0.9768637, 'she': 0.9726185, 'reading': 0.9398199, 'book': 0.94001997, 'library': 0.93195194, 'mentioned': 0.96110976, 'fascinating': 0.9851081, 'historical': 0.9561414, 'event': 0.9729177, 'i': 0.93829733, 'need': 0.9249449, 'file': 0.92935926, 'report': 0.9383264, 'for': 0.9025316, 'manager': 0.9574868, 'lost':

In [24]:
# with window size 7
cbow_model_7 = train_word2vec_model(tokenized_sentences, sg=0, window_size=7)
sg_model_7 = train_word2vec_model(tokenized_sentences, sg=1, window_size=7)
print("Word embeddings with window size 7:")
print(get_embeddings_similarity(cbow_model_7, sg_model_7))
print("Most similar embeddings with window size 7:")
get_most_similar_embeddings(cbow_model_7, sg_model_7)

Word embeddings with window size 7:
{'the': 0.9347057, 'bank': 0.96868795, 'is': 0.9371962, 'located': 0.95291346, 'near': 0.9732104, 'river': 0.9297902, '.': 0.9388637, 'approved': 0.92511964, 'my': 0.9226874, 'loan': 0.9317084, 'application': 0.9640964, 'he': 0.899395, 'rose': 0.9198896, 'from': 0.8877254, 'his': 0.9139882, 'chair': 0.91258866, 'to': 0.89039826, 'close': 0.925623, 'window': 0.9335352, 'bloomed': 0.96132356, 'beautifully': 0.95809305, 'in': 0.9199599, 'garden': 0.92427534, 'lead': 0.92133504, 'actor': 0.9308353, 'delivered': 0.923897, 'a': 0.9232762, 'stunning': 0.93427956, 'performance': 0.9426071, 'exposure': 0.9531121, 'harmful': 0.96049523, 'health': 0.97883457, 'she': 0.9393894, 'reading': 0.90316975, 'book': 0.8989253, 'library': 0.9317846, 'mentioned': 0.95454866, 'fascinating': 0.89267653, 'historical': 0.8790003, 'event': 0.9606312, 'i': 0.9291498, 'need': 0.92205733, 'file': 0.92685515, 'report': 0.95600015, 'for': 0.9099671, 'manager': 0.9403404, 'lost': 0.

### We can see that the cosine similarity of the words has been changed between the models when using different window size

In [25]:
# TODO 1.g and 1.i

In [26]:
# compare the models for the words bank, rose, lead, book and file
words_to_compare = ['bank', 'rose', 'lead', 'book', 'file']
for word in words_to_compare:
    print("For the word:", word, "the similarity between CBOW and Skip-gram models is:")
    print(caculate_similarity(word, cbow_model, sg_model))

For the word: bank the similarity between CBOW and Skip-gram models is:
0.9870091
For the word: rose the similarity between CBOW and Skip-gram models is:
0.9722186
For the word: lead the similarity between CBOW and Skip-gram models is:
0.9891132
For the word: book the similarity between CBOW and Skip-gram models is:
0.9785332
For the word: file the similarity between CBOW and Skip-gram models is:
0.9830948
