In [40]:
# imports
import nltk
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity

In [21]:
def train_word2vec_model(sentences, sg, window_size):    
    model = Word2Vec(sentences, window=window_size, min_count=1, sg=sg)
    return model

In [62]:
sentences = ["The bank is located near the river.", "The bank approved my loan application.", "He rose from his chair to close the window.", "The rose bloomed beautifully in the garden.", "The lead actor delivered a stunning performance.", "Exposure to lead is harmful to health.", "She is reading a book in the library.", "The book mentioned a fascinating historical event.", "I need to file a report for my manager.", "He lost the file containing important documents."]
tokenized_sentences = [nltk.word_tokenize(sentence.lower()) for sentence in sentences]
print("Tokenized sentences:", tokenized_sentences)

Tokenized sentences: [['the', 'bank', 'is', 'located', 'near', 'the', 'river', '.'], ['the', 'bank', 'approved', 'my', 'loan', 'application', '.'], ['he', 'rose', 'from', 'his', 'chair', 'to', 'close', 'the', 'window', '.'], ['the', 'rose', 'bloomed', 'beautifully', 'in', 'the', 'garden', '.'], ['the', 'lead', 'actor', 'delivered', 'a', 'stunning', 'performance', '.'], ['exposure', 'to', 'lead', 'is', 'harmful', 'to', 'health', '.'], ['she', 'is', 'reading', 'a', 'book', 'in', 'the', 'library', '.'], ['the', 'book', 'mentioned', 'a', 'fascinating', 'historical', 'event', '.'], ['i', 'need', 'to', 'file', 'a', 'report', 'for', 'my', 'manager', '.'], ['he', 'lost', 'the', 'file', 'containing', 'important', 'documents', '.']]


In [72]:
# try both skip-gram and CBOW
cbow_model = train_word2vec_model(tokenized_sentences, sg=0, window_size=3)
sg_model = train_word2vec_model(tokenized_sentences, sg=1, window_size=3)
print("CBOW Model:", cbow_model.wv.most_similar('bank', topn=3))


CBOW Model: [('containing', 0.1821729987859726), ('a', 0.1726524829864502), ('my', 0.16703678667545319)]


## d. Differences between the embeddings

In [None]:
def caculate_similarity(word, cbow_model, sg_model):
    cbow_vector = cbow_model.wv[word]
    sg_vector = sg_model.wv[word]
    # find a word which has similar embedding in both models
    cousine_similarity = cosine_similarity(cbow_vector.reshape(1, -1), sg_vector.reshape(1, -1))
    #print(f"Cosine similarity between CBOW and Skip-gram vectors for '{word}':", cousine_similarity[0][0])
    return cousine_similarity[0][0]

def get_most_similar_embeddings(cbow_model, sg_model):
    similarity_per_token = {}
    for sentence in tokenized_sentences:
        for token in sentence:
            similarity = caculate_similarity(token, cbow_model, sg_model)
            similarity_per_token[token] = similarity
    #print("Similarity per token:", similarity_per_token)
    print("The most similar embeddings between CBOW and Skip-gram models are: ", sorted(similarity_per_token.items(), key=lambda x: x[1], reverse=True)[:5])


In [73]:
# the most similar embeddings between CBOW and Skip-gram models for window size 3
get_most_similar_embeddings(cbow_model, sg_model)
# for window size 5
cbow_model_5 = train_word2vec_model(tokenized_sentences, sg=0, window_size=5)
sg_model_5 = train_word2vec_model(tokenized_sentences, sg=1, window_size=5)
get_most_similar_embeddings(cbow_model_5, sg_model_5)

# for window size 7
cbow_model_7 = train_word2vec_model(tokenized_sentences, sg=0, window_size=7)
sg_model_7 = train_word2vec_model(tokenized_sentences, sg=1, window_size=7)
get_most_similar_embeddings(cbow_model_7, sg_model_7)

The most similar embeddings between CBOW and Skip-gram models are:  [('library', 1.0000001), ('application', 1.0), ('bloomed', 1.0), ('in', 1.0), ('performance', 1.0)]
The most similar embeddings between CBOW and Skip-gram models are:  [('near', 1.0000001), ('i', 1.0000001), ('bank', 1.0), ('application', 1.0), ('in', 1.0)]
The most similar embeddings between CBOW and Skip-gram models are:  [('i', 1.0000001), ('application', 1.0), ('located', 0.99999994), ('approved', 0.99999994), ('loan', 0.99999994)]
