# Training a word2vec model


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.data import random_split
from torch.utils.data import Dataset
from gensim.models import Word2Vec
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.vocab import vocab
from torchtext.data.utils import get_tokenizer
from torch.utils.data import DataLoader
from tqdm import tqdm

%matplotlib inline
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')

In [2]:
def find_similar_words(target_word, embedding_dict, top_k=2):
    if target_word not in embedding_dict:
        return f"Word '{target_word}' not found in embeddings."
    
    target_vector = embedding_dict[target_word]
    similarities = {}

    for word, vector in embedding_dict.items():
        if word == target_word:
            continue
        similarity = np.dot(target_vector, vector) / (np.linalg.norm(target_vector) * np.linalg.norm(vector))
        similarities[word] = similarity

    sorted_words = sorted(similarities.items(), key=lambda x: x[1], reverse=True)
    return [word for word, _ in sorted_words[:top_k]]

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

# Training a word2vec model from gensim

In [4]:
sentences = [
    ["I", "like", "to", "eat", "pizza"],
    ["Pizza", "is", "my", "favorite", "food"],
    ["I", "enjoy", "eating", "pasta"],
    ["Pasta", "is", "also", "delicious", "and", "nutritious"],
    ["My", "friends", "and", "I", "often", "order", "pizza", "for", "dinner"],
    ["Healthy", "food", "choices", "include", "vegetables", "and", "fruits"],
    ["Sometimes", "I", "like", "to", "cook", "Italian", "recipes"],
    ["Eating", "out", "can", "be", "fun", "but", "home-cooked", "meals", "are", "better"],
    ["Do", "you", "prefer", "pizza", "or", "pasta", "?"],
    ["I", "always", "try", "new", "recipes", "on", "weekends"]
]

# Convert all words to lowercase for consistency
sentences = [[word.lower() for word in sentence] for sentence in sentences]
sentences

[['i', 'like', 'to', 'eat', 'pizza'],
 ['pizza', 'is', 'my', 'favorite', 'food'],
 ['i', 'enjoy', 'eating', 'pasta'],
 ['pasta', 'is', 'also', 'delicious', 'and', 'nutritious'],
 ['my', 'friends', 'and', 'i', 'often', 'order', 'pizza', 'for', 'dinner'],
 ['healthy', 'food', 'choices', 'include', 'vegetables', 'and', 'fruits'],
 ['sometimes', 'i', 'like', 'to', 'cook', 'italian', 'recipes'],
 ['eating',
  'out',
  'can',
  'be',
  'fun',
  'but',
  'home-cooked',
  'meals',
  'are',
  'better'],
 ['do', 'you', 'prefer', 'pizza', 'or', 'pasta', '?'],
 ['i', 'always', 'try', 'new', 'recipes', 'on', 'weekends']]

In [5]:
w2v_model = Word2Vec(
    sentences,       # our training data: a list of tokenized sentences
    vector_size=100, # Embedding dimension for each word (100-D vectors)
    window=3,        # Context window size (3 words to the left & right)
    min_count=1,     # Ignore words that appear less than 1 time
    workers=4        # Number of CPU cores to use in training
)

In [6]:
w2v_model.build_vocab(sentences, progress_per=10000)

In [7]:
w2v_model.corpus_count

10

In [8]:
# Train the model on training data
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=10, report_delay=1)

(199, 670)

We have trained a word2vec model using the `gensim` library.we can now access the word embeddings using `model.wv` and explore various operations such as finding similar words, calculating word similarities, and more.    

Use the trained model to find similar words to "pizza" and calculate the similarity between "pizza" and "pasta". 


In [9]:
# Finding similar words
similar_words = w2v_model.wv.most_similar("pizza")
print("Similar words to 'pizza':", similar_words)

Similar words to 'pizza': [('delicious', 0.2527044713497162), ('sometimes', 0.20118068158626556), ('better', 0.1951061487197876), ('out', 0.17653489112854004), ('to', 0.17132723331451416), ('also', 0.15045233070850372), ('are', 0.14713196456432343), ('is', 0.1391201913356781), ('friends', 0.11050225049257278), ('healthy', 0.09928988665342331)]


In [10]:
similar_words = w2v_model.wv.most_similar("eating")
print("Similar words to 'eating':", similar_words)

Similar words to 'eating': [('?', 0.2684538662433624), ('home-cooked', 0.14282964169979095), ('also', 0.1284063458442688), ('weekends', 0.10976455360651016), ('pasta', 0.10961795598268509), ('favorite', 0.10887644439935684), ('include', 0.10816717147827148), ('vegetables', 0.10177747160196304), ('choices', 0.09926041215658188), ('often', 0.09616386890411377)]


In [11]:
# Calculating word similarity
similarity = w2v_model.wv.similarity("pizza", "pasta")
print("Similarity between 'pizza' and 'pasta':", similarity)

Similarity between 'pizza' and 'pasta': -0.005449652


In [12]:
similarity = w2v_model.wv.similarity("pizza", "delicious")
print("Similarity between 'pizza' and 'delicious':", similarity)

Similarity between 'pizza' and 'delicious': 0.25270447


The word embeddings obtained from the model would be more meaningful and informative with larger and more diverse training data.


### Using the trained model to create a PyTorch embedding layer and use it in any task as an embedding layer.

In [14]:
print(w2v_model.wv.index_to_key)
print(w2v_model.wv.vectors)

['i', 'pizza', 'and', 'pasta', 'is', 'my', 'food', 'eating', 'to', 'like', 'recipes', 'eat', 'often', 'healthy', 'dinner', 'for', 'order', 'delicious', 'friends', 'nutritious', 'also', 'include', 'enjoy', 'favorite', 'choices', 'weekends', 'on', 'are', 'new', 'try', 'always', '?', 'or', 'prefer', 'you', 'do', 'better', 'meals', 'fruits', 'home-cooked', 'but', 'fun', 'be', 'can', 'out', 'italian', 'cook', 'sometimes', 'vegetables']
[[-0.0005448   0.00021663  0.00507805 ... -0.00705724  0.00088444
   0.0063876 ]
 [-0.00714774  0.00125787 -0.00719377 ...  0.00485981  0.00078154
   0.00302692]
 [ 0.00768154  0.00913524  0.00111634 ...  0.00830145 -0.00610254
   0.00946844]
 ...
 [ 0.00210629  0.00573915 -0.00212659 ...  0.00444153 -0.00810177
  -0.00406681]
 [-0.00696055 -0.00245817 -0.00802504 ...  0.00274619  0.00561105
   0.00257519]
 [-0.00497406 -0.00127645  0.00327384 ... -0.00696506  0.00576213
  -0.0094267 ]]


In [15]:
print(len(w2v_model.wv.index_to_key))
print(len(w2v_model.wv.vectors))

49
49


In [16]:
word_vectors = w2v_model.wv
word_to_index = {word: index for index, word in enumerate(word_vectors.index_to_key)}

# Create an instance of nn.Embedding and load it with the trained vectors
embedding_dim = w2v_model.vector_size
embedding = torch.nn.Embedding(len(word_vectors.index_to_key), embedding_dim)
embedding.weight.data.copy_(torch.from_numpy(word_vectors.vectors))

tensor([[-0.0005,  0.0002,  0.0051,  ..., -0.0071,  0.0009,  0.0064],
        [-0.0071,  0.0013, -0.0072,  ...,  0.0049,  0.0008,  0.0030],
        [ 0.0077,  0.0091,  0.0011,  ...,  0.0083, -0.0061,  0.0095],
        ...,
        [ 0.0021,  0.0057, -0.0021,  ...,  0.0044, -0.0081, -0.0041],
        [-0.0070, -0.0025, -0.0080,  ...,  0.0027,  0.0056,  0.0026],
        [-0.0050, -0.0013,  0.0033,  ..., -0.0070,  0.0058, -0.0094]])

In [17]:
# get the embedding for a word
word = "pizza"
word_index = word_to_index[word]
word_embedding = embedding(torch.LongTensor([word_index]))
print(f"Word: {word}, Embedding: {word_embedding.detach().numpy()}")

Word: pizza, Embedding: [[-0.00714774  0.00125787 -0.00719377 -0.002252    0.0037566   0.00583821
   0.00122404  0.00211798 -0.00412855  0.00723212 -0.00633135  0.00459581
  -0.00820093  0.00205702 -0.00499275 -0.00427009 -0.00306797  0.0056457
   0.00580719 -0.00504387  0.00079023 -0.00848691  0.00781246  0.00922173
  -0.00274401  0.00082376  0.00072107  0.00548828 -0.00863547  0.00056937
   0.00687279  0.00222707  0.00113017 -0.00939239  0.00846144 -0.00623696
  -0.00297992  0.00349776 -0.00079536  0.0014085   0.00178815 -0.00684685
  -0.00971646  0.00909191  0.00624171 -0.00692963  0.00335529  0.00018195
   0.00478414 -0.00712823  0.00402592  0.00435332  0.00997025 -0.00449254
  -0.00139026 -0.00733258 -0.00967484 -0.0090679  -0.00100836 -0.00650289
   0.00486133 -0.00623833  0.00254642  0.00078534 -0.00338036 -0.00094708
   0.00994053  0.00917958 -0.00450207  0.00911223 -0.0056666   0.00595734
  -0.00305531  0.00343974  0.00304454  0.00688019 -0.00234786  0.00879942
   0.00760095 -