# **Using Word2Vec in Gensim**

Word2Vec is a technique that is able to learn distributed vector representations of words using a large corpus of text. It was developed by researchers at Google and has been widely used for natural language processing tasks.

In [32]:
#Install packages if required
#!pip install scikit-learn
#!pip install gensim
#!pip install spacy

There are some different models built with different texts available through Gensim.

**Tip:** You may want to check the size of these pretrained models before you decide if you want to load them into memory or not. You can do this without loading as shown below.

In [1]:
import gensim.downloader

# Show all available models in gensim-data
print(list(gensim.downloader.info()['models'].keys()))
print("\n")
gensim.downloader.info('word2vec-google-news-300')

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']




{'num_records': 3000000,
 'file_size': 1743563840,
 'base_dataset': 'Google News (about 100 billion words)',
 'reader_code': 'https://github.com/RaRe-Technologies/gensim-data/releases/download/word2vec-google-news-300/__init__.py',
 'license': 'not found',
 'parameters': {'dimension': 300},
 'description': "Pre-trained vectors trained on a part of the Google News dataset (about 100 billion words). The model contains 300-dimensional vectors for 3 million words and phrases. The phrases were obtained using a simple data-driven approach described in 'Distributed Representations of Words and Phrases and their Compositionality' (https://code.google.com/archive/p/word2vec/).",
 'read_more': ['https://code.google.com/archive/p/word2vec/',
  'https://arxiv.org/abs/1301.3781',
  'https://arxiv.org/abs/1310.4546',
  'https://www.microsoft.com/en-us/research/publication/linguistic-regularities-in-continuous-space-word-representations/?from=http%3A%2F%2Fresearch.microsoft.com%2Fpubs%2F189726%2Frvec

You can load a pretrained Word2Vec model with the following:

In [3]:
# if you want to download the "word2vec-google-news-300" embeddings
w2v_model = gensim.downloader.load('word2vec-google-news-300')

In [None]:
# since these models can be quite big, you may want to remove them from working memory after you are done with them
# del w2v_model

In [27]:
print("Number of words in corpus: ",len(w2v_model.index_to_key)) # Number of words in the vocabulary

Number of words in corpus:  3000000


In [28]:
#What is the vector representation for a word?
print(w2v_model['computer'])

print("Length of word vector: ", len(w2v_model['computer']))

[ 1.07421875e-01 -2.01171875e-01  1.23046875e-01  2.11914062e-01
 -9.13085938e-02  2.16796875e-01 -1.31835938e-01  8.30078125e-02
  2.02148438e-01  4.78515625e-02  3.66210938e-02 -2.45361328e-02
  2.39257812e-02 -1.60156250e-01 -2.61230469e-02  9.71679688e-02
 -6.34765625e-02  1.84570312e-01  1.70898438e-01 -1.63085938e-01
 -1.09375000e-01  1.49414062e-01 -4.65393066e-04  9.61914062e-02
  1.68945312e-01  2.60925293e-03  8.93554688e-02  6.49414062e-02
  3.56445312e-02 -6.93359375e-02 -1.46484375e-01 -1.21093750e-01
 -2.27539062e-01  2.45361328e-02 -1.24511719e-01 -3.18359375e-01
 -2.20703125e-01  1.30859375e-01  3.66210938e-02 -3.63769531e-02
 -1.13281250e-01  1.95312500e-01  9.76562500e-02  1.26953125e-01
  6.59179688e-02  6.93359375e-02  1.02539062e-02  1.75781250e-01
 -1.68945312e-01  1.21307373e-03 -2.98828125e-01 -1.15234375e-01
  5.66406250e-02 -1.77734375e-01 -2.08984375e-01  1.76757812e-01
  2.38037109e-02 -2.57812500e-01 -4.46777344e-02  1.88476562e-01
  5.51757812e-02  5.02929

In [29]:
# Calculate the similarity between 'cat' and 'dog'
print("Similarity between cat and dog: ", w2v_model.similarity('cat', 'dog'))

print("Similarity between cat and computer: ", w2v_model.similarity('cat', 'computer'))

Similarity between cat and dog:  0.76094574
Similarity between cat and computer:  0.17324439


In [30]:
result = w2v_model.most_similar(negative=["woman"], positive=["king", "queen"])
result

[('kings', 0.6367637515068054),
 ('monarch', 0.5600019693374634),
 ('queens', 0.5444643497467041),
 ('princes', 0.5285636782646179),
 ('royal', 0.510769784450531),
 ('prince', 0.4869095981121063),
 ('NYC_anglophiles_aflutter', 0.4691288471221924),
 ('crown_prince', 0.46789005398750305),
 ('Savory_aromas_wafted', 0.4651806354522705),
 ('royals', 0.46382129192352295)]

In [None]:
# phrase in this word2vec model
print(w2v_model['proctocolitis'])

KeyError: "Key 'proctocolitis' not present"

In [31]:
#Let us examine the model by knowing what the most similar words are, for a given word
# by default gives top 10
w2v_model.most_similar('horse', topn=5)

[('horses', 0.8654032945632935),
 ('racehorse', 0.752392590045929),
 ('stallion', 0.7200170159339905),
 ('thoroughbred', 0.7158915400505066),
 ('horseman', 0.6845748424530029)]

We have also seen that SpaCy can give us a word or sentence representation.

In [None]:
import spacy

nlp = spacy.load('en_core_web_sm')
# process a sentence using the model
mydoc = nlp("Canada is a large country")
#Get a vector for individual words
#print(doc[0].vector) #vector for 'Canada', the first word in the text
print(mydoc.vector) #Averaged vector for the entire sentence

[-2.44864374e-01 -1.56845257e-01 -5.19747622e-02  5.86494267e-01
  8.10811967e-02 -1.65754989e-01  7.57052720e-01  2.63185889e-01
  1.40734492e-02  2.51211464e-01  2.43307427e-01 -2.79111534e-01
 -3.70179832e-01  5.22314429e-01 -5.23915410e-01  4.84695425e-03
  4.30857569e-01 -2.19760254e-01 -3.72532457e-01  1.71566337e-01
 -2.67529279e-01  2.24802848e-02 -3.03287357e-01 -1.04288436e-01
  1.51315406e-01 -5.31261384e-01  4.36048269e-01  2.97305524e-01
  4.72418487e-01  3.90211403e-01  2.69951403e-01  2.36672014e-01
  4.59462464e-01 -4.97865111e-01 -1.82451054e-01 -1.67997599e-01
  1.93978697e-01  5.16766071e-01 -2.88335413e-01  3.74710053e-01
 -1.11499667e-01  3.33659947e-01  5.49611822e-02  2.53970414e-01
 -5.02043903e-01  3.85194987e-01 -1.86397389e-01  8.60191345e-01
  2.11835742e-01 -1.24764726e-01 -7.09948778e-01  7.70933092e-01
 -1.79754198e-01 -6.63751960e-01 -4.01271343e-01  1.83464423e-01
 -2.96254933e-01  7.63848484e-01 -3.35624158e-01 -1.81755573e-01
  5.62856086e-02 -5.20981

## **Training from Scratch Using Gensim**

Sometimes, Word2Vec as a pretrained model is not enough.

**Word2Vec cannot handle OOV words.** Perhaps the original version was not trained on a large portion of the vocabulary you are interested in (for example, you are working with text that has a lot of scientific or medical vocab). Perhaps your vocabulary is nothing like the text used for training Word2Vec making it completely useless e.g., graph embedding where you would like a representation of the nodes of a graph.

It is quite simple to train Word2Vec from scratch using Gensim on your own vocabulary.

Below is an example of training a Word2Vec model with Gensim.

**Training Format: Gensim's Word2Vec requires a list of lists type of input. Every document is a list of tokens for that document, and these documents are then also stored in a list.**

In [25]:
from gensim.models import Word2Vec

# define training data
corpus = [['dog','bites','man'], ["man", "bites" ,"dog"],["dog","eats","meat"],["man", "eats","food"]]

#Training the model
model_cbow = Word2Vec(corpus, vector_size=100, window=3, min_count=1, sg=0)

# Save the model (if you want)
# model_cbow.save("word2vec.model")

Take a look at the Word2Vec definition [here](https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec). What do the following parameters mean?


*   vector_size
*   window
*   min_count
*   sg

As you can see, it is possible to train Word2Vec without having any knowledge of the underlying architecture. However, it is useful to have some idea of what each hyperparameter means and what these paparmeters reference to in the training procedure as well as how the training data should be prepared. Having this knowledge is also beneficial if something goes wrong or behaviour isn't as expected!

In [26]:
from gensim.models import FastText

model = FastText(corpus, min_count=1, max_n=3)
vector = model.wv['dog']  # get vector for word
# vector
# model.wv.most_similar('and')


### **Can we update a model we already have?**
Yes!

In [7]:
from gensim.models import Word2Vec

old_sentences = [["bad","robots"],["good","human"]]

old_model = Word2Vec(old_sentences, vector_size = 4, window=5, min_count = 1)
print(old_model.wv.key_to_index)

old_model.save("old_model")
new_model = Word2Vec.load("old_model")

new_sentences = [['yes', 'this', 'is', 'the', 'word2vec', 'model'],[ 'if',"you","have","think","about","it"]]
new_model.build_vocab(new_sentences, update = True)
new_model.train(new_sentences, total_examples=2, epochs=old_model.epochs)
print(new_model.wv.key_to_index)



{'human': 0, 'good': 1, 'robots': 2, 'bad': 3}
{'human': 0, 'good': 1, 'robots': 2, 'bad': 3, 'yes': 4, 'this': 5, 'is': 6, 'the': 7, 'word2vec': 8, 'model': 9, 'if': 10, 'you': 11, 'have': 12, 'think': 13, 'about': 14, 'it': 15}


## **Training from Scratch Using Pytorch (More Advanced)**

This is just a simple example of how you could implement a word2vec model from scratch using PyTorch. There are many variations of the word2vec model that you could explore, such as using subword embeddings or incorporating additional context information into the model.

The architecture for Word2Vec is reletively simple (~4 lines of important statements to set up how it works) and below we set up a blueprint that we can use to initialize the network

In [8]:
from torch import nn

In [9]:
class Word2Vec(nn.Module):
    def __init__(self, vocab_size, embedding_size):
      # first, we are defining the word2vec class as a child class of Module in pytorch so we can inherit its methods
        super().__init__()
        # this is our embedding layer for the words we input to convert to a one-hot-encoding input
        # and project the weights from the hidden layer
        self.embed = nn.Embedding(vocab_size, embedding_size)
        # this is our activation function (we discussed it being linear in class)
        # we also remove the bias/intercept with bias=False since we apply
        # softmax for rescaling anyway
        self.expand = nn.Linear(embedding_size, vocab_size, bias=False)

    def forward(self, input):
        # Encode input to lower-dimensional representation
        hidden = self.embed(input)
        # Expand hidden layer to predictions
        logits = self.expand(hidden)
        return logits

That is it!

Now, let's test this out with some data. First, we will prepare the data and make them into integers instead of strings. Gensim does this part for us automatically but with Pytorch we will have to do this step ourselves.

In [11]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

# Tokenize the text data
text = "The farm was home to a variety of animals, each with their own distinct personalities and characteristics. The cows were docile and hardworking, providing the farm with milk and cream. The pigs were intelligent and ambitious, often vying for more power and control. The chickens clucked and pecked around the barnyard, laying eggs for the farm's breakfast. The horses were strong and proud, plowing the fields and carrying heavy loads. The sheep were gentle and timid, content to graze in the meadow. But the true leader of the farm was a pig named Napoleon, who through manipulation and deceit, rose to power and convinced the other animals to overthrow their human owner and run the farm themselves, with the pigs as the ruling class in George Orwell's Animal Farm."
tokens = word_tokenize(text)

# Create a vocabulary of unique words
vocab = set(tokens)

# Create training data
data = []
window_size = 2
for i, word in enumerate(tokens):
    for j in range(i-window_size, i+window_size+1):
        if i != j and 0 <= j < len(tokens):
            data.append((word, tokens[j]))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


We will make use of DataLoader for preparing our training dataset so we are going to make each of our words an integer.

In [12]:
#Create a mapping from integers to words
id2tok = dict(enumerate(vocab))

# Create a mapping from words to integers
word2int = {word: i for i, word in id2tok.items()}

# Convert words to integers
data = [(word2int[word[0]], word2int[word[1]]) for word in data]

# Create a Pytorch dataloader
import torch
from torch.utils.data import DataLoader

dataloader = DataLoader(data, batch_size=32, shuffle=True)

In [13]:
feature_size = 100
model = Word2Vec(vocab_size=len(vocab), embedding_size=feature_size)

# Relevant if you have a GPU you want to use, we will ignore this step
#device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
#model.to(device)

# Training parameters
learning = 3e-4
epochs = 200
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=learning)

Now for the actual training!

In [14]:
running_loss = []
for epoch in range(epochs):
    epoch_loss = 0
    for i, (center, context) in enumerate(dataloader):
      # again, the commented to(device) code is only if you want to make use of GPU
        #center, context = center.to(device), context.to(device)
        #print(center, context)
        optimizer.zero_grad()
        logits = model(input=center)
        loss = loss_fn(logits, context)

        epoch_loss += loss.item()
        loss.backward() # This is where we backpropogate and update the weights of the network
        optimizer.step()
    epoch_loss /= len(dataloader)
    running_loss.append(epoch_loss)

print(running_loss)

[4.602848906266062, 4.526897079066226, 4.47815317856638, 4.46295135899594, 4.367240039925826, 4.352342555397435, 4.28582333263598, 4.266253345891049, 4.214065300790887, 4.174345317639802, 4.134563596625077, 4.083735842453806, 4.070022319492541, 4.0348262159447925, 3.9781655135907625, 3.956468193154586, 3.928064835698981, 3.8824787516342965, 3.855940241562693, 3.8301482953523336, 3.8001373316112317, 3.7539776877353064, 3.7328179259049263, 3.7080609672947933, 3.667464996639051, 3.649512253309551, 3.605811508078324, 3.589195728302002, 3.574406536001908, 3.5508708577407035, 3.5093746687236584, 3.4813657183396187, 3.467848627190841, 3.4519452044838355, 3.38978153780887, 3.392011203263935, 3.37001895904541, 3.344103223399112, 3.3390895190991854, 3.3091216463791695, 3.2933091991826107, 3.272037204943205, 3.2467315071507503, 3.233396517603021, 3.2131396594800448, 3.188509050168489, 3.171923926002101, 3.1782631497634086, 3.152783055054514, 3.118756369540566, 3.088004375758924, 3.086964419013575

As you can see from the running_loss values, we are slowly yet surely decreasing our error. However, do our representations make sense yet? We can check on our representations below for each word in our vocabulary.

In [15]:
wordvecs = model.expand.weight.cpu().detach().numpy() #just want the vectors now so we detach from tensor object
print(wordvecs[0])

[ 2.09599644e-01 -2.69629732e-02 -2.12840706e-01 -8.20920914e-02
 -5.04826829e-02 -3.78768966e-02  1.14996642e-01  1.83683857e-02
 -9.31274071e-02  1.45465389e-01  7.30959848e-02  2.29303213e-03
 -4.28109281e-02  1.80684477e-01 -8.86503831e-02  1.25743851e-01
  1.21448645e-02  1.26253385e-02 -1.44656032e-01  9.45089385e-02
  1.58332035e-01 -2.33851315e-04 -2.56919172e-02  2.18188763e-02
  1.17239535e-01  3.13984901e-02  6.82145134e-02  6.63065091e-02
 -8.44094157e-02  5.30883633e-02 -8.95121619e-02  3.00137289e-02
 -3.41570377e-02  6.10541329e-02  7.55806714e-02 -1.33047402e-01
 -1.12024911e-01  1.59632504e-01 -5.45366332e-02 -4.07549702e-02
 -1.56891435e-01 -8.51264149e-02  5.40869348e-02 -1.22832097e-02
  7.58123621e-02  2.73967050e-02  1.27216890e-01  7.79426470e-02
  1.95566490e-01  1.71155930e-01  1.32297695e-01  2.25129843e-01
 -3.56455147e-02 -4.44173217e-02 -7.88564757e-02 -3.94883566e-02
  1.27443865e-01 -1.21899888e-01 -1.04736909e-01  1.27263665e-01
 -5.06845117e-02 -1.94851

Below we have some code to calculate the cosine distance between vectors, which we will cover more when we cover visualization methods.

In [23]:
from scipy.spatial import distance
import numpy as np

words_of_interest = ['Napoleon', 'horses']

def get_distance_matrix(wordvecs, metric):
    dist_matrix = distance.squareform(distance.pdist(wordvecs, metric))
    return dist_matrix

def get_k_similar_words(word, dist_matrix, k=5):
    idx = word2int[word]
    dists = dist_matrix[idx]
    ind = np.argpartition(dists, k)[:k+1]
    ind = ind[np.argsort(dists[ind])][1:]
    out = [(i, id2tok[i], dists[i]) for i in ind]
    return out

dmat = get_distance_matrix(wordvecs, 'cosine')

for word in words_of_interest:
    print(word, [t[1] for t in get_k_similar_words(word, dmat)], "\n")

Napoleon ['named', 'through', 'pig', 'manipulation', 'a'] 

horses ['sheep', 'cows', 'control', 'characteristics', 'chickens'] 

