###Training Word2Vec

In [None]:
from gensim.models import Word2Vec
import warnings
warnings.filterwarnings('ignore')# define training data
#Genism word2vec requires that a format of ‘list of lists’ be provided for training where every document contained in a list.
#Every list contains lists of tokens of that document.
corpus = [['dog','bites','man'], ["man", "bites" ,"dog"],["dog","eats","meat"],["man", "eats","food"]]

#Training the model
model_cbow = Word2Vec(corpus, min_count=1,sg=0) #using CBOW Architecture for trainnig
model_skipgram = Word2Vec(corpus, min_count=1,sg=1)#using skipGram Architecture for training



**Continuous Bag of Words (CBOW)**

In [None]:
#Summarize the loaded model
print(model_cbow)

#Summarize vocabulary
words = list(model_cbow.wv.vocab)
print(words)

#Acess vector for one word
print(model_cbow['dog'])

Word2Vec(vocab=6, size=100, alpha=0.025)
['dog', 'bites', 'man', 'eats', 'meat', 'food']
[ 3.2897668e-03  4.6496373e-03 -4.4211568e-03  3.3648077e-03
 -1.1701586e-03  1.0055874e-03  4.6089590e-03 -4.5262506e-05
  5.8132049e-04 -5.3696829e-04  1.2484964e-03  3.9657694e-03
  1.9884675e-03 -8.8840211e-04 -3.0649202e-03 -4.4097835e-03
 -3.7679812e-03 -4.0984661e-03 -3.9008597e-03 -1.1704471e-03
 -4.9964855e-03  3.0908786e-04  2.2519611e-04 -4.7683809e-03
 -1.7676577e-03 -4.6025962e-03  4.1042096e-03 -1.1584435e-03
  2.6916882e-03  4.6389741e-03  4.2366772e-03 -3.7566214e-03
  1.3053344e-03 -3.1574036e-03  3.7432534e-03 -4.9080858e-03
  4.2039519e-03 -3.7083230e-03 -6.2805414e-04 -2.8593657e-03
  9.1623917e-04  2.0444888e-04  2.4431532e-03  2.0057396e-03
  2.6604258e-03  4.3454873e-03  4.0471465e-03 -2.2070173e-03
 -3.9086002e-03  2.6548572e-03  2.9771295e-03  2.8717120e-03
 -2.7624976e-03  2.4746892e-03 -4.3273875e-03  2.8125062e-03
 -3.1465148e-03 -3.5907680e-03  4.4358894e-03  4.6197162e

In [None]:
#Compute similarity
print("Similarity between eats and bites:",model_cbow.similarity('eats', 'bites'))
print("Similarity between eats and man:",model_cbow.similarity('eats', 'man'))

Similarity between eats and bites: 0.033017933
Similarity between eats and man: -0.10586784


From the above similarity scores we can conclude that eats is more similar to bites than man.

In [None]:
#Most similarity
model_cbow.most_similar('meat')

[('man', -0.021876007318496704),
 ('bites', -0.05190521106123924),
 ('food', -0.06278383731842041),
 ('eats', -0.09210361540317535),
 ('dog', -0.15392762422561646)]

In [None]:
# save model
model_cbow.save('model_cbow.bin')

# load model
new_model_cbow = Word2Vec.load('model_cbow.bin')
print(new_model_cbow)

Word2Vec(vocab=6, size=100, alpha=0.025)


**SkipGram** </br>
In skipgram, the task is to predict the context words from the center word.

In [None]:
#Summarize the loaded model
print(model_skipgram)

#Summarize vocabulary
words = list(model_skipgram.wv.vocab)
print(words)

#Acess vector for one word
print(model_skipgram['dog'])

Word2Vec(vocab=6, size=100, alpha=0.025)
['dog', 'bites', 'man', 'eats', 'meat', 'food']
[ 3.2897668e-03  4.6496373e-03 -4.4211568e-03  3.3648077e-03
 -1.1701586e-03  1.0055874e-03  4.6089590e-03 -4.5262506e-05
  5.8132049e-04 -5.3696829e-04  1.2484964e-03  3.9657694e-03
  1.9884675e-03 -8.8840211e-04 -3.0649202e-03 -4.4097835e-03
 -3.7679812e-03 -4.0984661e-03 -3.9008597e-03 -1.1704471e-03
 -4.9964855e-03  3.0908786e-04  2.2519611e-04 -4.7683809e-03
 -1.7676577e-03 -4.6025962e-03  4.1042096e-03 -1.1584435e-03
  2.6916882e-03  4.6389741e-03  4.2366772e-03 -3.7566214e-03
  1.3053344e-03 -3.1574036e-03  3.7432534e-03 -4.9080858e-03
  4.2039519e-03 -3.7083230e-03 -6.2805414e-04 -2.8593657e-03
  9.1623917e-04  2.0444888e-04  2.4431532e-03  2.0057396e-03
  2.6604258e-03  4.3454873e-03  4.0471465e-03 -2.2070173e-03
 -3.9086002e-03  2.6548572e-03  2.9771295e-03  2.8717120e-03
 -2.7624976e-03  2.4746892e-03 -4.3273875e-03  2.8125062e-03
 -3.1465148e-03 -3.5907680e-03  4.4358894e-03  4.6197162e

In [None]:
#Compute similarity
print("Similarity between eats and bites:",model_skipgram.similarity('eats', 'bites'))
print("Similarity between eats and man:",model_skipgram.similarity('eats', 'man'))

Similarity between eats and bites: 0.033021368
Similarity between eats and man: -0.10586565


In [None]:
#Most similarity
model_skipgram.most_similar('meat')

[('man', -0.021876007318496704),
 ('bites', -0.05190519243478775),
 ('food', -0.06278384476900101),
 ('eats', -0.09217316657304764),
 ('dog', -0.15392763912677765)]

In [None]:
# save model
model_skipgram.save('model_skipgram.bin')

# load model
new_model_skipgram = Word2Vec.load('model_skipgram.bin')
print(new_model_skipgram)

Word2Vec(vocab=6, size=100, alpha=0.025)


Read more:

https://radimrehurek.com/gensim/models/word2vec.html

https://www.kaggle.com/code/pierremegret/gensim-word2vec-tutorial

###Document Vectors
Doc2vec allows us to directly learn the representations for texts of arbitrary lengths (phrases, sentences, paragraphs and documents), by considering the context of words in the text into account.

###Dov2Vec using averaging via spaCy

###Training Dov2Vec using Gensim

In [None]:
# To install only the requirements of this notebook, uncomment the lines below and run this cell

# ===========================

!pip install gensim==3.6.0
!pip install spacy==2.2.4
!pip install nltk==3.2.5

# ===========================

In [None]:
import warnings
warnings.filterwarnings('ignore')
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
from pprint import pprint
import nltk
nltk.download('punkt')

In [None]:
data = ["dog bites man",
        "man bites dog",
        "dog eats meat",
        "man eats food"]

tagged_data = [TaggedDocument(words=word_tokenize(word.lower()), tags=[str(i)]) for i, word in enumerate(data)]

In [None]:
tagged_data

In [None]:
#dbow
model_dbow = Doc2Vec(tagged_data,vector_size=20, min_count=1, epochs=2,dm=0)

In [None]:
print(model_dbow.infer_vector(['man','eats','food']))#feature vector of man eats food

In [None]:
model_dbow.wv.most_similar("man",topn=5)#top 5 most simlar words.

In [None]:
model_dbow.wv.n_similarity(["dog"],["man"])

In [None]:
#dm
model_dm = Doc2Vec(tagged_data, min_count=1, vector_size=20, epochs=2,dm=1)

print("Inference Vector of man eats food\n ",model_dm.infer_vector(['man','eats','food']))

print("Most similar words to man in our corpus\n",model_dm.wv.most_similar("man",topn=5))
print("Similarity between man and dog: ",model_dm.wv.n_similarity(["dog"],["man"]))

What happens when we compare between words which are not in the vocabulary?

In [None]:
#model_dm.wv.n_similarity(['covid'],['man'])