### Load Gensim Library

In [13]:
#!pip install gensim

In [14]:
import gensim
import warnings
warnings.filterwarnings('ignore')

In [15]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', 
                    level=logging.INFO)

### Load Text Data

Data can be downloaded from https://www.kaggle.com/c/word2vec-nlp-tutorial/data

In [16]:
#This is needed only if you have uploaded data to Google drive
#from google.colab import drive
#drive.mount('/gdrive')

In [17]:
import pandas as pd

#change file path to point to where you have stored the zip file.
#df = pd.read_csv('/gdrive/My Drive/Statistical NLP AIML/unlabeledTrainData.tsv.zip', header=0, delimiter="\t", quoting=3)
df = pd.read_csv('unlabeledTrainData.tsv.zip', header=0, delimiter="\t", quoting=3)
print('Number of examples in Dataset: ', df.shape)
df.head()

Number of examples in Dataset:  (50000, 2)


Unnamed: 0,id,review
0,"""9999_0""","""Watching Time Chasers, it obvious that it was..."
1,"""45057_0""","""I saw this film about 20 years ago and rememb..."
2,"""15561_0""","""Minor Spoilers<br /><br />In New York, Joan B..."
3,"""7161_0""","""I went to see this film with a great deal of ..."
4,"""43971_0""","""Yes, I agree with everyone on this site this ..."


### Function to Clean up data

In [18]:
import re, string

def clean_str(string):
  """
  String cleaning before vectorization
  """
  try:    
    string = re.sub(r'^https?:\/\/<>.*[\r\n]*', '', string, flags=re.MULTILINE)
    string = re.sub(r"[^A-Za-z]", " ", string)         
    words = string.strip().lower().split()    
    words = [w for w in words if len(w)>=1]
    return " ".join(words)	
  except:
    return ""

### Clean the Data using routine above

In [19]:
df['clean_review'] = df['review'].apply(clean_str)
df.head()

Unnamed: 0,id,review,clean_review
0,"""9999_0""","""Watching Time Chasers, it obvious that it was...",watching time chasers it obvious that it was m...
1,"""45057_0""","""I saw this film about 20 years ago and rememb...",i saw this film about years ago and remember i...
2,"""15561_0""","""Minor Spoilers<br /><br />In New York, Joan B...",minor spoilers br br in new york joan barnard ...
3,"""7161_0""","""I went to see this film with a great deal of ...",i went to see this film with a great deal of e...
4,"""43971_0""","""Yes, I agree with everyone on this site this ...",yes i agree with everyone on this site this mo...


In [20]:
df['clean_review'][0].split(' ')

['watching',
 'time',
 'chasers',
 'it',
 'obvious',
 'that',
 'it',
 'was',
 'made',
 'by',
 'a',
 'bunch',
 'of',
 'friends',
 'maybe',
 'they',
 'were',
 'sitting',
 'around',
 'one',
 'day',
 'in',
 'film',
 'school',
 'and',
 'said',
 'hey',
 'let',
 's',
 'pool',
 'our',
 'money',
 'together',
 'and',
 'make',
 'a',
 'really',
 'bad',
 'movie',
 'or',
 'something',
 'like',
 'that',
 'what',
 'ever',
 'they',
 'said',
 'they',
 'still',
 'ended',
 'up',
 'making',
 'a',
 'really',
 'bad',
 'movie',
 'dull',
 'story',
 'bad',
 'script',
 'lame',
 'acting',
 'poor',
 'cinematography',
 'bottom',
 'of',
 'the',
 'barrel',
 'stock',
 'music',
 'etc',
 'all',
 'corners',
 'were',
 'cut',
 'except',
 'the',
 'one',
 'that',
 'would',
 'have',
 'prevented',
 'this',
 'film',
 's',
 'release',
 'life',
 's',
 'like',
 'that']

### Convert Review to a Word List

In [21]:
#List to hold all words in each review
documents = []

#Iterate over each review
for doc in df['clean_review']:
    documents.append(doc.split(' '))

print(len(documents))
print(documents[0])

50000
['watching', 'time', 'chasers', 'it', 'obvious', 'that', 'it', 'was', 'made', 'by', 'a', 'bunch', 'of', 'friends', 'maybe', 'they', 'were', 'sitting', 'around', 'one', 'day', 'in', 'film', 'school', 'and', 'said', 'hey', 'let', 's', 'pool', 'our', 'money', 'together', 'and', 'make', 'a', 'really', 'bad', 'movie', 'or', 'something', 'like', 'that', 'what', 'ever', 'they', 'said', 'they', 'still', 'ended', 'up', 'making', 'a', 'really', 'bad', 'movie', 'dull', 'story', 'bad', 'script', 'lame', 'acting', 'poor', 'cinematography', 'bottom', 'of', 'the', 'barrel', 'stock', 'music', 'etc', 'all', 'corners', 'were', 'cut', 'except', 'the', 'one', 'that', 'would', 'have', 'prevented', 'this', 'film', 's', 'release', 'life', 's', 'like', 'that']


In [22]:
print(len(documents[0]))

90


In [23]:
len(documents[1])

82

### Build the Model

In [25]:
#Build the model
model = gensim.models.Word2Vec(documents, #Word list
                               min_count=5, #Ignore all words with total frequency lower than this                           
                               workers=6, #Number of CPU Cores
                               vector_size=300,  #Embedding size
                               window=5, #Maximum Distance between current and predicted word
                               epochs=10   #Number of iterations over the text corpus
                              )  

2021-11-11 11:52:47,434 : INFO : collecting all words and their counts
2021-11-11 11:52:47,435 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-11-11 11:52:48,222 : INFO : PROGRESS: at sentence #10000, processed 2399440 words, keeping 51654 word types
2021-11-11 11:52:48,982 : INFO : PROGRESS: at sentence #20000, processed 4835846 words, keeping 69077 word types
2021-11-11 11:52:49,845 : INFO : PROGRESS: at sentence #30000, processed 7267977 words, keeping 81515 word types
2021-11-11 11:52:50,634 : INFO : PROGRESS: at sentence #40000, processed 9669772 words, keeping 91685 word types
2021-11-11 11:52:51,442 : INFO : collected 100479 word types from a corpus of 12084660 raw words and 50000 sentences
2021-11-11 11:52:51,444 : INFO : Creating a fresh vocabulary
2021-11-11 11:52:51,815 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 39730 unique words (39.54060052349247%% of original 100479, drops 60749)', 'datetime': '2021-11-11T11:5

2021-11-11 11:53:37,018 : INFO : EPOCH 2 - PROGRESS: at 69.86% examples, 379797 words/s, in_qsize 8, out_qsize 3
2021-11-11 11:53:38,049 : INFO : EPOCH 2 - PROGRESS: at 74.66% examples, 381798 words/s, in_qsize 11, out_qsize 0
2021-11-11 11:53:39,062 : INFO : EPOCH 2 - PROGRESS: at 79.32% examples, 382791 words/s, in_qsize 11, out_qsize 0
2021-11-11 11:53:40,080 : INFO : EPOCH 2 - PROGRESS: at 83.85% examples, 383565 words/s, in_qsize 10, out_qsize 1
2021-11-11 11:53:41,081 : INFO : EPOCH 2 - PROGRESS: at 88.75% examples, 385668 words/s, in_qsize 11, out_qsize 0
2021-11-11 11:53:42,112 : INFO : EPOCH 2 - PROGRESS: at 93.44% examples, 386700 words/s, in_qsize 11, out_qsize 0
2021-11-11 11:53:43,122 : INFO : EPOCH 2 - PROGRESS: at 98.16% examples, 388220 words/s, in_qsize 11, out_qsize 0
2021-11-11 11:53:43,429 : INFO : worker thread finished; awaiting finish of 5 more threads
2021-11-11 11:53:43,446 : INFO : worker thread finished; awaiting finish of 4 more threads
2021-11-11 11:53:43,4

2021-11-11 11:54:33,580 : INFO : EPOCH 5 - PROGRESS: at 17.90% examples, 385794 words/s, in_qsize 10, out_qsize 1
2021-11-11 11:54:34,617 : INFO : EPOCH 5 - PROGRESS: at 21.91% examples, 376566 words/s, in_qsize 11, out_qsize 0
2021-11-11 11:54:35,629 : INFO : EPOCH 5 - PROGRESS: at 27.17% examples, 391155 words/s, in_qsize 11, out_qsize 0
2021-11-11 11:54:36,644 : INFO : EPOCH 5 - PROGRESS: at 32.40% examples, 401260 words/s, in_qsize 11, out_qsize 0
2021-11-11 11:54:37,677 : INFO : EPOCH 5 - PROGRESS: at 37.23% examples, 403619 words/s, in_qsize 11, out_qsize 0
2021-11-11 11:54:38,695 : INFO : EPOCH 5 - PROGRESS: at 42.82% examples, 413952 words/s, in_qsize 11, out_qsize 0
2021-11-11 11:54:39,741 : INFO : EPOCH 5 - PROGRESS: at 47.87% examples, 416102 words/s, in_qsize 11, out_qsize 0
2021-11-11 11:54:40,765 : INFO : EPOCH 5 - PROGRESS: at 53.10% examples, 419525 words/s, in_qsize 10, out_qsize 1
2021-11-11 11:54:41,782 : INFO : EPOCH 5 - PROGRESS: at 58.27% examples, 422681 words/s,

2021-11-11 11:55:31,698 : INFO : worker thread finished; awaiting finish of 1 more threads
2021-11-11 11:55:31,763 : INFO : worker thread finished; awaiting finish of 0 more threads
2021-11-11 11:55:31,767 : INFO : EPOCH - 7 : training on 12084660 raw words (8901039 effective words) took 20.8s, 428104 effective words/s
2021-11-11 11:55:32,858 : INFO : EPOCH 8 - PROGRESS: at 5.28% examples, 458366 words/s, in_qsize 11, out_qsize 0
2021-11-11 11:55:33,862 : INFO : EPOCH 8 - PROGRESS: at 10.48% examples, 457011 words/s, in_qsize 11, out_qsize 0
2021-11-11 11:55:34,909 : INFO : EPOCH 8 - PROGRESS: at 16.00% examples, 459418 words/s, in_qsize 12, out_qsize 0
2021-11-11 11:55:35,927 : INFO : EPOCH 8 - PROGRESS: at 21.56% examples, 464888 words/s, in_qsize 11, out_qsize 0
2021-11-11 11:55:36,930 : INFO : EPOCH 8 - PROGRESS: at 26.74% examples, 464214 words/s, in_qsize 11, out_qsize 0
2021-11-11 11:55:37,944 : INFO : EPOCH 8 - PROGRESS: at 32.08% examples, 465206 words/s, in_qsize 11, out_qsiz

2021-11-11 11:56:32,816 : INFO : EPOCH 10 - PROGRESS: at 98.24% examples, 404577 words/s, in_qsize 10, out_qsize 1
2021-11-11 11:56:33,094 : INFO : worker thread finished; awaiting finish of 5 more threads
2021-11-11 11:56:33,098 : INFO : worker thread finished; awaiting finish of 4 more threads
2021-11-11 11:56:33,150 : INFO : worker thread finished; awaiting finish of 3 more threads
2021-11-11 11:56:33,152 : INFO : worker thread finished; awaiting finish of 2 more threads
2021-11-11 11:56:33,162 : INFO : worker thread finished; awaiting finish of 1 more threads
2021-11-11 11:56:33,184 : INFO : worker thread finished; awaiting finish of 0 more threads
2021-11-11 11:56:33,186 : INFO : EPOCH - 10 : training on 12084660 raw words (8902042 effective words) took 22.0s, 404790 effective words/s
2021-11-11 11:56:33,191 : INFO : Word2Vec lifecycle event {'msg': 'training on 120846600 raw words (89014752 effective words) took 219.7s, 405184 effective words/s', 'datetime': '2021-11-11T11:56:33.

# Exploring the model

### How many words in the model

### Get an embedding for a word

In [26]:
model.wv['great']

array([-5.63788593e-01, -6.89677417e-01, -3.01642627e-01,  2.18799639e+00,
       -2.50496149e-01,  8.78002942e-01, -1.17330766e+00, -3.51755649e-01,
        1.30101585e+00,  1.03170145e+00, -5.80828428e-01,  1.41137004e+00,
        1.18380570e+00, -7.72952199e-01, -3.44287634e-01, -1.88736096e-01,
       -9.02953684e-01,  1.06075895e+00, -1.02779019e+00,  1.70692945e+00,
       -7.66478896e-01,  3.65525514e-01, -3.22231650e-01,  1.95920289e-01,
        4.21076924e-01, -1.31468916e+00,  1.75107563e+00,  6.16433203e-01,
        3.64473015e-02, -1.09578215e-01,  1.59861195e+00,  7.57923946e-02,
        1.92975760e+00,  1.64625376e-01,  9.86693799e-01, -1.74801648e-01,
        1.67123228e-01,  1.64913201e+00, -7.62779653e-01, -1.41143692e+00,
       -2.93485194e-01,  9.29598927e-01, -1.43900037e-01,  2.58652091e-01,
       -3.48249704e-01, -2.06696421e-01,  3.36588502e-01,  1.47412193e+00,
        8.17290962e-01, -6.15421653e-01,  6.21364295e-01, -6.72095835e-01,
       -1.08734906e+00, -

In [27]:
model.wv['amazing']

array([-0.5192292 ,  1.1820428 , -0.88530517, -0.5201202 ,  0.04064511,
       -0.7046154 , -1.378359  ,  1.2876801 ,  0.48252356,  1.3447504 ,
       -0.5131772 ,  1.7878995 , -0.19430386, -0.6603103 ,  0.55657107,
        0.8525216 , -0.61799127,  0.6510389 , -0.4742072 ,  1.0943985 ,
       -0.41665834,  2.268133  , -0.30502835,  0.32585153,  0.577128  ,
       -0.42187846,  1.2556466 , -0.90146893,  1.4661648 , -0.1984029 ,
        1.0675898 ,  0.92750037,  0.9585684 , -0.26613882, -0.62172794,
       -0.06825158,  0.87711036,  1.1523486 ,  0.08155826, -0.10478573,
       -0.09048163,  1.3414595 , -0.28803468, -0.11576611, -0.6109621 ,
        0.21873492, -0.22451432,  0.9202399 , -0.43053088, -0.45861286,
       -0.65917236, -0.4253391 , -0.09851655, -0.4524087 , -0.9050053 ,
        0.18435404,  0.89784175,  0.4950049 ,  1.010324  ,  0.47974342,
       -1.1468371 , -1.5887647 ,  1.322706  ,  2.1893675 ,  2.2134278 ,
       -1.2545288 ,  0.42036268,  0.532242  ,  1.0246134 , -0.75

### Finding Words which have similar meaning

In [28]:
model.wv.most_similar('amazing')

[('incredible', 0.7770970463752747),
 ('awesome', 0.750782310962677),
 ('outstanding', 0.6993253231048584),
 ('astounding', 0.6844815015792847),
 ('exceptional', 0.674214780330658),
 ('astonishing', 0.6584542989730835),
 ('fantastic', 0.6551292538642883),
 ('excellent', 0.641973078250885),
 ('wonderful', 0.6101550459861755),
 ('extraordinary', 0.5955072045326233)]

In [29]:
model.wv.most_similar('delhi')

[('copenhagen', 0.557997465133667),
 ('donegal', 0.5411016345024109),
 ('hampshire', 0.5366488099098206),
 ('bluesmobile', 0.5328510403633118),
 ('nursemaid', 0.5291029810905457),
 ('cornwall', 0.5224801301956177),
 ('upstate', 0.5188958644866943),
 ('louisiana', 0.5188421607017517),
 ('orleans', 0.5145027041435242),
 ('mexico', 0.5084513425827026)]

### Find the word which is not like others

### Saving the model

In [30]:
model.save('word2vec-movie-50')

2021-11-11 11:58:37,929 : INFO : Word2Vec lifecycle event {'fname_or_handle': 'word2vec-movie-50', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2021-11-11T11:58:37.929976', 'gensim': '4.1.2', 'python': '3.8.5 (default, Sep  3 2020, 21:29:08) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'saving'}
2021-11-11 11:58:37,930 : INFO : storing np array 'vectors' to word2vec-movie-50.wv.vectors.npy
2021-11-11 11:58:38,061 : INFO : storing np array 'syn1neg' to word2vec-movie-50.syn1neg.npy
2021-11-11 11:58:38,165 : INFO : not storing attribute cum_table
2021-11-11 11:58:38,239 : INFO : saved word2vec-movie-50


In [19]:
#Load model from memory
model = gensim.models.Word2Vec.load('word2vec-movie-50')

2021-08-28 14:05:15,794 : INFO : loading Word2Vec object from word2vec-movie-50
2021-08-28 14:05:15,807 : INFO : loading wv recursively from word2vec-movie-50.wv.* with mmap=None
2021-08-28 14:05:15,808 : INFO : loading vectors from word2vec-movie-50.wv.vectors.npy with mmap=None
2021-08-28 14:05:15,831 : INFO : loading syn1neg from word2vec-movie-50.syn1neg.npy with mmap=None
2021-08-28 14:05:15,861 : INFO : setting ignored attribute cum_table to None
2021-08-28 14:05:16,241 : INFO : Word2Vec lifecycle event {'fname': 'word2vec-movie-50', 'datetime': '2021-08-28T14:05:16.241096', 'gensim': '4.0.1', 'python': '3.8.5 (default, Sep  3 2020, 21:29:08) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'loaded'}


1. Equation king + man = queen + ?
2. In this case there may not be enough data for this equation

In [31]:
model.wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=10)

[('queen', 0.43734657764434814),
 ('princess', 0.4300723075866699),
 ('commoner', 0.3797640800476074),
 ('countess', 0.3688327670097351),
 ('rajput', 0.3613698482513428),
 ('belle', 0.3526665270328522),
 ('empress', 0.3479282557964325),
 ('maria', 0.3453648090362549),
 ('saint', 0.3422326445579529),
 ('prince', 0.33791476488113403)]

In [32]:
model.wv.most_similar(positive=['woman', 'hero'], negative=['man'], topn=5)

[('heroine', 0.6039318442344666),
 ('protagonist', 0.48086097836494446),
 ('villain', 0.3953973352909088),
 ('girl', 0.3931638300418854),
 ('damsel', 0.3856784403324127)]

In [33]:
model.wv.most_similar(positive=['woman','father'], negative=['man'])

[('mother', 0.6816985011100769),
 ('daughter', 0.6567928791046143),
 ('grandmother', 0.5871499180793762),
 ('sister', 0.580805242061615),
 ('aunt', 0.5742297768592834),
 ('wife', 0.5611377954483032),
 ('mom', 0.5472424030303955),
 ('parents', 0.5446527600288391),
 ('spouse', 0.5401191115379333),
 ('girlfriend', 0.5313502550125122)]

In [34]:
model.wv['king'] + model.wv['man'] - model.wv['queen']

array([-4.5481160e-01, -3.3779359e-01, -1.5244037e-02, -2.0211082e+00,
       -6.7356205e-01,  2.3306243e+00, -9.4789946e-01,  2.1488960e+00,
        2.3688393e+00,  1.7235390e+00,  1.1841875e+00,  6.4840496e-01,
       -7.3191440e-01, -8.4619927e-01, -6.3268524e-01,  1.4771940e+00,
       -2.2085562e+00,  1.3961335e+00,  1.9187338e+00,  3.2880797e+00,
        4.9677408e-01,  1.3372293e+00, -2.2337956e+00,  9.2195255e-01,
        7.8062302e-01, -1.7935306e-02, -6.9245040e-01,  1.6122983e+00,
       -1.2887175e+00, -1.2718621e+00, -7.3152548e-01, -1.6742787e+00,
        1.0483837e+00, -1.8832542e+00,  2.0220120e+00, -1.9787261e+00,
       -2.5853026e-01, -2.0706016e-01,  4.3984124e-01,  1.9710746e+00,
       -1.3892562e+00,  6.8506682e-01,  1.4197485e+00, -1.1302315e+00,
       -5.7005870e-01, -9.5372152e-01,  9.5599091e-01,  3.1192303e-03,
       -7.7563405e-01, -2.0905914e+00,  2.1767612e+00, -1.2706807e+00,
       -1.1860112e+00,  4.3988600e-02, -2.2893026e+00,  5.9431046e-01,
      