### Load Gensim Library

In [1]:
!pip install gensim --quiet

You should consider upgrading via the '/usr/local/bin/python3.7 -m pip install --upgrade pip' command.[0m


In [2]:
import gensim

In [3]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', 
                    level=logging.INFO)

### Load Text Data

Data can be downloaded from https://www.kaggle.com/c/word2vec-nlp-tutorial/data

In [4]:
import pandas as pd

#change file path to point to where you have stored the zip file.
df = pd.read_csv('unlabeledTrainData.tsv.zip', header=0, delimiter="\t", quoting=3)

print('Number of examples in Dataset: ', df.shape)
df.head()

Number of examples in Dataset:  (50000, 2)


Unnamed: 0,id,review
0,"""9999_0""","""Watching Time Chasers, it obvious that it was..."
1,"""45057_0""","""I saw this film about 20 years ago and rememb..."
2,"""15561_0""","""Minor Spoilers<br /><br />In New York, Joan B..."
3,"""7161_0""","""I went to see this film with a great deal of ..."
4,"""43971_0""","""Yes, I agree with everyone on this site this ..."


### Function to Clean up data

In [5]:
import re, string

def clean_str(string):
  """
  String cleaning before vectorization
  """
  try:    
    string = re.sub(r'^https?:\/\/<>.*[\r\n]*', '', string, flags=re.MULTILINE)
    string = re.sub(r"[^A-Za-z]", " ", string)         
    words = string.strip().lower().split()    
    words = [w for w in words if len(w)>=1]
    return " ".join(words)	
  except:
    return ""

### Clean the Data using routine above

In [6]:
df['clean_review'] = df['review'].apply(clean_str)
df.head()

Unnamed: 0,id,review,clean_review
0,"""9999_0""","""Watching Time Chasers, it obvious that it was...",watching time chasers it obvious that it was m...
1,"""45057_0""","""I saw this film about 20 years ago and rememb...",i saw this film about years ago and remember i...
2,"""15561_0""","""Minor Spoilers<br /><br />In New York, Joan B...",minor spoilers br br in new york joan barnard ...
3,"""7161_0""","""I went to see this film with a great deal of ...",i went to see this film with a great deal of e...
4,"""43971_0""","""Yes, I agree with everyone on this site this ...",yes i agree with everyone on this site this mo...


In [7]:
df.shape

(50000, 3)

In [8]:
df.loc[0, 'clean_review']

'watching time chasers it obvious that it was made by a bunch of friends maybe they were sitting around one day in film school and said hey let s pool our money together and make a really bad movie or something like that what ever they said they still ended up making a really bad movie dull story bad script lame acting poor cinematography bottom of the barrel stock music etc all corners were cut except the one that would have prevented this film s release life s like that'

### Convert Review to a Word List

In [9]:
#List to hold all words in each review
documents = []

#Iterate over each review
for doc in df['clean_review']:
    documents.append(doc.split(' '))

print(len(documents))


50000


In [10]:
print(documents[0])

['watching', 'time', 'chasers', 'it', 'obvious', 'that', 'it', 'was', 'made', 'by', 'a', 'bunch', 'of', 'friends', 'maybe', 'they', 'were', 'sitting', 'around', 'one', 'day', 'in', 'film', 'school', 'and', 'said', 'hey', 'let', 's', 'pool', 'our', 'money', 'together', 'and', 'make', 'a', 'really', 'bad', 'movie', 'or', 'something', 'like', 'that', 'what', 'ever', 'they', 'said', 'they', 'still', 'ended', 'up', 'making', 'a', 'really', 'bad', 'movie', 'dull', 'story', 'bad', 'script', 'lame', 'acting', 'poor', 'cinematography', 'bottom', 'of', 'the', 'barrel', 'stock', 'music', 'etc', 'all', 'corners', 'were', 'cut', 'except', 'the', 'one', 'that', 'would', 'have', 'prevented', 'this', 'film', 's', 'release', 'life', 's', 'like', 'that']


In [11]:
len(documents[100])

57

### Build the Model

In [12]:
#Build the model
model = gensim.models.Word2Vec(documents, #Word list
                               min_count=10, #Ignore all words with total frequency lower than this                           
                               workers=4, #Number of CPU Cores
                               size=50,  #Embedding size
                               window=5, #Maximum Distance between current and predicted word
                               iter=10   #Number of iterations over the text corpus
                              )  

2020-06-06 18:48:15,515 : INFO : collecting all words and their counts
2020-06-06 18:48:15,516 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-06-06 18:48:15,933 : INFO : PROGRESS: at sentence #10000, processed 2399440 words, keeping 51654 word types
2020-06-06 18:48:16,381 : INFO : PROGRESS: at sentence #20000, processed 4835846 words, keeping 69077 word types
2020-06-06 18:48:16,842 : INFO : PROGRESS: at sentence #30000, processed 7267977 words, keeping 81515 word types
2020-06-06 18:48:17,301 : INFO : PROGRESS: at sentence #40000, processed 9669772 words, keeping 91685 word types
2020-06-06 18:48:17,757 : INFO : collected 100479 word types from a corpus of 12084660 raw words and 50000 sentences
2020-06-06 18:48:17,758 : INFO : Loading a fresh vocabulary
2020-06-06 18:48:18,092 : INFO : effective_min_count=10 retains 28322 unique words (28% of original 100479, drops 72157)
2020-06-06 18:48:18,093 : INFO : effective_min_count=10 leaves 11910457 word cor

# Exploring the model

### How many words in the model

In [13]:
#Model size
model.wv.syn0.shape

  


(28322, 50)

In [14]:
# Vocablury of the model
model.wv.vocab

{'watching': <gensim.models.keyedvectors.Vocab at 0x7fdbc14ef310>,
 'time': <gensim.models.keyedvectors.Vocab at 0x7fdbc14efd10>,
 'chasers': <gensim.models.keyedvectors.Vocab at 0x7fdbc14efd90>,
 'it': <gensim.models.keyedvectors.Vocab at 0x7fdbc14f3a10>,
 'obvious': <gensim.models.keyedvectors.Vocab at 0x7fdbc14f3b10>,
 'that': <gensim.models.keyedvectors.Vocab at 0x7fdbc14f3ad0>,
 'was': <gensim.models.keyedvectors.Vocab at 0x7fdbc14f3590>,
 'made': <gensim.models.keyedvectors.Vocab at 0x7fdbc14f3890>,
 'by': <gensim.models.keyedvectors.Vocab at 0x7fdbc14f3710>,
 'a': <gensim.models.keyedvectors.Vocab at 0x7fdbc14f37d0>,
 'bunch': <gensim.models.keyedvectors.Vocab at 0x7fdbc14f3810>,
 'of': <gensim.models.keyedvectors.Vocab at 0x7fdbc14f3950>,
 'friends': <gensim.models.keyedvectors.Vocab at 0x7fdbc14f3a50>,
 'maybe': <gensim.models.keyedvectors.Vocab at 0x7fdbc14f3b50>,
 'they': <gensim.models.keyedvectors.Vocab at 0x7fdbc14f3750>,
 'were': <gensim.models.keyedvectors.Vocab at 0x7f

### Get an embedding for a word

In [15]:
model.wv['flower']

array([-4.34260935e-01,  8.02268088e-01, -4.69377643e-04,  9.05431867e-01,
       -9.80103850e-01,  9.65716600e-01,  1.33022487e+00, -4.53235298e-01,
       -2.29285344e-01,  1.06126584e-01,  1.54020011e-01,  7.77248383e-01,
        9.00823951e-01, -8.74153912e-01, -3.29724193e-01,  5.69765456e-02,
        7.19763994e-01,  8.26199532e-01,  1.12421560e+00,  3.07759553e-01,
       -9.69164073e-01, -9.39934373e-01,  3.06783110e-01,  2.36838758e-01,
        5.33786356e-01,  7.83444557e-05, -7.61947751e-01,  1.16868055e+00,
        1.46542117e-01,  1.11196709e+00,  4.32975553e-02, -7.11031184e-02,
       -1.32739699e+00, -1.29037249e+00, -9.47170034e-02, -1.06679010e+00,
        1.53662717e+00, -8.08721483e-01, -5.88325381e-01, -7.85349727e-01,
       -4.47711080e-01, -5.19469008e-02, -3.42639923e-01, -2.45102301e-01,
       -4.44107354e-01,  5.95890135e-02,  6.58604324e-01, -8.47474858e-03,
       -6.41969442e-01,  3.80336761e-01], dtype=float32)

### Finding Words which have similar meaning

In [16]:
model.wv.most_similar('great')

2020-06-06 18:50:31,396 : INFO : precomputing L2-norms of word weight vectors


[('fantastic', 0.8879280090332031),
 ('wonderful', 0.8793569207191467),
 ('terrific', 0.8775209188461304),
 ('fine', 0.8503574728965759),
 ('good', 0.821395993232727),
 ('superb', 0.7864354848861694),
 ('brilliant', 0.7832158803939819),
 ('perfect', 0.7734867334365845),
 ('nice', 0.7627489566802979),
 ('remarkable', 0.7532870769500732)]

### Find the word which is not like others

In [17]:
model.doesnt_match("man woman child kitchen".split())

  """Entry point for launching an IPython kernel.
  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'kitchen'

### Saving the model

In [18]:
model.save('word2vec-movie-50')

2020-06-06 18:50:31,570 : INFO : saving Word2Vec object under word2vec-movie-50, separately None
2020-06-06 18:50:31,571 : INFO : not storing attribute vectors_norm
2020-06-06 18:50:31,572 : INFO : not storing attribute cum_table
2020-06-06 18:50:31,859 : INFO : saved word2vec-movie-50


In [19]:
#Load model from memory
model = gensim.models.Word2Vec.load('word2vec-movie-50')

2020-06-06 18:50:31,864 : INFO : loading Word2Vec object from word2vec-movie-50
2020-06-06 18:50:32,239 : INFO : loading wv recursively from word2vec-movie-50.wv.* with mmap=None
2020-06-06 18:50:32,240 : INFO : setting ignored attribute vectors_norm to None
2020-06-06 18:50:32,241 : INFO : loading vocabulary recursively from word2vec-movie-50.vocabulary.* with mmap=None
2020-06-06 18:50:32,241 : INFO : loading trainables recursively from word2vec-movie-50.trainables.* with mmap=None
2020-06-06 18:50:32,242 : INFO : setting ignored attribute cum_table to None
2020-06-06 18:50:32,242 : INFO : loaded word2vec-movie-50


1. Equation king + man = queen + ?
2. In this case there may not be enough data for this equation

In [20]:
model.most_similar(positive=['king','man'], negative=['queen'])

  """Entry point for launching an IPython kernel.
2020-06-06 18:50:32,302 : INFO : precomputing L2-norms of word weight vectors


[('joker', 0.5461510419845581),
 ('himself', 0.530073881149292),
 ('tong', 0.5214005708694458),
 ('master', 0.5085506439208984),
 ('spielberg', 0.5047746896743774),
 ('batman', 0.5030048489570618),
 ('soldier', 0.5021239519119263),
 ('scientist', 0.498812735080719),
 ('nemesis', 0.4957764744758606),
 ('buio', 0.4940866231918335)]

In [21]:
model.wv['king'] + model.wv['man'] - model.wv['queen']

array([-0.43550405,  3.025441  ,  2.0266876 ,  1.5845662 ,  2.531186  ,
        2.752121  , -2.8920875 , -3.948574  ,  0.9305538 , -1.6557367 ,
        0.16165793, -0.688841  , -2.6849446 ,  0.20468831,  5.263345  ,
       -0.454705  ,  1.1698976 , -0.08947134,  1.7938306 , -3.735002  ,
       -2.4357042 ,  1.0794291 ,  1.049356  , -0.5929636 , -0.3230077 ,
        0.63850605,  3.1798694 ,  4.4344    , -2.2810762 ,  0.86981905,
       -3.8866887 ,  5.330084  , -3.8001685 , -0.2797408 ,  0.5454081 ,
        4.084464  , -2.1803412 , -1.8875592 , -3.8348904 ,  4.7690983 ,
       -2.21694   , -5.4260883 , -2.90832   ,  0.67111504, -1.7269541 ,
       -0.55388296,  0.98585474,  1.8247118 , -1.9388968 ,  1.5114752 ],
      dtype=float32)