### Load Gensim Library

In [1]:
!pip install gensim



In [0]:
import gensim

In [0]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', 
                    level=logging.INFO)

### Load Text Data

Data can be downloaded from https://www.kaggle.com/c/word2vec-nlp-tutorial/data

In [4]:
#This is needed only if you have uploaded data to Google drive
from google.colab import drive
drive.mount('/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /gdrive


In [5]:
import pandas as pd

#change file path to point to where you have stored the zip file.
df = pd.read_csv('/gdrive/My Drive/AI-ML/unlabeledTrainData.tsv.zip', header=0, delimiter="\t", quoting=3)

print('Number of examples in Dataset: ', df.shape)
df.head()

Number of examples in Dataset:  (50000, 2)


Unnamed: 0,id,review
0,"""9999_0""","""Watching Time Chasers, it obvious that it was..."
1,"""45057_0""","""I saw this film about 20 years ago and rememb..."
2,"""15561_0""","""Minor Spoilers<br /><br />In New York, Joan B..."
3,"""7161_0""","""I went to see this film with a great deal of ..."
4,"""43971_0""","""Yes, I agree with everyone on this site this ..."


In [6]:
df.loc[0, 'review']

'"Watching Time Chasers, it obvious that it was made by a bunch of friends. Maybe they were sitting around one day in film school and said, \\"Hey, let\'s pool our money together and make a really bad movie!\\" Or something like that. What ever they said, they still ended up making a really bad movie--dull story, bad script, lame acting, poor cinematography, bottom of the barrel stock music, etc. All corners were cut, except the one that would have prevented this film\'s release. Life\'s like that."'

### Function to Clean up data

In [0]:
import re, string

def clean_str(string):
  """
  String cleaning before vectorization
  """
  try:    
    string = re.sub(r'^https?:\/\/<>.*[\r\n]*', '', string, flags=re.MULTILINE)
    string = re.sub(r"[^A-Za-z]", " ", string)         
    words = string.strip().lower().split()    
    words = [w for w in words if len(w)>=1]
    return " ".join(words)	
  except:
    return ""

### Clean the Data using routine above

In [8]:
df['clean_review'] = df['review'].apply(clean_str)
df.head()

Unnamed: 0,id,review,clean_review
0,"""9999_0""","""Watching Time Chasers, it obvious that it was...",watching time chasers it obvious that it was m...
1,"""45057_0""","""I saw this film about 20 years ago and rememb...",i saw this film about years ago and remember i...
2,"""15561_0""","""Minor Spoilers<br /><br />In New York, Joan B...",minor spoilers br br in new york joan barnard ...
3,"""7161_0""","""I went to see this film with a great deal of ...",i went to see this film with a great deal of e...
4,"""43971_0""","""Yes, I agree with everyone on this site this ...",yes i agree with everyone on this site this mo...


### Convert Review to a Word List

In [9]:
#List to hold all words in each review
documents = []

#Iterate over each review
for doc in df['clean_review']:
    documents.append(doc.split(' '))

print(len(documents))
print(documents[0])

50000
['watching', 'time', 'chasers', 'it', 'obvious', 'that', 'it', 'was', 'made', 'by', 'a', 'bunch', 'of', 'friends', 'maybe', 'they', 'were', 'sitting', 'around', 'one', 'day', 'in', 'film', 'school', 'and', 'said', 'hey', 'let', 's', 'pool', 'our', 'money', 'together', 'and', 'make', 'a', 'really', 'bad', 'movie', 'or', 'something', 'like', 'that', 'what', 'ever', 'they', 'said', 'they', 'still', 'ended', 'up', 'making', 'a', 'really', 'bad', 'movie', 'dull', 'story', 'bad', 'script', 'lame', 'acting', 'poor', 'cinematography', 'bottom', 'of', 'the', 'barrel', 'stock', 'music', 'etc', 'all', 'corners', 'were', 'cut', 'except', 'the', 'one', 'that', 'would', 'have', 'prevented', 'this', 'film', 's', 'release', 'life', 's', 'like', 'that']


In [0]:
#documents[0]

### Build the Model

In [13]:
#Build the model
model = gensim.models.Word2Vec(documents, #Word list
                               min_count=10, #Ignore all words with total frequency lower than this                           
                               workers=4, #Number of CPU Cores
                               size=50,  #Embedding size
                               window=5, #Maximum Distance between current and predicted word
                               iter=10   #Number of iterations over the text corpus
                              ) 

2020-06-13 06:12:11,712 : INFO : collecting all words and their counts
2020-06-13 06:12:11,713 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-06-13 06:12:12,229 : INFO : PROGRESS: at sentence #10000, processed 2399440 words, keeping 51654 word types
2020-06-13 06:12:12,720 : INFO : PROGRESS: at sentence #20000, processed 4835846 words, keeping 69077 word types
2020-06-13 06:12:13,230 : INFO : PROGRESS: at sentence #30000, processed 7267977 words, keeping 81515 word types
2020-06-13 06:12:13,734 : INFO : PROGRESS: at sentence #40000, processed 9669772 words, keeping 91685 word types
2020-06-13 06:12:14,246 : INFO : collected 100479 word types from a corpus of 12084660 raw words and 50000 sentences
2020-06-13 06:12:14,247 : INFO : Loading a fresh vocabulary
2020-06-13 06:12:14,672 : INFO : effective_min_count=10 retains 28322 unique words (28% of original 100479, drops 72157)
2020-06-13 06:12:14,673 : INFO : effective_min_count=10 leaves 11910457 word cor

In [0]:
#documents[0]

# Exploring the model

### How many words in the model

In [14]:
#Model size
model.wv.syn0.shape

  


(28322, 50)

In [15]:
# Vocablury of the model
model.wv.vocab

{'watching': <gensim.models.keyedvectors.Vocab at 0x7f1e268a70f0>,
 'time': <gensim.models.keyedvectors.Vocab at 0x7f1e268a7160>,
 'chasers': <gensim.models.keyedvectors.Vocab at 0x7f1ddc64e390>,
 'it': <gensim.models.keyedvectors.Vocab at 0x7f1ddc64e438>,
 'obvious': <gensim.models.keyedvectors.Vocab at 0x7f1ddc64e470>,
 'that': <gensim.models.keyedvectors.Vocab at 0x7f1ddc64e4e0>,
 'was': <gensim.models.keyedvectors.Vocab at 0x7f1ddc64e518>,
 'made': <gensim.models.keyedvectors.Vocab at 0x7f1ddc64e550>,
 'by': <gensim.models.keyedvectors.Vocab at 0x7f1ddc64e588>,
 'a': <gensim.models.keyedvectors.Vocab at 0x7f1ddc64e5c0>,
 'bunch': <gensim.models.keyedvectors.Vocab at 0x7f1ddc64e5f8>,
 'of': <gensim.models.keyedvectors.Vocab at 0x7f1ddc64e630>,
 'friends': <gensim.models.keyedvectors.Vocab at 0x7f1ddc64e668>,
 'maybe': <gensim.models.keyedvectors.Vocab at 0x7f1ddc64e6a0>,
 'they': <gensim.models.keyedvectors.Vocab at 0x7f1ddc64e6d8>,
 'were': <gensim.models.keyedvectors.Vocab at 0x7f

### Get an embedding for a word

In [18]:
model.wv['flower']

array([-0.14810558, -0.71751475,  0.60191846, -0.18540882,  0.7577236 ,
       -0.2769248 ,  0.31264266,  0.7332015 ,  0.9792783 ,  1.4231142 ,
        0.7711015 , -0.62922776,  1.0930786 , -0.5608674 , -0.70426786,
        1.1644068 , -0.6798673 , -0.00758035, -0.5645974 , -0.17416641,
       -0.19669205,  1.0966979 ,  0.861114  ,  0.2700012 ,  0.57455045,
       -0.31086162, -1.2841402 , -0.29588827, -0.21599896, -0.39557472,
       -0.04301611, -0.111603  ,  0.9452557 , -0.78471106,  0.09595187,
       -1.8731594 , -0.06825525,  0.8234651 ,  0.37180844, -0.30128375,
       -0.5152668 , -0.43481266, -0.56009793,  0.5890917 , -0.993304  ,
       -0.20824367,  1.1495295 , -0.10978289, -0.1285663 , -0.55448395],
      dtype=float32)

### Finding Words which have similar meaning

In [19]:
model.wv.most_similar('great')

2020-06-13 06:19:22,321 : INFO : precomputing L2-norms of word weight vectors
  if np.issubdtype(vec.dtype, np.int):


[('fantastic', 0.887427568435669),
 ('wonderful', 0.8820511698722839),
 ('terrific', 0.8731141686439514),
 ('fine', 0.8548595309257507),
 ('good', 0.8238403797149658),
 ('brilliant', 0.804119884967804),
 ('superb', 0.7887728214263916),
 ('perfect', 0.768755316734314),
 ('nice', 0.7584500908851624),
 ('remarkable', 0.7455489635467529)]

### Find the word which is not like others

In [20]:
model.doesnt_match("man woman child kitchen".split())

  """Entry point for launching an IPython kernel.
  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)
  if np.issubdtype(vec.dtype, np.int):


'kitchen'

### Saving the model

In [0]:
model.save('word2vec-movie-50')

In [0]:
#Load model from memory
model = gensim.models.Word2Vec.load('word2vec-movie-50')

1. Equation king + man = queen + ?
2. In this case there may not be enough data for this equation

In [21]:
model.most_similar(positive=['king','man'], negative=['queen'])

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('soldier', 0.572564423084259),
 ('batman', 0.5400917530059814),
 ('joker', 0.5323717594146729),
 ('buio', 0.5322198271751404),
 ('prophecy', 0.5291051268577576),
 ('spielberg', 0.5290843844413757),
 ('scientist', 0.5277968645095825),
 ('boy', 0.5190491676330566),
 ('seagal', 0.5157371163368225),
 ('marine', 0.4934084713459015)]

In [0]:
model.wv['king'] + model.wv['man'] - model.wv['queen']