 
 # Word2Vec tutorial using GENSIM
 
 In this tutorial we first show how to use pretrained models, then we train our own model with custom dataset. 
 
 ### Part 1:  Pretrained models
 Gensim comes with several already pre-trained models


In [None]:
# install gensim package to load and use the model
!pip install gensim

In [13]:
import gensim

We will load the model, demonstrate some features of the gensim package, and then compute a sentence vector using the word embeddings

In [2]:
import gensim.downloader as gensim_pretrained

In [3]:
import gensim.downloader
print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [4]:
info = gensim_pretrained.info()
sorted(info["models"].items())[:1]

[('__testing_word2vec-matrix-synopsis',
  {'description': '[THIS IS ONLY FOR TESTING] Word vecrors of the movie matrix.',
   'parameters': {'dimensions': 50},
   'preprocessing': 'Converted to w2v using a preprocessed corpus. Converted to w2v format with `python3.5 -m gensim.models.word2vec -train <input_filename> -iter 50 -output <output_filename>`.',
   'read_more': [],
   'checksum': '534dcb8b56a360977a269b7bfc62d124',
   'file_name': '__testing_word2vec-matrix-synopsis.gz',
   'parts': 1})]

In [3]:
for model_name, model_data in sorted(info["models"].items()):
    print(
            '%s (%d records) : %s' % 
            (
            model_name,
            model_data.get('num_records', -1),
            model_data['description'][:20]+ '...'
            )
        )

__testing_word2vec-matrix-synopsis (-1 records) : [THIS IS ONLY FOR TE...
conceptnet-numberbatch-17-06-300 (1917247 records) : ConceptNet Numberbat...
fasttext-wiki-news-subwords-300 (999999 records) : 1 million word vecto...
glove-twitter-100 (1193514 records) : Pre-trained vectors ...
glove-twitter-200 (1193514 records) : Pre-trained vectors ...
glove-twitter-25 (1193514 records) : Pre-trained vectors ...
glove-twitter-50 (1193514 records) : Pre-trained vectors ...
glove-wiki-gigaword-100 (400000 records) : Pre-trained vectors ...
glove-wiki-gigaword-200 (400000 records) : Pre-trained vectors ...
glove-wiki-gigaword-300 (400000 records) : Pre-trained vectors ...
glove-wiki-gigaword-50 (400000 records) : Pre-trained vectors ...
word2vec-google-news-300 (3000000 records) : Pre-trained vectors ...
word2vec-ruscorpora-300 (184973 records) : Word2vec Continuous ...


In [4]:
# Let's explore different types of word-embeddings

# fasttext : # Download the "glove-twitter-25" embeddings
fasttext = gensim_pretrained.load('fasttext-wiki-news-subwords-300')

# golve
glove = gensim_pretrained.load('glove-twitter-50')

# Word2Vec
w2v = gensim_pretrained.load("word2vec-google-news-300")





In [5]:
# Now let's try to find similar words 

fasttext.most_similar('man')

[('woman', 0.8119808435440063),
 ('man--', 0.7323855757713318),
 ('man--and', 0.7230692505836487),
 ('person', 0.7203925848007202),
 ('mad-man', 0.7037578225135803),
 ('guy', 0.6992257833480835),
 ('god-man', 0.69350266456604),
 ('boy-man', 0.6925113797187805),
 ('man--the', 0.6904609203338623),
 ('man-love', 0.687400221824646)]

In [6]:
glove.most_similar('man')

[('boy', 0.7652449011802673),
 ('dude', 0.752370297908783),
 ('guy', 0.737877368927002),
 ('was', 0.7247805595397949),
 ("'s", 0.7206680774688721),
 ('bad', 0.7175806164741516),
 ('men', 0.7122883200645447),
 ('hell', 0.7033430337905884),
 ('shit', 0.7005720138549805),
 ('that', 0.6958515644073486)]

In [9]:
# w2v.most_similar('man')

In [15]:
# Now find the distance between similar words and non-similar

fasttext.distance("man", "women")


0.4535101056098938

In [16]:
fasttext.distance("man", "cat")

0.5612933933734894

In [17]:
glove.distance("man","women")

0.4418826699256897

In [18]:
glove.distance("man","tree")

0.6139519810676575

In [20]:
w2v.distance("man","tree")

0.7706254124641418

In [22]:
# "king"-"man"+"women" = 'queen'

glove.most_similar_cosmul(positive=["king","women"], negative = ["man"])

[('royal', 0.9480992555618286),
 ('queen', 0.8974658846855164),
 ('african', 0.8896613717079163),
 ('american', 0.8735426068305969),
 ('heritage', 0.8707107305526733),
 ('republic', 0.8638789653778076),
 ('luxury', 0.858354389667511),
 ('goddess', 0.85495924949646),
 ('british', 0.8545292615890503),
 ('egyptian', 0.8537889122962952)]

In [24]:
fasttext.most_similar_cosmul(positive=["king","women"], negative = ["man"])

[('kings', 0.9261068105697632),
 ('queens', 0.9183101654052734),
 ('monarchs', 0.8894803524017334),
 ('queen', 0.8843674659729004),
 ('princesses', 0.8770883679389954),
 ('kingships', 0.8723795413970947),
 ('noblewomen', 0.8700087070465088),
 ('co-rulers', 0.8698115348815918),
 ('princes', 0.8625684380531311),
 ('rulers', 0.8621167540550232)]

In [25]:
fasttext.most_similar_cosmul(positive=["cricket","batter"], negative = ["bowler"])

[('baseball', 0.837321400642395),
 ('batters', 0.832930862903595),
 ('crickets', 0.8299791216850281),
 ('batterings', 0.82322758436203),
 ('battering', 0.8182348608970642),
 ('bat-and-ball', 0.8147028088569641),
 ('pitch', 0.813496470451355),
 ('ground', 0.8052932024002075),
 ('battery', 0.8052542805671692),
 ('batting-practice', 0.8051658868789673)]

### Part 2: Training the Word2Vec Model for a custom dataset

Dataset source: https://cseweb.ucsd.edu/~jmcauley/datasets/amazon_v2/#subsets



In [1]:
import pandas as pd

In [7]:
df_0 = pd.read_json("Appliances.json", lines=True)
# df = pd.read_json("reviews_Amazon_Coins2.json", lines=True, encoding="utf-8")

df_0.head(3)


Unnamed: 0,overall,vote,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,image
0,5,2.0,False,"11 27, 2013",A3NHUQ33CFH3VM,1118461304,{'Format:': ' Hardcover'},Greeny,Not one thing in this book seemed an obvious o...,Clear on what leads to innovation,1385510400,
1,5,,False,"11 1, 2013",A3SK6VNBQDNBJE,1118461304,{'Format:': ' Kindle Edition'},Leif C. Ulstrup,I have enjoyed Dr. Alan Gregerman's weekly blo...,Becoming more innovative by opening yourself t...,1383264000,
2,5,,False,"10 10, 2013",A3SOFHUR27FO3K,1118461304,{'Format:': ' Hardcover'},Harry Gilbert Miller III,Alan Gregerman believes that innovation comes ...,The World from Different Perspectives,1381363200,


In [8]:
df_0.shape

(602777, 12)

This is huge dataset. Since, we are working on a local machine with CPU, let's dowload and load smaller version of the same dataset.

In [9]:
df = pd.read_json("Appliances_5.json", lines=True)
df.head(2)

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,5,True,"08 22, 2013",A34A1UP40713F8,B00009W3I4,{'Style:': ' Dryer Vent'},James. Backus,I like this as a vent as well as something tha...,Great product,1377129600,,
1,5,True,"02 8, 2016",A1AHW6I678O6F2,B00009W3PA,{'Size:': ' 6-Foot'},kevin.,good item,Five Stars,1454889600,,


In [10]:
df.shape

(2277, 12)

##### Initial Text Processing and Tokenization
Before embarking on any data science project, it's crucial to prepare the data. In the realm of NLP, this often involves basic tasks such as lowering all letter cases, trimming extra spaces, and discarding punctuation.

Furthermore, it's common to eliminate stop words—common words like 'and', 'or', 'is', 'the', 'a', and 'an'—and to reduce words to their base form, turning 'running' into 'run', for example.

In [11]:
df.reviewText[0]

'I like this as a vent as well as something that will keep house warmer in winter.  I sanded it and then painted it the same color as the house.  Looks great.'

In [19]:
gensim.utils.simple_preprocess(df.reviewText[0])

['like',
 'this',
 'as',
 'vent',
 'as',
 'well',
 'as',
 'something',
 'that',
 'will',
 'keep',
 'house',
 'warmer',
 'in',
 'winter',
 'sanded',
 'it',
 'and',
 'then',
 'painted',
 'it',
 'the',
 'same',
 'color',
 'as',
 'the',
 'house',
 'looks',
 'great']

Let's apply this for all the rows

In [15]:
reviewText = df.reviewText.apply(gensim.utils.simple_preprocess)

In [17]:
reviewText

0       [like, this, as, vent, as, well, as, something...
1                                            [good, item]
2                    [fit, my, new, lg, dryer, perfectly]
3                    [good, value, for, electric, dryers]
4                  [price, and, delivery, was, excellent]
                              ...                        
2272    [works, great, used, it, and, an, extension, k...
2273    [anyone, who, thinks, they, don, have, problem...
2274                                               [good]
2275                                    [washer, washing]
2276                     [great, product, fast, shipping]
Name: reviewText, Length: 2277, dtype: object

###### Model Setup: 
Begin training the model on the reviews dataset. Set the context window to 8, meaning the model should consider 8 words preceding and following the current word. Ensure that only sentences with a minimum of 2 words are included in the training by adjusting the min_count parameter accordingly.

The workers parameter specifies the number of CPU threads that will be utilized during the process.

In [28]:
from gensim.models import Word2Vec 

model = Word2Vec( 
                 window=8, 
                 min_count=2,
                 workers=6)



In [29]:
# Build vocabulary
model.build_vocab(reviewText,
                  progress_per=1000)

###  Train the model 

In [30]:
# Let's check bydefault epochs
model.epochs

5

In [31]:
model.train(reviewText, total_examples=model.corpus_count, epochs=3)


(1297750, 1824300)

In [32]:
# Save the model
model.save("w2v-amazon-Appliances-review.model")

Now, let's check the model's performance

In [40]:
# get numpy vector of a word
vector = model.wv['excellent']  
vector

array([ 0.01955483,  0.026554  ,  0.12635495, -0.01358451,  0.08903369,
        0.01314539,  0.02698027,  0.04025428, -0.01104448, -0.01525382,
        0.01045332, -0.02405204,  0.02749079,  0.008655  ,  0.0624397 ,
       -0.00289201,  0.02717862, -0.06202495, -0.05749196, -0.07579448,
        0.05440379, -0.00650001,  0.03160439, -0.00333485,  0.03388236,
        0.05081904,  0.03923526, -0.02998827,  0.05778816, -0.03301499,
       -0.00275999, -0.06732249,  0.08404669, -0.07702573, -0.00540868,
       -0.00895185, -0.00865892, -0.00627251,  0.02088036, -0.04213507,
       -0.06489728, -0.04447998,  0.00944065,  0.03026509, -0.01459111,
       -0.011775  ,  0.01047484,  0.00696635,  0.01904456,  0.10540314,
        0.00980016, -0.04144798, -0.05155573,  0.02541336, -0.01923385,
       -0.03410445,  0.05050702,  0.01763752, -0.04362899, -0.02700694,
        0.0239468 ,  0.03908361, -0.07309889, -0.00395656, -0.0506328 ,
        0.11971781, -0.02126649,  0.11820494, -0.01267322, -0.01

In [42]:
 # get other similar words (give me top 5)
model.wv.most_similar("excellent", topn = 5)

[('delivery', 0.8176872730255127),
 ('charge', 0.8024957180023193),
 ('batteries', 0.8015746474266052),
 ('build', 0.774316132068634),
 ('removes', 0.739495575428009)]

In [43]:
# Let's find the similary between these words
model.wv.similarity(w1="excellent", w2="delivery")


0.8176872

In [44]:
model.wv.similarity(w1="excellent", w2="charge")

0.80249566