In [2]:
import gensim
import pandas as pd

In [3]:
df = pd.read_csv("/content/drive/MyDrive/Reviews.csv")
df

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...
...,...,...,...,...,...,...,...,...,...,...
568449,568450,B001EO7N10,A28KG5XORO54AY,Lettie D. Carter,0,0,5,1299628800,Will not do without,Great for sesame chicken..this is a good if no...
568450,568451,B003S1WTCU,A3I8AFVPEE8KI5,R. Sawyer,0,0,2,1331251200,disappointed,I'm disappointed with the flavor. The chocolat...
568451,568452,B004I613EE,A121AA1GQV751Z,"pksd ""pk_007""",2,2,5,1329782400,Perfect for our maltipoo,"These stars are small, so you can give 10-15 o..."
568452,568453,B004I613EE,A3IBEVCTXKNOH,"Kathy A. Welch ""katwel""",1,1,5,1331596800,Favorite Training and reward treat,These are the BEST treats for training and rew...


In [4]:
df.shape

(568454, 10)

# Simple Preprocessing & Tokenization

The first thing to do for any data science task is to clean the data. For NLP, we apply various processing like converting all the words to lower case, trimming spaces, removing punctuations. This is something we will do over here too.

Additionally, we can also remove stop words like 'and', 'or', 'is', 'the', 'a', 'an' and convert words to their root forms like 'running' to 'run'.

In [5]:
review_text = df.Text.apply(gensim.utils.simple_preprocess)
review_text

0         [have, bought, several, of, the, vitality, can...
1         [product, arrived, labeled, as, jumbo, salted,...
2         [this, is, confection, that, has, been, around...
3         [if, you, are, looking, for, the, secret, ingr...
4         [great, taffy, at, great, price, there, was, w...
                                ...                        
568449    [great, for, sesame, chicken, this, is, good, ...
568450    [disappointed, with, the, flavor, the, chocola...
568451    [these, stars, are, small, so, you, can, give,...
568452    [these, are, the, best, treats, for, training,...
568453    [am, very, satisfied, product, is, as, adverti...
Name: Text, Length: 568454, dtype: object

In [6]:
review_text.loc[0]

['have',
 'bought',
 'several',
 'of',
 'the',
 'vitality',
 'canned',
 'dog',
 'food',
 'products',
 'and',
 'have',
 'found',
 'them',
 'all',
 'to',
 'be',
 'of',
 'good',
 'quality',
 'the',
 'product',
 'looks',
 'more',
 'like',
 'stew',
 'than',
 'processed',
 'meat',
 'and',
 'it',
 'smells',
 'better',
 'my',
 'labrador',
 'is',
 'finicky',
 'and',
 'she',
 'appreciates',
 'this',
 'product',
 'better',
 'than',
 'most']

In [7]:
df.Text.loc[0]

'I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.'

# Training the Word2Vec Model

Train the model for reviews. Use a window of size 10 i.e. 10 words before the present word and 10 words ahead. A sentence with at least 2 words should only be considered, configure this using min_count parameter.

Workers define how many CPU threads to be used.

**Initialize the model**

In [8]:
model = gensim.models.Word2Vec(
    window=10,
    min_count=2,
    workers=4,
)

**Build Vocabulary**

In [9]:
model.build_vocab(review_text, progress_per=1000)

**Train the Word2Vec Model**

In [10]:
model.train(review_text, total_examples=model.corpus_count, epochs=model.epochs)

(163770152, 215303610)

In [11]:
model.save("./word2vec-amazon-food-reviews-short.model")

# Finding Similar Words and Similarity between words

In [12]:
model.wv.most_similar("bad")

[('terrible', 0.689831554889679),
 ('gross', 0.6616565585136414),
 ('horrible', 0.661617636680603),
 ('good', 0.660560667514801),
 ('nasty', 0.6580163836479187),
 ('weird', 0.6498607993125916),
 ('funny', 0.6461074948310852),
 ('shabby', 0.6069265007972717),
 ('awful', 0.60587078332901),
 ('offensive', 0.5970809459686279)]

In [13]:
model.wv.similarity(w1="cheap", w2="inexpensive")

0.46218523

In [16]:
model.wv.similarity(w1="great", w2="good")

0.75485754