In [2]:
# !py -3.9 -m pip install gensim
# !py -3.9 -m pip install python-Levenshtein

In [3]:
import gensim
import pandas as pd

# Reading and Exploring the Dataset
The dataset we are using here is a subset of Amazon reviews from the Cell Phones & Accessories category. The data is
stored as a JSON file and can be read using pandas.
Link to the Dataset:
http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Cell_Phones_and_Accessories_5.json.gz

then use the gunip command to unzip the file in git bash terminal

In [31]:
df = pd.read_json("datasets/reviews_Cell_Phones_and_Accessories_5.json",lines=True)

In [5]:
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A30TL5EWN6DFXT,120401325X,christina,"[0, 0]",They look good and stick good! I just don't li...,4,Looks Good,1400630400,"05 21, 2014"
1,ASY55RVNIL0UD,120401325X,emily l.,"[0, 0]",These stickers work like the review says they ...,5,Really great product.,1389657600,"01 14, 2014"
2,A2TMXE2AFO7ONB,120401325X,Erica,"[0, 0]",These are awesome and make my phone look so st...,5,LOVE LOVE LOVE,1403740800,"06 26, 2014"
3,AWJ0WZQYMYFQ4,120401325X,JM,"[4, 4]",Item arrived in great time and was in perfect ...,4,Cute!,1382313600,"10 21, 2013"
4,ATX7CZYFXI1KW,120401325X,patrice m rogoza,"[2, 3]","awesome! stays on, and looks great. can be use...",5,leopard home button sticker for iphone 4s,1359849600,"02 3, 2013"


In [6]:
df.shape

(194439, 9)

Now we will only use the reviewText column for our model and first we will do preprocessing and for that we will use the gensim library

In [20]:
len(df.reviewText[0])

189

In [21]:
gensim.utils.simple_preprocess(df.reviewText[0])

['they',
 'look',
 'good',
 'and',
 'stick',
 'good',
 'just',
 'don',
 'like',
 'the',
 'rounded',
 'shape',
 'because',
 'was',
 'always',
 'bumping',
 'it',
 'and',
 'siri',
 'kept',
 'popping',
 'up',
 'and',
 'it',
 'was',
 'irritating',
 'just',
 'won',
 'buy',
 'product',
 'like',
 'this',
 'again']

In [18]:
temp = 0
for word in gensim.utils.simple_preprocess(df.reviewText[0]):
    # print(word)
    for letter in word:
        temp += len(letter)

print(temp)

143


In [22]:
review_text = df.reviewText.apply(gensim.utils.simple_preprocess)
review_text

0         [they, look, good, and, stick, good, just, don...
1         [these, stickers, work, like, the, review, say...
2         [these, are, awesome, and, make, my, phone, lo...
3         [item, arrived, in, great, time, and, was, in,...
4         [awesome, stays, on, and, looks, great, can, b...
                                ...                        
194434    [works, great, just, like, my, original, one, ...
194435    [great, product, great, packaging, high, quali...
194436    [this, is, great, cable, just, as, good, as, t...
194437    [really, like, it, becasue, it, works, well, w...
194438    [product, as, described, have, wasted, lot, of...
Name: reviewText, Length: 194439, dtype: object

In [23]:
model = gensim.models.Word2Vec(
    window=10, # The maximum distance between the current and predicted word within a sentence.
    min_count=2, # Ignores all words with total frequency lower than this.
    workers=4, # Use these many worker threads to train the model (=faster training
)

In [24]:
# Now we need to build the vocabulary
model.build_vocab(review_text,progress_per=1000)

In [25]:
model.epochs

5

In [26]:
model.corpus_count

194439

In [27]:
model.train(review_text,total_examples=model.corpus_count,epochs=model.epochs)

(61504929, 83868975)

In [32]:
model.save('models/word2vec-amazon-call-review.model')

In [33]:
model.wv.most_similar('bad')

[('terrible', 0.6566386818885803),
 ('shabby', 0.6369579434394836),
 ('good', 0.6006110310554504),
 ('horrible', 0.5973445177078247),
 ('okay', 0.5532851815223694),
 ('sad', 0.5209133625030518),
 ('funny', 0.5194187164306641),
 ('poor', 0.5169872641563416),
 ('awful', 0.5162353515625),
 ('pathetic', 0.5102324485778809)]

In [34]:
model.wv.most_similar('good')

[('decent', 0.8230326175689697),
 ('great', 0.775018572807312),
 ('nice', 0.6960697770118713),
 ('fantastic', 0.6919859647750854),
 ('superb', 0.6371326446533203),
 ('excellent', 0.6339433789253235),
 ('bad', 0.6006110310554504),
 ('exceptional', 0.5770646929740906),
 ('terrific', 0.5711997151374817),
 ('reasonable', 0.568253755569458)]

In [35]:
model.wv.similarity('bad','good')

0.60061103

# Exercise
Train a word2vec model on the Sports & Outdoors Reviews Dataset Once you train a model on this, find the words most
similar to 'awful' and find similarities between the following word tuples: ('good', 'great'), ('slow','steady')

https://www.kaggle.com/datasets/aarishasifkhan/sports-and-outdoor-review-dataset

In [36]:
df = pd.read_json("datasets/Sports_and_Outdoors_5.json",lines=True)
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,AIXZKN4ACSKI,1881509818,David Briner,"[0, 0]",This came in on time and I am veru happy with ...,5,Woks very good,1390694400,"01 26, 2014"
1,A1L5P841VIO02V,1881509818,Jason A. Kramer,"[1, 1]",I had a factory Glock tool that I was using fo...,5,Works as well as the factory tool,1328140800,"02 2, 2012"
2,AB2W04NI4OEAD,1881509818,J. Fernald,"[2, 2]",If you don't have a 3/32 punch or would like t...,4,"It's a punch, that's all.",1330387200,"02 28, 2012"
3,A148SVSWKTJKU6,1881509818,"Jusitn A. Watts ""Maverick9614""","[0, 0]",This works no better than any 3/32 punch you w...,4,It's a punch with a Glock logo.,1328400000,"02 5, 2012"
4,AAAWJ6LW9WMOO,1881509818,Material Man,"[0, 0]",I purchased this thinking maybe I need a speci...,4,"Ok,tool does what a regular punch does.",1366675200,"04 23, 2013"


In [37]:
df.shape

(296337, 9)

In [40]:
review_text = df.reviewText.apply(gensim.utils.simple_preprocess)

In [41]:
review_text

0         [this, came, in, on, time, and, am, veru, happ...
1         [had, factory, glock, tool, that, was, using, ...
2         [if, you, don, have, punch, or, would, like, t...
3         [this, works, no, better, than, any, punch, yo...
4         [purchased, this, thinking, maybe, need, speci...
                                ...                        
296332    [this, is, water, bottle, done, right, it, is,...
296333    [if, you, re, looking, for, an, insulated, wat...
296334    [this, hydracentials, sporty, oz, double, insu...
296335    [as, usual, received, this, item, free, in, ex...
296336    [hydracentials, insulated, oz, water, bottle, ...
Name: reviewText, Length: 296337, dtype: object

In [43]:
model = gensim.models.Word2Vec(
    window=10,
    min_count=2,
    workers=4,
)

In [44]:
model.corpus_count

0

In [45]:
model.epochs

5

In [46]:
model.build_vocab(review_text,progress_per=1000)

In [47]:
model.corpus_count

296337

In [48]:
model.train(review_text,total_examples=model.corpus_count,epochs=model.epochs)

(91340397, 121496535)

awful
('good', 'great'), ('slow','steady')

In [49]:
model.wv.most_similar("awful")

[('horrible', 0.6661404371261597),
 ('overpowering', 0.6397011876106262),
 ('ugly', 0.6280115246772766),
 ('terrible', 0.6264380812644958),
 ('horrendous', 0.618258535861969),
 ('unpleasant', 0.5957173109054565),
 ('unusual', 0.5781636238098145),
 ('anomaly', 0.5716593861579895),
 ('authentic', 0.5691657066345215),
 ('overwhelming', 0.5670223236083984)]

In [50]:
model.wv.similarity('good', 'great')

0.7890026

In [51]:
model.wv.similarity('slow','steady')

0.37695825