#### Word2Vec Implementation using Gensim on Amazon Reviews Dataset

In [2]:
import gensim
import pandas as pd

In [8]:
# Loading Cell Phone Accessories Reviews Dataset
df = pd.read_json('Cell_Phones_and_Accessories_5.json', lines=True)
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A30TL5EWN6DFXT,120401325X,christina,"[0, 0]",They look good and stick good! I just don't li...,4,Looks Good,1400630400,"05 21, 2014"
1,ASY55RVNIL0UD,120401325X,emily l.,"[0, 0]",These stickers work like the review says they ...,5,Really great product.,1389657600,"01 14, 2014"
2,A2TMXE2AFO7ONB,120401325X,Erica,"[0, 0]",These are awesome and make my phone look so st...,5,LOVE LOVE LOVE,1403740800,"06 26, 2014"
3,AWJ0WZQYMYFQ4,120401325X,JM,"[4, 4]",Item arrived in great time and was in perfect ...,4,Cute!,1382313600,"10 21, 2013"
4,ATX7CZYFXI1KW,120401325X,patrice m rogoza,"[2, 3]","awesome! stays on, and looks great. can be use...",5,leopard home button sticker for iphone 4s,1359849600,"02 3, 2013"


In [None]:
print(df.shape)
# Preprocessing review text - remove punctuation, lowercase words, remove words like I, and, so etc.
review_text = df.reviewText.apply(gensim.utils.simple_preprocess)
review_text.head()

(194439, 9)


0    [they, look, good, and, stick, good, just, don...
1    [these, stickers, work, like, the, review, say...
2    [these, are, awesome, and, make, my, phone, lo...
3    [item, arrived, in, great, time, and, was, in,...
4    [awesome, stays, on, and, looks, great, can, b...
Name: reviewText, dtype: object

In [17]:
review_text.loc[0]

['they',
 'look',
 'good',
 'and',
 'stick',
 'good',
 'just',
 'don',
 'like',
 'the',
 'rounded',
 'shape',
 'because',
 'was',
 'always',
 'bumping',
 'it',
 'and',
 'siri',
 'kept',
 'popping',
 'up',
 'and',
 'it',
 'was',
 'irritating',
 'just',
 'won',
 'buy',
 'product',
 'like',
 'this',
 'again']

In [20]:
# Training the Word2Vec model
model = gensim.models.Word2Vec(window=7, min_count=4, workers=6)
model.build_vocab(review_text, progress_per=1000)

In [21]:
model.train(review_text, total_examples=model.corpus_count, epochs=model.epochs)

(61341653, 83868975)

In [23]:
model.save("word2vec_amazon_reviews.model")

In [32]:
# Checking the similar words
model.wv.most_similar("poor")

[('lousy', 0.6696669459342957),
 ('terrible', 0.6670495867729187),
 ('questionable', 0.6436570882797241),
 ('shoddy', 0.6257575154304504),
 ('horrible', 0.6214812994003296),
 ('degraded', 0.6128764152526855),
 ('high', 0.6080411672592163),
 ('superb', 0.6056307554244995),
 ('stellar', 0.6012518405914307),
 ('lackluster', 0.5901828408241272)]

In [38]:
# Checking Similarity Score
print(model.wv.similarity('good', 'bad').item())
print(model.wv.similarity('awesome', 'great').item())
print(model.wv.similarity('nice', 'worse').item())

0.604501485824585
0.7582744359970093
-0.10036548972129822


###### Kaggle Link: https://www.kaggle.com/datasets/abdallahwagih/amazon-reviews
###### Dataset Link: http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Cell_Phones_and_Accessories_5.json.gz

