# The reviews data

The data comes as a collection of json files. We can import these in an easy way with the following script (We only use the video games data)

In [1]:
import numpy as np
import pandas as pd
import gzip
import nltk

def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

df_reviews = getDF('./datasets/hackathon/reviews_Video_Games_5.json.gz')
df_meta = getDF('./datasets/hackathon/meta_Video_Games.json.gz')

# df_qa_toys_and_games = getDF('./datasets/hackathon/QA_Video_Games.json.gz')
# df_QA_toys_and_games = getDF('./data/QA_Toys_and_Games.json.gz')

We take a quick look at the data first. The reviews dataframe

In [2]:
df_reviews.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A2HD75EMZR8QLN,700099867,123,"[8, 12]",Installing the game was a struggle (because of...,1.0,Pay to unlock content? I don't think so.,1341792000,"07 9, 2012"
1,A3UR8NLLY1ZHCX,700099867,"Alejandro Henao ""Electronic Junky""","[0, 0]",If you like rally cars get this game you will ...,4.0,Good rally game,1372550400,"06 30, 2013"
2,A1INA0F5CWW3J4,700099867,"Amazon Shopper ""Mr.Repsol""","[0, 0]",1st shipment received a book instead of the ga...,1.0,Wrong key,1403913600,"06 28, 2014"
3,A1DLMTOTHQ4AST,700099867,ampgreen,"[7, 10]","I got this version instead of the PS3 version,...",3.0,"awesome game, if it did not crash frequently !!",1315958400,"09 14, 2011"
4,A361M14PU2GUEG,700099867,"Angry Ryan ""Ryan A. Forrest""","[2, 2]",I had Dirt 2 on Xbox 360 and it was an okay ga...,4.0,DIRT 3,1308009600,"06 14, 2011"


and the meta data

In [3]:
df_meta.head()

Unnamed: 0,asin,description,price,imUrl,related,salesRank,categories,title,brand
0,0078764343,Brand new sealed!,37.98,http://ecx.images-amazon.com/images/I/513h6dPb...,"{'also_bought': ['B000TI836G', 'B003Q53VZC', '...",{'Video Games': 28655},"[[Video Games, Xbox 360, Games]]",,
1,043933702X,In Stock NOW. Eligible for FREE Super Saving ...,23.5,http://ecx.images-amazon.com/images/I/61KKRndV...,"{'also_viewed': ['B000067NP1', '0439573947', '...",{'Video Games': 44080},"[[Video Games, PC, Games]]",,
2,0439339987,Grandma Groupers kelp seeds are missing and wi...,8.95,http://ecx.images-amazon.com/images/I/416QZg89...,"{'also_bought': ['B000314VVU', 'B000PXUOTE', '...",{'Video Games': 49836},"[[Video Games, PC, Games]]",,
3,0439342260,This software is BRAND NEW. Packaging may diff...,,http://ecx.images-amazon.com/images/I/61Wvu-Uj...,{'also_viewed': ['043934302X']},{'Video Games': 49156},"[[Video Games, PC, Games]]",,
4,0439339960,a scholastic clubs fairs cd rom game,,http://ecx.images-amazon.com/images/I/51k3oRCF...,{'also_viewed': ['B00028D7TG']},{'Video Games': 52262},"[[Video Games, PC, Games]]",,


For our purposes, it is important to see that the related feature has indeed more data, more precisely

that is related is a dictionary with 3 keys 'also_bought', 'bought_together', and 'buy_after_viewing'

In [4]:
import re
from collections import Counter

split_regex = r'\W+'

def simpleTokenize(string):
    """ A simple implementation of input string tokenization
    Args:
        string (str): input string
    Returns:
        list: a list of tokens
    """
    return [x for x in re.split(split_regex, string.lower()) if x]

In [5]:
from nltk.stem.porter import PorterStemmer

def stem_tokens(tokens, stemmer):
    """ To account for word stemming
    Args:
        tokens (str): input tokenized string
        stemmer: Kind of stemmer used
    Returns:
        list: a list of tokens
    """
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
        
    return stemmed

In [6]:
from nltk.stem.wordnet import WordNetLemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def simplefilter(string):
    """ A simple implementation of input string tokenization that excludes stop words
    Args:
        string (str): input string
    Returns:
        list: a list of filtered tokens
    """
    tokens = [x for x in simpleTokenize(string) if x not in stop_words]
#     lmtzr = WordNetLemmatizer()
#     stems = stem_tokens(tokens, stemmer)
#     return [lmtzr.lemmatize(token, 'v') if lmtzr.lemmatize(token) == token else token for token in tokens]
    return tokens

In [7]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(df_reviews, test_size=0.2, random_state=42)

In [8]:
train_set.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
41024,A1T6XL9P487RGC,B00009ZVHY,"Xider ""MWAHAHAHAHA!""","[2, 3]",Prince of Persia: The Sands of Time is a prett...,4.0,"Good, but difficult",1072828800,"12 31, 2003"
111044,AUNBHG6AVL8MO,B001C6GVI6,Johnmrrs,"[0, 0]",Although the game can be slow at times I love ...,5.0,One of my favorite games on the ps2,1356307200,"12 24, 2012"
19454,A2Z548GT5948WH,B00005ML0Z,"J. Perry ""Zak""","[0, 0]",This game eliminates boredom and allows one to...,3.0,A fun PS2 Game,1360972800,"02 16, 2013"
36629,A3PPXVR5J6U2JD,B00008G8OA,"Michael J. Tresca ""Talien""","[1, 2]",I've always been a Godzilla fan but never knew...,4.0,ROOOOOOOOOOOOOOOAR!,1113523200,"04 15, 2005"
58632,A1T7CM8MBPXCM1,B0009WPZOA,XDonkey,"[4, 4]",AWESOME GRAPHICS and VERY FUN gameplay put thi...,4.0,Tons of fun - Must Have!,1149379200,"06 4, 2006"


In [9]:
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to C:\Users\Hari
[nltk_data]     Ravindran\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Hari
[nltk_data]     Ravindran\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [10]:
train_set = train_set.assign(Text = train_set['reviewText'].apply(simplefilter))

In [11]:
train_set.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,Text
41024,A1T6XL9P487RGC,B00009ZVHY,"Xider ""MWAHAHAHAHA!""","[2, 3]",Prince of Persia: The Sands of Time is a prett...,4.0,"Good, but difficult",1072828800,"12 31, 2003","[prince, persia, sands, time, pretty, darn, go..."
111044,AUNBHG6AVL8MO,B001C6GVI6,Johnmrrs,"[0, 0]",Although the game can be slow at times I love ...,5.0,One of my favorite games on the ps2,1356307200,"12 24, 2012","[although, game, slow, times, love, whole, cas..."
19454,A2Z548GT5948WH,B00005ML0Z,"J. Perry ""Zak""","[0, 0]",This game eliminates boredom and allows one to...,3.0,A fun PS2 Game,1360972800,"02 16, 2013","[game, eliminates, boredom, allows, one, enjoy..."
36629,A3PPXVR5J6U2JD,B00008G8OA,"Michael J. Tresca ""Talien""","[1, 2]",I've always been a Godzilla fan but never knew...,4.0,ROOOOOOOOOOOOOOOAR!,1113523200,"04 15, 2005","[always, godzilla, fan, never, knew, much, met..."
58632,A1T7CM8MBPXCM1,B0009WPZOA,XDonkey,"[4, 4]",AWESOME GRAPHICS and VERY FUN gameplay put thi...,4.0,Tons of fun - Must Have!,1149379200,"06 4, 2006","[awesome, graphics, fun, gameplay, put, high, ..."


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(tokenizer = simplefilter)
tfs = tfidf.fit_transform(train_set['reviewText'])

In [31]:
test_set.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
168500,A17M1HL6U2GS7M,B0047TLIBU,Storylover,"[31, 43]","Been playing this for two days now, and I am v...",5.0,It feels like hanging out with old friends aga...,1299715200,"03 10, 2011"
189019,A1FGU7F9UJ264N,B0053BCP40,Kelly Randall,"[0, 0]","I really enjoyed playing this game, although i...",4.0,Fun game,1393459200,"02 27, 2014"
94086,A3R7M2EP1EYNVT,B000WCQWR6,Thomas B. Hileman Jr.,"[0, 0]",This was kind of a gamble and was looking for ...,3.0,Shoot em up,1373155200,"07 7, 2013"
168763,A369KP0JV77JYT,B00498T500,Jon,"[4, 4]",***Updated 7/7/2011 - update marked below***I ...,4.0,"Fun for a group, don't pay too much for it",1302393600,"04 10, 2011"
17314,ABIKTKAWOMY8,B00005BIG7,Chance Farley,"[3, 3]",It looks like we've come a long way from past ...,5.0,One of the best fps's to ever hit a console,994982400,"07 13, 2001"


In [32]:
str = test_set['reviewText'][168500]

In [33]:
str

'Been playing this for two days now, and I am very pleased so far. [Update: I\'ve been playing for a week now, and I like it more and more as the days have gone by!]  People have spoken a lot about how things are different than Dragon Age: Origins.  I\'m going to assume that you know that, and I\'m going to focus on what I have enjoyed about this game so far.  First--the graphics are fantastic.  We just finished Final Fantasy XIII, which had spectacular graphics, and then we started Two Worlds II.  The graphics on Two Worlds II were such a comedown that we had to stop the game for a while so that we would stop expecting FFXIII style graphics.  Well, it looks like Two Worlds II will have to wait a little longer, because upon popping in this game, I was blown away by how beautiful everything looks. Movements are smooth and natural, the scenery is crisp and deeply detailed.  Monsters are terrifying.  Facial expressions are better than DA:O.  Sometimes moving in and out of the cut scenes i

In [16]:
response = tfidf.transform([str])
print (response)

  (0, 186399)	0.0579303654709
  (0, 185018)	0.0395782169566
  (0, 184272)	0.0412108074774
  (0, 183316)	0.0529558765048
  (0, 183254)	0.0351127639056
  (0, 183136)	0.0898647687942
  (0, 183073)	0.0187141476462
  (0, 183001)	0.0265999576371
  (0, 182147)	0.0463443518411
  (0, 181439)	0.0308148981093
  (0, 181182)	0.0210523834778
  (0, 180698)	0.0309527468899
  (0, 180243)	0.0661244844718
  (0, 180071)	0.0555371783051
  (0, 179824)	0.0591899610791
  (0, 179739)	0.038589990049
  (0, 179647)	0.0248622326721
  (0, 179608)	0.0146904555034
  (0, 178841)	0.0471399334202
  (0, 178537)	0.0225762096076
  (0, 177872)	0.0614286789627
  (0, 177848)	0.0432122534239
  (0, 177807)	0.0438685456828
  (0, 177048)	0.0406104395127
  (0, 176627)	0.0195281808456
  :	:
  (0, 17889)	0.0575050154598
  (0, 17461)	0.0492170571704
  (0, 16920)	0.0614918021669
  (0, 16766)	0.031381887935
  (0, 16482)	0.0302755435546
  (0, 16315)	0.02006154448
  (0, 16082)	0.0435003686723
  (0, 15582)	0.0295611302805
  (0, 15454)	0.0

In [34]:
feature_names = tfidf.get_feature_names()
for col in response.nonzero()[1]:
    print (feature_names[col], ' - ', response[0, col])

zur  -  0.0579303654709
yet  -  0.0395782169566
xiii  -  0.0412108074774
would  -  0.0529558765048
worth  -  0.0351127639056
worlds  -  0.0898647687942
world  -  0.0187141476462
working  -  0.0265999576371
wish  -  0.0463443518411
wife  -  0.0308148981093
whole  -  0.0210523834778
wheel  -  0.0309527468899
well  -  0.0661244844718
week  -  0.0555371783051
weapons  -  0.0591899610791
weaker  -  0.038589990049
ways  -  0.0248622326721
way  -  0.0146904555034
want  -  0.0471399334202
wait  -  0.0225762096076
voices  -  0.0614286789627
voice  -  0.0432122534239
vocal  -  0.0438685456828
video  -  0.0406104395127
version  -  0.0195281808456
various  -  0.0252081650923
us  -  0.0473008123222
urchin  -  0.0676889548962
upon  -  0.0275716531674
upgrading  -  0.0349572350484
upgrade  -  0.0524291847125
update  -  0.0798377829058
understanding  -  0.0756324197634
understand  -  0.0250814735603
undercurrents  -  0.0676889548962
unacceptable  -  0.0401344878156
ultimately  -  0.0951119197482
two  

In [18]:
type(response)

scipy.sparse.csr.csr_matrix

In [35]:
train_set_num = train_set.drop(["reviewerName", "helpful", "summary", "unixReviewTime", "reviewTime"], axis = 1)

In [36]:
train_set_num.head()

Unnamed: 0,reviewerID,asin,reviewText,overall,Text
41024,A1T6XL9P487RGC,B00009ZVHY,Prince of Persia: The Sands of Time is a prett...,4.0,"[prince, persia, sands, time, pretty, darn, go..."
111044,AUNBHG6AVL8MO,B001C6GVI6,Although the game can be slow at times I love ...,5.0,"[although, game, slow, times, love, whole, cas..."
19454,A2Z548GT5948WH,B00005ML0Z,This game eliminates boredom and allows one to...,3.0,"[game, eliminates, boredom, allows, one, enjoy..."
36629,A3PPXVR5J6U2JD,B00008G8OA,I've always been a Godzilla fan but never knew...,4.0,"[always, godzilla, fan, never, knew, much, met..."
58632,A1T7CM8MBPXCM1,B0009WPZOA,AWESOME GRAPHICS and VERY FUN gameplay put thi...,4.0,"[awesome, graphics, fun, gameplay, put, high, ..."


In [37]:
training_labels = train_set["overall"].copy()
training = train_set_num.drop("overall", axis=1)

In [38]:
training.head()

Unnamed: 0,reviewerID,asin,reviewText,Text
41024,A1T6XL9P487RGC,B00009ZVHY,Prince of Persia: The Sands of Time is a prett...,"[prince, persia, sands, time, pretty, darn, go..."
111044,AUNBHG6AVL8MO,B001C6GVI6,Although the game can be slow at times I love ...,"[although, game, slow, times, love, whole, cas..."
19454,A2Z548GT5948WH,B00005ML0Z,This game eliminates boredom and allows one to...,"[game, eliminates, boredom, allows, one, enjoy..."
36629,A3PPXVR5J6U2JD,B00008G8OA,I've always been a Godzilla fan but never knew...,"[always, godzilla, fan, never, knew, much, met..."
58632,A1T7CM8MBPXCM1,B0009WPZOA,AWESOME GRAPHICS and VERY FUN gameplay put thi...,"[awesome, graphics, fun, gameplay, put, high, ..."


In [23]:
# training = training.assign(TFIDF = training['reviewText'].apply(lambda str: tfidf.transform([str])))

In [24]:
# training.head()

In [25]:
# training_prepared = training.drop(["reviewerID", "asin", "reviewText"], axis = 1)

In [26]:
# training_prepared.head()

In [27]:
# training_prepared["TFIDF"][0].todense()

In [28]:
# training_prepared.head()

In [29]:
# Memory issues

# training = training.assign(TFIDF = training['TFIDF'].apply(lambda x: x.todense()))

In [30]:
from sklearn.linear_model import SGDRegressor
sgd_reg = SGDRegressor(n_iter=500, penalty=None, eta0=0.1, random_state=42)

In [39]:
sgd_reg.fit(tfs, training_labels.ravel())

SGDRegressor(alpha=0.0001, average=False, epsilon=0.1, eta0=0.1,
       fit_intercept=True, l1_ratio=0.15, learning_rate='invscaling',
       loss='squared_loss', n_iter=500, penalty=None, power_t=0.25,
       random_state=42, shuffle=True, verbose=0, warm_start=False)

In [40]:
sgd_reg.intercept_, sgd_reg.coef_

(array([ 3.93183008]),
 array([-0.55103209, -0.26251813, -0.3220666 , ...,  0.02161779,
         0.12169506, -0.08671662]))

In [41]:
test_set.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
168500,A17M1HL6U2GS7M,B0047TLIBU,Storylover,"[31, 43]","Been playing this for two days now, and I am v...",5.0,It feels like hanging out with old friends aga...,1299715200,"03 10, 2011"
189019,A1FGU7F9UJ264N,B0053BCP40,Kelly Randall,"[0, 0]","I really enjoyed playing this game, although i...",4.0,Fun game,1393459200,"02 27, 2014"
94086,A3R7M2EP1EYNVT,B000WCQWR6,Thomas B. Hileman Jr.,"[0, 0]",This was kind of a gamble and was looking for ...,3.0,Shoot em up,1373155200,"07 7, 2013"
168763,A369KP0JV77JYT,B00498T500,Jon,"[4, 4]",***Updated 7/7/2011 - update marked below***I ...,4.0,"Fun for a group, don't pay too much for it",1302393600,"04 10, 2011"
17314,ABIKTKAWOMY8,B00005BIG7,Chance Farley,"[3, 3]",It looks like we've come a long way from past ...,5.0,One of the best fps's to ever hit a console,994982400,"07 13, 2001"


In [42]:
sgd_reg.predict(response)

array([ 5.23034673])

In [43]:
test_set_labels = test_set["overall"].copy()
test_set_cleaned = test_set.drop(["reviewerName", "helpful", "summary", "unixReviewTime", "reviewTime", "reviewerID", "asin", "overall"], axis = 1)

In [44]:
test_set_cleaned.head()

Unnamed: 0,reviewText
168500,"Been playing this for two days now, and I am v..."
189019,"I really enjoyed playing this game, although i..."
94086,This was kind of a gamble and was looking for ...
168763,***Updated 7/7/2011 - update marked below***I ...
17314,It looks like we've come a long way from past ...


In [45]:
test = test_set_cleaned.assign(TFIDF = test_set_cleaned['reviewText'].apply(lambda str: tfidf.transform([str])))

KeyboardInterrupt: 

In [None]:
test_tfidf = test.drop('reviewText', axis = 1)
test_tfidf.head()

In [None]:
test_sample= test_tfidf.iloc[:5]

In [None]:
test_sample.values

In [None]:
# Not working. How do we run a prediction on all entries at once?
# sgd_reg.predict(test_sample.values)

In [46]:
str2 = test_set['reviewText'][189019]
response2 = tfidf.transform([str2])

In [47]:
sgd_reg.predict(response2)

array([ 4.46361506])

In [48]:
# Need to run predict on all entries at once for RMSE. Then, fine-tune the sgd learning rate parameter. Maybe the number of iterations?
str3 = test_set['reviewText'][94086]
response3 = tfidf.transform([str3])
sgd_reg.predict(response3)

array([ 3.12583316])

In [35]:
import gensim, logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
 
sentences = [['first', 'sentence'], ['second', 'sentence']]
# train word2vec on the two sentences
model = gensim.models.Word2Vec(sentences, min_count=1)

2017-09-08 19:24:01,437 : INFO : collecting all words and their counts
2017-09-08 19:24:01,438 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-09-08 19:24:01,438 : INFO : collected 3 word types from a corpus of 4 raw words and 2 sentences
2017-09-08 19:24:01,438 : INFO : Loading a fresh vocabulary
2017-09-08 19:24:01,454 : INFO : min_count=1 retains 3 unique words (100% of original 3, drops 0)
2017-09-08 19:24:01,454 : INFO : min_count=1 leaves 4 word corpus (100% of original 4, drops 0)
2017-09-08 19:24:01,454 : INFO : deleting the raw counts dictionary of 3 items
2017-09-08 19:24:01,470 : INFO : sample=0.001 downsamples 3 most-common words
2017-09-08 19:24:01,470 : INFO : downsampling leaves estimated 0 word corpus (5.7% of prior 4)
2017-09-08 19:24:01,470 : INFO : estimated required memory for 3 words and 100 dimensions: 3900 bytes
2017-09-08 19:24:01,485 : INFO : resetting layer weights
2017-09-08 19:24:01,485 : INFO : training model with 3 workers o

In [36]:
import nltk.data
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [49]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [50]:
def simplefilter2(string):
    """ A simple implementation of input string tokenization that excludes stop words
    Args:
        string (str): input string
    Returns:
        list: a list of filtered tokens
    """
    tokens = [x for x in simpleTokenize(string)]
#   stems = stem_tokens(tokens, stemmer)
    return tokens

In [52]:
train_set2 = train_set.assign(Text = train_set['reviewText'].apply(simplefilter2))

In [53]:
train_set2.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,Text
41024,A1T6XL9P487RGC,B00009ZVHY,"Xider ""MWAHAHAHAHA!""","[2, 3]",Prince of Persia: The Sands of Time is a prett...,4.0,"Good, but difficult",1072828800,"12 31, 2003","[prince, of, persia, the, sands, of, time, is,..."
111044,AUNBHG6AVL8MO,B001C6GVI6,Johnmrrs,"[0, 0]",Although the game can be slow at times I love ...,5.0,One of my favorite games on the ps2,1356307200,"12 24, 2012","[although, the, game, can, be, slow, at, times..."
19454,A2Z548GT5948WH,B00005ML0Z,"J. Perry ""Zak""","[0, 0]",This game eliminates boredom and allows one to...,3.0,A fun PS2 Game,1360972800,"02 16, 2013","[this, game, eliminates, boredom, and, allows,..."
36629,A3PPXVR5J6U2JD,B00008G8OA,"Michael J. Tresca ""Talien""","[1, 2]",I've always been a Godzilla fan but never knew...,4.0,ROOOOOOOOOOOOOOOAR!,1113523200,"04 15, 2005","[i, ve, always, been, a, godzilla, fan, but, n..."
58632,A1T7CM8MBPXCM1,B0009WPZOA,XDonkey,"[4, 4]",AWESOME GRAPHICS and VERY FUN gameplay put thi...,4.0,Tons of fun - Must Have!,1149379200,"06 4, 2006","[awesome, graphics, and, very, fun, gameplay, ..."


In [54]:
def review_to_sentences(review, tokenizer):
    """ To split a review into parsed sentences
    Args:
        review: string
        tokenizer
    Returns:
        list: a list of a (list of tokens (each sentence is tokenized)). So, a list of sentences
    """
    raw_sentences = tokenizer.tokenize(review.strip())
  
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(simplefilter2(raw_sentence))
            
    return sentences

In [55]:
sentences = []

print ("Parsing sentences from training set...")
for review in train_set2['reviewText']:
    sentences += review_to_sentences(review, tokenizer)

Parsing sentences from training set...


In [56]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

num_features = 300    # Word vector dimensionality 

min_word_count = 0   # Minimum word count         

num_workers = 4       # Number of threads to run in parallel

context = 10          # Context window size 

downsampling = 1e-3   # Downsample setting for frequent words


from gensim.models import word2vec

print ("Training model...")

model = word2vec.Word2Vec(sentences, workers=num_workers, size=num_features, min_count = min_word_count, window = context, sample = downsampling)

model.init_sims(replace=True)

model_name = "300features_10context"
model.save(model_name)

# Note that now word features are now stored as a 1*300 numpy array called syn0.

2017-09-08 20:41:14,340 : INFO : 'pattern' package not found; tag filters are not available for English
2017-09-08 20:41:14,425 : INFO : collecting all words and their counts
2017-09-08 20:41:14,440 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types


Training model...


2017-09-08 20:41:14,741 : INFO : PROGRESS: at sentence #10000, processed 213123 words, keeping 13066 word types
2017-09-08 20:41:14,973 : INFO : PROGRESS: at sentence #20000, processed 424848 words, keeping 18550 word types
2017-09-08 20:41:15,173 : INFO : PROGRESS: at sentence #30000, processed 643007 words, keeping 22719 word types
2017-09-08 20:41:15,396 : INFO : PROGRESS: at sentence #40000, processed 854215 words, keeping 25850 word types
2017-09-08 20:41:15,628 : INFO : PROGRESS: at sentence #50000, processed 1071610 words, keeping 28833 word types
2017-09-08 20:41:15,797 : INFO : PROGRESS: at sentence #60000, processed 1297731 words, keeping 32070 word types
2017-09-08 20:41:16,009 : INFO : PROGRESS: at sentence #70000, processed 1510682 words, keeping 34444 word types
2017-09-08 20:41:16,156 : INFO : PROGRESS: at sentence #80000, processed 1721676 words, keeping 36808 word types
2017-09-08 20:41:16,445 : INFO : PROGRESS: at sentence #90000, processed 1937425 words, keeping 3896

2017-09-08 20:41:29,732 : INFO : PROGRESS: at sentence #730000, processed 15672695 words, keeping 112485 word types
2017-09-08 20:41:29,947 : INFO : PROGRESS: at sentence #740000, processed 15895346 words, keeping 113370 word types
2017-09-08 20:41:30,110 : INFO : PROGRESS: at sentence #750000, processed 16104136 words, keeping 114239 word types
2017-09-08 20:41:30,279 : INFO : PROGRESS: at sentence #760000, processed 16311636 words, keeping 114951 word types
2017-09-08 20:41:30,495 : INFO : PROGRESS: at sentence #770000, processed 16527937 words, keeping 115839 word types
2017-09-08 20:41:30,696 : INFO : PROGRESS: at sentence #780000, processed 16746301 words, keeping 116582 word types
2017-09-08 20:41:30,865 : INFO : PROGRESS: at sentence #790000, processed 16956911 words, keeping 117219 word types
2017-09-08 20:41:31,034 : INFO : PROGRESS: at sentence #800000, processed 17172229 words, keeping 117928 word types
2017-09-08 20:41:31,228 : INFO : PROGRESS: at sentence #810000, processe

2017-09-08 20:41:42,692 : INFO : PROGRESS: at sentence #1440000, processed 30900695 words, keeping 162390 word types
2017-09-08 20:41:42,862 : INFO : PROGRESS: at sentence #1450000, processed 31114920 words, keeping 163025 word types
2017-09-08 20:41:43,024 : INFO : PROGRESS: at sentence #1460000, processed 31328779 words, keeping 163630 word types
2017-09-08 20:41:43,194 : INFO : PROGRESS: at sentence #1470000, processed 31544512 words, keeping 164278 word types
2017-09-08 20:41:43,379 : INFO : PROGRESS: at sentence #1480000, processed 31763110 words, keeping 164909 word types
2017-09-08 20:41:43,542 : INFO : PROGRESS: at sentence #1490000, processed 31977005 words, keeping 165487 word types
2017-09-08 20:41:43,711 : INFO : PROGRESS: at sentence #1500000, processed 32195092 words, keeping 166067 word types
2017-09-08 20:41:43,880 : INFO : PROGRESS: at sentence #1510000, processed 32405577 words, keeping 166723 word types
2017-09-08 20:41:44,043 : INFO : PROGRESS: at sentence #1520000,

2017-09-08 20:42:35,223 : INFO : PROGRESS: at 5.56% examples, 319906 words/s, in_qsize 7, out_qsize 0
2017-09-08 20:42:36,230 : INFO : PROGRESS: at 5.78% examples, 319927 words/s, in_qsize 7, out_qsize 0
2017-09-08 20:42:37,252 : INFO : PROGRESS: at 6.04% examples, 321902 words/s, in_qsize 7, out_qsize 0
2017-09-08 20:42:38,278 : INFO : PROGRESS: at 6.32% examples, 324752 words/s, in_qsize 7, out_qsize 0
2017-09-08 20:42:39,279 : INFO : PROGRESS: at 6.62% examples, 328895 words/s, in_qsize 7, out_qsize 0
2017-09-08 20:42:40,302 : INFO : PROGRESS: at 6.96% examples, 333971 words/s, in_qsize 7, out_qsize 0
2017-09-08 20:42:41,311 : INFO : PROGRESS: at 7.34% examples, 340733 words/s, in_qsize 7, out_qsize 0
2017-09-08 20:42:42,314 : INFO : PROGRESS: at 7.72% examples, 346916 words/s, in_qsize 7, out_qsize 0
2017-09-08 20:42:43,328 : INFO : PROGRESS: at 8.10% examples, 353082 words/s, in_qsize 7, out_qsize 0
2017-09-08 20:42:44,345 : INFO : PROGRESS: at 8.49% examples, 358840 words/s, in_q

2017-09-08 20:43:56,062 : INFO : PROGRESS: at 31.87% examples, 437411 words/s, in_qsize 7, out_qsize 0
2017-09-08 20:43:57,073 : INFO : PROGRESS: at 32.25% examples, 438485 words/s, in_qsize 7, out_qsize 0
2017-09-08 20:43:58,083 : INFO : PROGRESS: at 32.64% examples, 439544 words/s, in_qsize 7, out_qsize 0
2017-09-08 20:43:59,088 : INFO : PROGRESS: at 33.02% examples, 440607 words/s, in_qsize 7, out_qsize 0
2017-09-08 20:44:00,094 : INFO : PROGRESS: at 33.39% examples, 441579 words/s, in_qsize 7, out_qsize 0
2017-09-08 20:44:01,098 : INFO : PROGRESS: at 33.77% examples, 442605 words/s, in_qsize 7, out_qsize 0
2017-09-08 20:44:02,106 : INFO : PROGRESS: at 34.15% examples, 443603 words/s, in_qsize 7, out_qsize 0
2017-09-08 20:44:03,114 : INFO : PROGRESS: at 34.53% examples, 444564 words/s, in_qsize 7, out_qsize 0
2017-09-08 20:44:04,119 : INFO : PROGRESS: at 34.91% examples, 445535 words/s, in_qsize 7, out_qsize 0
2017-09-08 20:44:05,137 : INFO : PROGRESS: at 35.29% examples, 446434 wor

2017-09-08 20:45:16,743 : INFO : PROGRESS: at 59.62% examples, 465182 words/s, in_qsize 7, out_qsize 0
2017-09-08 20:45:17,752 : INFO : PROGRESS: at 59.97% examples, 465461 words/s, in_qsize 7, out_qsize 0
2017-09-08 20:45:18,767 : INFO : PROGRESS: at 60.33% examples, 465721 words/s, in_qsize 8, out_qsize 0
2017-09-08 20:45:19,772 : INFO : PROGRESS: at 60.69% examples, 466037 words/s, in_qsize 7, out_qsize 0
2017-09-08 20:45:20,774 : INFO : PROGRESS: at 61.06% examples, 466436 words/s, in_qsize 7, out_qsize 0
2017-09-08 20:45:21,787 : INFO : PROGRESS: at 61.44% examples, 466835 words/s, in_qsize 7, out_qsize 0
2017-09-08 20:45:22,793 : INFO : PROGRESS: at 61.80% examples, 467104 words/s, in_qsize 7, out_qsize 0
2017-09-08 20:45:23,805 : INFO : PROGRESS: at 62.13% examples, 467195 words/s, in_qsize 7, out_qsize 0
2017-09-08 20:45:24,811 : INFO : PROGRESS: at 62.49% examples, 467419 words/s, in_qsize 7, out_qsize 0
2017-09-08 20:45:25,818 : INFO : PROGRESS: at 62.84% examples, 467632 wor

2017-09-08 20:46:37,464 : INFO : PROGRESS: at 87.89% examples, 478943 words/s, in_qsize 7, out_qsize 0
2017-09-08 20:46:38,465 : INFO : PROGRESS: at 88.24% examples, 479044 words/s, in_qsize 7, out_qsize 0
2017-09-08 20:46:39,471 : INFO : PROGRESS: at 88.59% examples, 479186 words/s, in_qsize 8, out_qsize 0
2017-09-08 20:46:40,479 : INFO : PROGRESS: at 88.95% examples, 479327 words/s, in_qsize 7, out_qsize 0
2017-09-08 20:46:41,484 : INFO : PROGRESS: at 89.30% examples, 479443 words/s, in_qsize 7, out_qsize 0
2017-09-08 20:46:42,490 : INFO : PROGRESS: at 89.66% examples, 479586 words/s, in_qsize 7, out_qsize 0
2017-09-08 20:46:43,499 : INFO : PROGRESS: at 90.02% examples, 479718 words/s, in_qsize 7, out_qsize 0
2017-09-08 20:46:44,511 : INFO : PROGRESS: at 90.37% examples, 479819 words/s, in_qsize 7, out_qsize 0
2017-09-08 20:46:45,524 : INFO : PROGRESS: at 90.73% examples, 479948 words/s, in_qsize 8, out_qsize 0
2017-09-08 20:46:46,525 : INFO : PROGRESS: at 91.09% examples, 480093 wor

In [57]:
model.doesnt_match("man woman child kitchen".split())

'kitchen'

In [58]:
model.doesnt_match("france england germany berlin".split())

'berlin'

In [59]:
model.doesnt_match("paris berlin london austria".split())

'austria'

In [60]:
model.most_similar("man")

[('manms', 0.6613045930862427),
 ('bitsy', 0.6534435749053955),
 ('itsy', 0.6252572536468506),
 ('man2', 0.6038957834243774),
 ('evolutions', 0.5582209825515747),
 ('xeviousmappygalpussuper', 0.5467562079429626),
 ('g33k', 0.5364798307418823),
 ('buster', 0.5130997896194458),
 ('maiden', 0.5038331747055054),
 ('men', 0.49635887145996094)]

In [61]:
 model.most_similar("awful")

[('terrible', 0.7304427623748779),
 ('horrible', 0.7195299863815308),
 ('atrocious', 0.7008128762245178),
 ('horrid', 0.6946996450424194),
 ('aweful', 0.6593258380889893),
 ('horrendous', 0.6587698459625244),
 ('lousy', 0.6055406332015991),
 ('abysmal', 0.5954902768135071),
 ('laughable', 0.5682377815246582),
 ('sloppy', 0.5604789853096008)]

In [62]:
model.most_similar("good")

[('decent', 0.7168729305267334),
 ('great', 0.6842114925384521),
 ('phenomenal', 0.5741235613822937),
 ('cool', 0.5702375173568726),
 ('nice', 0.5524464845657349),
 ('unimpressive', 0.5510821342468262),
 ('terrific', 0.5473631024360657),
 ('lousy', 0.5346128940582275),
 ('bad', 0.5320186018943787),
 ('solid', 0.5264005064964294)]

In [52]:
print(len(sentences))

1849559


In [69]:
def makeFeatureVec(words, model, num_features):
    """ To average all the word vectors in a given review
    Args:
        words: word vectors
        model: word2vec model
        num_features: number of features
    Returns:
        featureVec: The averaged-out feature vector
    """
    featureVec = np.zeros((num_features,),dtype="float32")
    
    nwords = 0
   
    # Converting names of words in the model's vocabulary. Using set for speed.
    index2word_set = set(model.wv.index2word)
   
    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1
            featureVec = np.add(featureVec,model[word])
    
    if nwords!= 0:
        featureVec = np.divide(featureVec,float(nwords))
        
    return featureVec


def getAvgFeatureVecs(reviews, model, num_features):
    """ Calculate the average feature vector for each review from a set of reviews.
    Args:
        reviews: a set of reviews
        model: word2vec model
        num_features: number of features
    Returns:
        reviewFeatureVecs: average feature vector for each review in the set
    """
    counter = 0
    
    # Preallocate a 2D numpy array, for speed
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    
    for review in reviews:
        # Print a status message every 1000th review
        if counter % 1000 == 0:
            print ("Review %d of %d" % (counter, len(reviews)))
        reviewFeatureVecs[counter] = makeFeatureVec(review, model, num_features)
        counter = counter + 1
            
    return reviewFeatureVecs

In [70]:
clean_train_reviews = []

for review in train_set2['reviewText']:
    clean_train_reviews.append(simplefilter(review))

trainDataVecs = getAvgFeatureVecs(clean_train_reviews, model, num_features)

print ("Creating average feature vecs for test reviews")

clean_test_reviews = []
for review in test_set['reviewText']:
    clean_test_reviews.append(simplefilter(review))

testDataVecs = getAvgFeatureVecs(clean_test_reviews, model, num_features)

Review 0 of 185424
Review 1000 of 185424
Review 2000 of 185424
Review 3000 of 185424
Review 4000 of 185424
Review 5000 of 185424
Review 6000 of 185424
Review 7000 of 185424
Review 8000 of 185424
Review 9000 of 185424
Review 10000 of 185424
Review 11000 of 185424
Review 12000 of 185424
Review 13000 of 185424
Review 14000 of 185424
Review 15000 of 185424
Review 16000 of 185424
Review 17000 of 185424
Review 18000 of 185424
Review 19000 of 185424
Review 20000 of 185424
Review 21000 of 185424
Review 22000 of 185424
Review 23000 of 185424
Review 24000 of 185424
Review 25000 of 185424
Review 26000 of 185424
Review 27000 of 185424
Review 28000 of 185424
Review 29000 of 185424
Review 30000 of 185424
Review 31000 of 185424
Review 32000 of 185424
Review 33000 of 185424
Review 34000 of 185424
Review 35000 of 185424
Review 36000 of 185424
Review 37000 of 185424
Review 38000 of 185424
Review 39000 of 185424
Review 40000 of 185424
Review 41000 of 185424
Review 42000 of 185424
Review 43000 of 185424
R

In [71]:
sgd_reg_avgReview = SGDRegressor(n_iter=500, penalty=None, eta0=0.1, random_state=42)

In [72]:
sgd_reg_avgReview.fit(trainDataVecs, training_labels.ravel())

SGDRegressor(alpha=0.0001, average=False, epsilon=0.1, eta0=0.1,
       fit_intercept=True, l1_ratio=0.15, learning_rate='invscaling',
       loss='squared_loss', n_iter=500, penalty=None, power_t=0.25,
       random_state=42, shuffle=True, verbose=0, warm_start=False)

In [73]:
sgd_reg_avgReview.predict(testDataVecs)

array([ 4.17189479,  4.69751602,  3.77161215, ...,  4.42039445,
        4.0473744 ,  3.63924979])

In [75]:
from sklearn.cluster import KMeans

word_vectors = model.wv.syn0
num_clusters = int(word_vectors.shape[0] / 5)

# Initalize a k-means object and use it to extract centroids
kmeans_clustering = KMeans(n_clusters = num_clusters)
idx = kmeans_clustering.fit_predict(word_vectors)

MemoryError: 

In [None]:
word_centroid_map = dict(zip(model.index2word, idx ))

In [None]:
# For the first 10 clusters
for cluster in range(0,10):
    # Print the cluster number  
    print ("\nCluster %d" % cluster)

    # Find all of the words for that cluster number, and print them out
    words = []
    for i in range(0, len(word_centroid_map.values())):
        if(word_centroid_map.values()[i] == cluster):
            words.append(word_centroid_map.keys()[i])
        print (words)

In [None]:
def create_bag_of_centroids(wordlist, word_centroid_map):
    """ Convert reviews into semantically related bag-of-centroids from semantically related clusters
    Args:
        wordlist: average feature vector for review
        word_centroid_map: Dictionary mapping word to cluster number
    Returns:
        bag_of_centroids: Numpy array for each review, with the number of features equal to the number of clusters.
    """
    # The number of clusters is equal to the highest cluster index in the word / centroid map
    num_centroids = max( word_centroid_map.values() ) + 1

    # Pre-allocate the bag of centroids vector (for speed)
    bag_of_centroids = np.zeros(num_centroids, dtype="float32")
    
    # Loop over the words in the review. If the word is in the vocabulary, find which cluster it belongs to, 
    # and increment that cluster count by one
    for word in wordlist:
        if word in word_centroid_map:
            index = word_centroid_map[word]
            bag_of_centroids[index] += 1
    
    return bag_of_centroids

In [None]:
# Pre-allocate an array for the training set bags of centroids (for speed)
train_centroids = np.zeros((train_set2['reviewText'].size, num_clusters), dtype="float32")

# Transform the training set reviews into bags of centroids
counter = 0
for review in clean_train_reviews:
    train_centroids[counter] = create_bag_of_centroids(review, word_centroid_map)
    counter += 1

test_centroids = np.zeros((test_set['reviewText'].size, num_clusters), dtype="float32")

counter = 0
for review in clean_test_reviews:
    test_centroids[counter] = create_bag_of_centroids(review, word_centroid_map)
    counter += 1

In [None]:
sgd_reg_cluster = SGDRegressor(n_iter=500, penalty=None, eta0=0.1, random_state=42)

In [None]:
sgd_reg_cluster.fit(train_centroids, training_labels.ravel())

In [None]:
sgd_reg_cluster.predict(test_centroids)