In [1]:
import pandas as pd
import numpy as np

import gensim
from gensim.models.doc2vec import Doc2Vec



In [2]:
embedding_model = Doc2Vec.load('./text8_doc2vec.model')


In [3]:
%%time
food_dataframe = pd.read_csv("reviews.csv", encoding='utf8')#, low_memory=False)
food_df = food_dataframe.copy()


CPU times: user 9.56 s, sys: 2.04 s, total: 11.6 s
Wall time: 22.5 s


In [4]:
food_df.head()

Unnamed: 0,ReviewId,RecipeId,AuthorId,AuthorName,Rating,Review,DateSubmitted,DateModified
0,2,992,2008,gayg msft,5,better than any you can get at a restaurant!,2000-01-25T21:44:00Z,2000-01-25T21:44:00Z
1,7,4384,1634,Bill Hilbrich,4,"I cut back on the mayo, and made up the differ...",2001-10-17T16:49:59Z,2001-10-17T16:49:59Z
2,9,4523,2046,Gay Gilmore ckpt,2,i think i did something wrong because i could ...,2000-02-25T09:00:00Z,2000-02-25T09:00:00Z
3,13,7435,1773,Malarkey Test,5,easily the best i have ever had. juicy flavor...,2000-03-13T21:15:00Z,2000-03-13T21:15:00Z
4,14,44,2085,Tony Small,5,An excellent dish.,2000-03-28T12:51:00Z,2000-03-28T12:51:00Z


In [5]:
print("Before any processing, there are", len(food_df), "reviews.")

Before any processing, there are 1401982 reviews.


In [6]:
food_df = food_df[['AuthorId', 'AuthorName', 'Rating', 'Review']]
food_df.head()

Unnamed: 0,AuthorId,AuthorName,Rating,Review
0,2008,gayg msft,5,better than any you can get at a restaurant!
1,1634,Bill Hilbrich,4,"I cut back on the mayo, and made up the differ..."
2,2046,Gay Gilmore ckpt,2,i think i did something wrong because i could ...
3,1773,Malarkey Test,5,easily the best i have ever had. juicy flavor...
4,2085,Tony Small,5,An excellent dish.


In [7]:
food_df = food_df[food_df.Rating != 0].reset_index(drop = True)


In [8]:
len(food_df)

1325734

In [9]:
food_df.dropna(inplace=True)

In [10]:
print("There are", len(food_df), "after dropping empty reviews and reviews with no ratings.")


There are 1325520 after dropping empty reviews and reviews with no ratings.


In [11]:
food_df.Rating.unique()

array([5, 4, 2, 1, 3])

In [12]:
# we will attribute a Sentiment 1 if author reported rating of 3 or higher
# and a Sentiment 0 if author reported rating of less than 3
food_df['Sentiment'] = 0
food_df.loc[food_df.Rating >= 3, 'Sentiment'] = 1
food_df.head()

Unnamed: 0,AuthorId,AuthorName,Rating,Review,Sentiment
0,2008,gayg msft,5,better than any you can get at a restaurant!,1
1,1634,Bill Hilbrich,4,"I cut back on the mayo, and made up the differ...",1
2,2046,Gay Gilmore ckpt,2,i think i did something wrong because i could ...,0
3,1773,Malarkey Test,5,easily the best i have ever had. juicy flavor...,1
4,2085,Tony Small,5,An excellent dish.,1


Will now train a recurrent neural network for predicting sentiment on food.com reviews. Will embed each of the reviews as a single (300-dimensional) vector using the pretrained doc2vec model and then pass the vectorized review ($X$) with its sentiment $y$ -- where a sentiment is either a 1 or 0. 

My embedding takes one hundred thousand years on the entire food.com data set, so I'm going to train on a smaller sample until I can figure out how to speed up the process...

In [13]:
df = food_df[['Review','Sentiment']]
df.head()

Unnamed: 0,Review,Sentiment
0,better than any you can get at a restaurant!,1
1,"I cut back on the mayo, and made up the differ...",1
2,i think i did something wrong because i could ...,0
3,easily the best i have ever had. juicy flavor...,1
4,An excellent dish.,1


In [14]:
df = df.sample(frac=.02, replace=True, random_state=1)

In [15]:
len(df)

26510

In [16]:
%%time
tokenizedReviews = [gensim.utils.simple_preprocess(comment) for comment in df.Review.values]


CPU times: user 2.05 s, sys: 170 ms, total: 2.23 s
Wall time: 2.44 s


In [17]:
%%time
embedding_vectors = [embedding_model.infer_vector(tokenized_review) for tokenized_review in tokenizedReviews]


CPU times: user 36 s, sys: 1.21 s, total: 37.2 s
Wall time: 41.4 s


In [18]:
df['embeddingVector'] = embedding_vectors

In [19]:
df.head()

Unnamed: 0,Review,Sentiment,embeddingVector
128037,This was yummy and did kind of taste like nach...,1,"[-0.014341448, 0.011953876, 0.0034630944, -0.0..."
491757,"These were lovely, and tasted delish! I made m...",1,"[0.1450044, -0.14621297, 0.18056215, -0.033033..."
470926,This is my favorite guacamole recipe. It is gr...,1,"[0.019378975, 0.0039566695, -0.0062853205, -0...."
491265,hi loved it and so did all the family. Made it...,1,"[-0.011049533, -0.09011109, -0.05596567, -0.00..."
836493,"Served this at my 4th annual Halloween party, ...",1,"[-0.021316735, 0.023677586, 0.040562265, -0.02..."


In [20]:
# split the data set into positive and negative reviews
df_positive = df.loc[df['Sentiment']==1]
df_negative = df.loc[df['Sentiment']==0]


In [21]:
len(df_positive) + len(df_negative) == len(df)

True

In [22]:
print("There are", len(df_positive), "positive reviews and", len(df_negative), "negative reviews in the sample of ", len(df), "reviews.")

There are 25881 positive reviews and 629 negative reviews in the sample of  26510 reviews.


In [23]:
df_validation = pd.concat([df_negative.sample(int(.2*len(df_negative))), df_positive.sample(int(.2*len(df_negative)))])
df_train = df.drop(df_validation.index)

In [24]:
df_train.sample(n=10, random_state=220)

Unnamed: 0,Review,Sentiment,embeddingVector
561797,I really enjoyed this and will definately be m...,1,"[0.030001625, 0.04524819, -0.02766003, -0.0324..."
479642,Awesome pastry dough!! This was my first time...,1,"[-0.003590507, 0.00082318106, 0.076030016, 0.0..."
330944,I made this last night. It was fairly quick t...,1,"[0.053923544, 0.047585357, 0.11914902, 0.00890..."
7315,This is a great recipe and makes a wonderful q...,1,"[0.11321857, 0.062809914, 0.06060652, -0.01090..."
948224,Very nice. My tea had just a faint hint of or...,1,"[-0.007904674, 0.08306873, -0.020023046, -0.01..."
1255816,Excellent and refreshing salsa that I made as ...,1,"[0.028446663, 0.0066323574, 0.030905316, -0.01..."
725170,I made these for our vacation and we were so h...,1,"[0.09439738, 0.069615886, -0.010768046, -0.059..."
212732,Very close to a 5 star recipe. The only thing ...,1,"[0.0980529, 0.043266, 0.0032068559, -0.0898738..."
1255496,A flavoursome dish and very easy to make. I ma...,1,"[0.023964927, -0.03780505, -0.03322575, 0.0848..."
1074520,Wonderful pancakes and so easy to make. I love...,1,"[0.06526356, -0.061988525, -0.07451298, -0.013..."


In [25]:
X_train = np.array(df_train.embeddingVector.values.tolist())
y_train = np.array(df_train.Sentiment.values.tolist())
X_val = np.array(df_validation.embeddingVector.values.tolist())
y_val = np.array(df_validation.Sentiment.values.tolist())

In [26]:
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
X_val = np.reshape(X_val, (X_val.shape[0], X_val.shape[1], 1))


In [27]:
print(X_train.shape)
print(X_val.shape)

(26256, 300, 1)
(250, 300, 1)


In [28]:
# had to downgrade tensorflow for some reason?
# currently tensorflow==2.3.1
# and numpy had to downgrade to, numpy==1.18.5
import tensorflow as tf

In [29]:
from tensorflow.python.keras import models
from tensorflow.python.keras import layers
from tensorflow.python.keras import optimizers
from tensorflow.python.keras import losses
from tensorflow.python.keras import metrics
from tensorflow.python.keras import regularizers

In [30]:
# ## Import all the keras stuff we'll need
# from keras import models
# from keras import layers
# from keras import optimizers
# from keras import losses
# from keras import metrics
# from keras import regularizers


In [31]:
## make the model object
model = models.Sequential()

In [32]:
## add the layers we want

# no embedding layer since we already did an embedding using doc2vec

# the first simplernn layer
model.add(layers.SimpleRNN(300, return_sequences=False))
# model.add(layers.SimpleRNN(100, input_shape=(1, 500),
#                          kernel_regularizer=regularizers.L2(0.01),
#                          bias_regularizer=regularizers.L2(0.01),
#                          recurrent_regularizer=regularizers.L2(0.01),
#                          return_sequences=True))

In [33]:
model.add(layers.Dense(1, activation='sigmoid'))

In [34]:
model.compile(optimizer='rmsprop',
                 loss='binary_crossentropy',
                 metrics=['accuracy'])

In [35]:
history = model.fit(X_train, y_train,
                    epochs = 5,
                    batch_size=128,
                    validation_data=(X_val,y_val))

## Note training this model can take a while

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [36]:
model.save('rnn_model_trained_on_food_text8_embedding')

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: rnn_model_trained_on_food_text8_embedding/assets


For some reason, when I tried just importing the already constructed embedding vectors via `pd.read_csv("tahini-reviews-with-text8_embedding_vectors-and-sentiment_class.csv")`, the commas were dropped in the embedding vectors...

In [37]:
import gensim
from gensim.models.doc2vec import Doc2Vec

In [38]:
tahini_dataframe = pd.read_csv("salted-tahini-chocolate-chip-cookies-with-sentiment.csv")
tahini_df = tahini_dataframe.copy()
tahini_df.head()

Unnamed: 0,user,comment,sentiment
0,lmk,Yum. These took much longer than 16 minutes t...,pos
1,Sonya,If you follow the recipe as written the tahini...,pos
2,KV,I have made these cookies 5 times. My advice i...,pos
3,MaryN,I liked this- the tahini is slightly more subt...,pos
4,Maggie B,Used Shaila M's tweaks. Baked first tray strai...,pos


In [39]:
tahini_df['sentiment_class'] = 0
tahini_df.loc[tahini_df['sentiment']=='pos', 'sentiment_class'] = 1
tahini_df.head()

Unnamed: 0,user,comment,sentiment,sentiment_class
0,lmk,Yum. These took much longer than 16 minutes t...,pos,1
1,Sonya,If you follow the recipe as written the tahini...,pos,1
2,KV,I have made these cookies 5 times. My advice i...,pos,1
3,MaryN,I liked this- the tahini is slightly more subt...,pos,1
4,Maggie B,Used Shaila M's tweaks. Baked first tray strai...,pos,1


In [40]:
%%time
tokenizedComments = [gensim.utils.simple_preprocess(comment) for comment in tahini_df.comment.values]

CPU times: user 20.1 ms, sys: 627 µs, total: 20.8 ms
Wall time: 20.4 ms


In [41]:
tahini_df['tokenizedComments'] = tokenizedComments

In [42]:
%%time
embedding_vectors = [embedding_model.infer_vector(tokenized_comment) for tokenized_comment in tokenizedComments]

CPU times: user 480 ms, sys: 221 ms, total: 701 ms
Wall time: 702 ms


In [43]:
tahini_df['embeddingVector'] = embedding_vectors

In [44]:
tahini_df.head()

Unnamed: 0,user,comment,sentiment,sentiment_class,tokenizedComments,embeddingVector
0,lmk,Yum. These took much longer than 16 minutes t...,pos,1,"[yum, these, took, much, longer, than, minutes...","[0.023047142, 0.03272364, 0.105930656, 0.00287..."
1,Sonya,If you follow the recipe as written the tahini...,pos,1,"[if, you, follow, the, recipe, as, written, th...","[0.045890145, -0.0403712, 0.06738468, 0.021354..."
2,KV,I have made these cookies 5 times. My advice i...,pos,1,"[have, made, these, cookies, times, my, advice...","[0.02465725, -0.031621132, 0.19326253, 0.09497..."
3,MaryN,I liked this- the tahini is slightly more subt...,pos,1,"[liked, this, the, tahini, is, slightly, more,...","[0.0605256, 0.03904017, 0.071211174, -0.039034..."
4,Maggie B,Used Shaila M's tweaks. Baked first tray strai...,pos,1,"[used, shaila, tweaks, baked, first, tray, str...","[0.014146562, 0.001976253, 0.010398472, 0.0432..."


In [45]:
X_test = np.array(tahini_df.embeddingVector.values.tolist())
y_test = np.array(tahini_df.sentiment_class.values.tolist())

In [46]:
X_test.shape

(355, 300)

In [47]:
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))


In [48]:
y_predicted = model.predict_classes(X_test)

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


In [49]:
y_predicted = y_predicted.reshape(-1,)

In [50]:
from sklearn.metrics import accuracy_score, roc_auc_score

In [51]:
accuracy_score(y_test, y_predicted)

0.6788732394366197

In [52]:
roc_auc_score(y_test, y_predicted)

0.5