In [1]:
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.special import softmax
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
import math

In [2]:
recipes = pd.read_csv("../data/recipes/all_recipes.csv")
reviews = pd.read_csv("../data/users/all_users.csv")

In [3]:
transformed_user_ids = pd.DataFrame(reviews.user_id.unique()).reset_index()
transformed_user_ids.columns = ["transformed_user_id","user_id"]

In [4]:
transformed_recipe_ids = pd.DataFrame(reviews.recipe_id.unique()).reset_index()
transformed_recipe_ids.columns = ["transformed_recipe_id","recipe_id"]

In [5]:
reviews = reviews.merge(transformed_user_ids, on="user_id")
reviews = reviews.merge(transformed_recipe_ids, on="recipe_id")

In [6]:
# reviews["transformed_rating"] = reviews.rating - 1

In [7]:
user_vocab_size = reviews.transformed_user_id.max()
item_vocab_size = reviews.transformed_recipe_id.max()

In [8]:
tf.keras.backend.clear_session()
np.random.seed(42)
tf.random.set_seed(42)

item_embedding_layer = layers.Embedding(input_dim = item_vocab_size + 1, output_dim=128, input_length = None)
item_id = keras.Input(shape=(1,), name='item_ids')
item_embeddings = item_embedding_layer(item_id)
item_embeddings = layers.Dropout(rate=0.3)(item_embeddings)
item_embeddings = layers.Flatten()(item_embeddings)
item_embeddings = keras.Model(inputs=item_id, outputs=item_embeddings)

user_id = keras.Input(shape=(1,), name='user_ids')
user_embeddings = layers.Embedding(input_dim = user_vocab_size + 1, output_dim=128, input_length = None)(user_id)
user_embeddings = layers.Dropout(rate=0.3)(user_embeddings)
user_embeddings = layers.Flatten()(user_embeddings)
user_embeddings = keras.Model(inputs=user_id, outputs=user_embeddings)

dot = layers.Dot(axes=1)([user_embeddings.output, item_embeddings.output])
s = keras.Model(inputs=[user_id, item_id], outputs=dot)

combined = layers.concatenate([user_embeddings.output, item_embeddings.output])
z = layers.BatchNormalization(name="bn_top_0")(combined)
z = layers.Dense(32, activation='relu', name='top_combined_dense_1')(z)
z = layers.Dropout(rate=0.2)(z)
z = layers.BatchNormalization(name="bn_top_1")(z)
z = layers.Dense(16, activation='relu', name='top_combined_dense_2')(z)
z = layers.concatenate([z, s.output])
z = layers.Dense(1, name='predictions')(z)
model = keras.Model(inputs=[user_id, item_id], outputs=z)

In [9]:
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001),  # Optimizer
              # Loss function to minimize
              loss=tf.keras.losses.MSE,  
              # List of metrics to monitor
              metrics= [keras.metrics.MeanSquaredError()])

In [10]:
train = reviews[reviews.date < "2018-01-01"]
test = reviews[reviews.date >= "2018-01-01"]

In [11]:
user_id_train = train.transformed_user_id[:-1000]
item_id_train = train.transformed_recipe_id[:-1000]
y_train = train.rating[:-1000]

user_id_test = test.transformed_user_id
item_id_test = test.transformed_recipe_id
y_test = test.rating

user_id_val = train.transformed_user_id[-1000:]
item_id_val = train.transformed_recipe_id[-1000:]
y_val = train.rating[-1000:]

In [13]:
keras.backend.clear_session()
# del model
history = model.fit([user_id_train, item_id_train], y_train,
                    batch_size=1000,
                    epochs=10,
                    # We pass some validation for
                    # monitoring validation loss and metrics
                    # at the end of each epoch
                    validation_data=([user_id_val, item_id_val], y_val)
                   )
print('\nhistory dict:', history.history)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

history dict: {'loss': [1.287615180015564, 0.8860304355621338, 0.6809209585189819, 0.5560032725334167, 0.4727649688720703, 0.39730843901634216, 0.3381701409816742, 0.29279038310050964, 0.25886866450309753, 0.23805417120456696], 'mean_squared_error': [1.287615180015564, 0.8860304355621338, 0.6809209585189819, 0.5560032725334167, 0.4727649688720703, 0.39730843901634216, 0.3381701409816742, 0.29279038310050964, 0.25886866450309753, 0.23805417120456696], 'val_loss': [6.5862226486206055, 3.2588632106781006, 1.6457340717315674, 1.0410786867141724, 0.8594183921813965, 0.8232988119125366, 0.8641886115074158, 0.9204229116439819, 0.9535016417503357, 0.9791634678840637], 'val_mean_squared_error': [6.5862226486206055, 3.2588632106781006, 1.6457340717315674, 1.0410786867141724, 0.8594183921813965, 0.8232988119125366, 0.8641886115074158, 0.9204229116439819, 0.9535016417503357, 0.9791634678

### Evaluation

#### Training metrics

In [14]:
predictions = model.predict([user_id_train, item_id_train])

In [15]:
print("train rmse:", math.sqrt(mean_squared_error(y_train, predictions)))

train rmse: 0.3818415760277013


#### Test metrics

In [16]:
predictions = model.predict([user_id_test, item_id_test])

In [17]:
print("test rmse:", math.sqrt(mean_squared_error(y_test, predictions)))

test rmse: 0.7569037349931316


### Adding content based features to the network

In [18]:
recipes = recipes[["recipe_id","title"]]

In [19]:
train = train.merge(recipes, how="inner", on="recipe_id")
test = test.merge(recipes, how="inner", on="recipe_id")

In [20]:
train.shape

(54427, 9)

In [21]:
test.shape

(19282, 9)

In [22]:
train

Unnamed: 0,date,rating,recipe_id,review,user_id,username,transformed_user_id,transformed_recipe_id,title
0,2012-09-11,5,222234,This recipe was delicious! Instead of cherry p...,3419993,Pie84,0,0,Cherry Folditup
1,2012-12-25,5,222234,Made this recipe as instructed and it was easy...,3154459,hertzen,1,0,Cherry Folditup
2,2012-11-22,5,222234,This was a great recipe to make using leftover...,10370475,suziloo,2,0,Cherry Folditup
3,2015-07-07,5,222234,"Tasty , simple dessert that comes out cute as ...",2304335,Chelsea M.,3,0,Cherry Folditup
4,2016-12-31,5,222234,"Delicious!! I was pressed for time, so I didn'...",1967176,Amanda H.,4,0,Cherry Folditup
...,...,...,...,...,...,...,...,...,...
54422,2015-10-05,2,239466,Cooking time inadequate. Should have been cove...,16243497,david,54505,1080,Chef John's Cranberry Bean Ragout
54423,2015-07-02,2,240133,I must have overcooked it.under whelming,6013253,janalee,54507,1082,Roast Quail with Cured Lemon
54424,2014-07-13,4,237724,This was easy to make and delicious. I subbed ...,8837240,CAN,54508,1083,Sausage-Stuffed Piquillo Peppers
54425,2013-01-04,5,222237,Thank you Chef John.I was looking for somethin...,2499050,gigithefoodie,54537,1112,Cotechino and Braised Beans


In [23]:
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                 stop_words='english')

In [24]:
train_summary = vectorizer.fit_transform(train.title).toarray()
test_summary = vectorizer.transform(test.title).toarray()

In [25]:
user_id_train = train.transformed_user_id[:-1000]
item_id_train = train.transformed_recipe_id[:-1000]
train_summary = train_summary[:-1000]
y_train = train.rating[:-1000]

user_id_test = test.transformed_user_id
item_id_test = test.transformed_recipe_id
y_test = test.rating

user_id_val = train.transformed_user_id[-1000:]
item_id_val = train.transformed_recipe_id[-1000:]
val_summary = train_summary[-1000:]
y_val = train.rating[-1000:]

In [26]:
train_summary.shape

(53427, 1059)

In [27]:
tf.keras.backend.clear_session()
np.random.seed(42)
tf.random.set_seed(42)

item_embedding_layer = layers.Embedding(input_dim = item_vocab_size + 1, output_dim=128, input_length = None)
item_id = keras.Input(shape=(1,), name='item_ids')
item_embeddings = item_embedding_layer(item_id)
item_embeddings = layers.Dropout(rate=0.3)(item_embeddings)
item_embeddings = layers.Flatten()(item_embeddings)
item_embeddings = keras.Model(inputs=item_id, outputs=item_embeddings)

user_id = keras.Input(shape=(1,), name='user_ids')
user_embeddings = layers.Embedding(input_dim = user_vocab_size + 1, output_dim=128, input_length = None)(user_id)
user_embeddings = layers.Dropout(rate=0.3)(user_embeddings)
user_embeddings = layers.Flatten()(user_embeddings)
user_embeddings = keras.Model(inputs=user_id, outputs=user_embeddings)

summary_inputs = keras.Input(shape=(train_summary.shape[1],), name='summary')
summ = layers.Dense(128, activation='relu', name='summary_dense_1')(summary_inputs)
summ = layers.Dropout(rate=0.2)(summ)
summ = layers.Dense(32, activation='relu', name='summary_dense_2')(summ)
summ = keras.Model(inputs=summary_inputs, outputs=summ)

combined = layers.concatenate([user_embeddings.output, item_embeddings.output, summ.output])
z = layers.BatchNormalization(name="bn_top_0")(combined)
z = layers.Dense(32, activation='relu', name='top_combined_dense_1')(z)
z = layers.Dropout(rate=0.2)(z)
z = layers.BatchNormalization(name="bn_top_1")(z)
z = layers.Dense(16, activation='relu', name='top_combined_dense_2')(z)
z = layers.Dense(1, name='predictions')(z)
hybrid_model = keras.Model(inputs=[user_id, item_id, summary_inputs], outputs=z)

In [28]:
hybrid_model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001),  # Optimizer
              # Loss function to minimize
              loss=tf.keras.losses.MSE,  
              # List of metrics to monitor
              metrics= [keras.metrics.MeanSquaredError()])

In [29]:
keras.backend.clear_session()
# del model
history = hybrid_model.fit([user_id_train, item_id_train, train_summary], y_train,
                    batch_size=1000,
                    epochs=20,
                    # We pass some validation for
                    # monitoring validation loss and metrics
                    # at the end of each epoch
                    validation_data=([user_id_val, item_id_val, val_summary], y_val)
                   )
print('\nhistory dict:', history.history)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

history dict: {'loss': [13.86633014678955, 3.1277248859405518, 1.1879215240478516, 0.8316421508789062, 0.6667829155921936, 0.5266468524932861, 0.4283468723297119, 0.3569146990776062, 0.3055184483528137, 0.26655295491218567, 0.2425268441438675, 0.22211511433124542, 0.20293638110160828, 0.19158945977687836, 0.17734265327453613, 0.1651856154203415, 0.15484431385993958, 0.14921505749225616, 0.13868504762649536, 0.13620835542678833], 'mean_squared_error': [13.86633014678955, 3.1277248859405518, 1.1879215240478516, 0.8316421508789062, 0.6667829155921936, 0.5266468524932861, 0.4283468723297119, 0.3569146990776062, 0.3055184483528137, 0.26655295491218567, 0.2425268441438675, 0.22211511433124542, 0.20293638110160828, 0.19158945977687836, 0.17734265327453613, 0.165185

### Evaluation

#### Training metrics

In [30]:
predictions = hybrid_model.predict([user_id_train, item_id_train, train_summary])

In [31]:
print("train rmse:", math.sqrt(mean_squared_error(y_train, predictions)))

train rmse: 0.25392357373812124


#### Test metrics

In [32]:
predictions = hybrid_model.predict([user_id_test, item_id_test, test_summary])

In [33]:
print("test rmse:", math.sqrt(mean_squared_error(y_test, predictions)))

test rmse: 0.7529374208898267
