In [61]:
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.special import softmax
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
import math

In [69]:
recipes = pd.read_csv("../src/data/recipes.csv")
reviews = pd.read_csv("../src/data/all_users.csv")
recipe_ids = pd.read_csv("../src/data/recipe_ids.csv")

In [70]:
transformed_user_ids = pd.DataFrame(reviews.user_id.unique()).reset_index()
transformed_user_ids.columns = ["transformed_user_id","user_id"]

In [71]:
transformed_recipe_ids = pd.DataFrame(reviews.recipe_id.unique()).reset_index()
transformed_recipe_ids.columns = ["transformed_recipe_id","recipe_id"]

In [72]:
reviews = reviews.merge(transformed_user_ids, on="user_id")
reviews = reviews.merge(transformed_recipe_ids, on="recipe_id")

In [73]:
# reviews["transformed_rating"] = reviews.rating - 1

In [74]:
user_vocab_size = reviews.transformed_user_id.max()
item_vocab_size = reviews.transformed_recipe_id.max()

In [75]:
tf.keras.backend.clear_session()
np.random.seed(42)
tf.random.set_seed(42)

item_embedding_layer = layers.Embedding(input_dim = item_vocab_size + 1, output_dim=128, input_length = None)
item_id = keras.Input(shape=(1,), name='item_ids')
item_embeddings = item_embedding_layer(item_id)
item_embeddings = layers.Dropout(rate=0.3)(item_embeddings)
item_embeddings = layers.Flatten()(item_embeddings)
item_embeddings = keras.Model(inputs=item_id, outputs=item_embeddings)

user_id = keras.Input(shape=(1,), name='user_ids')
user_embeddings = layers.Embedding(input_dim = user_vocab_size + 1, output_dim=128, input_length = None)(user_id)
user_embeddings = layers.Dropout(rate=0.3)(user_embeddings)
user_embeddings = layers.Flatten()(user_embeddings)
user_embeddings = keras.Model(inputs=user_id, outputs=user_embeddings)

dot = layers.Dot(axes=1)([user_embeddings.output, item_embeddings.output])
s = keras.Model(inputs=[user_id, item_id], outputs=dot)

combined = layers.concatenate([user_embeddings.output, item_embeddings.output])
z = layers.BatchNormalization(name="bn_top_0")(combined)
z = layers.Dense(32, activation='relu', name='top_combined_dense_1')(z)
z = layers.Dropout(rate=0.2)(z)
z = layers.BatchNormalization(name="bn_top_1")(z)
z = layers.Dense(16, activation='relu', name='top_combined_dense_2')(z)
z = layers.concatenate([z, s.output])
z = layers.Dense(1, name='predictions')(z)
model = keras.Model(inputs=[user_id, item_id], outputs=z)

In [76]:
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001),  # Optimizer
              # Loss function to minimize
              loss=tf.keras.losses.MSE,  
              # List of metrics to monitor
              metrics= [keras.metrics.MeanSquaredError()])

In [77]:
train = reviews[reviews.date < "2018-01-01"]
test = reviews[reviews.date >= "2018-01-01"]

In [78]:
user_id_train = train.transformed_user_id[:-1000]
item_id_train = train.transformed_recipe_id[:-1000]
y_train = train.rating[:-1000]

user_id_test = test.transformed_user_id
item_id_test = test.transformed_recipe_id
y_test = test.rating

user_id_val = train.transformed_user_id[-1000:]
item_id_val = train.transformed_recipe_id[-1000:]
y_val = train.rating[-1000:]

In [79]:
keras.backend.clear_session()
# del model
history = model.fit([user_id_train, item_id_train], y_train,
                    batch_size=1000,
                    epochs=10,
                    # We pass some validation for
                    # monitoring validation loss and metrics
                    # at the end of each epoch
                    validation_data=([user_id_val, item_id_val], y_val)
                   )
print('\nhistory dict:', history.history)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

history dict: {'loss': [9.709579467773438, 1.9021233320236206, 1.0553921461105347, 0.77657151222229, 0.6223452091217041, 0.5174590349197388, 0.43526729941368103, 0.36824601888656616, 0.31542855501174927, 0.27612096071243286], 'mean_squared_error': [9.709579467773438, 1.9021233320236206, 1.0553921461105347, 0.77657151222229, 0.6223452091217041, 0.5174590349197388, 0.43526729941368103, 0.36824601888656616, 0.31542855501174927, 0.27612096071243286], 'val_loss': [14.52668285369873, 9.594083786010742, 4.712585926055908, 2.2471163272857666, 1.2299772500991821, 0.8960283398628235, 0.8189418911933899, 0.8511294722557068, 0.9038393497467041, 0.9297377467155457], 'val_mean_squared_error': [14.52668285369873, 9.594083786010742, 4.712585926055908, 2.2471163272857666, 1.2299772500991821, 0.8960283398628235, 0.8189418911933899, 0.8511294722557068, 0.9038393497467041, 0.9297377467155457]}


### Evaluation

#### Training metrics

In [80]:
predictions = model.predict([user_id_train, item_id_train])

In [81]:
print("train rmse:", math.sqrt(mean_squared_error(y_train, predictions)))

train rmse: 0.5276963342162481


#### Test metrics

In [82]:
predictions = model.predict([user_id_test, item_id_test])

In [83]:
print("test rmse:", math.sqrt(mean_squared_error(y_test, predictions)))

test rmse: 0.7869372477106613


### Adding content based features to the network

In [84]:
recipes = recipes[["id","summary"]]

In [85]:
train = train.merge(recipes, how="left", left_on="recipe_id",right_on="id")
test = test.merge(recipes, how="left", left_on="recipe_id",right_on="id")

In [86]:
train.shape

(54126, 10)

In [87]:
test.shape

(19122, 10)

In [88]:
train_summary = vectorizer.fit_transform(train.summary).toarray()
test_summary = vectorizer.transform(test.summary).toarray()

ValueError: np.nan is an invalid document, expected byte or unicode string.

In [13]:
user_id_train = train.transformed_user_id[:-1000]
item_id_train = train.transformed_recipe_id[:-1000]
train_summary = train_summary[:-1000]
y_train = train.rating[:-1000]

user_id_test = test.transformed_user_id
item_id_test = test.transformed_recipe_id
y_test = test.rating

user_id_val = train.transformed_user_id[-1000:]
item_id_val = train.transformed_recipe_id[-1000:]
val_summary = train_summary[-1000:]
y_val = train.rating[-1000:]

In [14]:
train_summary.shape

(13182, 1935)

In [18]:
tf.keras.backend.clear_session()
np.random.seed(42)
tf.random.set_seed(42)

item_embedding_layer = layers.Embedding(input_dim = item_vocab_size + 1, output_dim=128, input_length = None)
item_id = keras.Input(shape=(1,), name='item_ids')
item_embeddings = item_embedding_layer(item_id)
item_embeddings = layers.Dropout(rate=0.3)(item_embeddings)
item_embeddings = layers.Flatten()(item_embeddings)
item_embeddings = keras.Model(inputs=item_id, outputs=item_embeddings)

user_id = keras.Input(shape=(1,), name='user_ids')
user_embeddings = layers.Embedding(input_dim = user_vocab_size + 1, output_dim=128, input_length = None)(user_id)
user_embeddings = layers.Dropout(rate=0.3)(user_embeddings)
user_embeddings = layers.Flatten()(user_embeddings)
user_embeddings = keras.Model(inputs=user_id, outputs=user_embeddings)

summary_inputs = keras.Input(shape=(1935,), name='summary')
summ = layers.Dense(128, activation='relu', name='summary_dense_1')(summary_inputs)
summ = layers.Dropout(rate=0.2)(summ)
summ = layers.Dense(32, activation='relu', name='summary_dense_2')(summ)
summ = keras.Model(inputs=summary_inputs, outputs=summ)

combined = layers.concatenate([user_embeddings.output, item_embeddings.output, summ.output])
z = layers.BatchNormalization(name="bn_top_0")(combined)
z = layers.Dense(32, activation='relu', name='top_combined_dense_1')(z)
z = layers.Dropout(rate=0.2)(z)
z = layers.BatchNormalization(name="bn_top_1")(z)
z = layers.Dense(16, activation='relu', name='top_combined_dense_2')(z)
z = layers.Dense(1, name='predictions')(z)
hybrid_model = keras.Model(inputs=[user_id, item_id, summary_inputs], outputs=z)

In [19]:
hybrid_model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.003),  # Optimizer
              # Loss function to minimize
              loss=tf.keras.losses.MSE,  
              # List of metrics to monitor
              metrics= [keras.metrics.MeanSquaredError()])

In [20]:
keras.backend.clear_session()
# del model
history = hybrid_model.fit([user_id_train, item_id_train, train_summary], y_train,
                    batch_size=1000,
                    epochs=30,
                    # We pass some validation for
                    # monitoring validation loss and metrics
                    # at the end of each epoch
                    validation_data=([user_id_val, item_id_val, val_summary], y_val)
                   )
print('\nhistory dict:', history.history)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30

history dict: {'loss': [15.155194282531738, 4.2404093742370605, 1.7883899211883545, 1.0155471563339233, 0.7349841594696045, 0.5847600698471069, 0.4749837815761566, 0.3853587210178375, 0.3257328271865845, 0.2886633276939392, 0.2518046200275421, 0.22438767552375793, 0.19992296397686005, 0.17970795929431915, 0.16424237191677094, 0.16463910043239594, 0.14751270413398743, 0.14227379858493805, 0.12645871937274933, 0.12270812690258026, 0.11864369362592697, 0.11337385326623917, 0.11472485214471817, 0.11007941514253616, 0.10789966583251953, 0.10451383888721466, 0.10349541902542114, 0.09192182868719101, 0.09502140432596207, 0.09120315313339233], 'mea

### Evaluation

#### Training metrics

In [22]:
predictions = hybrid_model.predict([user_id_train, item_id_train, train_summary])

In [23]:
print("train rmse:", math.sqrt(mean_squared_error(y_train, predictions)))

train rmse: 0.5278610039725392


#### Test metrics

In [24]:
predictions = hybrid_model.predict([user_id_test, item_id_test, test_summary])

In [25]:
print("test rmse:", math.sqrt(mean_squared_error(y_test, predictions)))

test rmse: 0.8310363581374476
