In [1]:
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
from scipy.special import softmax
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
import math

In [2]:
recipes = pd.read_csv("../src/data/recipes.csv")
reviews = pd.read_csv("../src/data/all_users.csv")
recipe_ids = pd.read_csv("../src/data/recipe_ids.csv")
# reviews = pd.read_csv("../src/data/reviews.csv")

In [3]:
transformed_user_ids = pd.DataFrame(reviews.user_id.unique()).reset_index()
transformed_user_ids.columns = ["transformed_user_id","user_id"]

In [4]:
transformed_recipe_ids = pd.DataFrame(reviews.recipe_id.unique()).reset_index()
transformed_recipe_ids.columns = ["transformed_recipe_id","recipe_id"]

In [5]:
reviews = reviews.merge(transformed_user_ids, on="user_id")
reviews = reviews.merge(transformed_recipe_ids, on="recipe_id")

In [6]:
reviews["transformed_rating"] = reviews.rating - 1

In [7]:
user_vocab_size = reviews.transformed_user_id.max()
item_vocab_size = reviews.transformed_recipe_id.max()

In [8]:
tf.keras.backend.clear_session()
np.random.seed(42)
tf.random.set_seed(42)
# item_inputs = keras.Input(shape=(19,), name='items')
# i = layers.Dense(64, activation='relu', name='item_dense_1')(item_inputs)
# i = layers.Dropout(rate=0.2)(i)
# i = layers.Dense(16, activation='relu', name='item_dense_2')(i)
# i = keras.Model(inputs=item_inputs, outputs=i)

item_embedding_layer = layers.Embedding(input_dim = item_vocab_size + 1, output_dim=128, input_length = None)
item_id = keras.Input(shape=(1,), name='item_ids')
item_embeddings = item_embedding_layer(item_id)
item_embeddings = layers.Dropout(rate=0.3)(item_embeddings)
item_embeddings = layers.Flatten()(item_embeddings)
item_embeddings = keras.Model(inputs=item_id, outputs=item_embeddings)

# user_inputs = keras.Input(shape=(3,), name='users')
# u = layers.Dense(64, activation='relu', name='user_dense_1')(user_inputs)
# u = layers.Dropout(rate=0.2)(u)
# u = layers.Dense(32, activation='relu', name='user_dense_2')(u)
# u = keras.Model(inputs=user_inputs, outputs=u)

user_id = keras.Input(shape=(1,), name='user_ids')
user_embeddings = layers.Embedding(input_dim = user_vocab_size + 1, output_dim=128, input_length = None)(user_id)
user_embeddings = layers.Dropout(rate=0.3)(user_embeddings)
user_embeddings = layers.Flatten()(user_embeddings)
user_embeddings = keras.Model(inputs=user_id, outputs=user_embeddings)

dot = layers.Dot(axes=1)([user_embeddings.output, item_embeddings.output])
s = keras.Model(inputs=[user_id, item_id], outputs=dot)

combined = layers.concatenate([user_embeddings.output, item_embeddings.output])
z = layers.BatchNormalization(name="bn_top_0")(combined)
z = layers.Dense(32, activation='relu', name='top_combined_dense_1')(z)
z = layers.Dropout(rate=0.2)(z)
z = layers.BatchNormalization(name="bn_top_1")(z)
z = layers.Dense(16, activation='relu', name='top_combined_dense_2')(z)
z = layers.concatenate([z, s.output])
z = layers.Dense(5, name='predictions')(z)
model = keras.Model(inputs=[user_id, item_id], outputs=z)

In [9]:
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001),  # Optimizer
              # Loss function to minimize
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),  
              # List of metrics to monitor
              metrics=['accuracy'])

In [10]:
train = reviews[reviews.date < "2018-01-01"]
test = reviews[reviews.date >= "2018-01-01"]

In [11]:
user_id_train = train.transformed_user_id[:-1000]
item_id_train = train.transformed_recipe_id[:-1000]
y_train = train.transformed_rating[:-1000]

user_id_test = test.transformed_user_id
item_id_test = test.transformed_recipe_id
y_test = test.transformed_rating

user_id_val = train.transformed_user_id[-1000:]
item_id_val = train.transformed_recipe_id[-1000:]
y_val = train.transformed_rating[-1000:]

In [12]:
1 / (y_train.value_counts()/ y_train.value_counts().sum())

4     1.279805
3     7.095766
2    22.655011
1    52.392505
0    69.084525
Name: transformed_rating, dtype: float64

In [13]:
keras.backend.clear_session()
# del model
history = model.fit([user_id_train, item_id_train], y_train,
                    batch_size=2000,
                    epochs=10,
                    # We pass some validation for
                    # monitoring validation loss and metrics
                    # Introduce loss weights to handle class imbalance
#                     callbacks=[callback],
                    class_weight= {
                        0: 37,
                        1: 36,
                        2: 17,
                        3: 5,
                        4: 1
                    },
                    # at the end of each epoch
                    validation_data=([user_id_val, item_id_val], y_val)
                   )
print('\nhistory dict:', history.history)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

history dict: {'loss': [6.156229496002197, 5.1989264488220215, 4.688233375549316, 4.0345354080200195, 3.291860342025757, 2.552598714828491, 1.9457457065582275, 1.4960805177688599, 1.2375755310058594, 1.0396062135696411], 'accuracy': [0.2599668800830841, 0.3872680068016052, 0.48477205634117126, 0.5683093070983887, 0.6448255181312561, 0.7357979416847229, 0.8054436445236206, 0.848304033279419, 0.8781576156616211, 0.892180860042572], 'val_loss': [1.5793732404708862, 1.5590441226959229, 1.556445598602295, 1.5145902633666992, 1.376549482345581, 1.1580214500427246, 1.0053386688232422, 0.9313068985939026, 0.9248469471931458, 0.9442068934440613], 'val_accuracy': [0.35600000619888306, 0.4950000047683716, 0.31299999356269836, 0.41999998688697815, 0.7329999804496765, 0.7409999966621399, 0.7409999966621399, 0.7409999966621399, 0.7409999966621399, 0.7409999966621399]}


### Evaluation

#### Training metrics

In [14]:
# Evaluate the model on the test data using `evaluate`
print('\n# Evaluate on train data')
results = model.evaluate([user_id_train, item_id_train], y_train, batch_size=y_train.shape[0], verbose=0)
print('train loss, train accuracy:', results)


# Evaluate on train data
train loss, train accuracy: [0.6205962300300598, 0.7813687920570374]


In [15]:
# Verify with sklearn metrics
logits = model.predict([user_id_train, item_id_train])
proba = softmax(logits, axis=1)
predictions = np.argmax(proba,axis=1)
print("train accuracy:",accuracy_score(y_train, predictions))

train accuracy: 0.7813688212927756


In [16]:
print("train rmse:", math.sqrt(mean_squared_error(y_train, predictions)))

train rmse: 0.849041304380012


#### Test metrics

In [17]:
# Evaluate the model on the test data using `evaluate`
print('\n# Evaluate on test data')
results = model.evaluate([user_id_test, item_id_test], y_test, batch_size=y_test.shape[0], verbose=0)
print('test loss, test accuracy:', results)


# Evaluate on test data
test loss, test accuracy: [0.7539902925491333, 0.8062964081764221]


In [18]:
# Verify with sklearn metrics
logits = model.predict([user_id_test, item_id_test])
proba = softmax(logits, axis=1)
predictions = np.argmax(proba,axis=1)
print("test accuracy:",accuracy_score(y_test, predictions))

test accuracy: 0.8062964125091517


In [19]:
print("test rmse:", math.sqrt(mean_squared_error(y_test, predictions)))

test rmse: 0.7704944846414414
