In [2]:
from typing import Dict, Text
import pprintpp

import numpy as np
import pandas as pd

import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

from tensorflow import keras # type: ignore
from keras import layers, Sequential, Model, losses, metrics, optimizers, backend




  from .autonotebook import tqdm as notebook_tqdm


In [3]:
interaction_data = pd.read_csv("./data/RAW_interactions.csv")
recipe_data = pd.read_csv("./data/RAW_recipes.csv")

interaction_train = pd.read_csv("./data/interactions_train.csv")
interaction_test = pd.read_csv("./data/interactions_test.csv")
interaction_validation = pd.read_csv("./data/interactions_validation.csv")

single_user_test = pd.read_csv("./data/single_user_data.csv")

In [4]:
# get avg ratings for each recipe and sort by number of ratings
df3_avg = interaction_data.groupby('recipe_id')
df3_avg = df3_avg['rating'].agg(['mean', 'count']).sort_values(by='count', ascending=False)
#show top 10s names from df2 and ratings from df3_avg
df3_avg = df3_avg.merge(recipe_data, left_on='recipe_id', right_on='id')
# change id to recipe_id
df3_avg = df3_avg.rename(columns={'id': 'recipe_id'})
# drop unnecessary columns
df3_avg = df3_avg.drop(columns=['minutes', 'contributor_id', 'n_steps', 'n_ingredients'])
df4 = df3_avg[:]

def score(mean, count):
    return mean + 2 * np.log10(count) -1

df3_avg = df3_avg[['recipe_id', 'mean', 'count']]
# merge columns mean and count to get score
df3_avg['score'] = df3_avg.apply(lambda x: score(x['mean'], x['count']), axis=1)
df3_avg = df3_avg.drop(columns=['mean', 'count'])

# merge interaction train and test with df3_avg
interaction_data = interaction_data.merge(df3_avg, left_on='recipe_id', right_on='recipe_id')
interaction_train = interaction_train.merge(df3_avg, left_on='recipe_id', right_on='recipe_id')
interaction_test = interaction_test.merge(df3_avg, left_on='recipe_id', right_on='recipe_id')
interaction_validation = interaction_validation.merge(df3_avg, left_on='recipe_id', right_on='recipe_id')

# merge df4 with df3_avg
df4 = df4.merge(df3_avg, left_on='recipe_id', right_on='recipe_id')
    


print(df3_avg.shape)
df3_avg.head(5)

(231637, 2)


Unnamed: 0,recipe_id,score
0,2886,9.601258
1,27208,9.697352
2,89204,9.617157
3,39087,9.862974
4,67256,9.57151


In [5]:
interaction_data = interaction_data.astype({'user_id': 'string', 'recipe_id':'string', 'rating':'float64', "score":'float64'})
interaction_train = interaction_train.astype({'user_id': 'string', 'recipe_id':'string', 'rating':'float64', "score":'float64'})
interaction_test = interaction_test.astype({'user_id': 'string', 'recipe_id':'string', 'rating':'float64', "score":'float64'})
interaction_validation = interaction_validation.astype({'user_id': 'string', 'recipe_id':'string', 'rating':'float64', "score":'float64'})

single_user_test = single_user_test.astype({'user_id': 'string', 'recipe_id':'string', 'rating':'float64'})

In [6]:
uniqueUserIds = interaction_data.user_id.unique()
uniqueFoodIds = interaction_data.recipe_id.unique()

In [7]:
class RankingModel(Model):

    def __init__(self):
        super().__init__()
        embedding_dimension = 32

        self.user_embeddings = Sequential([
                                    layers.experimental.preprocessing.StringLookup(
                                        vocabulary=uniqueUserIds, mask_token=None),
                                    layers.Embedding(len(uniqueUserIds)+1, embedding_dimension)
                                    ])

        self.food_embeddings = Sequential([
                                    layers.experimental.preprocessing.StringLookup(
                                        vocabulary=uniqueFoodIds, mask_token=None),
                                    layers.Embedding(len(uniqueFoodIds)+1, embedding_dimension),
                                    ])
        self.ratings = Sequential([
                            layers.Dense(256, activation="relu"),
                            layers.Dense(64,  activation="relu"),
                            layers.Dense(16,  activation="relu"),
                            layers.Dense(1)
                              ])

        
    def call(self, userId, foodId):
        user_embeddings  = self.user_embeddings (userId)
        food_embeddings = self.food_embeddings(foodId)
        return self.ratings(tf.concat([user_embeddings, food_embeddings], axis=1))

class FoodModel(tfrs.models.Model):

    def __init__(self):
        super().__init__()
        self.ranking_model: Model = RankingModel()
        self.task: layers.Layer   = tfrs.tasks.Ranking(
                                                    loss    =  losses.MeanSquaredError(),
                                                    metrics = [metrics.RootMeanSquaredError()])
        self.validation_metrics = [metrics.RootMeanSquaredError()]

    def compute_loss(self, features, training=False):
        rating_predictions = self.ranking_model(features["userID"], features["foodID"]  )

        return self.task( labels=features["rating"], predictions=rating_predictions)
    

    def save_model(self, *args, **kwargs):
        self.ranking_model.save(*args, **kwargs)

In [8]:
# score formula for ranking = avg rating + math.log(number of ratings)
def score_formula(avg_rating, num_ratings):
    return avg_rating + np.log(num_ratings)


train_data = tf.data.Dataset.from_tensor_slices(
{
    "userID":tf.cast(interaction_train.user_id.values, tf.string),
    "foodID":tf.cast(interaction_train.recipe_id.values, tf.string),
    "rating":tf.cast(interaction_train.rating.values, tf.float32),
    "score":tf.cast(interaction_train.score.values, tf.float32)
})

test_data = tf.data.Dataset.from_tensor_slices(
{
    "userID":tf.cast(interaction_test.user_id.values, tf.string),
    "foodID":tf.cast(interaction_test.recipe_id.values, tf.string),
    "rating":tf.cast(interaction_test.rating.values, tf.float32),
    "score":tf.cast(interaction_test.score.values, tf.float32)
})

validation_data = tf.data.Dataset.from_tensor_slices(
{
    "userID":tf.cast(interaction_validation.user_id.values, tf.string),
    "foodID":tf.cast(interaction_validation.recipe_id.values, tf.string),
    "rating":tf.cast(interaction_validation.rating.values, tf.float32),
    "score":tf.cast(interaction_validation.score.values, tf.float32)
})

In [11]:
tf.random.set_seed(42)

train_data = train_data.shuffle(100_000, seed=12, reshuffle_each_iteration=True)

In [17]:
model = FoodModel()
model.compile(optimizer=optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False),
                metrics=model.validation_metrics)
# clear cache from previous runs
backend.clear_session()

cached_train = train_data.shuffle(100_000).batch(8192).cache()
cached_test = test_data.batch(4096).cache()
cached_validation = validation_data.batch(4096).cache()
model.fit(cached_train, epochs=5, validation_data=cached_validation, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x19f79dbdd90>

In [13]:
model.evaluate(cached_test, return_dict=True, verbose=1)





{'root_mean_squared_error': 1.3714897632598877,
 'loss': 2.618216037750244,
 'regularization_loss': 0,
 'total_loss': 2.618216037750244}

In [14]:
import random
user_rand = random.choice(uniqueUserIds)
print(user_rand)
test_rating = {}
for m in test_data.take(10):
    test_rating[m["foodID"].numpy()]=RankingModel()(tf.convert_to_tensor([user_rand]),tf.convert_to_tensor([m["foodID"]])) # type: ignore
print("test_rating")
pprintpp.pprint(test_rating)

408066
test_rating
{
    b'118119': <tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[-0.02444865]], dtype=float32)>,
    b'126118': <tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[-0.00833372]], dtype=float32)>,
    b'166712': <tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.0051669]], dtype=float32)>,
    b'186470': <tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.00497086]], dtype=float32)>,
    b'219596': <tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[-0.0289848]], dtype=float32)>,
    b'228179': <tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[-0.00571151]], dtype=float32)>,
    b'298748': <tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[-0.00461718]], dtype=float32)>,
    b'435013': <tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.00280774]], dtype=float32)>,
    b'44551': <tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[-0.00873445]], dtype=float32)>,
    b'82783': <tf.Tensor: shape=(1, 1), dtype=float32, numpy=array(

In [15]:
print("Top 10 recommended products for User {}: ".format(user_rand))
for m in sorted(test_rating, key=test_rating.get, reverse=True): # type: ignore
    recipe = df4.loc[recipe_data['id'] == int(m.decode())]
    recipe = recipe[['recipe_id', 'mean', 'count', "score"]]
    print(recipe)

Top 10 recommended products for User 408066: 
        recipe_id  mean  count  score
170501     100419   5.0      1    4.0
        recipe_id  mean  count  score
188288     279963   5.0      1    4.0
        recipe_id  mean  count  score
205526     243247   5.0      1    4.0
       recipe_id  mean  count    score
26216       9166   4.0      8  4.80618
        recipe_id  mean  count  score
228424     315173   5.0      1    4.0
        recipe_id  mean  count    score
121728     339387   4.5      2  4.10206
       recipe_id  mean  count     score
11773       9425   4.4     15  5.752183
        recipe_id  mean  count  score
200904     207470   5.0      1    4.0
        recipe_id  mean  count  score
156275     364237   4.0      1    3.0
       recipe_id   mean  count    score
28415     272643  4.125      8  4.93118


In [16]:
# build and save model
model.save_model("./model/")

INFO:tensorflow:Assets written to: ./model/assets


INFO:tensorflow:Assets written to: ./model/assets


TypeError: Unable to serialize <StringArray>
[     '38094',    '1293707',       '8937',     '126440',      '57222',
      '52282',     '124416', '2000192946',      '76535',     '273745',
 ...
     '157255', '2002300998', '2002212283', '2000497761', '2000145340',
 '2001868099',    '1197076',    '2405600', '2000127684',     '116593']
Length: 226570, dtype: string to JSON. Unrecognized type <class 'pandas.core.arrays.string_.StringArray'>.