In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from IPython.core.display_functions import display
import matplotlib.pyplot as plt

In [2]:
EMBEDDING_SIZE = 50


class RecommenderNet(keras.Model):
    def __init__(self, num_users, num_items, embedding_size, **kwargs):
        super().__init__(**kwargs)
        self.num_users = num_users
        self.num_items = num_items
        self.embedding_size = embedding_size
        self.user_embedding = layers.Embedding(
            num_users,
            embedding_size,
            embeddings_initializer="he_normal",
            embeddings_regularizer=keras.regularizers.l2(1e-6),
        )
        self.user_bias = layers.Embedding(num_users, 1)
        self.item_embedding = layers.Embedding(
            num_items,
            embedding_size,
            embeddings_initializer="he_normal",
            embeddings_regularizer=keras.regularizers.l2(1e-6),
        )
        self.item_bias = layers.Embedding(num_items, 1)

    def call(self, inputs):
        user_vector = self.user_embedding(inputs[:, 0])
        user_bias = self.user_bias(inputs[:, 0])
        item_vector = self.item_embedding(inputs[:, 1])
        item_bias = self.item_bias(inputs[:, 1])
        dot_user_item = tf.tensordot(user_vector, item_vector, 2)
        # Add all the components (including bias)
        x = dot_user_item + user_bias + item_bias
        # The sigmoid activation forces the rating to between 0 and 1
        return tf.nn.sigmoid(x)

In [3]:
def df_encoding(df: pd.DataFrame) -> pd.DataFrame:
    user_ids = df["userId"].unique().tolist()
    user2user_encoded = {x: i for i, x in enumerate(user_ids)}
    userencoded2user = {i: x for i, x in enumerate(user_ids)}
    item_ids = df["itemId"].unique().tolist()
    item2item_encoded = {x: i for i, x in enumerate(item_ids)}
    item_encoded2item = {i: x for i, x in enumerate(item_ids)}
    df["user"] = df["userId"].map(user2user_encoded)
    df["item"] = df["itemId"].map(item2item_encoded)

    num_users = len(user2user_encoded)
    num_items = len(item_encoded2item)
    df["rating"] = df["rating"].values.astype(np.float32)
    # min and max ratings will be used to normalize the ratings later
    min_rating = min(df["rating"])
    max_rating = max(df["rating"])


    return df, num_users, num_items, min_rating, max_rating

In [4]:
df_train = pd.read_csv("data/train.csv")
df_train.head(10)
df_train

Unnamed: 0,userId,itemId,rating,date
0,cgexjc,682978,4,2017-08-13 13:23:35
1,cgexjc,320898,4,2019-01-18 15:56:07
2,cgexjc,29028,4,2017-08-13 14:03:55
3,cgexjc,399148,5,2017-08-13 13:59:51
4,cgexjc,734055,4,2019-01-18 15:37:29
...,...,...,...,...
8612102,blqyzd,148210,1,2017-08-05 14:15:09
8612103,blqyzd,735535,4,2017-08-05 14:14:35
8612104,blqyzd,124242,3,2017-08-05 14:25:33
8612105,blqyzd,948393,4,2017-08-05 13:25:21


In [5]:
df_train, num_users, num_items, min_rating, max_rating = df_encoding(df_train)
data = df_train.sample(frac=0.15, random_state=1111)

# Assuming training on 80% of the data and validating on 20%.
train_df = data.sample(frac=0.5, random_state=1111)
val_df = df_train.drop(train_df.index)

x_train,x_val = (train_df[["user", "item"]].values,val_df[["user", "item"]].values)
# Normalize the targets between 0 and 1. Makes it easy to train.
y_train,y_val = (train_df["rating"].apply(lambda x: (x - min_rating) / (max_rating - min_rating)).values,val_df["rating"].apply(lambda x: (x - min_rating) / (max_rating - min_rating)).values)


model = RecommenderNet(num_users, num_items, EMBEDDING_SIZE)
model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=keras.optimizers.Adam(learning_rate=0.0007),
)


history = model.fit(
    x=x_train,
    y=y_train,
    batch_size=64,
    epochs=3,
    verbose=1,
    validation_data=(x_val, y_val),
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [None]:
df_train, num_users, num_items, min_rating, max_rating = df_encoding(df_train)
data = df_train.sample(frac=0.15, random_state=1111)

# Assuming training on 80% of the data and validating on 20%.
train_df = data.sample(frac=0.5, random_state=1111)
val_df = df_train.drop(train_df.index)

x_train,x_val = (train_df[["user", "item"]].values,val_df[["user", "item"]].values)
# Normalize the targets between 0 and 1. Makes it easy to train.
y_train,y_val = (train_df["rating"].apply(lambda x: (x - min_rating) / (max_rating - min_rating)).values,val_df["rating"].apply(lambda x: (x - min_rating) / (max_rating - min_rating)).values)


model = RecommenderNet(num_users, num_items, EMBEDDING_SIZE)
model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=keras.optimizers.Adam(learning_rate=0.0007),
)


history = model.fit(
    x=x_train,
    y=y_train,
    batch_size=32,
    epochs=3,
    verbose=1,
    validation_data=(x_val, y_val),
)

Epoch 1/3

In [None]:
df_train, num_users, num_items, min_rating, max_rating = df_encoding(df_train)
data = df_train.sample(frac=0.1, random_state=1111)

# Assuming training on 80% of the data and validating on 20%.
train_df = data.sample(frac=0.6, random_state=1111)
val_df = df_train.drop(train_df.index)

x_train,x_val = (train_df[["user", "item"]].values,val_df[["user", "item"]].values)
# Normalize the targets between 0 and 1. Makes it easy to train.
y_train,y_val = (train_df["rating"].apply(lambda x: (x - min_rating) / (max_rating - min_rating)).values,val_df["rating"].apply(lambda x: (x - min_rating) / (max_rating - min_rating)).values)


model = RecommenderNet(num_users, num_items, EMBEDDING_SIZE)
model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=keras.optimizers.Adam(learning_rate=0.0007),
)


history = model.fit(
    x=x_train,
    y=y_train,
    batch_size=64,
    epochs=3,
    verbose=1,
    validation_data=(x_val, y_val),
)

In [None]:
df_train, num_users, num_items, min_rating, max_rating = df_encoding(df_train)
data = df_train.sample(frac=0.1, random_state=1111)

# Assuming training on 80% of the data and validating on 20%.
train_df = data.sample(frac=0.6, random_state=1111)
val_df = df_train.drop(train_df.index)

x_train,x_val = (train_df[["user", "item"]].values,val_df[["user", "item"]].values)
# Normalize the targets between 0 and 1. Makes it easy to train.
y_train,y_val = (train_df["rating"].apply(lambda x: (x - min_rating) / (max_rating - min_rating)).values,val_df["rating"].apply(lambda x: (x - min_rating) / (max_rating - min_rating)).values)


model = RecommenderNet(num_users, num_items, EMBEDDING_SIZE)
model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=keras.optimizers.Adam(learning_rate=0.0007),
)


history = model.fit(
    x=x_train,
    y=y_train,
    batch_size=32,
    epochs=3,
    verbose=1,
    validation_data=(x_val, y_val),
)