# Lesson 6b: Factorization Machines with Keras

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Okay bitte :((((

In [17]:
class TopPopRecommender():

    def fit(self, train):

        item_popularity = train[['movie_id','rating']].groupby(by='movie_id').count()

        self.train = train
        # We are not interested in sorting the popularity value,
        # but to order the items according to it
        self.popular_items = item_popularity.sort_values(by='rating',ascending=False).index


    def predict_top(self, user_id, at=5, remove_seen=True):

        if remove_seen:
            seen_items = self.train[self.train.user_id==user_id].movie_id.values
            unseen_items_mask = np.in1d(self.popular_items, seen_items, assume_unique=True, invert = True)
            unseen_items = self.popular_items[unseen_items_mask]
            recommended_items = unseen_items[0:at]

        else:
            recommended_items = self.popular_items[0:at]

        return recommended_items

In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from pathlib import Path
from zipfile import ZipFile
!mkdir models
import tensorflow.keras.backend as K
from tensorflow.keras.models import Model
from tensorflow.keras.layers import *

In [4]:
from google.colab import drive
drive.mount('/content/drive')
complete_train = pd.read_csv('/content/drive/My Drive/_Universität_HPI/Semester 6/Recommenders/Assignments/Assignment 1/data/train.csv')
complete_train_og = complete_train.copy()
test = pd.read_csv('/content/drive/My Drive/_Universität_HPI/Semester 6/Recommenders/Assignments/Assignment 1/givenExample/kaggle_baseline.csv')

Mounted at /content/drive


In [5]:
def text2seq(text, n_genre):
    """ using tokenizer to encoded the multi-level categorical feature
    """
    tokenizer = Tokenizer(lower=True, split='|',filters='', num_words=n_genre)
    tokenizer.fit_on_texts(text)
    seq = tokenizer.texts_to_sequences(text)
    seq = pad_sequences(seq, maxlen=3,padding='post')
    return seq

In [6]:
df = complete_train.copy()
df = df.rename(columns={"release_date": "movie_genre", "age": "sex", "sex": "age"})
df

Unnamed: 0,user_id,title,movie_id,rating,movie_genre,age,sex
0,2592,Top Gun (1986),1101,4,Action|Romance,50,M
1,4318,12 Angry Men (1957),1203,4,Drama,25,M
2,2756,Robocop 2 (1990),2986,2,Action|Crime|Sci-Fi,18,M
3,1706,Modern Times (1936),3462,5,Comedy,25,M
4,4813,Milk Money (1994),276,3,Comedy|Romance,35,F
...,...,...,...,...,...,...,...
800162,59,"Big Chill, The (1983)",2352,4,Comedy|Drama,50,F
800163,4458,So I Married an Axe Murderer (1993),543,4,Comedy|Romance|Thriller,25,F
800164,1234,Almost Famous (2000),3897,4,Comedy|Drama,18,M
800165,4864,"Fish Called Wanda, A (1988)",1079,5,Comedy,18,M


In [7]:
n_genre = 18
df['movie_genre'] = text2seq(df['movie_genre'].values, n_genre=n_genre).tolist()

In [8]:
df

Unnamed: 0,user_id,title,movie_id,rating,movie_genre,age,sex
0,2592,Top Gun (1986),1101,4,"[3, 6, 0]",50,M
1,4318,12 Angry Men (1957),1203,4,"[2, 0, 0]",25,M
2,2756,Robocop 2 (1990),2986,2,"[3, 8, 5]",18,M
3,1706,Modern Times (1936),3462,5,"[1, 0, 0]",25,M
4,4813,Milk Money (1994),276,3,"[1, 6, 0]",35,F
...,...,...,...,...,...,...,...
800162,59,"Big Chill, The (1983)",2352,4,"[1, 2, 0]",50,F
800163,4458,So I Married an Axe Murderer (1993),543,4,"[1, 6, 4]",25,F
800164,1234,Almost Famous (2000),3897,4,"[1, 2, 0]",18,M
800165,4864,"Fish Called Wanda, A (1988)",1079,5,"[1, 0, 0]",18,M


In [9]:
from sklearn.model_selection import train_test_split

train, val = train_test_split(df, test_size=0.2, random_state=7)

In [10]:
import tensorflow.keras.backend as K
from tensorflow.keras.models import Model
from tensorflow.keras.layers import *

def define_input_layers():
    # numerical features
    fea3_input = Input((1,), name = 'input_fea3')
    num_inputs = [fea3_input]
    # single level categorical features
    uid_input = Input((1,), name = 'input_uid') #user_id
    mid_input = Input((1,), name= 'input_mid')  #movie_id
    cat_sl_inputs = [uid_input, mid_input]

    # multi level categorical features (with 3 genres at most)
    genre_input = Input((3,), name = 'input_genre')
    cat_ml_inputs = [genre_input]

    inputs = num_inputs + cat_sl_inputs + cat_ml_inputs

    return inputs

inputs = define_input_layers()

In [11]:
def Tensor_Mean_Pooling(name = 'mean_pooling', keepdims = False):
    return Lambda(lambda x: K.mean(x, axis = 1, keepdims=keepdims), name = name)

def fm_1d(inputs, n_uid, n_mid, n_genre):

    # user feat3 + user embedding + movie embedding + genre embedding
    fea3_input, uid_input, mid_input, genre_input = inputs

    # all tensors are reshape to (None, 1)
    num_dense_1d = [Dense(1, name = 'num_dense_1d_fea4')(fea3_input)]
    cat_sl_embed_1d = [Embedding(n_uid + 1, 1, name = 'cat_embed_1d_uid')(uid_input),
                        Embedding(n_mid + 1, 1, name = 'cat_embed_1d_mid')(mid_input)]
    cat_ml_embed_1d = [Embedding(n_genre + 1, 1, mask_zero=True, name = 'cat_embed_1d_genre')(genre_input)]

    cat_sl_embed_1d = [Reshape((1,))(i) for i in cat_sl_embed_1d]
    cat_ml_embed_1d = [Tensor_Mean_Pooling(name = 'embed_1d_mean')(i) for i in cat_ml_embed_1d]

    # add all tensors
    y_fm_1d = Add(name = 'fm_1d_output')(num_dense_1d + cat_sl_embed_1d + cat_ml_embed_1d)

    return y_fm_1d

y_1d = fm_1d(inputs, 10, 10, 10)

In [12]:
def fm_2d(inputs, n_uid, n_mid, n_genre, k):

    fea3_input, uid_input, mid_input, genre_input = inputs

    num_dense_2d = [Dense(k, name = 'num_dense_2d_fea3')(fea3_input)] # shape (None, k)
    num_dense_2d = [Reshape((1,k))(i) for i in num_dense_2d] # shape (None, 1, k)

    cat_sl_embed_2d = [Embedding(n_uid + 1, k, name = 'cat_embed_2d_uid')(uid_input),
                       Embedding(n_mid + 1, k, name = 'cat_embed_2d_mid')(mid_input)] # shape (None, 1, k)

    cat_ml_embed_2d = [Embedding(n_genre + 1, k, name = 'cat_embed_2d_genre')(genre_input)] # shape (None, 3, k)
    cat_ml_embed_2d = [Tensor_Mean_Pooling(name = 'cat_embed_2d_genure_mean', keepdims=True)(i) for i in cat_ml_embed_2d] # shape (None, 1, k)

    # concatenate all 2d embed layers => (None, ?, k)
    embed_2d = Concatenate(axis=1, name = 'concat_embed_2d')(num_dense_2d + cat_sl_embed_2d + cat_ml_embed_2d)

    # calcuate the interactions by simplication
    # sum of (x1*x2) = sum of (0.5*[(xi)^2 - (xi^2)])
    tensor_sum = Lambda(lambda x: K.sum(x, axis = 1), name = 'sum_of_tensors')
    tensor_square = Lambda(lambda x: K.square(x), name = 'square_of_tensors')

    sum_of_embed = tensor_sum(embed_2d)
    square_of_embed = tensor_square(embed_2d)

    square_of_sum = Multiply()([sum_of_embed, sum_of_embed])
    sum_of_square = tensor_sum(square_of_embed)

    sub = Subtract()([square_of_sum, sum_of_square])
    sub = Lambda(lambda x: x*0.5)(sub)
    y_fm_2d = Reshape((1,), name = 'fm_2d_output')(tensor_sum(sub))

    return y_fm_2d, embed_2d

y_fm2_d, embed_2d = fm_2d(inputs, 10, 10, 10, 5)

In [13]:
def fm_model(n_uid, n_mid, n_genre, k, dnn_dr):

    inputs = define_input_layers()

    y_fm_1d = fm_1d(inputs, n_uid, n_mid, n_genre)
    y_fm_2d, embed_2d = fm_2d(inputs, n_uid, n_mid, n_genre, k)


    # combinded deep and fm parts
    y = Concatenate()([y_fm_1d, y_fm_2d])
    y = Dense(1, name = 'fm_output')(y)

    fm_model_1d = Model(inputs, y_fm_1d)
    fm_model_2d = Model(inputs, y_fm_2d)
    fm_model = Model(inputs, y)

    return fm_model_1d, fm_model_2d, fm_model

In [14]:
params = {
    'n_uid': df['user_id'].max(),
    'n_mid': df['movie_id'].max(),
    'n_genre': 18,
    'k':20,
    'dnn_dr': 0.5
}

fm_model_1d, fm_model_2d, fm_model = fm_model(**params)

In [15]:
def df2xy(ratings):
    x = [df['age'].values,
         df['user_id'].values,
         df['movie_id'].values,
         np.concatenate(df['movie_genre'].values).reshape(-1,3)]
    y = df['rating'].values
    return x,y

train_x, train_y = df2xy(train)
valid_x, valid_y = df2xy(val)

In [44]:
from tensorflow.keras.callbacks import  EarlyStopping, ModelCheckpoint
fm_model_2d.compile(
    loss=tf.keras.losses.MeanSquaredError(), optimizer=keras.optimizers.Adam(learning_rate=0.001)
)
early_stop = EarlyStopping(monitor='val_loss', patience=3)
model_ckp = ModelCheckpoint(filepath='./models/deepfm_weights.h5',
                            monitor='val_loss',
                            save_weights_only=True,
                            save_best_only=True)
callbacks = [model_ckp,early_stop]
train_history = fm_model_2d.fit(train_x, train_y,
                                  epochs=100, batch_size=2048,
                                  validation_data=(valid_x, valid_y),
                                  callbacks = callbacks)



Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100


In [16]:
from tensorflow.keras.callbacks import  EarlyStopping, ModelCheckpoint
fm_model_2d.compile(
    loss=tf.keras.losses.MeanSquaredError(), optimizer=keras.optimizers.Adam(learning_rate=0.001)
)
early_stop = EarlyStopping(monitor='val_loss', patience=3)
model_ckp = ModelCheckpoint(filepath='./models/deepfm_weights.h5',
                            monitor='val_loss',
                            save_weights_only=True,
                            save_best_only=True)
callbacks = [model_ckp,early_stop]
train_history = fm_model_2d.fit(train_x, train_y,
                                  epochs=100, batch_size=1024,
                                  validation_data=(valid_x, valid_y),
                                  callbacks = callbacks)





Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100


In [20]:
user_id = 1
topPopular = TopPopRecommender()
topPopular.fit(complete_train_og)
user_ages_lookup = df[['user_id', 'age']].drop_duplicates().set_index('uid')['age'].to_dict()




######################


unseen_movie_ids = np.array(topPopular.predict_top(user_id, at=200, remove_seen=True))
user_age = user_ages_lookup.get(user_id, 0)
age_array = np.array((len(unseen_movie_ids), 1), fill_value=user_age)
id_array = np.array((len(unseen_movie_ids), 1), fill_value=user_age)
#dummy_input = [np.array([25]), np.array([2592]), np.array([1101]), np.array([[1, 0, 0]])]
input = [age_array, ]

# Predict with the model
prediction = fm_model_2d.predict(dummy_input)

# Print the prediction
print(prediction)

[[2.5463617]]


0.4588

In [None]:




def get_recommendations(user_id):

    user_age = user_ages_lookup.get(user_id, 0)  # Default 0 if user_id is not found

    # Generate all possible unseen movie IDs using the topPopular recommender
    unseen_movie_ids = np.array(topPopular.predict_top(user_id, at=200, remove_seen=True))

    # Prepare the prediction input arrays
    sex_array = np.full((len(unseen_movie_ids), 1), fill_value=user_sex)
    age_array = np.full((len(unseen_movie_ids), 1), fill_value=user_age)
    genres_array = np.array([[genre_dict.get(genre, 0) for genre in genres_columns]
                    if (genre_dict := self.movie_genres_lookup.get(mid)) is not None
                    else np.zeros(len(genres_columns)) for mid in unseen_movie_ids])
    release_year_array = np.array([self.movie_release_year_lookup.get(mid, 0) for mid in unseen_movie_ids])
    user_ids_array = np.full((len(unseen_movie_ids), ), user_id)
    movie_ids_array = unseen_movie_ids  # Ensure it's in a format that supports reshape

    # Prepare the data in the correct format, incorporating all features
    predict_x = [user_ids_array.reshape(-1, 1), movie_ids_array.reshape(-1, 1),
                sex_array, release_year_array.reshape(-1, 1), age_array, genres_array]

    # Predict the ratings for all movies for the given user_id
    predicted_ratings = self.model.predict(predict_x).flatten()

    # Sort the predictions and select top N
    top_n_movie_indices = np.argsort(-predicted_ratings)[:top_n]
    top_n_movie_ids = unseen_movie_ids[top_n_movie_indices]

    return top_n_movie_ids