# Lesson 6b: Factorization Machines with Keras

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Factorization Machine Model

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from pathlib import Path
from zipfile import ZipFile


In [None]:
## FACTORIZATION MODELS

# Download the actual data from http://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
# Use the ratings.csv file
movielens_data_file_url = (
    "http://files.grouplens.org/datasets/movielens/ml-1m.zip"
)
movielens_zipped_file = keras.utils.get_file(
    "ml-1m.zip", movielens_data_file_url, extract=False
)
keras_datasets_path = Path(movielens_zipped_file).parents[0]
movielens_dir = keras_datasets_path / "ml-1m"

# Only extract the data the first time the script is run.
if not movielens_dir.exists():
    with ZipFile(movielens_zipped_file, "r") as zip:
        # Extract files
        print("Extracting all the files now...")
        zip.extractall(path=keras_datasets_path)
        print("Done!")


def load_ratings(movielens_dir):
    COL_NAME = ['uid','mid','rating','timestamp']
    df = pd.read_csv(movielens_dir / 'ratings.dat',sep='::', header=None, engine='python', names=COL_NAME)
    return df

def load_movies(movielens_dir):
    COL_NAME = ['mid','movie_name','movie_genre']
    df = pd.read_csv(movielens_dir / 'movies.dat',sep='::', header=None, engine='python', names=COL_NAME, encoding='latin-1')
    return df

def load_users(movielens_dir):
    COL_NAME = ['uid','user_fea1','user_fea2','user_fea3','user_fea4']
    df = pd.read_csv(movielens_dir / 'users.dat',sep='::', header=None, engine='python', names=COL_NAME)
    return df

def text2seq(text, n_genre):
    """ using tokenizer to encoded the multi-level categorical feature
    """
    tokenizer = Tokenizer(lower=True, split='|',filters='', num_words=n_genre)
    tokenizer.fit_on_texts(text)
    seq = tokenizer.texts_to_sequences(text)
    seq = pad_sequences(seq, maxlen=3,padding='post')
    return seq

n_genre = 15


#ratings = load_ratings(movielens_dir)
movies = load_movies(movielens_dir)
users = load_users(movielens_dir)


print("====== rating.dat ======")
#print(ratings.head())
print("===== movies.dat ======")
print(movies.head())
print("====== users.dat ======")
print(users.head())

movies['movie_genre'] = text2seq(movies.movie_genre.values, n_genre=n_genre).tolist()
print(movies.head())

#ratings = pd.merge(pd.merge(ratings, users), movies)

print("====== preprocessed data =======")
#ratings.head()

   mid                          movie_name                   movie_genre
0    1                    Toy Story (1995)   Animation|Children's|Comedy
1    2                      Jumanji (1995)  Adventure|Children's|Fantasy
2    3             Grumpier Old Men (1995)                Comedy|Romance
3    4            Waiting to Exhale (1995)                  Comedy|Drama
4    5  Father of the Bride Part II (1995)                        Comedy
   uid user_fea1  user_fea2  user_fea3 user_fea4
0    1         F          1         10     48067
1    2         M         56         16     70072
2    3         M         25         15     55117
3    4         M         45          7     02460
4    5         M         25         20     55455
   mid                          movie_name movie_genre
0    1                    Toy Story (1995)   [9, 2, 0]
1    2                      Jumanji (1995)   [7, 9, 0]
2    3             Grumpier Old Men (1995)   [2, 5, 0]
3    4            Waiting to Exhale (1995)   [2,

In [None]:
from google.colab import drive
drive.mount('/content/drive')
df = pd.read_csv('/content/drive/My Drive/MasterBarca/kaggle/input/train.csv')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# uid, mid,rating,user_fea1:sex,user_fea2:age,movie_genre
df = df.rename(columns={'user_id': 'uid','movie_id':'mid'})
df = df.rename(columns={'sex': 'age','age':'sex','title':'movie_name'})
df['movie_genre'] = text2seq(df.release_date.values, n_genre=n_genre).tolist()
#df['user_fea3']=8.2
df['sex'] = df['sex'].replace({'M': 1, 'F': 0})


In [None]:
ratings=df
ratings.head()

Unnamed: 0,uid,movie_name,mid,rating,release_date,age,sex,movie_genre
0,2592,Top Gun (1986),1101,4,Action|Romance,50,1,"[3, 6, 0]"
1,4318,12 Angry Men (1957),1203,4,Drama,25,1,"[2, 0, 0]"
2,2756,Robocop 2 (1990),2986,2,Action|Crime|Sci-Fi,18,1,"[3, 8, 5]"
3,1706,Modern Times (1936),3462,5,Comedy,25,1,"[1, 0, 0]"
4,4813,Milk Money (1994),276,3,Comedy|Romance,35,0,"[1, 6, 0]"


# Vorbereiten der Daten

In [None]:
from sklearn.model_selection import train_test_split

train, val = train_test_split(ratings, test_size=0.2, random_state=7)

## Define input layers
The dataset contains a **numeric** and **categerical** features, they need to be treated differently.

* **numeric features** can be concatenated to inputs, with shape (None, num_of_numeric)
* **categorical features** can be encoded individually to inputs, with shape (None, 1) each.

In [None]:
import tensorflow.keras.backend as K
from tensorflow.keras.models import Model
from tensorflow.keras.layers import *

# Input class is an input layer for neural net, we set the dim here
def define_input_layers():
    # numerical features
    age_input = Input((1,), name = 'age')
    num_inputs = [age_input]

    # single level categorical features
    uid_input = Input((1,), name = 'input_uid') #user_id
    mid_input = Input((1,), name= 'input_mid')  #movie_id
    sex_input = Input((1,), name= 'input_sex')  #sex
    cat_sl_inputs = [uid_input, mid_input
                     ,sex_input
                     ]

    # multi level categorical features (with 3 genres at most)
    # has 3 possible values
    genre_input = Input((3,), name = 'input_genre')
    cat_ml_inputs = [genre_input]

    inputs = num_inputs + cat_sl_inputs + cat_ml_inputs
    return inputs

inputs = define_input_layers()
print(inputs)

[<KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'age')>, <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'input_uid')>, <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'input_mid')>, <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'input_sex')>, <KerasTensor: shape=(None, 3) dtype=float32 (created by layer 'input_genre')>]


## 1st order factorization machines
1st order will require features to map to a scalar. so for:

* numeric feature: a dense layer will convert tensor to shape (None,1)
* categorical feature: a embedding layer will convert tensor to shape (None,1,1) and then reshape layer to reshape to (None,1)

In [None]:
def Tensor_Mean_Pooling(name = 'mean_pooling', keepdims = False):
    return Lambda(lambda x: K.mean(x, axis = 1, keepdims=keepdims), name = name)

# Input dimension of the different vectors
def fm_1d(inputs, n_uid, n_mid, n_genre):

    age_input, uid_input, mid_input,sex_input, genre_input = inputs

    # all tensors are reshape to (None, 1)
    num_dense_1d = [Dense(1, name = 'num_dense_1d_fea4')(age_input)]

    #Embedding: kategoriele Variablen into small vector dim
    cat_sl_embed_1d = [Embedding(n_uid + 1, 1, name = 'cat_embed_1d_uid')(uid_input),
                        Embedding(n_mid + 1, 1, name = 'cat_embed_1d_mid')(mid_input),
                        Embedding(2, 1, mask_zero=True, name = 'cat_embed_1d_sex')(sex_input)
                       ]
    cat_ml_embed_1d = [Embedding(n_genre + 1, 1, mask_zero=True, name = 'cat_embed_1d_genre')(genre_input)]

    cat_sl_embed_1d = [Reshape((1,))(i) for i in cat_sl_embed_1d]

    cat_ml_embed_1d = [Tensor_Mean_Pooling(name = 'embed_1d_mean')(i) for i in cat_ml_embed_1d]

    # here cat_ml_embed_1d has one dimension

    # add all tensors
    # I think thats the linear model
    y_fm_1d = Add(name = 'fm_1d_output')(num_dense_1d + cat_sl_embed_1d + cat_ml_embed_1d)

    return y_fm_1d

y_1d = fm_1d(inputs, 10, 10, 10)
#print(y_1d)

## 2nd order factorization machines
In 2nd order FM, each feature is map to shape (None, 1, k) and then stack to concat_embed_2d layer with shape (None, p, k).
k - matrix factorization latent dimension, p is feature dimension.

the calculation of interaction terms can be simplified, using
\begin{equation*} \sum{x_ix_j} = \frac{1}{2} \left((\sum{x})^2 - \sum({x}^2)\right) \end{equation*}

Hence, the sum of 2nd order interactions = square of sum of concat_embed_2d - sum of squared concat_embed_2d in p dimension, the resulting tensor will have a shape (None, k)



In [None]:
def fm_2d(inputs, n_uid, n_mid, n_genre, k):

    age_input, uid_input, mid_input,sex_input, genre_input = inputs

    num_dense_2d = [Dense(k, name = 'num_dense_2d_age')(age_input)] # shape (None, k)
    num_dense_2d = [Reshape((1,k))(i) for i in num_dense_2d] # shape (None, 1, k)

    cat_sl_embed_2d = [Embedding(n_uid + 1, k, name = 'cat_embed_2d_uid')(uid_input),
                       Embedding(n_mid + 1, k, name = 'cat_embed_2d_mid')(mid_input),
                        Embedding(2, k, name = 'cat_embed_2d_sex')(sex_input),
                       ] # shape (None, 1, k)

    cat_ml_embed_2d = [Embedding(n_genre + 1, k, name = 'cat_embed_2d_genre')(genre_input)] # shape (None, 3, k)
    cat_ml_embed_2d = [Tensor_Mean_Pooling(name = 'cat_embed_2d_genure_mean', keepdims=True)(i) for i in cat_ml_embed_2d] # shape (None, 1, k)

    # concatenate all 2d embed layers => (None, ?, k)
    embed_2d = Concatenate(axis=1, name = 'concat_embed_2d')(num_dense_2d + cat_sl_embed_2d + cat_ml_embed_2d)

    # calcuate the interactions by simplication
    # sum of (x1*x2) = sum of (0.5*[(xi)^2 - (xi^2)])
    tensor_sum = Lambda(lambda x: K.sum(x, axis = 1), name = 'sum_of_tensors')
    tensor_square = Lambda(lambda x: K.square(x), name = 'square_of_tensors')

    sum_of_embed = tensor_sum(embed_2d)
    square_of_embed = tensor_square(embed_2d)

    square_of_sum = Multiply()([sum_of_embed, sum_of_embed])
    sum_of_square = tensor_sum(square_of_embed)

    sub = Subtract()([square_of_sum, sum_of_square])
    sub = Lambda(lambda x: x*0.5)(sub)
    y_fm_2d = Reshape((1,), name = 'fm_2d_output')(tensor_sum(sub))

    return y_fm_2d, embed_2d

y_fm2_d, embed_2d = fm_2d(inputs, 10, 10, 10, 5)

## Put together

In [None]:
def fm_model(n_uid, n_mid, n_genre, k, dnn_dr):

    inputs = define_input_layers()

    y_fm_1d = fm_1d(inputs, n_uid, n_mid, n_genre)
    y_fm_2d, embed_2d = fm_2d(inputs, n_uid, n_mid, n_genre, k)


    # combinded deep and fm parts
    y = Concatenate()([y_fm_1d, y_fm_2d])
    y = Dense(1, name = 'fm_output')(y)

    fm_model_1d = Model(inputs, y_fm_1d)
    fm_model_2d = Model(inputs, y_fm_2d)
    fm_model = Model(inputs, y)

    return fm_model_1d, fm_model_2d, fm_model

In [None]:
ratings.head()

Unnamed: 0,uid,movie_name,mid,rating,release_date,age,sex,movie_genre
0,2592,Top Gun (1986),1101,4,Action|Romance,50,1,"[3, 6, 0]"
1,4318,12 Angry Men (1957),1203,4,Drama,25,1,"[2, 0, 0]"
2,2756,Robocop 2 (1990),2986,2,Action|Crime|Sci-Fi,18,1,"[3, 8, 5]"
3,1706,Modern Times (1936),3462,5,Comedy,25,1,"[1, 0, 0]"
4,4813,Milk Money (1994),276,3,Comedy|Romance,35,0,"[1, 6, 0]"


In [None]:
params = {
    'n_uid': ratings.uid.max(),
    'n_mid': ratings.mid.max(),
    'n_genre': 14,
    'k':20,
    'dnn_dr': 0.5
}

fm_model_1d, fm_model_2d, fm_model = fm_model(**params)

In [None]:
params

{'n_uid': 6040, 'n_mid': 3952, 'n_genre': 14, 'k': 20, 'dnn_dr': 0.5}

## Prepare Data

### Split Data

In [None]:
# Schreibt das df in einzelne Spalten

# age_input, uid_input, mid_input,sex_input, genre_input = inputs
def df2xy(ratings):
    x = [
        ratings.age.values,
         ratings.uid.values,
         ratings.mid.values,
        ratings.sex.values,
         np.concatenate(ratings.movie_genre.values).reshape(-1,3)]
    y = ratings.rating.values
    return x,y

train_x, train_y = df2xy(train)
valid_x, valid_y = df2xy(val)

train.head()

Unnamed: 0,uid,movie_name,mid,rating,release_date,age,sex,movie_genre
271288,2878,Dial M for Murder (1954),1086,5,Mystery|Thriller,50,0,"[14, 4, 0]"
784783,3825,"Sixth Sense, The (1999)",2762,4,Thriller,18,1,"[4, 0, 0]"
425303,4470,"Brady Bunch Movie, The (1995)",585,3,Comedy,35,1,"[1, 0, 0]"
381119,4722,Die Hard (1988),1036,4,Action|Thriller,35,1,"[3, 4, 0]"
385640,2206,"Few Good Men, A (1992)",2268,5,Crime|Drama,45,1,"[8, 2, 0]"


## Train Model

In [None]:
from tensorflow.keras.callbacks import  EarlyStopping, ModelCheckpoint
# train  model
fm_model.compile(
    loss=tf.keras.losses.MeanSquaredError(), optimizer=keras.optimizers.Adam(learning_rate=0.001)
)
early_stop = EarlyStopping(monitor='val_loss', patience=3)
model_ckp = ModelCheckpoint(filepath='./models/deepfm_weights.h5',
                            monitor='val_loss',
                            save_weights_only=True,
                            save_best_only=True)
callbacks = [model_ckp,early_stop]
train_history = fm_model.fit(train_x, train_y,
                                  epochs=30, batch_size=2048,
                                  validation_data=(valid_x, valid_y),
                                  callbacks = callbacks)




Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [None]:
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.metrics import average_precision_score

predictions = fm_model.predict(valid_x)
rmse = np.sqrt(mean_squared_error(valid_y, predictions))
print("RMSE:", rmse)

# MAP berechnen
# Annahme: Da es sich um ein Rating-Problem handelt, konvertieren Sie die Vorhersagen in binäre Werte (1 für positiv, 0 für negativ)
# Sie können hierfür z.B. einen Schwellenwert festlegen
threshold = 3.5  # Beispiel-Schwellenwert
binary_predictions = (predictions >= threshold).astype(int)
map_score = average_precision_score(valid_y > threshold, binary_predictions)
print("MAP Score:", map_score)



RMSE: 0.8875326322122863
MAP Score: 0.7181446021115474


In [None]:
# Make the model prediction
x_t,y_t = df2xy(ratings)
predictions = fm_model.predict(x_t)
ratings['predictions']=predictions
print(len(ratings))
print(len(predictions))
print(max(predictions))
print(min(predictions))



800167
800167
[5.668758]
[-0.9779109]


In [None]:
def predict_top(ratings,user_id, at=25):
  seen_items = ratings[ratings.uid==user_id].mid.values
  unseen_items = set(ratings.mid.values) - set(seen_items)
  unseen_ratings = ratings[ratings.mid.isin(unseen_items)]

  return unseen_ratings.nlargest(25, 'predictions')['mid'].tolist()

print("IDs der 25 größten Werte:", predict_top(ratings,850))



IDs der 25 größten Werte: [296, 844, 2198, 296, 527, 527, 527, 527, 296, 527, 260, 318, 260, 527, 50, 260, 1213, 260, 1213, 296, 527, 913, 50, 913, 50]


In [None]:
import csv
basic_solution = pd.read_csv('/content/drive/My Drive/MasterBarca/kaggle/input/kaggle_baseline.csv')

with open('/content/drive/My Drive/MasterBarca/solution.csv', 'w',encoding='UTF8') as f:
    writer = csv.writer(f)
    writer.writerow(['user_id', 'prediction'])
    for user_id in basic_solution.user_id.unique(): # loop throgh test : user, list of predictions
        relevant_items = predict_top(ratings, user_id, at=25)
        list_relevants = ' '.join([str(elem) for elem in relevant_items])
        writer.writerow([str(user_id),list_relevants])

In [None]:
from tqdm import tqdm
def precision(recommended_items, relevant_items):
    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)
    precision_score = np.sum(is_relevant, dtype=np.float32) / len(is_relevant)

    return precision_score

def recall(recommended_items, relevant_items):
    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)
    recall_score = np.sum(is_relevant, dtype=np.float32) / relevant_items.shape[0]

    return recall_score

def AP(recommended_items, relevant_items):

    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)
    # Cumulative sum: precision at 1, at 2, at 3 ...
    p_at_k = is_relevant * np.cumsum(is_relevant, dtype=np.float32) / (1 + np.arange(is_relevant.shape[0]))
    ap_score = np.sum(p_at_k) / np.min([relevant_items.shape[0], is_relevant.shape[0]])

    return ap_score

## Divide the data in two sets: training and test
def assign_to_set(df):
    sampled_ids = np.random.choice(df.index,
                                   size=np.int64(np.ceil(df.index.size * 0.2)),
                                   replace=False)
    df.loc[sampled_ids, 'for_testing'] = True
    return df

def evaluate_algorithm_top(ratings,test, at=25, thr_relevant = 0.85):

    cumulative_precision = 0.0
    cumulative_recall = 0.0
    cumulative_AP = 0.0

    num_eval = 0


    for user_id in tqdm(test.uid.unique()):

        relevant_items = test[test.uid==user_id]
        thr = np.quantile(relevant_items.rating,thr_relevant)
        relevant_items = np.array(relevant_items[relevant_items.rating >=thr].mid.values)
        if len(relevant_items)>0:

            recommended_items = predict_top(ratings,user_id, at=at)
            num_eval+=1

            cumulative_precision += precision(recommended_items, relevant_items)
            cumulative_recall += recall(recommended_items, relevant_items)
            cumulative_AP += AP(recommended_items, relevant_items)
    cumulative_precision /= num_eval
    cumulative_recall /= num_eval
    MAP = cumulative_AP / num_eval

    print("Recommender results are: Precision = {:.4f}, Recall = {:.4f}, MAP = {:.4f}".format(
        cumulative_precision, cumulative_recall, MAP))

In [None]:
evaluate_algorithm_top(ratings,val)

100%|██████████| 6028/6028 [22:53<00:00,  4.39it/s]

Recommender results are: Precision = 0.0936, Recall = 0.0952, MAP = 0.0825



