# Factorization Machine example

In [2]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from reco.datasets import loadMovieLens100k
from reco.recommender import FM

### At first we'll test only with the bare minimum *userId*, *itemId* and *rating* columns.

In [3]:
train, test, _, _ = loadMovieLens100k(train_test_split=True)
print(train.head())

   userId  itemId  rating
0       1       1       5
1       1       2       3
2       1       3       4
3       1       4       3
4       1       5       3


So we have the user ids, item ids and the respective ratings in the 3 columns. Next we need to separate the rating column since we are going to predict that. Also we need to explicitly set the column data type to string for *userId* and *itemId* so that the model treats them as categorical variables, not integers. We'll do this for both the train and test sets.

In [4]:
y_train = train['rating']
train.drop(['rating'], axis=1, inplace=True)

train['userId'] = train['userId'].astype('str')
train['itemId'] = train['itemId'].astype('str')

y_test = test['rating']
test.drop(['rating'], axis=1, inplace=True)

test['userId'] = test['userId'].astype('str')
test['itemId'] = test['itemId'].astype('str')

Next we'll train the model. We choose 60 iterations here. Tweak the hyperparameters to get the best performance.

In [5]:
f = FM(k=10, iterations = 60, learning_rate = 0.003, regularizer=0.005)
f.fit(X=train, y=y_train)

epoch 0 time 1.3084863953578691 mse 1.103937278612045
epoch 1 time 1.3081417801617612 mse 0.9918084223513931
epoch 2 time 1.350830668491486 mse 0.9441643862460812
epoch 3 time 1.2941405570513194 mse 0.918135365419286
epoch 4 time 1.4166798860326555 mse 0.9012910466433169
epoch 5 time 1.263930385542353 mse 0.8891798658571386
epoch 6 time 1.3548125238632345 mse 0.8798711358070531
epoch 7 time 1.2486451517382644 mse 0.8723796737482616
epoch 8 time 1.334478039259876 mse 0.8661391582826142
epoch 9 time 1.3298616543206911 mse 0.8607936833117162
epoch 10 time 1.3456986371107167 mse 0.8560972256015189
epoch 11 time 1.3040315601878643 mse 0.8518732029213533
epoch 12 time 1.2945989499629675 mse 0.8479907528352713
epoch 13 time 1.2999260809521154 mse 0.8443463899115101
epoch 14 time 1.5280591527425713 mse 0.8408569059317575
epoch 15 time 1.2989221384813376 mse 0.8374490178145726
epoch 16 time 1.3151708361458674 mse 0.8340607647692536
epoch 17 time 1.3207630837991111 mse 0.8306367098314906
epoch 1

In [7]:
y_pred = f.predict(test)
print("MSE: {}".format(mean_squared_error(y_test, y_pred)))

MSE: 0.9257072902506759


### Now we'll try with all the columns and train our model on the whole dataset.

In [8]:
train, test, _, _ = loadMovieLens100k(train_test_split=True, all_columns=True)
print(train.head())

  userId itemId  rating  age gender  occupation  5  6  7  8 ...  14  15  16  \
0      1      1     5.0   24      M  technician  0  0  0  1 ...   0   0   0   
1      2      1     4.0   53      F       other  0  0  0  1 ...   0   0   0   
2      6      1     4.0   42      M   executive  0  0  0  1 ...   0   0   0   
3     10      1     4.0   53      M      lawyer  0  0  0  1 ...   0   0   0   
4     13      1     3.0   47      M    educator  0  0  0  1 ...   0   0   0   

   17  18  19  20  21  22  23  
0   0   0   0   0   0   0   0  
1   0   0   0   0   0   0   0  
2   0   0   0   0   0   0   0  
3   0   0   0   0   0   0   0  
4   0   0   0   0   0   0   0  

[5 rows x 25 columns]


This time, we also need to change the data type of the columns *gender* and *occupation* to string so that they are treated as categorical variables and hence one-hot encoded.

In [9]:
y_train = train['rating']
train.drop(['rating'], axis=1, inplace=True)
train['userId'] = train['userId'].astype('str')
train['itemId'] = train['itemId'].astype('str')
train['gender'] = train['gender'].astype('str')
train['occupation'] = train['occupation'].astype('str')


y_test = test['rating']
test.drop(['rating'], axis=1, inplace=True)
test['userId'] = test['userId'].astype('str')
test['itemId'] = test['itemId'].astype('str')
test['gender'] = test['gender'].astype('str')
test['occupation'] = test['occupation'].astype('str')

Before training, we also need to normalize the age column since the values are greatly different from the other columns and hence will hamper the performance of the model. We choose min-max normaliztion here.

In [11]:
train['age'] = (train['age']-train['age'].min())/(train['age'].max()-train['age'].min())
test['age'] = (test['age']-test['age'].min())/(test['age'].max()-test['age'].min())

In [17]:
f = FM(k=10, iterations = 60, learning_rate = 0.003, regularizer=0.005)
f.fit(X=train, y=y_train)

epoch 0 time 7.168705366406357 mse 0.9966038516961236
epoch 1 time 7.158128050254618 mse 0.9222644952229134
epoch 2 time 7.203841164850473 mse 0.8932051216741685
epoch 3 time 7.34335999451514 mse 0.8760120494763252
epoch 4 time 7.146919851257735 mse 0.8635113788610401
epoch 5 time 7.183824309702686 mse 0.8532038650239079
epoch 6 time 7.159351890041307 mse 0.8440652823004936
epoch 7 time 7.132119266761038 mse 0.8356230268739092
epoch 8 time 7.165921459096808 mse 0.8276217140381995
epoch 9 time 7.163955876126693 mse 0.8199056253557706
epoch 10 time 7.1612620428422815 mse 0.8123744204657366
epoch 11 time 7.17143894895662 mse 0.8049685822245547
epoch 12 time 7.152979973298443 mse 0.797652948928141
epoch 13 time 7.142680537337128 mse 0.7904183100351516
epoch 14 time 7.199183571956382 mse 0.7832648678385027
epoch 15 time 7.281502478494531 mse 0.7762022575076472
epoch 16 time 7.312662256321346 mse 0.7692416144569367
epoch 17 time 7.2228478781003105 mse 0.7623965802119932
epoch 18 time 7.25717

In [18]:
y_pred = f.predict(test)
print("MSE: {}".format(mean_squared_error(y_test, y_pred)))

MSE: 1.0619317734747407
