This is an example of how we can use fastFM for solving regession problems such as rating predictions.

# Imports and installations

In [23]:
pip install fastFM==0.2.9



In [24]:
import pandas as pd
import numpy as np
import time
from sklearn.preprocessing import OneHotEncoder
from fastFM.mcmc import FMRegression
from fastFM.datasets import make_user_item_regression
from sklearn.model_selection import train_test_split
from fastFM import als,sgd
from sklearn.metrics import mean_squared_error,roc_auc_score


Regression with ALS

In [25]:
# This sets up a small test dataset.
X, y, _ = make_user_item_regression(label_stdev=.4)
X_train, X_test, y_train, y_test = train_test_split(X, y)

fm = als.FMRegression(n_iter=1000, init_stdev=0.1, rank=2, l2_reg_w=0.1, l2_reg_V=0.5)
fm.fit(X_train, y_train)
y_pred = fm.predict(X_test)

In [26]:
'RMSE:', mean_squared_error(y_test, y_pred)**0.5

('RMSE:', 0.5259048855045719)

# Factorization machines - Movielens

In [27]:
!wget http://files.grouplens.org/datasets/movielens/ml-100k.zip

--2021-03-21 08:27:46--  http://files.grouplens.org/datasets/movielens/ml-100k.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4924029 (4.7M) [application/zip]
Saving to: ‘ml-100k.zip’


2021-03-21 08:27:46 (16.4 MB/s) - ‘ml-100k.zip’ saved [4924029/4924029]



In [28]:
!unzip ml-100k.zip

Archive:  ml-100k.zip
   creating: ml-100k/
  inflating: ml-100k/allbut.pl       
  inflating: ml-100k/mku.sh          
  inflating: ml-100k/README          
  inflating: ml-100k/u.data          
  inflating: ml-100k/u.genre         
  inflating: ml-100k/u.info          
  inflating: ml-100k/u.item          
  inflating: ml-100k/u.occupation    
  inflating: ml-100k/u.user          
  inflating: ml-100k/u1.base         
  inflating: ml-100k/u1.test         
  inflating: ml-100k/u2.base         
  inflating: ml-100k/u2.test         
  inflating: ml-100k/u3.base         
  inflating: ml-100k/u3.test         
  inflating: ml-100k/u4.base         
  inflating: ml-100k/u4.test         
  inflating: ml-100k/u5.base         
  inflating: ml-100k/u5.test         
  inflating: ml-100k/ua.base         
  inflating: ml-100k/ua.test         
  inflating: ml-100k/ub.base         
  inflating: ml-100k/ub.test         


In [29]:
def load_movielens_100k(all_features=False):
    '''Standard test dataset for recommendation systems
    From http://grouplens.org/datasets/movielens/
    '''
    folder = 'ml-100k'
    ratings = pd.read_csv(folder + '/u.data', sep='\t', 
                              names=['user', 'movie', 'rating', 'timestamp'], header=None)
    ratings = ratings.drop('timestamp', axis=1)
    if all_features:
        users   = pd.read_csv(folder + '/u.user', sep='|', 
                                  names=['user', 'age', 'gender', 'occupation', 'zip'], header=None,encoding='latin-1')
        movies  = pd.read_csv(folder + '/u.item', sep='|',
           names=['movie', 'title','released','video_release', 'IMDb URL','unknown','Action','Adventure','Animation',
            'Children','Comedy','Crime','Documentary','Drama','Fantasy','Film-Noir',
            'Horror','Musical','Mystery','Romance','Sci-Fi','Thriller','War','Western'], header=None,encoding='latin-1')
        
        movies = movies.drop(['title', 'IMDb URL', 'video_release'], axis=1)
        movies['released'] = pd.to_datetime(movies['released']).map(lambda z: z.year)
        ratings = pd.merge(pd.merge(ratings, users, on='user'), movies, on='movie')

    answers = ratings['rating'].values
    ratings = ratings.drop('rating', axis=1)

    for feature in ratings.columns:
        _, ratings[feature] = np.unique(ratings[feature], return_inverse=True)
        
    trainX, testX, trainY, testY = train_test_split(ratings, answers, train_size=0.75, random_state=42)
    return trainX, testX, trainY, testY

In [30]:
trainX, testX, trainY, testY = load_movielens_100k(True)

In [31]:
trainX

Unnamed: 0,user,movie,age,gender,occupation,zip,released,unknown,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
98980,692,1310,33,0,7,615,68,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
69824,931,528,48,1,3,59,57,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
9928,216,553,12,1,13,110,67,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
75599,798,498,39,0,0,166,30,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
95621,910,547,27,0,20,397,68,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6265,658,268,21,1,3,406,69,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
54886,579,342,6,1,18,152,69,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0
76820,681,46,13,1,14,425,66,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
860,720,305,14,0,5,107,69,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0


In [32]:
def fitpredict_fastfm(trainX, trainY, testX, classification=True, rank=8, n_iter=100):
    encoder = OneHotEncoder(handle_unknown='ignore').fit(trainX)
    trainX = encoder.transform(trainX)
    testX = encoder.transform(testX)    
    reg = FMRegression(rank=rank, n_iter=n_iter)
    return reg.fit_predict(trainX, trainY, testX)

In [33]:
def test_on_dataset(trainX, testX, trainY, testY):
    reg_fastFM   = fitpredict_fastfm   
    start = time.time()
    predictions = reg_fastFM(trainX, trainY, testX)
    spent_time = time.time() - start            
    print('time',spent_time)
    print('RMSE',np.mean((testY - predictions) ** 2) ** 0.5)                

In [34]:
test_on_dataset(trainX, testX, trainY, testY)

time 42.8170166015625
RMSE 0.8965433945577149
