# ML-Based Collaborative Filtering: Performance Evaluation
This notebook shows the evaluation of performance the SVD-based machine learning collaborative filtering. 

In [1]:
import pandas as pd
import numpy as np
import sys
sys.path.insert(1, '../resype')
%load_ext autoreload
%autoreload 2 

## Prepare data

In [2]:
# load transaction list
transaction_train = pd.read_csv("../sample_data/train_set.csv")[['userId', 'movieId', 'rating']]
transaction_train.columns = ["user_id", 'item_id', 'rating']
transaction_test = pd.read_csv("../sample_data/test_set.csv")[['userId', 'movieId', 'rating']]
transaction_test.columns = ["user_id", 'item_id', 'rating']

In [3]:
transaction_train.shape, transaction_test.shape

((3499, 3), (1501, 3))

In [4]:
transaction_train.head()

Unnamed: 0,user_id,item_id,rating
0,1,2090,5.0
1,1,216,5.0
2,1,1967,4.0
3,1,1029,5.0
4,1,2253,2.0


In [5]:
transaction_test.head()

Unnamed: 0,user_id,item_id,rating
0,1,101,5.0
1,1,163,5.0
2,1,231,5.0
3,1,296,3.0
4,1,356,4.0


In [6]:
from collab_filtering import CollabFilteringModel

In [7]:
from sklearn.ensemble import RandomForestRegressor

In [8]:
%%time

df_test_fin_nans = transaction_test.copy()
df_test_fin_nans['rating'] = np.nan

re = CollabFilteringModel(pd.concat([transaction_train, df_test_fin_nans], axis=0))
utility_matrix = re.construct_utility_matrix()
print(utility_matrix.shape)
rs_model = RandomForestRegressor(random_state=202109)

outputs = re.train_model_svd(
    re.utility_matrix, rs_model, d=20, return_models=True)


(32, 2427)
Done training 100 out of 2427
Done training 200 out of 2427
Done training 300 out of 2427
Done training 400 out of 2427
Done training 500 out of 2427
Done training 600 out of 2427
Done training 700 out of 2427
Done training 800 out of 2427
Done training 900 out of 2427
Done training 1000 out of 2427
Done training 1100 out of 2427
Done training 1200 out of 2427
Done training 1300 out of 2427
Done training 1400 out of 2427
Done training 1500 out of 2427
Done training 1600 out of 2427
Done training 1700 out of 2427
Done training 1800 out of 2427
Done training 1900 out of 2427
Done training 2000 out of 2427
Done training 2100 out of 2427
Done training 2200 out of 2427
Done training 2300 out of 2427
Done training 2400 out of 2427
Done training 2427 out of 2427
CPU times: user 1h 42min 23s, sys: 7min 8s, total: 1h 49min 31s
Wall time: 6min 50s


In [12]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

utility_matrix_imputed, trained_model = outputs
utility_matrix_imputed = re.mean_filled_utilmat(utility_matrix_imputed)
imputed_transactions = utility_matrix_imputed.stack().reset_index()
test_data = transaction_test.copy()
merged_test_data = test_data.merge(
    imputed_transactions, 
    on=imputed_transactions.columns.tolist()[:2],
    how='left').dropna()
mse = mean_squared_error(merged_test_data.rating, merged_test_data[0].values)
mae = mean_absolute_error(merged_test_data.rating, merged_test_data[0].values)

In [13]:
print(f'MSE:{mse}, MAE:{mae}')

MSE:1.3433272948932116, MAE:0.9071479727188221
