In [1]:
!pip install surprise



In [2]:
import pandas as pd

from surprise import Dataset, KNNBasic, Reader
from surprise.model_selection import cross_validate

In [19]:
rating_df = pd.read_csv( "https://raw.githubusercontent.com/manaranjanp/ISB_MLUL2/main/cf/u.data"
                        , delimiter = "\t"
                        , header = None )

In [20]:
rating_df.columns = ["userID", "itemID", "rating", "timestamp"]

In [22]:
rating_df.head(10)

Unnamed: 0,userID,itemID,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
5,298,474,4,884182806
6,115,265,2,881171488
7,253,465,5,891628467
8,305,451,3,886324817
9,6,86,3,883603013


In [21]:
df = pd.DataFrame(rating_df)

# A reader is still needed but only the rating_scale param is required.
reader = Reader(rating_scale=(1, 5))

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(df[["userID", "itemID", "rating"]], reader)

In [6]:
# Example using cosine similarity
sim_options = {
    "name": "cosine",
    "user_based": False,  # compute  similarities between items
}
algo = KNNBasic(k = 50, sim_options=sim_options)

In [7]:
cv_results = cross_validate(algo, data, cv = 5, verbose=False)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


In [8]:
cv_results.keys()

dict_keys(['test_rmse', 'test_mae', 'fit_time', 'test_time'])

In [9]:
cv_results['test_rmse']

array([1.02460868, 1.01468733, 1.03163741, 1.02327947, 1.02585594])

In [10]:
cv_results['test_mae']

array([0.80932095, 0.80286221, 0.81635369, 0.81023554, 0.81044997])

In [11]:
cv_results['test_rmse'].mean()

1.0240137640870237

In [12]:
cv_results['test_rmse'].std()

0.005466060700452175

In [13]:
from surprise.model_selection import GridSearchCV

In [14]:
param_grid = {'k': [40, 50],
              'sim_options': {'name': ['cosine', 'pearson'],
                              'user_based': [True, False]}
              }

In [15]:
grid_cv = GridSearchCV(KNNBasic,
                       param_grid,
                       measures=['rmse'],
                       cv=5,
                       refit=True)

In [16]:
grid_cv.fit(data)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing

In [17]:
# best RMSE score
print(grid_cv.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(grid_cv.best_params['rmse'])

1.0112764214686758
{'k': 50, 'sim_options': {'name': 'pearson', 'user_based': True}}


In [18]:
results_df = pd.DataFrame.from_dict(grid_cv.cv_results)
results_df[['param_k', 'param_sim_options', 'mean_test_rmse', 'rank_test_rmse']]

Unnamed: 0,param_k,param_sim_options,mean_test_rmse,rank_test_rmse
0,40,"{'name': 'cosine', 'user_based': True}",1.017058,4
1,40,"{'name': 'cosine', 'user_based': False}",1.026828,6
2,40,"{'name': 'pearson', 'user_based': True}",1.012075,2
3,40,"{'name': 'pearson', 'user_based': False}",1.041225,8
4,50,"{'name': 'cosine', 'user_based': True}",1.016337,3
5,50,"{'name': 'cosine', 'user_based': False}",1.024665,5
6,50,"{'name': 'pearson', 'user_based': True}",1.011276,1
7,50,"{'name': 'pearson', 'user_based': False}",1.036626,7
