In [1]:
#!pip install scikit-surprise

documentation of the Surprise package can be found at http://surprise.readthedocs.io/en/stable/index.html

In [2]:
from surprise import Dataset
from surprise import evaluate, print_perf
from surprise import Reader
from surprise import KNNBasic
from surprise import SVD
import warnings
warnings.filterwarnings('ignore')

In [3]:
import os
data_file_path = "C:/Users/ohm/Downloads/Advanced Machine Learning Exercises/ratings.dat"
data_file_path

'C:/Users/ohm/Downloads/Advanced Machine Learning Exercises/ratings.dat'

In [4]:
# As we're loading a custom dataset, we need to define a reader. In the
# course talk data set that we are using, each line has the following format:
# 'user item rating', separated by '|' characters.

"""
The Reader class is used to parse a file containing ratings.

Such a file is assumed to specify only one rating per line, and each line needs to respect the following structure:

user ; item ; rating ; [timestamp]

Here we dont have time stamp and it is optional

"""
reader_object = Reader(line_format='user item rating', sep='|',rating_scale=(1, 5), skip_lines=0)
data = Dataset.load_from_file(data_file_path, reader=reader_object)

In [5]:
# We need to split the Data into five folds to perform cross validation
data.split(n_folds=5)

In [6]:
#Let's use SVD(Singular Value Decomposition) and see the results

algo = SVD()

# Evaluate performances of our algorithm on the dataset.
perf = evaluate(algo, data, measures=['RMSE', 'MAE'])

print_perf(perf)

Evaluating RMSE, MAE of algorithm SVD.

------------
Fold 1
RMSE: 0.7964
MAE:  0.4840
------------
Fold 2
RMSE: 0.7857
MAE:  0.4771
------------
Fold 3
RMSE: 0.8653
MAE:  0.5220
------------
Fold 4
RMSE: 0.8157
MAE:  0.4989
------------
Fold 5
RMSE: 0.8025
MAE:  0.4775
------------
------------
Mean RMSE: 0.8131
Mean MAE : 0.4919
------------
------------
        Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    
RMSE    0.7964  0.7857  0.8653  0.8157  0.8025  0.8131  
MAE     0.4840  0.4771  0.5220  0.4989  0.4775  0.4919  


In [7]:
# Let's use KNN basic algo and see the results

# We'll use the famous User Based Collaborative Filtering algorithm.there are a list of other algorithms that can be used in the Surprise package
similarity_params = {'name': 'cosine',
               'user_based': True  # compute  similarities between users
               }
algo1 = KNNBasic(sim_options=similarity_params)

# Evaluate performances of our algorithm on the dataset.
perf1 = evaluate(algo1, data, measures=['RMSE', 'MAE'])

print_perf(perf1)

Evaluating RMSE, MAE of algorithm KNNBasic.

------------
Fold 1
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9654
MAE:  0.6674
------------
Fold 2
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9144
MAE:  0.6416
------------
Fold 3
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9850
MAE:  0.6791
------------
Fold 4
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9654
MAE:  0.6644
------------
Fold 5
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9520
MAE:  0.6488
------------
------------
Mean RMSE: 0.9564
Mean MAE : 0.6603
------------
------------
        Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    
RMSE    0.9654  0.9144  0.9850  0.9654  0.9520  0.9564  
MAE     0.6674  0.6416  0.6791  0.6644  0.6488  0.6603  


# Grid Search

In the context of machine learning, hyperparameters are parameters whose values are set prior to the commencement of the learning process. By contrast, the value of other parameters is derived via training.Hyperparameter optimization or model selection is the problem of choosing a set of optimal hyperparameters for a learning algorithm, for optimizing a measure of the algorithm's performance on a data set.

The traditional way of performing hyperparameter optimization has been grid search, or a parameter sweep, which is simply an exhaustive searching through a manually specified subset of the hyperparameter space of a learning algorithm. A grid search algorithm must be guided by some performance metric, typically measured by cross-validation on the training set or evaluation on a held-out validation set.
Grid search suffers from the curse of high dimentionality. Since it has to search for every possible combination of specified hyper parameter. SO grid serach must be used wisely

In [8]:
from surprise import GridSearch
param_grid = {'min_k': [1,2,3,4,5], 'k': [35,36,37,38,39,40]}
grid_search = GridSearch(KNNBasic, param_grid, measures=['RMSE', 'MAE'])
grid_search.evaluate(data)

Running grid search for the following parameter combinations:
{'min_k': 1, 'k': 35}
{'min_k': 1, 'k': 36}
{'min_k': 1, 'k': 37}
{'min_k': 1, 'k': 38}
{'min_k': 1, 'k': 39}
{'min_k': 1, 'k': 40}
{'min_k': 2, 'k': 35}
{'min_k': 2, 'k': 36}
{'min_k': 2, 'k': 37}
{'min_k': 2, 'k': 38}
{'min_k': 2, 'k': 39}
{'min_k': 2, 'k': 40}
{'min_k': 3, 'k': 35}
{'min_k': 3, 'k': 36}
{'min_k': 3, 'k': 37}
{'min_k': 3, 'k': 38}
{'min_k': 3, 'k': 39}
{'min_k': 3, 'k': 40}
{'min_k': 4, 'k': 35}
{'min_k': 4, 'k': 36}
{'min_k': 4, 'k': 37}
{'min_k': 4, 'k': 38}
{'min_k': 4, 'k': 39}
{'min_k': 4, 'k': 40}
{'min_k': 5, 'k': 35}
{'min_k': 5, 'k': 36}
{'min_k': 5, 'k': 37}
{'min_k': 5, 'k': 38}
{'min_k': 5, 'k': 39}
{'min_k': 5, 'k': 40}
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing simil

Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd

In [9]:
# To know results of grid search

# best RMSE score in the model
print("Best RMSE Score is:{!r}".format(grid_search.best_score['RMSE']))


# combination of parameters that gave the best RMSE score
print("Parameter to achieve Best RMSE Score is:{!r}".format(grid_search.best_params['RMSE']))


# best MAE score
print("Best MAE Score is :{!r}".format(grid_search.best_score['MAE']))


# combination of parameters that gave the best MAE score
print("Parameter to achieve Best MAE Score is:{!r}".format(grid_search.best_params['MAE']))


Best RMSE Score is:0.9411718894835174
Parameter to achieve Best RMSE Score is:{'min_k': 3, 'k': 35}
Best MAE Score is :0.6494225741150481
Parameter to achieve Best MAE Score is:{'min_k': 3, 'k': 35}


In [10]:
# It is clearly evident that although we use Best parameters for KNN basic after conducting GridSearch,
#SVD is winning the race with low RMSE and MAE 