# Model Based Collaborative Filtering

Model-based recommendation systems involve building a model based on the dataset of ratings. In other words, we extract some information from the dataset, and use that as a "model" to make recommendations without having to use the complete dataset every time.

In [1]:
import pandas as pd
import numpy as np
from surprise import Reader, Dataset, KNNBasic, SVD, NMF
from surprise.model_selection import GridSearchCV, cross_validate
from surprise import accuracy
from math import sqrt
from sklearn.metrics import mean_squared_error
from surprise.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
movies_df = pd.read_csv(r'C:\Users\P Sai Deekshith\Recommender Systems\ml-100k\movies.csv')

ratings_df = pd.read_csv(r'C:\Users\P Sai Deekshith\Recommender Systems\ml-100k\ratings.csv')


In [3]:
reader = Reader(rating_scale=(0.5, 5.0))

data = Dataset.load_from_df( ratings_df[['userId', 'movieId', 'rating']], reader = reader )

## K-Nearest Neighbours

In [4]:
sim_options = {'name' : 'msd'}

algo = KNNBasic(k=20, sim_options=sim_options )
cross_validate(algo=algo, data=data, measures=['RMSE'], cv=5, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9382  0.9397  0.9408  0.9430  0.9406  0.9404  0.0016  
Fit time          0.54    0.42    0.52    0.45    0.50    0.49    0.05    
Test time         3.16    3.10    2.98    2.90    3.04    3.04    0.09    


{'test_rmse': array([0.93817595, 0.93967687, 0.94076974, 0.94296266, 0.94057133]),
 'fit_time': (0.5423369407653809,
  0.42111802101135254,
  0.5236999988555908,
  0.4490549564361572,
  0.5047211647033691),
 'test_time': (3.1559829711914062,
  3.1005027294158936,
  2.984384059906006,
  2.9035911560058594,
  3.0397303104400635)}

### Tuning the Parameters

In [5]:
n_neighbours = [10, 20, 30]
param_grid = {'n_neighbours' : n_neighbours}

gs = GridSearchCV(KNNBasic, measures=['RMSE'], param_grid=param_grid)
gs.fit(data)

def rmseKNN():
    print('\n\n###############')
    
    print('Best Score :', gs.best_score['rmse'])
    
    
    print('Best Parameters :', gs.best_params['rmse'])
    print('###############')
    
    return gs.best_score['rmse']

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

In [6]:
rmseKNN()



###############
Best Score : 0.9468788455294493
Best Parameters : {'n_neighbours': 10}
###############


0.9468788455294493

### SVD

In [7]:
algo = SVD()
cross_validate(algo=algo, data=data, measures=['RMSE'], cv=5, verbose=True)

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8698  0.8787  0.8674  0.8748  0.8748  0.8731  0.0040  
Fit time          9.92    9.39    10.05   9.36    9.40    9.63    0.30    
Test time         0.48    0.40    0.40    0.25    0.36    0.38    0.07    


{'test_rmse': array([0.8698346 , 0.87874304, 0.86741597, 0.87480979, 0.87475223]),
 'fit_time': (9.921858310699463,
  9.394017696380615,
  10.048898696899414,
  9.364928007125854,
  9.39711332321167),
 'test_time': (0.48251891136169434,
  0.40403079986572266,
  0.4030294418334961,
  0.2531299591064453,
  0.3620266914367676)}

### Tuning the Parameters

In [8]:
param_grid = {'n_factors' : [50, 75], 'lr_all' : [0.5, 0.05], 'reg_all' : [0.06, 0.04]}

gs = GridSearchCV(algo_class=SVD, measures=['RMSE'], param_grid=param_grid)
gs.fit(data)


def rmseSVD():
    print('\n###############')
    print('Best Score :', gs.best_score['rmse'])

    print('Best Parameters :', gs.best_params['rmse'])
    print('###############')
    
    return gs.best_score['rmse']

In [9]:
rmseSVD()


###############
Best Score : 0.8630034332881612
Best Parameters : {'n_factors': 75, 'lr_all': 0.05, 'reg_all': 0.06}
###############


0.8630034332881612