In [None]:
#!pip install surprise

In [1]:
import pandas as pd
import numpy as np

In [88]:
from surprise import KNNWithMeans, KNNBaseline, KNNWithZScore
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split

In [3]:
movies = pd.read_csv('../data/movies.csv')
ratings = pd.read_csv('../data/ratings.csv')

In [4]:
movies_with_ratings = movies.merge(ratings, on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)
movies_with_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   movieId    100836 non-null  int64  
 1   title      100836 non-null  object 
 2   genres     100836 non-null  object 
 3   userId     100836 non-null  int64  
 4   rating     100836 non-null  float64
 5   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3), object(2)
memory usage: 4.6+ MB


In [5]:
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})

In [6]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   uid     100836 non-null  int64  
 1   iid     100836 non-null  object 
 2   rating  100836 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 2.3+ MB


In [7]:
dataset.iid.nunique()

9719

In [8]:
dataset.uid = dataset.uid.astype(np.int16)
dataset.rating = dataset.rating.astype(np.float16)

In [9]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   uid     100836 non-null  int16  
 1   iid     100836 non-null  object 
 2   rating  100836 non-null  float16
dtypes: float16(1), int16(1), object(1)
memory usage: 1.2+ MB


In [10]:
dataset.iid = dataset.iid.astype('category')

In [11]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype   
---  ------  --------------   -----   
 0   uid     100836 non-null  int16   
 1   iid     100836 non-null  category
 2   rating  100836 non-null  float16 
dtypes: category(1), float16(1), int16(1)
memory usage: 924.9 KB


In [12]:
ratings.rating.min(), ratings.rating.max()

(0.5, 5.0)

In [13]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(dataset, reader)

In [14]:
trainset, testset = train_test_split(data, test_size=0.2, random_state=1)

### User Based

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.


In [92]:
test_pred = model.test(testset)
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.8778


0.8777886132535606

In [20]:
from surprise.model_selection import RandomizedSearchCV

In [62]:
params = {
    'k': [10, 20, 30, 40, 50, 100],
    'min_k': [1, 5, 10, 15, 20],
    'sim_options': {
        'name': ['cosine', 'msd', 'pearson', 'pearson_baseline'],
        'user_based': [True]
    }
}

In [93]:
RS = RandomizedSearchCV(KNNBaseline, params, measures=['rmse'], cv=5, random_state=1)
RS.fit(data)

Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
C

In [94]:
RS.best_score["rmse"]

0.859715516674467

In [95]:
RS.best_params["rmse"]

{'k': 20,
 'min_k': 20,
 'sim_options': {'name': 'pearson_baseline', 'user_based': True}}

## Item Based

In [89]:
model = KNNBaseline(k=30, min_k=5, sim_options={
    'name': 'pearson_baseline',
    'user_based': False
}).fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


In [90]:
test_pred = model.test(testset)
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.8501


0.8501338230474232