# Load datasets

In [1]:
import numpy as np
import pandas as pd

ratings_df = pd.read_csv('./ratings_small.csv')
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [2]:
ratings_df.drop(columns=['timestamp'], inplace=True)
ratings_df.shape

(100004, 3)

# Data Inspection/Cleaning

In [3]:
# Checking for missing values column wise
ratings_df.isna().sum()

userId     0
movieId    0
rating     0
dtype: int64

In [4]:
# Checking for duplicate records
dup_bool = ratings_df.duplicated(['userId','movieId','rating'])
print("Number of duplicate records:",sum(dup_bool))

Number of duplicate records: 0


In [5]:
print("Number of users rated:",ratings_df.userId.nunique())
print("Number of movies being rated:",ratings_df.movieId.nunique())

Number of users rated: 671
Number of movies being rated: 9066


In [6]:
ratings_per_user = ratings_df.groupby(by='userId')['rating'].count()
ratings_per_user.describe()

count     671.000000
mean      149.037258
std       231.226948
min        20.000000
25%        37.000000
50%        71.000000
75%       161.000000
max      2391.000000
Name: rating, dtype: float64

* Minimum number of ratings given by a user = 20
* Maximum number of ratings given by a user = 2391
* average ratings per user = 149

# User Based Collaborative filtering

### Build user based collaborative filtering from scratch.

In [7]:
# make a pivot table for utility matrix (row = userId, col = movieId)
rating_pivot = ratings_df.pivot_table(index = ["userId"],columns = ["movieId"],values = "rating")
rating_pivot.head(10)

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,4.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,4.0,...,,,,,,,,,,
5,,,4.0,,,,,,,,...,,,,,,,,,,
6,,,,,,,,,,,...,,,,,,,,,,
7,3.0,,,,,,,,,3.0,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,
9,4.0,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,


In [8]:
ratings_matrix = np.array(rating_pivot.fillna(0))
ratings_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [5., 0., 0., ..., 0., 0., 0.]])

Although we filled in missing ratings as 0, we should not assume these values to truly be zero. More appropriately, they are just empty entries. We will split our data into training and test sets by removing 10 ratings per user from the training set and placing them in the test set.

In [9]:
def split_matrix(ratings_matrix):
    test = np.zeros(ratings_matrix.shape)
    train = ratings_matrix.copy()
    for user in range(ratings_matrix.shape[0]):
        np.random.seed(20)
        test_ratings = np.random.choice(ratings_matrix[user, :].nonzero()[0], 
                                        size=10, 
                                        replace=False)
        train[user, test_ratings] = 0.
        test[user, test_ratings] = ratings_matrix[user, test_ratings]
        
    # Test and training are truly disjoint
    assert(np.all((train * test) == 0)) 
    return train, test

train, test = split_matrix(ratings_matrix)

Construct an user-similarity matrix

In [10]:
# cosine similarity
def cos_sim(ratings, epsilon=1e-9):
    # epsilon -> small number for handling divided-by-zero errors
    sim = ratings.dot(ratings.T) + epsilon
    norms = np.array([np.sqrt(np.diagonal(sim))])
    return (sim / norms / norms.T)

user_sim = cos_sim(train)
print(user_sim[:4, :4])

[[1.00000000e+00 4.90408239e-12 6.10798793e-12 2.52633360e-02]
 [4.90408239e-12 1.00000000e+00 7.71017917e-02 9.81236371e-02]
 [6.10798793e-12 7.71017917e-02 1.00000000e+00 5.03430344e-02]
 [2.52633360e-02 9.81236371e-02 5.03430344e-02 1.00000000e+00]]


Predict the ratings that were not included with the data. We can then compare the preditions with the test data for validation.

In [11]:
def predict_userCF(ratings, similarity):
    return similarity.dot(ratings) / np.array([np.abs(similarity).sum(axis=1)]).T

user_pred = predict_userCF(train, user_sim)

In [12]:
from sklearn.metrics import mean_squared_error
from math import sqrt

def get_rmse(pred, actual):
    # Ignore nonzero terms.
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return sqrt(mean_squared_error(pred, actual))

print('RMSE of User-based CF: ' + str(get_rmse(user_pred, test)))

RMSE of User-based CF: 3.188979432041803


### Matrix Factorization on user based collaborative filtering

In [13]:
from surprise import Reader, Dataset
from surprise.model_selection import train_test_split

reader = Reader()
ratings = Dataset.load_from_df(ratings_df, reader)

train_ratings, test_ratings = train_test_split(ratings, test_size=0.2, random_state = 42)
print("Size of trainset: ", train_ratings.n_ratings)
print("Size of testset: ", len(test_ratings))

Size of trainset:  80003
Size of testset:  20001


In [14]:
from surprise import SVD
svd = SVD()
svd.fit(train_ratings)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1162927f0>

In [15]:
from surprise import accuracy
train_pred_svd = svd.test(train_ratings.build_testset())
test_pred_svd = svd.test(test_ratings)
print("RMSE on training data (SVD): ", accuracy.rmse(train_pred_svd, verbose = False))
print("RMSE on test data (SVD): ", accuracy.rmse(test_pred_svd, verbose = False))

RMSE on training data (SVD):  0.6406217136435836
RMSE on test data (SVD):  0.9031086921432482


### Use `KNNBasic` in `surprise` scikit for user based collaborative filtering.

In [16]:
from surprise import KNNBasic
knn_model = KNNBasic(random_state = 20,verbose = False)
knn_model.fit(train_ratings)

<surprise.prediction_algorithms.knns.KNNBasic at 0x116292ac8>

In [17]:
from surprise import accuracy
train_pred_knn = knn_model.test(train_ratings.build_testset())
test_pred_knn = knn_model.test(test_ratings)
print("RMSE on training data (KNN): ", accuracy.rmse(train_pred_knn, verbose = False))
print("RMSE on test data (KNN): ", accuracy.rmse(test_pred_knn, verbose = False))

RMSE on training data (KNN):  0.7160931907176622
RMSE on test data (KNN):  0.9662515187787728
