In [0]:
# Import Libraries
import pandas as pd 
import numpy as np 
import surprise
from surprise.accuracy import rmse
from matplotlib import pyplot as plt

In [0]:
# Load Dataset
movies_df = pd.read_csv('./movie_ratings.csv')
movies_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [0]:
# Preprocessing

# Remove timestamp column
movies_df = movies_df.drop('timestamp', axis=1)
movies_df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [0]:
# Dataset Description
num_users = movies_df.userId.unique().shape[0]
num_movies = movies_df.movieId.unique().shape[0]
num_ratings = movies_df.shape[0]

print('Number of Users: {}'.format(num_users))
print('Number of Movies: {}'.format(num_movies))
print('Number of Ratings: {}'.format(num_ratings))

Number of Users: 610
Number of Movies: 9724
Number of Ratings: 100836


In [0]:
# Range of Movie Ratings
lower_rating = movies_df['rating'].min()
upper_rating = movies_df['rating'].max()
print('Rating ranges from {} to {}'.format(lower_rating, upper_rating))

Rating ranges from 0.5 to 5.0


In [0]:
# Load dataset into surprise
reader = surprise.Reader(rating_scale=(0.5, 5.0))
data = surprise.Dataset.load_from_df(movies_df, reader)

In [0]:
from surprise.model_selection import train_test_split

# Train-Test Split ratio
split = 0.2

train, test = train_test_split(data, test_size=split)

## 1. User User filtering

In [0]:
# User-User

def get_user_user_predictions(train, test, k):
    
    # K Nearest Neighbours
    K = k # Ranges from 1 to NUM_USERS

    # Configuration for similarity measure
    # PARAM: name takes values {'cosine', 'pearson'}
    # PARAM: user_based takes values {'True', 'False'}  If set to false then similarity is taken between items.
    options = {
        'name': 'cosine',
        'user_based': True
    }

    # Find user-user similarities
    obj = surprise.prediction_algorithms.KNNWithMeans(k=K, sim_options=options)
    output = obj.fit(train)

    # Predict on test dataset
    uu_predictions = obj.test(test)
    rmse(uu_predictions)
    return uu_predictions

In [0]:
uu_predictions = get_user_user_predictions(train, test, 10)
print(uu_predictions)

# in the result, Prediction(uid=x, iid=y) means item y is recommended for user x 

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9044
[Prediction(uid=241, iid=296, r_ui=4.5, est=3.9548671984253585, details={'was_impossible': False, 'actual_k': 10}), Prediction(uid=474, iid=7090, r_ui=4.0, est=3.620950441756016, details={'was_impossible': False, 'actual_k': 10}), Prediction(uid=51, iid=3681, r_ui=4.0, est=4.212922702501518, details={'was_impossible': False, 'actual_k': 10}), Prediction(uid=95, iid=6288, r_ui=4.0, est=4.487888144141617, details={'was_impossible': False, 'actual_k': 2}), Prediction(uid=66, iid=3252, r_ui=4.5, est=4.3019467787424714, details={'was_impossible': False, 'actual_k': 10}), Prediction(uid=260, iid=7361, r_ui=3.5, est=4.278190071940799, details={'was_impossible': False, 'actual_k': 10}), Prediction(uid=424, iid=1092, r_ui=4.0, est=2.9713226717799146, details={'was_impossible': False, 'actual_k': 10}), Prediction(uid=592, iid=589, r_ui=5.0, est=3.80781462840959, details={'was_impossible': False, 'actual_k': 

## 2. SVD Decomposition

In [0]:
algo = surprise.SVD()
algo.fit(train)
predictions = algo.test(test)

In [0]:
rmse(predictions)

RMSE: 0.8655


0.8654560359451237