In [12]:
import numpy as np
import pandas as pd
from scipy.sparse import coo_matrix
import scipy.sparse.linalg as sa
from sklearn.model_selection import train_test_split

%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [13]:
import util

def load_data(size='small', test_size=0.3):
    ratings = util.read_files(size=size)[0]
    movie_counts = ratings.groupby('movieId').count()
    good_movies = set(movie_counts[movie_counts['userId'] > 20].index)
    # print(len(good_movies))
    ratings = ratings[ratings.movieId.isin(good_movies)]
    indices = np.unique(ratings['movieId'].values, return_inverse=True)[1]
    ratings['movieId'] = indices.astype('int32')
    return train_test_split(ratings, test_size=test_size)

train, test = load_data(size='small', test_size=0.1)

# Item-item collaborative filtering
First precompute similarity matrix between all items
using cosine distance between rows of utility (`item x user`) matrix

Then predict unknown score for given `user` and `movie` by taking
weighted average of all ratings of this user

In [9]:
def calculate_item_item_similarity(ratings):
    """
    Calculate matrix of cosine distances between movies
    treated as vectors of ratings 
    """
    # Calculate the dot-products of ratings for all movies
    # Use sparse matrix for calculation to accomodate large data-set (~260k users, ~53k movies)
    # Storing results as dense matrix, allows us to use in-place operations
    # and provides faster indexing
    ii_similarity = coo_matrix(
        (ratings['rating'], (ratings['movieId'], ratings['userId']))
        ).tocsr()
    ii_similarity = ii_similarity.dot(ii_similarity.transpose()).toarray()
    
    d = 1 / np.sqrt(ii_similarity.diagonal())
    ii_similarity *= d.reshape((-1, 1))
    ii_similarity *= d.reshape((1, -1))
    return ii_similarity

In [5]:
ii_similarity = calculate_item_item_similarity(train)

def predict(user, movie, ratings=train):
    """
    Predict rating of movie by user by calculating weighted average
    of ratings given by this user to other movies
    """
    other = ratings[ratings['userId'] == user]
    if other.empty:
        return 0
    similarities = ii_similarity[movie, other['movieId']]
    tmp = similarities * other['rating']
    tmp = np.sum(tmp) / similarities.sum()
    if np.isnan(tmp):
        print('NAN')
    return tmp

Asses rating prediction by calculating RMSE (Root Mean Square Error)

In [8]:
s = 0
num_samples = min(len(test.index), 10000)
for r in test.head(num_samples).itertuples():
    p = predict(r.userId, r.movieId)
    s += np.float64((p - r.rating) ** 2)
s /= num_samples
s = np.sqrt(s)
s

8570.820938778888


0.9257872832772596

# Singular Value Decomposition
Another approach to predicting recommendations is to factorize the utility
matrix using SVD. This allows us to extract latent features (either for users or movies)
and reduce the dimensionality of the problem

In [34]:
utility = coo_matrix(
        (train['rating'], (train['movieId'], train['userId']))
        ).tocsr()
u, s, vt = sa.svds(utility, k=100)
u = u @ np.diag(s)

def predict(user, movie):
    return u[movie, :] @ vt[:, user]

In [35]:
s = 0
num_samples = min(len(test.index), 10000)
for r in test.head(num_samples).itertuples():
    p = predict(r.userId, r.movieId)
    s += np.float64((p - r.rating) ** 2)
s /= num_samples
s = np.sqrt(s)
s

3.254033183472906