In [1]:
import pandas as pd
from pandas import DataFrame
import numpy as np
import scipy as sp
from scipy.sparse.linalg import svds
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import mean_squared_error
from math import sqrt

In [2]:
df = pd.read_csv('data/ml-latest-small/ratings.csv')

In [3]:
df.sample(3)

Unnamed: 0,userId,movieId,rating,timestamp
11280,73,40629,1.5,1256029392
32199,232,2949,4.0,955086826
73968,516,508,5.0,844687287


### Create the user-item matrix

In [4]:
def create_matrix(df, user_col, item_col, rating_col):
    # Reverse index look up user --> user index in the matrix
    user_list = df[user_col].unique()
    users = {}
    for i, user in enumerate(user_list):
        users[user] = i
    # Reverse index look up item --> item index in the matrix
    item_list = df[item_col].unique()
    items = {}
    for i, item in enumerate(item_list):
        items[item] = i
    # Initialize a matrix of zeros
    n_users = len(users)
    n_items = len(items)
    user_item_matrix = np.zeros((n_users, n_items))
    # Populate the matrix
    for _, row in df.iterrows():
        # Must explicitly cast to int
        user_id = int(row[user_col])
        item_id = int(row[item_col])
        user_item_matrix[users[user_id], items[item_id]] = row[rating_col]
    # Create the data frame
    return DataFrame(user_item_matrix, index=user_list, columns=item_list)

In [5]:
df_matrix = create_matrix(df, 'userId', 'movieId', 'rating')

In [6]:
# Verify
print(df_matrix.shape)
print(df_matrix.loc[73, 40629] == 1.5)
print(df_matrix.loc[232, 2949] == 4.0)
print(df_matrix.loc[516, 508] == 5.0)

(671, 9066)
True
True
True


### Pairwise similarity

In [7]:
user_similarity = pairwise_distances(df_matrix.values, metric='cosine')

In [8]:
movie_similarity = pairwise_distances(df_matrix.values.T, metric='cosine')

### User-based collaborative filtering

In [9]:
user_similarity.shape

(671, 671)

See memory-based of https://en.wikipedia.org/wiki/Collaborative_filtering

    for each item, the rating by user x is computed as:
        avg_rating_user_x + sum(sim(x, i) * (rating_user_i - avg_rating_user_i)) / sum(sim(x, i))

A simplified version without bias correction:

    for each item, the rating is:
        sum(sim(x, i) * rating_user_i) / sum(sim(x, i))

In [10]:
def user_based_cf(user_item_matrix, user_sim_matrix):
    # TODO: For movie ratings, should exclude users, whose ratings are 0 (unknown)
    #       from summing up the similarities; otherwise they will weigh down the ratings
    # Note: Some other cases, 0s are true values and should not be excluded
    new_ratings = np.array([np.dot(sims, ratings) / np.sum(sims)
                   for sims in user_sim_matrix
                   for ratings in user_item_matrix.T])
    return new_ratings.reshape(user_item_matrix.shape)

In [11]:
user_cf_matrix = user_based_cf(df_matrix.values, user_similarity)

In [12]:
df_user_cf = DataFrame(user_cf_matrix, index=df_matrix.index, columns=df_matrix.columns)

In [13]:
# The values are different because they are diluted by missing ratings. See TODO above.
print(df_matrix.loc[213, 37380], df_user_cf.loc[213, 37380])
print(df_matrix.loc[299, 2952], df_user_cf.loc[299, 2952])
print(df_matrix.loc[299, 7371], df_user_cf.loc[299, 7371])

2.5 0.00975278411432
3.5 0.0468919664056
4.5 0.0355327096075


In [14]:
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [15]:
rmse(user_cf_matrix, df_matrix.values)

3.387854860874772

### SVD

In [16]:
# get SVD components from train matrix. Choose k.
u, s, vt = svds(df_matrix.values, k = 20)
s_diag_matrix = np.diag(s)
X_pred = np.dot(np.dot(u, s_diag_matrix), vt)

In [17]:
X_pred.shape

(671, 9066)

In [18]:
rmse(X_pred, df_matrix.values)

2.431602197489613