In [1]:
import numpy as np
from numpy.linalg import multi_dot

import csv

import pandas as pd

from pathlib import Path

In [2]:
# Reading dataset.

__file__ = 'recommender.ipynb'
base_path = Path(__file__).parent

file_path = (base_path / '../ml-latest-small/ratings.csv').resolve()
with open(file_path) as f:
    ratings = [line for line in csv.reader(f)]
    
file_path = (base_path / '../ml-latest-small/movies.csv').resolve()
with open(file_path) as f:
    movies = [line for line in csv.reader(f)]

In [3]:
# Building rating and movies dataframes from dataset.

# Building dataframes, fixing types and dropping useless columns.
ratings_df = pd.DataFrame(ratings,columns = ['UserID', 'MovieID', 'Rating', 'Timestamp']).iloc[1:]
ratings_df[['UserID', 'MovieID']] = ratings_df[['UserID', 'MovieID']].astype(int) - 1 # 0 index.
ratings_df[['Rating']] = ratings_df[['Rating']].astype(float)
ratings_df.drop(['Timestamp'], inplace = True, axis = 1)

movies_df = pd.DataFrame(movies, columns = ['MovieID', 'Title', 'Genres']).iloc[1:1000]
movies_df[['MovieID']] = movies_df[['MovieID']].astype(int) - 1 # 0 index.


# Movie index correction.
movie_index = pd.DataFrame([i for i in range(0, movies_df['MovieID'].shape[0])], columns = ['NewID'])
movie_index['MovieID'] = movies_df['MovieID'].to_numpy()

# Fix movies_df MovieIDs
movies_df = pd.merge(movie_index, movies_df, on = 'MovieID', how = 'inner').drop(['MovieID'], axis = 1)
movies_df.columns = ['MovieID', 'Title', 'Genres']

# Fix ratings_df MovieIDs
ratings_df = pd.merge(movie_index, ratings_df, on = 'MovieID', how = 'inner').drop(['MovieID'], axis = 1)
ratings_df.columns = ['MovieID', 'UserID', 'Rating']

In [6]:
# Creating R dataframe.

R_df = pd.merge(ratings_df, movies_df, on = "MovieID", how = "inner")

R_df = pd.pivot_table(R_df, index = ['MovieID', 'UserID', 'Genres', 'Title'])
R_df = pd.DataFrame(R_df.to_records())

R_df.head()

Unnamed: 0,MovieID,UserID,Genres,Title,Rating
0,0,0,Adventure|Animation|Children|Comedy|Fantasy,Toy Story (1995),4.0
1,0,4,Adventure|Animation|Children|Comedy|Fantasy,Toy Story (1995),4.0
2,0,6,Adventure|Animation|Children|Comedy|Fantasy,Toy Story (1995),4.5
3,0,14,Adventure|Animation|Children|Comedy|Fantasy,Toy Story (1995),2.5
4,0,16,Adventure|Animation|Children|Comedy|Fantasy,Toy Story (1995),4.5


In [7]:
# Build R matrix (CAN IT BE DONE IN A LESS UGLY WAY?).

from scipy.sparse import coo_matrix, csr_matrix
import ast

R_users = R_df["UserID"].to_numpy().flatten()
R_movies = R_df["MovieID"].to_numpy().flatten()
R_ratings = R_df["Rating"].to_numpy().flatten()

# csr_matrix((dat, (row, col)))
R = csr_matrix((R_ratings, (R_users, R_movies)))
R = R.toarray()

np.shape(R)

(610, 999)

In [8]:
# Obtaining weights matrix C.

#M = 10
M = np.shape(R)[0]
#N = 20
N = np.shape(R)[1]

# Unobserbed weights.
w0 = 1

c = [ np.count_nonzero(R[:, i]) for i in range(0, np.shape(R)[1]) ]
C = R * c + w0

C

array([[8.610e+02, 1.000e+00, 2.090e+02, ..., 1.160e+02, 1.000e+00,
        1.000e+00],
       [1.000e+00, 1.000e+00, 1.000e+00, ..., 1.000e+00, 1.000e+00,
        1.000e+00],
       [1.000e+00, 1.000e+00, 1.000e+00, ..., 1.000e+00, 1.000e+00,
        1.000e+00],
       ...,
       [5.385e+02, 2.210e+02, 1.050e+02, ..., 1.000e+00, 1.000e+00,
        1.000e+00],
       [6.460e+02, 1.000e+00, 1.000e+00, ..., 1.000e+00, 1.000e+00,
        1.000e+00],
       [1.076e+03, 1.000e+00, 1.000e+00, ..., 1.000e+00, 1.000e+00,
        4.150e+01]])

In [9]:
# Building random X and Y matrices of proper dimension.

#K = 5
K = 100

X = np.random.rand(M, K)
Y = np.random.rand(N, K)

In [10]:
# Define functions to compute approximated ratings and error.

def approximation(X, Y):
    return np.dot(X, Y.T)

def error(approxRatings, ratings, w0):
    obs_idx = np.where(ratings > 0)
    nobs_idx = np.where(ratings == 0)
    obs_error = sum( (ratings[obs_idx] - approxRatings[obs_idx]) ** 2 )
    nobs_error = sum( (ratings[nobs_idx] - approxRatings[nobs_idx]) ** 2 )
    return obs_error + w0 * nobs_error

In [11]:
# Testing functions.

approx_ratings = approximation(X, Y)
w0 = 0.5
error(approx_ratings, R, w0)

196541399.97890928

In [12]:
def singlePassWALS(R, X, Y, C, reg_lambda):
    M = np.shape(X)[0]
    K = np.shape(X)[1]
    N = np.shape(Y)[0]
    
    #XX = np.dot(X.T, X)
    #YY = np.dot(Y.T, Y)

    for u in range(1, M):
        Cu = np.diag(C[u, :])
        # A X_u = b
        A = multi_dot([Y.T, Cu, Y]) + reg_lambda * np.eye(K)
        #A = YY + multi_dot([Y.T, Cu - np.eye(N), Y])
        b = multi_dot([Y.T, Cu, R[u, :]])
        X_u = np.linalg.solve(A, b)
        
        X[u,] = X_u
        
    for i in range(1, N):
        Ci = np.diag(C[:,i])
        # A Y_i = b
        A = multi_dot([X.T, Ci, X]) + reg_lambda * np.eye(K)
        #A = XX + multi_dot([X.T, Ci - np.eye(M), X])
        b = multi_dot([X.T, Ci, R[:, i]])
        Y_i = np.linalg.solve(A, b)
        
        Y[i,] = Y_i    
    

In [13]:
def WALS(R, X, Y, C, reg_lambda, n_iter):
    for j in range(1, n_iter):
        singlePassWALS(R, X, Y, C, reg_lambda)
        approx_ratings = approximation(X, Y)
        w0 = 0.5
        print("Error: " + str(error(approx_ratings, R, w0)))
        

In [14]:
# Compute WALS.

reg_lambda = 0.2
n_iter = 10


WALS(R, X, Y, C, reg_lambda, n_iter)

final_result = approximation(X, Y)

Error: 419367.50914227596
Error: 218575.4056366898
Error: 172838.3805848219
Error: 153896.26274904658
Error: 142885.04765450116
Error: 135512.04868495264
Error: 130169.47405344585
Error: 126066.84758166845
Error: 122777.4001680187


In [17]:
# Recommend first n results for given user.
def recommend(user, R, approx_R):
    pred = np.matrix.round(approx_R, 2)[user]
    # Unseen movies.
    idx = np.where(R[user] == 0)[0]
    movie_pred = list(zip(idx, pred[idx]))
    recom_df = pd.DataFrame(movie_pred, columns = ['MovieID', 'Prediction'])
    recom_df = pd.merge(recom_df, movies_df, on = "MovieID", how = "inner")
    recom_df = recom_df.sort_values(by = 'Prediction', ascending = False)
    return recom_df


In [37]:
recom_test = recommend(0, R, final_result)
recom_test.head(10)

Unnamed: 0,MovieID,Prediction,Title,Genres
475,507,24.36,Terminator 2: Judgment Day (1991),Action|Sci-Fi
259,277,24.05,"Shawshank Redemption, The (1994)",Crime|Drama
615,659,23.69,"Godfather, The (1972)",Crime|Drama
115,123,19.02,Apollo 13 (1995),Adventure|Drama|IMAX
300,322,17.59,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX
474,506,17.48,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical
311,334,17.41,Speed (1994),Action|Romance|Thriller
28,31,17.01,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
736,793,16.92,Die Hard (1988),Action|Crime|Thriller
314,337,16.35,True Lies (1994),Action|Adventure|Comedy|Romance|Thriller


In [36]:
comparison = R_df.query('UserID == 0')
comparison.sort_values(by = "Rating", ascending = False).head(10)

Unnamed: 0,MovieID,UserID,Genres,Title,Rating
21003,705,0,Drama|Mystery,Citizen Kane (1941),5.0
16952,520,0,Comedy|Crime|Drama|Thriller,Fargo (1996),5.0
23513,836,0,Children|Drama|Sci-Fi,E.T. the Extra-Terrestrial (1982),5.0
23362,831,0,Crime|Mystery|Thriller,Basic Instinct (1992),5.0
23147,828,0,Crime|Mystery|Thriller,Reservoir Dogs (1992),5.0
22930,820,0,Comedy,Monty Python's Life of Brian (1979),5.0
22701,815,0,Children|Comedy|Fantasy|Musical,Willy Wonka & the Chocolate Factory (1971),5.0
22527,801,0,Action|Adventure,"Ghost and the Darkness, The (1996)",5.0
22133,789,0,Adventure|Animation|Children|Fantasy|Musical,Alice in Wonderland (1951),5.0
22109,788,0,Adventure|Children|Musical,Bedknobs and Broomsticks (1971),5.0
