In [51]:
import numpy as np
from numpy.linalg import multi_dot

import csv

import pandas as pd

from pathlib import Path

In [52]:
# Reading dataset.

__file__ = 'recommender.ipynb'
base_path = Path(__file__).parent

file_path = (base_path / '../ml-latest-small/ratings.csv').resolve()
with open(file_path) as f:
    ratings = [line for line in csv.reader(f)]
    
file_path = (base_path / '../ml-latest-small/movies.csv').resolve()
with open(file_path) as f:
    movies = [line for line in csv.reader(f)]

In [53]:
# Building dataframes from dataset.

ratings_df = pd.DataFrame(ratings,columns = ['UserID', 'MovieID', 'Rating', 'Timestamp']).iloc[1:]
movies_df = pd.DataFrame(movies, columns = ['MovieID', 'Title', 'Genres']).iloc[1:]

movies_df[['MovieID']] = movies_df[['MovieID']].astype(int)


# Merging and cleaning.
R_df = pd.merge(ratings_df, movies_df, on = "MovieID", how = "inner")
R_df = R_df.drop(columns = ['Timestamp'] , axis = 1)

# Type conversion.
R_df[['UserID', 'MovieID']] = R_df[['UserID', 'MovieID']].astype(int) - 1 # Takes care of 0 index.
R_df[['Rating']] = R_df[['Rating']].astype(float)
R_df[['Title', 'Genres']] = R_df[['Title', 'Genres']].astype(str)

R_df = pd.pivot_table(R_df, index = ['MovieID', 'UserID', 'Genres', 'Title'])
R_df = pd.DataFrame(R_df.to_records())


# Compute the number of distinct movies.
mvd = R_df[['MovieID']].to_numpy().flatten()


# Rewrite movie indices. UGLY, NEEDS TO BE REWRITTEN!
j = -1
new = []
for i in range(0, len(mvd)):
    if (mvd[i] != mvd[i-1]):
        j = j+1
    new.append(j)
    
mvd = np.array(new)

# Assign new indices to dataframe.
R_df = R_df.assign(MovieID = mvd)
R_df

Unnamed: 0,MovieID,UserID,Genres,Title,Rating
0,0,0,Adventure|Animation|Children|Comedy|Fantasy,Toy Story (1995),4.0
1,0,4,Adventure|Animation|Children|Comedy|Fantasy,Toy Story (1995),4.0
2,0,6,Adventure|Animation|Children|Comedy|Fantasy,Toy Story (1995),4.5
3,0,14,Adventure|Animation|Children|Comedy|Fantasy,Toy Story (1995),2.5
4,0,16,Adventure|Animation|Children|Comedy|Fantasy,Toy Story (1995),4.5
5,0,17,Adventure|Animation|Children|Comedy|Fantasy,Toy Story (1995),3.5
6,0,18,Adventure|Animation|Children|Comedy|Fantasy,Toy Story (1995),4.0
7,0,20,Adventure|Animation|Children|Comedy|Fantasy,Toy Story (1995),3.5
8,0,26,Adventure|Animation|Children|Comedy|Fantasy,Toy Story (1995),3.0
9,0,30,Adventure|Animation|Children|Comedy|Fantasy,Toy Story (1995),5.0


In [54]:
# Build R matrix in a VERY UGLY WAY.

from scipy.sparse import coo_matrix, csr_matrix
import ast

R_users = R_df["UserID"].to_numpy().flatten()
R_movies = R_df["MovieID"].to_numpy().flatten()
R_ratings = R_df["Rating"].to_numpy().flatten()

R = csr_matrix((R_ratings, (R_users, R_movies)))  # csr_matrix((dat, (row, col)))
R = R.toarray()
np.shape(R) # Now correct dimensions.

(610, 9724)

In [56]:
# Obtaining coefficient matrix C.

#M = 10
M = np.shape(R)[0]
#N = 20
N = np.shape(R)[1]

# Obserbed and unobserbed weights.
wk = 0.7
w0 = 0.5

#R = np.random.choice([0, 1], size = (M, N), p = [3./5, 2./5])

c = [ sum(R[:, i]) for i in range(0, np.shape(R)[1]) ]
#C = R * c * wk + w0
C = 1 + 20*R
C

array([[ 81.,   1.,  81., ...,   1.,   1.,   1.],
       [  1.,   1.,   1., ...,   1.,   1.,   1.],
       [  1.,   1.,   1., ...,   1.,   1.,   1.],
       ...,
       [ 51.,  41.,  41., ...,   1.,   1.,   1.],
       [ 61.,   1.,   1., ...,   1.,   1.,   1.],
       [101.,   1.,   1., ...,   1.,   1.,   1.]])

In [58]:
# Building random X and Y matrices of proper dimension.

#K = 5
K = 100

X = np.random.rand(M, K)
Y = np.random.rand(N, K)

In [59]:
# Define functions to compute approximated ratings and error.

def approximation(X, Y):
    return np.dot(X, Y.T)

def error(approxRatings, ratings, w0):
    obs_idx = np.where(ratings > 0)
    nobs_idx = np.where(ratings == 0)
    obs_error = sum( (ratings[obs_idx] - approxRatings[obs_idx]) ** 2 )
    nobs_error = sum( (ratings[nobs_idx] - approxRatings[nobs_idx]) ** 2 )
    return obs_error + w0 * nobs_error

In [60]:
# Testing functions.

approx_ratings = approximation(X, Y)
w0 = 0.5
error(approx_ratings, R, w0)

1883579038.754299

In [61]:
def singlePassWALS(R, X, Y, C, reg_lambda):
    M = np.shape(X)[0]
    K = np.shape(X)[1]
    N = np.shape(Y)[0]

    for u in range(1, M):
        Cu = np.diag(C[u, :])
        # A X_u = b
        A = multi_dot([Y.T, Cu, Y]) + reg_lambda * np.eye(K)
        b = multi_dot([Y.T, Cu, R[u, :]])
        X_u = np.linalg.solve(A, b)
        
        X[u,] = X_u
        
    for i in range(1, N):
        Ci = np.diag(C[:,i])
        # A Y_i = b
        A = multi_dot([X.T, Ci, X]) + reg_lambda * np.eye(K)
        b = multi_dot([X.T, Ci, R[:, i]])
        Y_i = np.linalg.solve(A, b)
        
        Y[i,] = Y_i    
    

In [62]:
def WALS(R, X, Y, C, reg_lambda, n_iter):
    for j in range(1, n_iter):
        singlePassWALS(R, X, Y, C, reg_lambda)
        approx_ratings = approximation(X, Y)
        w0 = 0.5
        print("Error: " + str(error(approx_ratings, R, w0)))
        

In [63]:
# Compute WALS.

reg_lambda = 0.2
n_iter = 10


WALS(R, X, Y, C, reg_lambda, n_iter)

final_result = approximation(X, Y)

Error: 2742086.69867049
Error: 1221467.7864142098
Error: 943415.9545924983
Error: 847440.9083193926
Error: 798944.3478183385
Error: 769390.5400117529
Error: 749381.9207585008
Error: 734894.9514031764
Error: 723895.137569335


In [182]:
movies_df[['MovieID']] = movies_df[['MovieID']].astype(int) - 1


In [177]:
# Recommend first n results for given user.
def recommend(user, R, approx_R):
    pred = np.matrix.round(approx_R, 2)[user]
    # Unseen movies.
    idx = np.where(R[user] == 0)[0]
    movie_pred = list(zip(idx, T[idx]))
    recom_df = pd.DataFrame(movie_pred, columns = ['MovieID', 'Prediction'])
    recom_df = pd.merge(recom_df, movies_df, on = "MovieID", how = "inner")
    recom_df = recom_df.sort_values(by = 'Prediction', ascending = False)
    return recom_df


In [189]:
recom_test = recommend(43, R, final_result)
recom_test.head(10)

Unnamed: 0,MovieID,Prediction,Title,Genres
262,314,5.13,"Specialist, The (1994)",Action|Drama|Thriller
281,334,5.08,Underneath (1995),Mystery|Thriller
273,326,5.07,Tank Girl (1995),Action|Comedy|Sci-Fi
194,239,5.02,Hideaway (1995),Thriller
241,291,5.02,Outbreak (1995),Action|Drama|Sci-Fi|Thriller
178,221,4.99,Circle of Friends (1995),Drama|Romance
431,508,4.98,"Piano, The (1993)",Drama|Romance
63,77,4.98,"Crossing Guard, The (1995)",Action|Crime|Drama|Thriller
429,506,4.97,"Perfect World, A (1993)",Crime|Drama|Thriller
256,307,4.97,Three Colors: White (Trzy kolory: Bialy) (1994),Comedy|Drama


In [190]:
comparison = R_df.query('UserID == 43')
comparison.sort_values(by = "Rating", ascending = False).head(10)

Unnamed: 0,MovieID,UserID,Genres,Title,Rating
18377,595,43,Action|Sci-Fi,Barb Wire (1996),5.0
3729,99,43,Action|Adventure|Comedy|Crime,Rumble in the Bronx (Hont faan kui) (1995),5.0
31961,1128,43,Comedy|Drama,Private Parts (1997),5.0
31682,1099,43,Action|Adventure|Comedy|Thriller,First Strike (Police Story 4: First Strike) (G...,5.0
25879,910,43,Action|Adventure|Sci-Fi,Star Wars: Episode VI - Return of the Jedi (1983),5.0
18135,592,43,Action|Adventure|Thriller,"Rock, The (1996)",5.0
6869,224,43,Action|Adventure|Sci-Fi,Star Wars: Episode IV - A New Hope (1977),5.0
33880,1230,43,Comedy|Drama|Romance,Chasing Amy (1997),5.0
3086,83,43,Comedy|Drama|Romance,Beautiful Girls (1996),4.0
31253,1072,43,Drama|Romance,Jerry Maguire (1996),4.0
