In [123]:
import numpy as np
from numpy.linalg import multi_dot

import csv

import pandas as pd

from pathlib import Path

In [124]:
# Reading dataset.

__file__ = 'recommender.ipynb'
base_path = Path(__file__).parent

file_path = (base_path / '../ml-latest-small/ratings.csv').resolve()
with open(file_path) as f:
    ratings = [line for line in csv.reader(f)]
    
file_path = (base_path / '../ml-latest-small/movies.csv').resolve()
with open(file_path) as f:
    movies = [line for line in csv.reader(f)]

In [125]:
# Building rating and movies dataframes from dataset.

# Building dataframes, fixing types and dropping useless columns.
ratings_df = pd.DataFrame(ratings,columns = ['UserID', 'MovieID', 'Rating', 'Timestamp']).iloc[1:]
ratings_df[['UserID', 'MovieID']] = ratings_df[['UserID', 'MovieID']].astype(int) - 1 # 0 index.
ratings_df[['Rating']] = ratings_df[['Rating']].astype(float)
ratings_df.drop(['Timestamp'], inplace = True, axis = 1)

movies_df = pd.DataFrame(movies, columns = ['MovieID', 'Title', 'Genres']).iloc[1:1300]
movies_df[['MovieID']] = movies_df[['MovieID']].astype(int) - 1 # 0 index.


# Movie index correction.
movie_index = pd.DataFrame([i for i in range(0, movies_df['MovieID'].shape[0])], columns = ['NewID'])
movie_index['MovieID'] = movies_df['MovieID'].to_numpy()

# Fix movies_df MovieIDs
movies_df = pd.merge(movie_index, movies_df, on = 'MovieID', how = 'inner').drop(['MovieID'], axis = 1)
movies_df.columns = ['MovieID', 'Title', 'Genres']

# Fix ratings_df MovieIDs
ratings_df = pd.merge(movie_index, ratings_df, on = 'MovieID', how = 'inner').drop(['MovieID'], axis = 1)
ratings_df.columns = ['MovieID', 'UserID', 'Rating']

In [126]:
# Creating R dataframe.

R_df = pd.merge(ratings_df, movies_df, on = "MovieID", how = "inner")

R_df = pd.pivot_table(R_df, index = ['MovieID', 'UserID', 'Genres', 'Title'])
R_df = pd.DataFrame(R_df.to_records())

R_df.head()

Unnamed: 0,MovieID,UserID,Genres,Title,Rating
0,0,0,Adventure|Animation|Children|Comedy|Fantasy,Toy Story (1995),4.0
1,0,4,Adventure|Animation|Children|Comedy|Fantasy,Toy Story (1995),4.0
2,0,6,Adventure|Animation|Children|Comedy|Fantasy,Toy Story (1995),4.5
3,0,14,Adventure|Animation|Children|Comedy|Fantasy,Toy Story (1995),2.5
4,0,16,Adventure|Animation|Children|Comedy|Fantasy,Toy Story (1995),4.5


In [127]:
# Build R matrix and dividing training/test sets.

from scipy.sparse import coo_matrix, csr_matrix

np.random.seed(17)

R_users = R_df["UserID"].to_numpy().flatten()
R_movies = R_df["MovieID"].to_numpy().flatten()
R_ratings = R_df["Rating"].to_numpy().flatten()

# csr_matrix((dat, (row, col)))
R = csr_matrix((R_ratings, (R_users, R_movies)))
R = R.toarray()

print("Shape of R matrix: {}".format(np.shape(R)))

# Dividing training and test set.

# Test set percentage.
p_test = 0.2

# Random boolean, TRUE for test set.
mask = np.random.choice(a = [False, True], size = R.size, p = [p_test, 1 - p_test]).reshape(R.shape)

R_test = np.zeros(R.shape)
R_test[mask] = R[mask]

R_train = np.zeros(R.shape)
R_train[np.invert(mask)] = R[np.invert(mask)]

np.array_equal(R, R_test + R_train)
R_train

Shape of R matrix: (610, 1299)


array([[0., 0., 4., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [128]:
# Obtaining weights matrix C.

### Do the weights make sense? ###

#M = 10
M = np.shape(R_train)[0]
#N = 20
N = np.shape(R_train)[1]

# Unobserbed weights.
w0 = 1

c = [ np.count_nonzero(R_train[:, i]) for i in range(0, np.shape(R_train)[1]) ]
#C = R * c + w0
alpha = 10
C = 1 + alpha * R_train

C

array([[ 1.,  1., 41., ...,  1.,  1.,  1.],
       [ 1.,  1.,  1., ...,  1.,  1.,  1.],
       [ 1.,  1.,  1., ...,  1.,  1.,  1.],
       ...,
       [ 1.,  1.,  1., ...,  1.,  1.,  1.],
       [ 1.,  1.,  1., ...,  1.,  1.,  1.],
       [ 1.,  1.,  1., ...,  1.,  1.,  1.]])

In [129]:
# Building random X and Y matrices of proper dimension.

#K = 5
K = 100

X = np.random.rand(M, K)
Y = np.random.rand(N, K)

In [130]:
# Define functions to compute approximated ratings and error.

def predict(X, Y):
    return np.dot(X, Y.T)

def error(predicted_ratings, ratings, w0):
    obs_idx = np.where(ratings > 0)
    nobs_idx = np.where(ratings == 0)
    obs_error = sum( (ratings[obs_idx] - predicted_ratings[obs_idx]) ** 2 )
    nobs_error = sum( (ratings[nobs_idx] - predicted_ratings[nobs_idx]) ** 2 )
    return obs_error + w0 * nobs_error

In [131]:
# Testing functions. Error should be computer between the predicted ratings and the R_test set.

predicted_ratings = predict(X, Y)
w0 = 1
error(predicted_ratings, R_test, w0)

494173564.8164117

In [132]:
import scipy.optimize.nnls as nnls

### Should solutions be non-negative? (or further constrained to be [0.5, 5]) ##

def singlePassWALS(R, X, Y, C, reg_lambda):
    M = np.shape(X)[0]
    K = np.shape(X)[1]
    N = np.shape(Y)[0]
    
    #XX = np.dot(X.T, X)
    #YY = np.dot(Y.T, Y)

    for u in range(1, M):
        Cu = np.diag(C[u, :])
        # A X_u = b
        A = multi_dot([Y.T, Cu, Y]) + reg_lambda * np.eye(K)
        #A = YY + multi_dot([Y.T, Cu - np.eye(N), Y])
        b = multi_dot([Y.T, Cu, R[u, :]])
        X_u = np.linalg.solve(A, b)
        #X_u = nnls(A, b)[0]
        
        X[u,] = X_u
        
    for i in range(1, N):
        Ci = np.diag(C[:,i])
        # A Y_i = b
        A = multi_dot([X.T, Ci, X]) + reg_lambda * np.eye(K)
        #A = XX + multi_dot([X.T, Ci - np.eye(M), X])
        b = multi_dot([X.T, Ci, R[:, i]])
        Y_i = np.linalg.solve(A, b)
        #Y_i = nnls(A, b)[0]
        
        Y[i,] = Y_i    
    

In [133]:
def WALS(R_train, R_test, X, Y, C, reg_lambda, n_iter):
    for j in range(1, n_iter):
        singlePassWALS(R_train, X, Y, C, reg_lambda)
        predicted_ratings = predict(X, Y)
        w0 = 0.5
        print("Error: " + str(error(predicted_ratings, R_test, w0)))

In [134]:
# Compute WALS.

reg_lambda = 0.1
n_iter = 5


WALS(R_train, R_test, X, Y, C, reg_lambda, n_iter)

predicted_ratings = predict(X, Y)

Error: 476723.4078555144
Error: 459733.93029574456
Error: 456977.23521706834
Error: 455952.63266598555


In [174]:
# Recommend first n results for given user.
def recommendNew(user, R, approx_R):
    pred = np.matrix.round(approx_R, 2)[user]
    
    # Unseen movies.
    idx = np.where(R[user] == 0)[0]
    movie_pred = list(zip(idx, pred[idx]))
    
    # Build prediction dataframe.
    recom_df = pd.DataFrame(movie_pred, columns = ['MovieID', 'Prediction'])
    recom_df = pd.merge(recom_df, movies_df, on = "MovieID", how = "inner")
    recom_df = recom_df.sort_values(by = 'Prediction', ascending = False)
    
    # Add comparison with average ratings.
    avg_rat = ratings_df.groupby('MovieID').mean()
    recom_df = pd.merge(recom_df, avg_rat, on = "MovieID", how = "inner")
    recom_df.drop(['UserID'], inplace = True, axis = 1)
    recom_df.round({'Rating': 1})
    recom_df.rename(columns={'Rating':'AVG_Rating'}, inplace=True)
    
    return recom_df


In [175]:
recommendNew(6, R, predicted_ratings).head(10)

Unnamed: 0,MovieID,Prediction,Title,Genres,AVG_Rating
0,1261,1.47,Starship Troopers (1997),Action|Sci-Fi,3.33125
1,1067,1.34,Jaws (1975),Action|Horror,4.005495
2,378,1.32,Cliffhanger (1993),Action|Adventure|Thriller,3.034653
3,863,1.31,Monty Python and the Holy Grail (1975),Adventure|Comedy|Fantasy,4.161765
4,1183,1.21,Men in Black (a.k.a. MIB) (1997),Action|Comedy|Sci-Fi,3.487879
5,897,1.18,Cheech and Chong's Up in Smoke (1978),Comedy,3.590909
6,201,1.17,Ed Wood (1994),Comedy|Drama,3.678571
7,686,1.07,Rear Window (1954),Mystery|Thriller,4.261905
8,878,1.05,Cinema Paradiso (Nuovo cinema Paradiso) (1989),Drama,4.161765
9,960,1.02,Evil Dead II (Dead by Dawn) (1987),Action|Comedy|Fantasy|Horror,4.044118


In [170]:
comparison = R_df.query('UserID == 6').sort_values(by = "Rating", ascending = False)
print(len(comparison))
comparison.head(10)

36


Unnamed: 0,MovieID,UserID,Genres,Title,Rating
16231,510,6,Crime|Horror|Thriller,"Silence of the Lambs, The (1991)",5.0
33094,1187,6,Drama|Sci-Fi,Contact (1997),5.0
28569,969,6,Adventure|Comedy|Sci-Fi,Back to the Future (1985),5.0
27290,939,6,Action|Sci-Fi|Thriller,"Terminator, The (1984)",5.0
6855,224,6,Action|Adventure|Sci-Fi,Star Wars: Episode IV - A New Hope (1977),5.0
10021,314,6,Comedy|Drama|Romance|War,Forrest Gump (1994),5.0
12826,404,6,Action|Comedy|War,Hot Shots! Part Deux (1993),5.0
13087,418,6,Action|Adventure|Sci-Fi|Thriller,Jurassic Park (1993),5.0
26460,920,6,Crime|Horror,Psycho (1960),5.0
2,0,6,Adventure|Animation|Children|Comedy|Fantasy,Toy Story (1995),4.5


In [151]:
# Compares recommendation on test set with actual ratings.
def recommendTest(user_id, R_test, predicted_ratings):
    pred = np.around(predicted_ratings, 2)[user_id]
    # Unseen movies (test set ones)
    idx = np.where(R_test[user_id] != 0)[0]
    # idx = np.where(R[user] == 0)[0]
    movie_pred = list(zip(idx, pred[idx]))
    recom_df = pd.DataFrame(movie_pred, columns = ['MovieID', 'Prediction'])

    recom_df = pd.merge(recom_df, R_df, on = "MovieID", how = "inner")
    recom_df = recom_df[(recom_df.UserID == user_id)]
    recom_df = recom_df.sort_values(by = 'Prediction', ascending = False)
    
    return recom_df

In [176]:
recommendTest(user_id, R_test, predicted_ratings)

Unnamed: 0,MovieID,Prediction,UserID,Genres,Title,Rating
319,911,0.29,100,Action|Adventure|Sci-Fi,Star Wars: Episode VI - Return of the Jedi (1983),4.0
189,832,0.17,100,Drama,"Doors, The (1991)",5.0
17,190,0.15,100,Comedy,Clerks (1994),4.0
104,199,-0.01,100,Drama,Exotica (1994),4.0
118,316,-0.01,100,Drama,Higher Learning (1995),2.0
136,328,-0.08,100,Action|Comedy,Naked Gun 33 1/3: The Final Insult (1994),2.0
481,1290,-0.16,100,Drama,"Sweet Hereafter, The (1997)",5.0
232,856,-0.27,100,Action|Adventure|Sci-Fi|Thriller,"Abyss, The (1989)",4.0
