In [98]:
import numpy as np
from numpy.linalg import multi_dot

import numpy.linalg as lin

import csv

import pandas as pd

from pathlib import Path
import time

In [99]:
# Define context manager to measure execution time.

class codeTimer:
    def __init__(self, name=None):
        self.name = "Executed '"  + name + "'. " if name else ""

    def __enter__(self):
        self.start = time.perf_counter()

    def __exit__(self, exc_type, exc_value, traceback):
        self.end = time.perf_counter()
        self.elapsed = (self.end - self.start)
        print('%s Elapsed time: %0.6fs' % (str(self.name), self.elapsed))

In [100]:
# Reading dataset.

__file__ = 'recommender.ipynb'
base_path = Path(__file__).parent

file_path = (base_path / '../ml-latest-small/ratings.csv').resolve()
with open(file_path) as f:
    ratings = [line for line in csv.reader(f)]
    
file_path = (base_path / '../ml-latest-small/movies.csv').resolve()
with open(file_path) as f:
    movies = [line for line in csv.reader(f)]

In [164]:
# Building rating and movies dataframes from dataset.

# Building dataframes, fixing types and dropping useless columns.
ratings_df = pd.DataFrame(ratings,columns = ['UserID', 'MovieID', 'Rating', 'Timestamp']).iloc[1:]
ratings_df[['UserID', 'MovieID']] = ratings_df[['UserID', 'MovieID']].astype(int) - 1 # 0 index.
ratings_df[['Rating']] = ratings_df[['Rating']].astype(float)
ratings_df.drop(['Timestamp'], inplace = True, axis = 1)

movies_df = pd.DataFrame(movies, columns = ['MovieID', 'Title', 'Genres']).iloc[1:3000]
movies_df[['MovieID']] = movies_df[['MovieID']].astype(int) - 1 # 0 index.


# Movie index correction.
movie_index = pd.DataFrame([i for i in range(0, movies_df['MovieID'].shape[0])], columns = ['NewID'])
movie_index['MovieID'] = movies_df['MovieID'].to_numpy()

# Fix movies_df MovieIDs
movies_df = pd.merge(movie_index, movies_df, on = 'MovieID', how = 'inner').drop(['MovieID'], axis = 1)
movies_df.columns = ['MovieID', 'Title', 'Genres']

# Fix ratings_df MovieIDs
ratings_df = pd.merge(movie_index, ratings_df, on = 'MovieID', how = 'inner').drop(['MovieID'], axis = 1)
ratings_df.columns = ['MovieID', 'UserID', 'Rating']

In [102]:
# Creating R dataframe.

R_df = pd.merge(ratings_df, movies_df, on = "MovieID", how = "inner")

R_df = pd.pivot_table(R_df, index = ['MovieID', 'UserID', 'Genres', 'Title'])
R_df = pd.DataFrame(R_df.to_records())

R_df.head()

Unnamed: 0,MovieID,UserID,Genres,Title,Rating
0,0,0,Adventure|Animation|Children|Comedy|Fantasy,Toy Story (1995),4.0
1,0,4,Adventure|Animation|Children|Comedy|Fantasy,Toy Story (1995),4.0
2,0,6,Adventure|Animation|Children|Comedy|Fantasy,Toy Story (1995),4.5
3,0,14,Adventure|Animation|Children|Comedy|Fantasy,Toy Story (1995),2.5
4,0,16,Adventure|Animation|Children|Comedy|Fantasy,Toy Story (1995),4.5


In [103]:
len(R_df)

59364

In [104]:
rrr = R_df["UserID"].to_numpy().flatten()
(rrr == 534).sum()

30

In [105]:
# Build R matrix and dividing training/test sets.

from scipy.sparse import coo_matrix, csr_matrix

np.random.seed(17)

R_users = R_df["UserID"].to_numpy().flatten()
R_movies = R_df["MovieID"].to_numpy().flatten()
R_ratings = R_df["Rating"].to_numpy().flatten()

# csr_matrix((dat, (row, col)))
R = csr_matrix((R_ratings, (R_users, R_movies)))
R = R.toarray()

print("Shape of R matrix: {}".format(np.shape(R)))

# Dividing training and test set.

# Test set percentage.
p_test = 0.2

# Random boolean, TRUE for test set.
mask = np.random.choice(a = [False, True], size = R.size, p = [p_test, 1 - p_test]).reshape(R.shape)

R_test = np.zeros(R.shape)
R_test[mask] = R[mask]

R_train = np.zeros(R.shape)
R_train[np.invert(mask)] = R[np.invert(mask)]

np.array_equal(R, R_test + R_train)
R_train

Shape of R matrix: (610, 2999)


array([[0., 0., 4., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 5., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [111]:
# Obtaining weights matrix C.

### Do the weights make sense? ###


#M = 10
M = np.shape(R)[0]
#N = 20
N = np.shape(R)[1]

# Unobserbed weights.
w0 = 1

c = [ np.count_nonzero(R[:, i]) for i in range(0, np.shape(R)[1]) ]
#C = R * c + w0
alpha = 10
C = 1 + alpha * R

C

array([[41.,  1., 41., ...,  1.,  1.,  1.],
       [ 1.,  1.,  1., ...,  1.,  1.,  1.],
       [ 1.,  1.,  1., ...,  1.,  1.,  1.],
       ...,
       [26., 21., 21., ..., 51.,  1.,  1.],
       [31.,  1.,  1., ...,  1.,  1.,  1.],
       [51.,  1.,  1., ..., 46.,  1.,  1.]])

In [112]:
# Building random X and Y matrices of proper dimension.

#K = 5
K = 100

X = np.random.rand(M, K)
Y = np.random.rand(N, K)

In [113]:
# Define functions to compute approximated ratings and error.

def predict(X, Y):
    return np.dot(X, Y.T)

def error(predicted_ratings, ratings, w0):
    obs_idx = np.where(ratings > 0)
    nobs_idx = np.where(ratings == 0)
    obs_error = sum( (ratings[obs_idx] - predicted_ratings[obs_idx]) ** 2 )
    nobs_error = sum( (ratings[nobs_idx] - predicted_ratings[nobs_idx]) ** 2 )
    return obs_error + w0 * nobs_error

In [114]:
# Testing functions. Error should be computer between the predicted ratings and the R_test set.

predicted_ratings = predict(X, Y)
w0 = 1
error(predicted_ratings, R, w0)

1147079970.9557586

In [116]:
import scipy.optimize.nnls as nnls

### Should solutions be non-negative? (or further constrained to be [0.5, 5]) ##

def singlePassWALS(R, X, Y, C, reg_lambda):
    M = np.shape(X)[0]
    K = np.shape(X)[1]
    N = np.shape(Y)[0]
    
    #XX = np.dot(X.T, X)
    #YY = np.dot(Y.T, Y)

    for u in range(1, M):
        Cu = np.diag(C[u, :])
        # A X_u = b
        A = multi_dot([Y.T, Cu, Y]) + reg_lambda * np.eye(K)
        #A = YY + multi_dot([Y.T, Cu - np.eye(N), Y])
        b = multi_dot([Y.T, Cu, R[u, :]])
        X_u = np.linalg.solve(A, b)
        #X_u = nnls(A, b)[0]
        
        X[u,] = X_u
        
    for i in range(1, N):
        Ci = np.diag(C[:,i])
        # A Y_i = b
        A = multi_dot([X.T, Ci, X]) + reg_lambda * np.eye(K)
        #A = XX + multi_dot([X.T, Ci - np.eye(M), X])
        b = multi_dot([X.T, Ci, R[:, i]])
        Y_i = np.linalg.solve(A, b)
        #Y_i = nnls(A, b)[0]
        
        Y[i,] = Y_i    
    

In [117]:
def WALS(R_train, R_test, X, Y, C, reg_lambda, n_iter):
    for j in range(1, n_iter):
        singlePassWALS(R_train, X, Y, C, reg_lambda)
        predicted_ratings = predict(X, Y)
        # Here put to one, error is the same for observed and unobserved matrices.
        w0 = 1
        print("Test error: " + str(error(predicted_ratings, R_test, w0)))
        print("Train error: " + str(error(predicted_ratings, R_train, w0)))

In [118]:
# Compute WALS.

reg_lambda = 0.1
n_iter = 5

with codeTimer("WALS"):
    #WALS(R_train, R_test, X, Y, C, reg_lambda, n_iter)
    WALS(R, R, X, Y, C, reg_lambda, n_iter)


predicted_ratings = predict(X, Y)

Test error: 1872895.330897949
Train error: 1872895.330897949
Test error: 901766.9103060175
Train error: 901766.9103060175
Test error: 724445.8328305486
Train error: 724445.8328305486
Test error: 665610.7336245014
Train error: 665610.7336245014
Executed 'WALS'.  Elapsed time: 130.104824s


In [119]:
# Recommend first n results for given user.

def recommendNew(user, R, approx_R):
    pred = np.matrix.round(approx_R, 2)[user]
    
    # Unseen movies.
    idx = np.where(R[user] == 0)[0]
    movie_pred = list(zip(idx, pred[idx]))
    
    # Build prediction dataframe.
    recom_df = pd.DataFrame(movie_pred, columns = ['MovieID', 'Prediction'])
    recom_df = pd.merge(recom_df, movies_df, on = "MovieID", how = "inner")
    recom_df = recom_df.sort_values(by = 'Prediction', ascending = False)
    
    # Add comparison with average ratings.
    avg_rat = ratings_df.groupby('MovieID').mean()
    recom_df = pd.merge(recom_df, avg_rat, on = "MovieID", how = "inner")
    recom_df.drop(['UserID'], inplace = True, axis = 1)
    recom_df.round({'Rating': 1})
    recom_df.rename(columns={'Rating':'AVG_Rating'}, inplace=True)
    
    return recom_df


In [126]:
recommendNew(17, R, predicted_ratings).head(10)

Unnamed: 0,MovieID,Prediction,Title,Genres,AVG_Rating
0,2028,5.02,"South Park: Bigger, Longer and Uncut (1999)",Animation|Comedy|Musical,3.861842
1,1187,4.39,Contact (1997),Drama|Sci-Fi,3.652439
2,962,4.22,"Deer Hunter, The (1978)",Drama|War,3.825581
3,2355,4.21,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,3.860825
4,2641,4.14,American Psycho (2000),Crime|Horror|Mystery|Thriller,3.788136
5,836,4.03,E.T. the Extra-Terrestrial (1982),Children|Drama|Sci-Fi,3.766393
6,1000,3.85,Field of Dreams (1989),Children|Drama|Fantasy,3.517857
7,906,3.78,Lawrence of Arabia (1962),Adventure|Drama|War,4.3
8,1135,3.71,Liar Liar (1997),Comedy,3.033784
9,815,3.68,Willy Wonka & the Chocolate Factory (1971),Children|Comedy|Fantasy|Musical,3.87395


In [128]:
comparison = R_df.query('UserID == 17').sort_values(by = "Rating", ascending = False)
print(len(comparison))
comparison.head(15)

192


Unnamed: 0,MovieID,UserID,Genres,Title,Rating
26626,922,17,Crime|Drama,"Godfather: Part II, The (1974)",5.0
25383,903,17,Action|Adventure|Western,"Good, the Bad and the Ugly, The (Buono, il bru...",5.0
8661,277,17,Crime|Drama,"Shawshank Redemption, The (1994)",5.0
25848,910,17,Action|Drama|Western,Once Upon a Time in the West (C'era una volta ...,5.0
21006,705,17,Drama|Mystery,Citizen Kane (1941),5.0
2386,46,17,Crime|Mystery|Thriller,"Usual Suspects, The (1995)",5.0
25462,905,17,Drama,12 Angry Men (1957),5.0
26074,913,17,Film-Noir|Mystery|Thriller,"Third Man, The (1949)",4.5
25870,911,17,Action|Adventure|Sci-Fi,Star Wars: Episode VI - Return of the Jedi (1983),4.5
25683,908,17,Drama,To Kill a Mockingbird (1962),4.5


In [129]:
# Compares recommendation on test set with actual ratings.

def recommendTest(user_id, R_test, predicted_ratings):
    pred = np.around(predicted_ratings, 2)[user_id]
    # Unseen movies (test set ones)
    idx = np.where(R_test[user_id] != 0)[0]
    # idx = np.where(R[user] == 0)[0]
    movie_pred = list(zip(idx, pred[idx]))
    recom_df = pd.DataFrame(movie_pred, columns = ['MovieID', 'Prediction'])

    recom_df = pd.merge(recom_df, R_df, on = "MovieID", how = "inner")
    recom_df = recom_df[(recom_df.UserID == user_id)]
    recom_df = recom_df.sort_values(by = 'Prediction', ascending = False)
    
    return recom_df

In [130]:
#user_id = 6
#recommendTest(user_id, R_test, predicted_ratings).head(10)

In [214]:
# Suggesting similar items.

def cosine_similarity(d_1, d_2):
    len_1 = lin.norm(d_1)
    len_2 = lin.norm(d_2)
    if len_1 == 0 or len_2 == 0:
        return -1
    return np.dot(d_1, d_2) / (len_1 * len_2)

def similar_items(movie_id, Y):
    # Y is the item embedding
    d_1 = Y[movie_id]
    similarity = [cosine_similarity(Y[movie_id], Y[i]) for i in range(0, np.shape(Y)[0])]
    return similarity

In [215]:
np.where(~Y.any(axis=1))[0]

array([ 816, 2211, 2499, 2587])

In [272]:
def suggestSimilar(movie_id, Y):
    similarities = pd.DataFrame(similar_items(movie_id, Y), columns = ["Similarity"])
    similarities_df = pd.concat([movies_df, similarities], axis = 1)
    return similarities_df.sort_values(by = 'Similarity', ascending = False).head(10)

# Some suggestions:
#
# 911: Star Wars Episode VI
# 786: Dumbo
# 957: The Shining
# 474: Blade Runner

suggestSimilar(474, Y)

Unnamed: 0,MovieID,Title,Genres,Similarity
474,474,Blade Runner (1982),Action|Sci-Fi|Thriller,1.0
706,706,2001: A Space Odyssey (1968),Adventure|Drama|Sci-Fi,0.652314
509,509,Batman (1989),Action|Crime|Thriller,0.620636
915,915,Alien (1979),Horror|Sci-Fi,0.59067
2636,2636,Predator (1987),Action|Sci-Fi|Thriller,0.568413
2038,2038,Ghostbusters (a.k.a. Ghost Busters) (1984),Action|Comedy|Sci-Fi,0.567894
939,939,"Terminator, The (1984)",Action|Sci-Fi|Thriller,0.566122
1576,1576,Indiana Jones and the Temple of Doom (1984),Action|Adventure|Fantasy,0.564395
2743,2743,For a Few Dollars More (Per qualche dollaro in...,Action|Drama|Thriller|Western,0.555524
902,902,Aliens (1986),Action|Adventure|Horror|Sci-Fi,0.554188


## TODO
* New users
* Cold start