In [236]:
import numpy as np
from numpy.linalg import multi_dot

import csv

import pandas as pd

from pathlib import Path

In [237]:
# Reading dataset.

__file__ = 'recommender.ipynb'
base_path = Path(__file__).parent

file_path = (base_path / '../ml-latest-small/ratings.csv').resolve()
with open(file_path) as f:
    ratings = [line for line in csv.reader(f)]
    
file_path = (base_path / '../ml-latest-small/movies.csv').resolve()
with open(file_path) as f:
    movies = [line for line in csv.reader(f)]

In [238]:
# Building dataframes from dataset.

ratings_df = pd.DataFrame(ratings,columns = ['UserID', 'MovieID', 'Rating', 'Timestamp']).iloc[1:1000]
movies_df = pd.DataFrame(movies, columns = ['MovieID', 'Title', 'Genres']).iloc[1:200]

# Merging and cleaning.
R_df = pd.merge(ratings_df, movies_df, on = "MovieID", how = "inner")
R_df = R_df.drop(columns = ['Timestamp'] , axis = 1)

# Type conversion.
R_df[['UserID', 'MovieID']] = R_df[['UserID', 'MovieID']].astype(int) - 1 # Takes care of 0 index.
R_df[['Rating']] = R_df[['Rating']].astype(float)
R_df[['Title', 'Genres']] = R_df[['Title', 'Genres']].astype(str)

R_df = pd.pivot_table(R_df, index = ['MovieID', 'UserID', 'Genres', 'Title'])
R_df = pd.DataFrame(R_df.to_records())


# Compute the number of distinct movies.
mvd = R_df[['MovieID']].to_numpy().flatten()


# Rewrite movie indices. UGLY, NEEDS TO BE REWRITTEN!
j = -1
new = []
for i in range(0, len(mvd)):
    if (mvd[i] != mvd[i-1]):
        j = j+1
    new.append(j)
    
mvd = np.array(new)

# Assign new indices to dataframe.
R_df = R_df.assign(MovieID = mvd)
R_df.dtypes

MovieID      int64
UserID       int64
Genres      object
Title       object
Rating     float64
dtype: object

In [241]:
# Build R matrix in a VERY UGLY WAY.

from scipy.sparse import coo_matrix, csr_matrix
import ast

R_users = R_df["UserID"].to_numpy().flatten()
R_movies = R_df["MovieID"].to_numpy().flatten()
R_ratings = R_df["Rating"].to_numpy().flatten()

R = csr_matrix((R_ratings, (R_users, R_movies)))  # csr_matrix((dat, (row, col)))
R = R.toarray()
#np.shape(R) # SOMETHING WRONG WITH DIMENSION, TOO MANY MOVIES

In [24]:
# Building random data matrix R and obtain coefficient matrix C.

#M = 10
M = np.shape(R)[0]
#N = 20
M = np.shape(R)[1]

# Obserbed and unobserbed weights.
wk = 0.7
w0 = 0.5

#R = np.random.choice([0, 1], size = (M, N), p = [3./5, 2./5])

c = [ sum(R[:, i]) for i in range(0, np.shape(R)[1]) ]
C = R * c * wk + w0

In [25]:
C

array([[2.36090e+03, 5.00000e-01, 4.75100e+02, ..., 5.00000e-01,
        5.00000e-01, 5.00000e-01],
       [5.00000e-01, 5.00000e-01, 5.00000e-01, ..., 5.00000e-01,
        5.00000e-01, 5.00000e-01],
       [5.00000e-01, 5.00000e-01, 5.00000e-01, ..., 5.00000e-01,
        5.00000e-01, 5.00000e-01],
       ...,
       [1.47575e+03, 5.29000e+02, 2.37800e+02, ..., 5.00000e-01,
        5.00000e-01, 5.00000e-01],
       [1.77080e+03, 5.00000e-01, 5.00000e-01, ..., 5.00000e-01,
        5.00000e-01, 5.00000e-01],
       [2.95100e+03, 5.00000e-01, 5.00000e-01, ..., 5.00000e-01,
        5.00000e-01, 5.00000e-01]])

In [None]:
# Building random X and Y matrices of proper dimension.

#K = 5
K = 100

X = np.random.rand(M, K)
Y = np.random.rand(N, K)

In [None]:
# Define functions to compute approximated ratings and error.

def approximation(X, Y):
    return np.dot(X, Y.T)

def error(approxRatings, ratings, w0):
    obs_idx = np.where(ratings > 0)
    nobs_idx = np.where(ratings == 0)
    obs_error = sum( (ratings[obs_idx] - approxRatings[obs_idx]) ** 2 )
    nobs_error = sum( (ratings[nobs_idx] - approxRatings[nobs_idx]) ** 2 )
    return obs_error + w0 * nobs_error

In [None]:
# Testing functions.

approx_ratings = approximation(X, Y)
w0 = 0.5
error(approx_ratings, R, w0)

In [None]:
def singlePassWALS(R, X, Y, C, reg_lambda):
    M = np.shape(X)[0]
    K = np.shape(X)[1]
    N = np.shape(Y)[0]

    for u in range(1, M):
        Cu = np.diag(C[u, :])
        # A X_u = b
        A = multi_dot([Y.T, Cu, Y]) + reg_lambda * np.eye(K)
        b = multi_dot([Y.T, Cu, R[u, :]])
        X_u = np.linalg.solve(A, b)
        
        X[u,] = X_u
        
    for i in range(1, N):
        Ci = np.diag(C[:,i])
        # A Y_i = b
        A = multi_dot([X.T, Ci, X]) + reg_lambda * np.eye(K)
        b = multi_dot([X.T, Ci, R[:, i]])
        Y_i = np.linalg.solve(A, b)
        
        Y[i,] = Y_i    
    

In [None]:
def WALS(R, X, Y, C, reg_lambda, n_iter):
    for j in range(1, n_iter):
        singlePassWALS(R, X, Y, C, reg_lambda)
        approx_ratings = approximation(X, Y)
        w0 = 0.5
        print("Error: " + str(error(approx_ratings, R, w0)))
        

In [None]:
# Compute WALS.

reg_lambda = 0.2
n_iter = 10


WALS(R, X, Y, C, reg_lambda, n_iter)

final_result = approximation(X, Y)

In [None]:
T = np.matrix.round(final_result, 1)[4]

In [None]:
Test = np.array([R[4], T])

In [None]:
print(Test)