In [None]:
%matplotlib inline
import numpy as np
import scipy
import scipy.io
import scipy.sparse as sp
import pandas as pd
import matplotlib.pyplot as plt
import datetime
%load_ext autoreload
%autoreload 2
from helpers import *

In [None]:
dataset_file_path = "data_train.csv"
ratings = load_data(dataset_file_path)
print('Shape of ratings matrix:', ratings.shape)

In [None]:
from plots import plot_raw_data

num_items_per_user, num_users_per_item = plot_raw_data(ratings)

print("min # of items per user = {}, min # of users per item = {}.".format(
        min(num_items_per_user), min(num_users_per_item)))

In [None]:
def init_MF_random(train, num_features):
    """
        Initialize randomly the matrices W and Z of matrix factorization.

        Arguments:
            train: training set (matrix X)
            num_features: number of latent variables in the W*Z^T decomposition

        Returned value(s):
            item_features: matrix W of shape = num_features, num_item
            user_features: matrix Z of shape = num_features, num_user
    """
    
    item_features = np.random.random((train.shape[0],num_features)) # W matrix initialization
    user_features = np.random.random((train.shape[1],num_features)) # Z matrix initialization
    
    return item_features, user_features

In [None]:
# define parameters
num_features = 2 # number of latent features in matrix factorization
lambda_item = 0.0 # regularization parameter for item features
lambda_user = 0.0 # regularization parameter for user features
num_epochs = 20 # number of iterations of ALS

# set random seed
np.random.seed(988)

# initialize matrices W and Z
item_features, user_features = init_MF_random(ratings, num_features)

# find the non-zero ratings indices in the training set
nz_row, nz_col = ratings.nonzero()
nz_train = list(zip(nz_row, nz_col))

# initialize matrices used to compute RMSE
train_label = np.zeros(len(nz_train))
prediction_label = np.zeros(len(nz_train))

# initialize accumulator for RMSE of every iteration
rmse_train = np.zeros(num_epochs)

In [None]:
def compute_ALS(ratings, item_features, user_features, num_features, num_epochs):
    for it in range(num_epochs):
        begin = datetime.datetime.now() # start time measurement

        user_features = ((np.linalg.inv((item_features.T.dot(item_features) + lambda_user * np.identity(num_features)))).dot(item_features.T.dot(ratings))).T
        print("Items")
        item_features = ((np.linalg.inv((user_features.T.dot(user_features) + lambda_item * np.identity(num_features)))).dot(user_features.T.dot(ratings.T))).T

        end = datetime.datetime.now() # stop time measurement

        # calculate training rmse
        for i in range(len(nz_train)):
            train_label[i] = ratings[nz_train[i][0], nz_train[i][1]]
            prediction_label[i] = item_features[nz_train[i][0], :].dot(user_features[nz_train[i][1], :])

        # store RMSE of current iteration
        rmse_train[it] = calculate_mse(train_label, prediction_label)
        print("Epoch: {}, RMSE on training set: {}".format(it, rmse_train[it]))

        # compute the time of the iteration
        execution_time = (end - begin).total_seconds()
        print("Execution time:", execution_time)

        print("*" * 50)

In [None]:
compute_ALS(ratings, item_features, user_features, num_features, num_epochs)

In [None]:
plt.plot(rmse_train)

In [None]:
create_csv_submission(user_features, item_features)