In [1]:
%matplotlib inline
import numpy as np
import scipy
import scipy.io
import scipy.sparse as sp
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

## Load the data

In [2]:
from helpers import load_data, preprocess_data
path_dataset = "Datasets/data_train.csv"
ratings = load_data(path_dataset)

## SGD Matrix Factorization 

In [3]:
from helpers import init_MF, compute_error

### Implementation

In [4]:
def matrix_factorization_SGD(ratings, gamma, num_features, lambda_user, lambda_item, reg_step):
    """matrix factorization by SGD."""
    
    # define parameters
    num_epochs = 30
    errors = [0]
    
    # set seed
    np.random.seed(1)

    # init matrix
    user_features, item_features = init_MF(ratings, num_features)
    
    # find the non-zero ratings indices 
    nz_row, nz_col = ratings.nonzero()
    nz_ratings = list(zip(nz_row, nz_col))

    for it in range(num_epochs):        
        # shuffle the ratings indices
        np.random.shuffle(nz_ratings)
        
        # decrease step size
        gamma /= reg_step
        
        for d, n in nz_ratings:
            # update W_d (item_features[:, d]) and Z_n (user_features[:, n])
            item_info = item_features[:, d]
            user_info = user_features[:, n]
            err = ratings[d, n] - user_info.T.dot(item_info)
    
            # calculate the gradient and update
            item_features[:, d] += gamma * (err * user_info - lambda_item * item_info)
            user_features[:, n] += gamma * (err * item_info - lambda_user * user_info)
    
    return user_features, item_features

### Training

In [5]:
from timeit import default_timer as timer

start = timer()
user_features_SGD, item_features_SGD = matrix_factorization_SGD(ratings, 0.008768 , 50, 0.0005  , 0.7 , 1.1)
end = timer()
print("execution time: {} minutes.".format((end - start)/60))

# The values for the hyperparameters have been chosen through bayesian optimization 
# as computed in the bayesian_optimization_baselines.ipynb notebook

execution time: 21.68439289166667 minutes.


## ALS Matrix Factorization

In [6]:
from helpers import update_user_feature, update_item_feature
from helpers import build_index_groups

### Implementation

In [7]:
def ALS(ratings, num_features, lambda_user, lambda_item):
    """Alternating Least Squares (ALS) algorithm."""
    
    # define parameters
    num_epochs = 30
    error_list = [0, 0]
    
    # set seed
    np.random.seed(1)

    # init ALS
    user_features, item_features = init_MF(ratings, num_features)
    
    # get the number of non-zero ratings for each user and item
    nnz_items_per_user, nnz_users_per_item = ratings.getnnz(axis=0), ratings.getnnz(axis=1)
    
    # group the indices by row or column index
    nz_ratings, nz_item_userindices, nz_user_itemindices = build_index_groups(ratings)

    for it in range(num_epochs): 
    
        # update user feature & item feature
        user_features = update_user_feature(
            ratings, item_features, lambda_user,
            nnz_items_per_user, nz_user_itemindices)
        item_features = update_item_feature(
            ratings, user_features, lambda_item,
            nnz_users_per_item, nz_item_userindices)

    return user_features, item_features

### Training

In [9]:
start = timer()
user_features_ALS, item_features_ALS = ALS(ratings, 5, 0.0005, 0.7)
end = timer()
print("execution time: {} minutes.".format((end - start)/60))

# The values for the hyperparameters have been chosen through bayesian optimization 
# as computed in the bayesian_optimization_baselines.ipynb notebook

execution time: 14.110866665000003 minutes.


## Create submissions

In [10]:
path_dataset = "Datasets/sample_submission.csv"
from helpers import read_txt, load_submission_indexes
from helpers import get_coordinate_and_prediction, create_submission
sample_indexes =  load_submission_indexes(path_dataset)

### SGD submission

In [11]:
pos, predictions = create_submission(user_features_SGD, item_features_SGD, sample_indexes)
# creat .csv submission file
import csv
with open('outputs/submission_SGD_tuned.csv','w',newline = '') as f:
    writer = csv.writer(f)
    writer.writerows(zip(pos, predictions))

### ALS submission

In [12]:
pos, predictions = create_submission(user_features_ALS, item_features_ALS, sample_indexes)
# creat .csv submission file
import csv
with open('outputs/submission_ALS_tuned.csv','w',newline = '') as f:
    writer = csv.writer(f)
    writer.writerows(zip(pos, predictions))