In [18]:
# Useful starting lines
%matplotlib inline

import numpy as np
import scipy
import scipy.io
import scipy.sparse as sp
import matplotlib.pyplot as plt
from helpers import *
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
from helpers import load_data, preprocess_data

path_dataset = "data_train.csv"
ratings = load_data(path_dataset)
path_test="sampleSubmission.csv"
test_submission=load_data(path_test)

number of items: 10000, number of users: 1000
number of items: 10000, number of users: 1000


## Split data


In [5]:
def split_data(ratings, num_items_per_user, num_users_per_item,
               min_num_ratings, p_test=0.1):
    """split the ratings to training data and test data.
    Args:
        min_num_ratings: 
            all users and items we keep must have at least min_num_ratings per user and per item. 
    """
    # set seed
    np.random.seed(2019)
    
    # select user and item based on the condition.
    valid_users = np.where(num_items_per_user >= min_num_ratings)[0]
    valid_items = np.where(num_users_per_item >= min_num_ratings)[0]
    valid_ratings = ratings[valid_items, :][: , valid_users]  
    
    # init
    num_rows, num_cols = valid_ratings.shape
    train = sp.lil_matrix((num_rows, num_cols))
    test = sp.lil_matrix((num_rows, num_cols))
    
    print("the shape of original ratings. (# of row, # of col): {}".format(
        ratings.shape))
    print("the shape of valid ratings. (# of row, # of col): {}".format(
        (num_rows, num_cols)))

    nz_items, nz_users = valid_ratings.nonzero()
   
    # split the data
    for user in set(nz_users):
        # randomly select a subset of ratings
        row, col = valid_ratings[:, user].nonzero()
        selects = np.random.choice(row, size=int(len(row) * p_test))
        residual = list(set(row) - set(selects))

        # add to train set
        train[residual, user] = valid_ratings[residual, user]

        # add to test set
        test[selects, user] = valid_ratings[selects, user]

    print("Total number of nonzero elements in origial data:{v}".format(v=ratings.nnz))
    print("Total number of nonzero elements in train data:{v}".format(v=train.nnz))
    print("Total number of nonzero elements in test data:{v}".format(v=test.nnz))
    return valid_ratings, train, test

## Global mean

In [7]:
from helpers import calculate_mse

def baseline_global_mean(train):
    """baseline method: use the global mean."""
    # find the non zero ratings in the train
    nonzero_train = train[train.nonzero()]

    # calculate the global mean
    global_mean_train = nonzero_train.mean()
    # predict the ratings as global mean
    mse = calculate_mse(nonzero_train.todense(), global_mean_train)
    rmse = np.sqrt(1.0 * mse / nonzero_train.todense().shape[1])
    print("test RMSE of baseline using the global mean: {v}.".format(v=rmse))
    return global_mean_train
global_mean_train=baseline_global_mean(ratings)


test RMSE of baseline using the global mean: [[1.11905673]].


In [11]:
num_items, num_users = test_submission.shape
prediction_globalmean=np.array([])
for item_user in range(num_users):
    test=test_submission[:,item_user]
    row,col=test.nonzero()
    prediction_globalmean=np.append(prediction_globalmean,global_mean_train*np.ones((1,len(row))))


In [15]:
from helpers import *
OUTPUT_PATH = 'global_mean.csv' 

prediction_globalmean=np.round(prediction_globalmean.flatten())


submission(path_test,prediction_globalmean,OUTPUT_PATH)

## User mean

In [16]:
def baseline_user_mean(ratings,test_submission):
    """baseline method: use the user means as the prediction."""
    mse = 0
    num_items, num_users = ratings.shape
    prediction=np.array([])
    for user_index in range(num_users):
        # find the non-zero ratings for each user in the training dataset
        train_ratings = ratings[:, user_index]
        nonzeros_train_ratings = train_ratings[train_ratings.nonzero()]
        
        # calculate the mean if the number of elements is not 0
        if nonzeros_train_ratings.shape[0] != 0:
            user_train_mean = nonzeros_train_ratings.mean()
        else:
            continue
        
        # find the non-zero ratings for each user in the test dataset
        test_ratings = test_submission[:, user_index]
       
       
        prediction=np.append(prediction,user_train_mean*np.ones(test_ratings.nnz))
       
    return prediction
prediction_usermean=baseline_user_mean(ratings, test_submission)

In [18]:
OUTPUT_PATH = 'user_mean.csv' 

prediction_usermean=np.round(prediction_usermean.flatten())


submission(path_test,prediction_usermean,OUTPUT_PATH)

## Item mean

In [19]:
def baseline_item_mean(train):
    """baseline method: use item means as the prediction."""
    mse = 0
    num_items, num_users = train.shape
    a=np.array([])
    for item_index in range(num_items):
        # find the non-zero ratings for each item in the training dataset
        train_ratings = train[item_index, :]
        nonzeros_train_ratings = train_ratings[train_ratings.nonzero()]
        # calculate the mean if the number of elements is not 0
        if nonzeros_train_ratings.shape[0] != 0:
            item_train_mean = nonzeros_train_ratings.mean()
        else:
            continue
        
        a=np.append(a,item_train_mean)
    return a 
mean_movie=baseline_item_mean(ratings)

In [27]:
num_items, num_users = test_submission.shape
prediction_item=np.array([])
for item_user in range(num_users):
    test=test_submission[:,item_user]
    row,col=test.nonzero()
    prediction_item=np.append(prediction_item,mean_movie[row])

In [30]:
OUTPUT_PATH = 'item_mean.csv' 

prediction_item=np.round(prediction_item.flatten())


submission(path_test,prediction_item,OUTPUT_PATH)

## Matrix Factorization using SGD

In [7]:
def init_MF(train, num_features):
    """init the parameter for matrix factorization."""
   
    #  return:
    #     user_features: shape = num_features, num_user
    #     item_features: shape = num_features, num_item
    # ***************************************************
    num_item, num_user = train.get_shape()

    user_features = np.random.rand(num_features, num_user)
    item_features = np.random.rand(num_features, num_item)

    # start by item features.
    item_nnz = train.getnnz(axis=1)
    item_sum = train.sum(axis=1)

    for ind in range(num_item):
        item_features[0, ind] = item_sum[ind, 0] / item_nnz[ind]
    return user_features, item_features

In [8]:
def compute_error(data, user_features, item_features, nz):
    """compute the loss (MSE) of the prediction of nonzero elements."""
    # ***************************************************
    # calculate rmse (we only consider nonzero entries.)
    # ***************************************************
    mse = 0
    for row, col in nz:
        item_info = item_features[:, row]
        user_info = user_features[:, col]
        mse += (data[row, col] - user_info.T.dot(item_info)) ** 2
    return np.sqrt(1.0 * mse / len(nz))

In [30]:
def matrix_factorization_SGD(train, test, gamma, num_features, num_epochs,lambda_item, lambda_user):
    """matrix factorization by SGD.
    num_features  K in the lecture notes(number of features for our matrices)
    num_epochs      number of full passes through the train set
    lambda_user , lambda_item : regularization parameters
    """
    
    errors = [0]
    
    # set seed
    np.random.seed(2019)

    # init matrix
    user_features, item_features = init_MF(train, num_features)
    
    # find the non-zero ratings indices 
    nz_row, nz_col = train.nonzero()
    nz_train = list(zip(nz_row, nz_col))
    nz_row, nz_col = test.nonzero()
    nz_test = list(zip(nz_row, nz_col))

    print("learn the matrix factorization using SGD...")
    for it in range(num_epochs):        
        # shuffle the training rating indices
        np.random.shuffle(nz_train)
        
        # decrease step size
        gamma /= 1.2
        
        for d, n in nz_train:
            # update W_d (item_features[:, d]) and Z_n (user_features[:, n])
            item_info = item_features[:, d]
            user_info = user_features[:, n]
            err = train[d, n] - user_info.T.dot(item_info)
    
            # calculate the gradient and update
            item_features[:, d] += gamma * (err * user_info - lambda_item * item_info)
            user_features[:, n] += gamma * (err * item_info - lambda_user * user_info)

        rmse = compute_error(train, user_features, item_features, nz_train)
        print("iter: {}, RMSE on training set: {}.".format(it, rmse))
        # evaluate the test error
        rmse = compute_error(test, user_features, item_features, nz_test)
        print("RMSE on test data: {}.".format(rmse))

        errors.append(rmse)
    return errors,item_features,user_features


In [31]:
def hyper_par(train, test, gammas, Ks, lambdas_user, lambdas_item, num_epochs):
    '''
    This method allow us to get the best hyper parameters
    :return: the best combination of parameters with the lowest rmse
    '''
    errors = {}
    for g in gammas:
        for k in Ks:
            for lambda_user in lambdas_user:
                for lambda_item in lambdas_item:
                    error, w, z = matrix_factorization_SGD(train, test, g, k, num_epochs,lambda_item,lambda_user)
                    error_min = min(error[1:]) #because error[0]=0
                    errors[(g, k, error.index(error_min), lambda_user, lambda_item)] = error_min
    (gamma_final, k_final, num_epochs_final, lambda_user, lambda_item) = min(errors, key=errors.get)  #to get the best combination
    return gamma_final, k_final, num_epochs_final, lambda_user, lambda_item

In [32]:
def train_sgd_grid_search(ratings,gammas, Ks, num_epochs, lambdas_user, lambdas_item):
    '''This method does the grid search on the parameters given and then generates a submission for the best combination,
    using all the methods described before
    '''
    import time
    start_time = time.time()
    

    # do statistics.
    num_items_per_user = np.array((ratings != 0).sum(axis=0)).flatten()
    num_users_per_item = np.array((ratings != 0).sum(axis=1).T).flatten()

    valid_ratings, train, test = split_data(
        ratings, num_items_per_user, num_users_per_item, min_num_ratings=0, p_test=0.1)
    print("--- %s seconds ---" % (time.time() - start_time))

    # Grid search
    gamma_final, k_final, num_epochs_final, lambda_user, lambda_item = hyper_par(train, test, gammas, Ks, lambdas_user,lambdas_item,num_epochs)
    print(gamma_final)
    print(k_final)
    print(num_epochs_final)
    print(lambda_item)
    print(lambda_user)
    errors, item_features,user_features = matrix_factorization_SGD(train, test, gamma=gamma_final, num_features=k_final,
                                              num_epochs=num_epochs_final, lambda_item=lambda_item,
                                              lambda_user=lambda_user)
    print("--- %s seconds ---" % (time.time() - start_time))

In [42]:
gammas=[0.001,0.005,0.009,0.05]
Ks=[5,15,25,30,35,40]
num_epochs=2
lambdas_user=[0.01,0.02,0.015,0.005]
lambdas_item=[0.015,0.24,0.1,0.3,0.4]

train_sgd_grid_search(ratings,gammas, Ks, num_epochs, lambdas_user, lambdas_item)

the shape of original ratings. (# of row, # of col): (10000, 1000)
the shape of valid ratings. (# of row, # of col): (10000, 1000)
Total number of nonzero elements in origial data:1176952
Total number of nonzero elements in train data:1065324
Total number of nonzero elements in test data:111628
--- 89.67246150970459 seconds ---
learn the matrix factorization using SGD...
iter: 0, RMSE on training set: 1.0337956959632955.
RMSE on test data: 1.0437305902135077.
iter: 1, RMSE on training set: 1.0211454727504135.
RMSE on test data: 1.0320494011588743.
learn the matrix factorization using SGD...
iter: 0, RMSE on training set: 1.0364825180948865.
RMSE on test data: 1.046409051437782.
iter: 1, RMSE on training set: 1.0268404643469666.
RMSE on test data: 1.0377612340831752.
learn the matrix factorization using SGD...
iter: 0, RMSE on training set: 1.0345749904315813.
RMSE on test data: 1.0445096191489966.
iter: 1, RMSE on training set: 1.022656783200728.
RMSE on test data: 1.0335743055797302.




iter: 0, RMSE on training set: nan.
RMSE on test data: nan.
iter: 1, RMSE on training set: nan.
RMSE on test data: nan.
learn the matrix factorization using SGD...




iter: 0, RMSE on training set: nan.
RMSE on test data: nan.
iter: 1, RMSE on training set: nan.
RMSE on test data: nan.
learn the matrix factorization using SGD...




iter: 0, RMSE on training set: nan.
RMSE on test data: nan.
iter: 1, RMSE on training set: nan.
RMSE on test data: nan.
learn the matrix factorization using SGD...
iter: 0, RMSE on training set: nan.
RMSE on test data: nan.
iter: 1, RMSE on training set: nan.
RMSE on test data: nan.
learn the matrix factorization using SGD...
iter: 0, RMSE on training set: nan.
RMSE on test data: nan.
iter: 1, RMSE on training set: nan.
RMSE on test data: nan.
learn the matrix factorization using SGD...
iter: 0, RMSE on training set: nan.
RMSE on test data: nan.
iter: 1, RMSE on training set: nan.
RMSE on test data: nan.
learn the matrix factorization using SGD...
iter: 0, RMSE on training set: nan.
RMSE on test data: nan.
iter: 1, RMSE on training set: nan.
RMSE on test data: nan.
learn the matrix factorization using SGD...
iter: 0, RMSE on training set: nan.
RMSE on test data: nan.
iter: 1, RMSE on training set: nan.
RMSE on test data: nan.
learn the matrix factorization using SGD...
iter: 0, RMSE on

In [43]:
import time
start_time=time.time()

num_items_per_user = np.array((ratings != 0).sum(axis=0)).flatten()
num_users_per_item = np.array((ratings != 0).sum(axis=1).T).flatten()

valid_ratings, train, test = split_data(
        ratings, num_items_per_user, num_users_per_item, min_num_ratings=0, p_test=0.1)
print("--- %s seconds ---" % (time.time() - start_time))
errors, item_features,user_features = matrix_factorization_SGD(train, test, gamma=0.005, num_features=5,
                                              num_epochs=100, lambda_item=0.015,
                                              lambda_user=0.02)

the shape of original ratings. (# of row, # of col): (10000, 1000)
the shape of valid ratings. (# of row, # of col): (10000, 1000)
Total number of nonzero elements in origial data:1176952
Total number of nonzero elements in train data:1065324
Total number of nonzero elements in test data:111628
--- 127.09667897224426 seconds ---
learn the matrix factorization using SGD...
iter: 0, RMSE on training set: 1.0177147905905786.
RMSE on test data: 1.0304529392857777.
iter: 1, RMSE on training set: 1.0086326527191274.
RMSE on test data: 1.0223327209887025.
iter: 2, RMSE on training set: 1.0036789192108178.
RMSE on test data: 1.0179152465096732.
iter: 3, RMSE on training set: 0.999991395992896.
RMSE on test data: 1.014302540788575.
iter: 4, RMSE on training set: 0.9981115929210609.
RMSE on test data: 1.0129684490644058.
iter: 5, RMSE on training set: 0.995478947463597.
RMSE on test data: 1.0105147403761665.
iter: 6, RMSE on training set: 0.9940271896241553.
RMSE on test data: 1.0094769633381706

In [44]:
def prediction(test,item_features,user_features):
    nz_row, nz_col = test.T.shape
    A=item_features.T.dot(user_features)
    prediction=np.array([])
    for user in range(nz_row):
        if user%100==0:
            print(user)
        test_ratings = test[:, user]
        item=test_ratings.nonzero()
        prediction=np.append(prediction,A[item[0],user])
    return prediction
prediction_sgd=prediction(test_submission,item_features,user_features)



0
100
200
300
400
500
600
700
800
900


In [45]:
from helpers import *
OUTPUT_PATH = 'SGD_best_combination2.csv' 

prediction_sgd=np.round(prediction_sgd.flatten())
a=np.where(prediction_sgd>5)[0]
prediction_sgd[a]=5

submission(path_test,prediction_sgd,OUTPUT_PATH)

## ALS

In [38]:
def update_user_feature(
        train, item_features, lambda_user,
        nnz_items_per_user, nz_user_itemindices):
    """update user feature matrix."""
    """the best lambda is assumed to be nnz_items_per_user[user] * lambda_user"""
    num_user = nnz_items_per_user.shape[0]
    num_feature = item_features.shape[0]
    lambda_I = lambda_user * sp.eye(num_feature)
    updated_user_features = np.zeros((num_feature, num_user))

    for user, items in nz_user_itemindices:
        # extract the columns corresponding to the prediction for given item
        M = item_features[:, items]
        
        # update column row of user features
        V = M @ train[items, user]
        A = M @ M.T + nnz_items_per_user[user] * lambda_I
        X = np.linalg.solve(A, V)
        updated_user_features[:, user] = np.copy(X.T)
    return updated_user_features

def update_item_feature(
        train, user_features, lambda_item,
        nnz_users_per_item, nz_item_userindices):
    """update item feature matrix."""
    """the best lambda is assumed to be nnz_items_per_item[item] * lambda_item"""
    num_item = nnz_users_per_item.shape[0]
    num_feature = user_features.shape[0]
    lambda_I = lambda_item * sp.eye(num_feature)
    updated_item_features = np.zeros((num_feature, num_item))

    for item, users in nz_item_userindices:
        # extract the columns corresponding to the prediction for given user
        M = user_features[:, users]
        V = M @ train[item, users].T
        A = M @ M.T + nnz_users_per_item[item] * lambda_I
        X = np.linalg.solve(A, V)
        updated_item_features[:, item] = np.copy(X.T)
    return updated_item_features

In [39]:
from helpers import build_index_groups


def ALS(train):
    """Alternating Least Squares (ALS) algorithm."""
    # define parameters
    num_features = 20   # K in the lecture notes
    lambda_user = 0.1
    lambda_item = 0.7
    stop_criterion = 1e-6
    change = 1
    error_list = [0, 0]
    
    # set seed
    np.random.seed(988)

    # init ALS
    user_features, item_features = init_MF(train, num_features)
    
    # get the number of non-zero ratings for each user and item
    nnz_items_per_user, nnz_users_per_item = train.getnnz(axis=0), train.getnnz(axis=1)
    
    # group the indices by row or column index
    nz_train, nz_item_userindices, nz_user_itemindices = build_index_groups(train)

    # run ALS
    print("\nstart the ALS algorithm...")
    while change > stop_criterion:
        # update user feature & item feature
        user_features = update_user_feature(
            train, item_features, lambda_user,
            nnz_items_per_user, nz_user_itemindices)
        item_features = update_item_feature(
            train, user_features, lambda_item,
            nnz_users_per_item, nz_item_userindices)

        error = compute_error(train, user_features, item_features, nz_train)
        print("RMSE on training set: {}.".format(error))
        error_list.append(error)
        change = np.fabs(error_list[-1] - error_list[-2])

    
    return user_features,item_features

In [40]:
user_features_als,item_features_als=ALS(ratings)
prediction_als=prediction(test_submission,item_features_als,user_features_als)


start the ALS algorithm...
RMSE on training set: 2.065063021490569.
RMSE on training set: 1.273463815376106.
RMSE on training set: 1.1449053870831276.
RMSE on training set: 1.0939015808006096.
RMSE on training set: 1.0684238701680566.
RMSE on training set: 1.0539746318308731.
RMSE on training set: 1.0450942951421904.
RMSE on training set: 1.039332330498928.
RMSE on training set: 1.035449129888575.
RMSE on training set: 1.0327601516258906.
RMSE on training set: 1.0308611744052287.
RMSE on training set: 1.0295006923357326.
RMSE on training set: 1.0285156463572684.
RMSE on training set: 1.0277968413905214.
RMSE on training set: 1.0272692744283838.
RMSE on training set: 1.0268804005494088.
RMSE on training set: 1.0265928420890558.
RMSE on training set: 1.0263796971005599.
RMSE on training set: 1.026221429358883.
RMSE on training set: 1.0261037548976675.
RMSE on training set: 1.0260161761245978.
RMSE on training set: 1.0259509481651852.
RMSE on training set: 1.0259023403447758.
RMSE on tra

In [41]:
OUTPUT_PATH = 'ALS_20.csv' 

prediction_als=np.round(prediction_als.flatten())
a=np.where(prediction_als>5)[0]
prediction_als[a]=5

submission(path_test,prediction_als,OUTPUT_PATH)

## Pytorch

In [3]:
import pandas as pd
import torch
from spotlight.interactions import Interactions
from spotlight.factorization.explicit import ExplicitFactorizationModel
from spotlight.cross_validation import random_train_test_split
from spotlight.evaluation import rmse_score
torch.set_default_tensor_type('torch.DoubleTensor')

In [15]:
from helpers import *
LOSS = 'regression'  # Our chosen loss
K = 20  # Latent dimension of our matrix factorization
NB_EPOCHS = 30  # Number of times we go through our training set
BATCH_SIZE = 32  # The batch size of our optimization algorithm
L2 = 1e-5  # Our lambda ridge penalization
GAMMA = 1e-4  # Our optimization learning rate

# Loading train data
print("---------LOADING DATA-----------")
raw_data_train = pd.read_csv('data_train.csv')
raw_data_output = pd.read_csv('sampleSubmission.csv')

print("---------PREPROCESSING DATA-----------")
df_input = split_(raw_data_train, column='Id')
df_output = split_(raw_data_output, column='Id')
input_interaction = create_input(df_input['userid'].values,
                                               df_input['movieid'].values,
                                               df_input['rating'].values)

output_interaction = create_input(df_output['userid'].values,
                                                df_output['movieid'].values,
                                                df_output['rating'].values)

print("---------TRAINING THE MODEL-----------")
model = create_model(loss=LOSS, k=K, number_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, l2_penal=L2, gamma=GAMMA)
model = train_model(model, input_interaction)
print("---------CREATING THE SUBMISSION-----------")
df_submission = create_output_df(y_predictions=predict_output(model, output_interaction), test_df=raw_data_output)
create_submission_pd(df_submission)


---------LOADING DATA-----------
---------PREPROCESSING DATA-----------
---------TRAINING THE MODEL-----------
Epoch 0: loss 11.808978693339691
Epoch 1: loss 2.221023114873733
Epoch 2: loss 1.0749866440719946
Epoch 3: loss 1.014201435913752
Epoch 4: loss 1.0036618863256996
Epoch 5: loss 1.000469624169046
Epoch 6: loss 0.9990616741358778
Epoch 7: loss 0.9976488795650446
Epoch 8: loss 0.9965754535949215
Epoch 9: loss 0.995116572694359
Epoch 10: loss 0.9937046261423974
Epoch 11: loss 0.9917810907779826
Epoch 12: loss 0.9894441019312613
Epoch 13: loss 0.9870301735669972
Epoch 14: loss 0.9839572431582418
Epoch 15: loss 0.9809411778465155
Epoch 16: loss 0.9776719151656257
Epoch 17: loss 0.9741851131667383
Epoch 18: loss 0.9706778684514153
Epoch 19: loss 0.9672847675307024
Epoch 20: loss 0.9641072391291098
Epoch 21: loss 0.9607400285842037
Epoch 22: loss 0.9570136881005097
Epoch 23: loss 0.9534914778277003
Epoch 24: loss 0.9492016573743824
Epoch 25: loss 0.9450947575104222
Epoch 26: loss 0.94

In [7]:
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate
import pandas as pd
from surprise import Reader
from helpers import *

# Load the data.
print("---------LOADING DATA-----------")
data_train = pd.read_csv('data_train.csv')
data_output = pd.read_csv('sampleSubmission.csv')

print("---------PREPROCESSING DATA-----------")
df_input = split_(data_train, column='Id')
df_output = split_(data_output, column='Id')
reader = Reader(rating_scale=(1, 5))

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(df_input[['userid', 'movieid', 'rating']], reader)
# Use the famous SVD algorithm.
algo = SVD()

# Run 5-fold cross-validation and print results.
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)


---------LOADING DATA-----------
---------PREPROCESSING DATA-----------
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.0277  1.0284  1.0277  1.0292  1.0238  1.0274  0.0019  
MAE (testset)     0.8197  0.8203  0.8204  0.8219  0.8180  0.8201  0.0013  
Fit time          91.58   91.77   94.90   92.62   92.40   92.65   1.19    
Test time         3.77    3.92    4.61    3.71    3.56    3.92    0.37    


{'test_rmse': array([1.02774782, 1.02836132, 1.02767626, 1.02919419, 1.02378757]),
 'test_mae': array([0.81973704, 0.82026217, 0.82037513, 0.82191   , 0.81800344]),
 'fit_time': (91.57523536682129,
  91.76902413368225,
  94.89803957939148,
  92.61508774757385,
  92.40403032302856),
 'test_time': (3.771996021270752,
  3.9249796867370605,
  4.612553119659424,
  3.705843925476074,
  3.56200909614563)}

In [15]:
from surprise.model_selection import GridSearchCV
data = Dataset.load_from_df(df_input[['userid', 'movieid', 'rating']], reader)
param_grid = {'n_epochs': [5,10], 'lr_all': [0.005,0.009],
              'reg_all': [0.4,0.2]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "c:\users\lamyae\appdata\local\continuum\anaconda3\envs\projectml2\lib\site-packages\IPython\core\interactiveshell.py", line 3319, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-15-fd44323cb2b8>", line 7, in <module>
    gs.fit(data)
  File "c:\users\lamyae\appdata\local\continuum\anaconda3\envs\projectml2\lib\site-packages\surprise\model_selection\search.py", line 90, in fit
    verbose=self.joblib_verbose)(delayed_list)
  File "c:\users\lamyae\appdata\local\continuum\anaconda3\envs\projectml2\lib\site-packages\joblib\parallel.py", line 1007, in __call__
    while self.dispatch_one_batch(iterator):
  File "c:\users\lamyae\appdata\local\continuum\anaconda3\envs\projectml2\lib\site-packages\joblib\parallel.py", line 835, in dispatch_one_batch
    self._dispatch(tasks)
  File "c:\users\lamyae\appdata\local\continuum\anaconda3\envs\projectml2\lib\site-packages\joblib\parallel.py", line 754, in _dispatch
  

KeyboardInterrupt: 

In [18]:
from surprise import SVDpp , SlopeOne ,NMF , NormalPredictor , KNNBaseline , KNNBasic , KNNWithMeans ,KNNWithZScore , BaselineOnly , CoClustering
benchmark = []
# Iterate over all algorithms
for algorithm in [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]:
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
    print("_____step1____")
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')  

_____step1____
_____step1____
_____step1____
_____step1____
_____step1____
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
_____step1____
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
_____step1____
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
_____step1____
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing sim

Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BaselineOnly,0.999947,3.83272,6.362024
SlopeOne,1.001506,6.460053,60.607916
KNNBaseline,1.01197,120.580732,847.633581
CoClustering,1.013733,29.899863,6.54197
NMF,1.014375,77.733584,5.737661
KNNWithMeans,1.024882,111.466288,792.832491
SVDpp,1.024894,1674.849502,69.05394
KNNWithZScore,1.026487,115.130776,851.329665
KNNBasic,1.031897,100.597629,715.553122
SVD,1.032499,74.655963,7.463581


In [5]:
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate
import pandas as pd
from surprise import Reader
from helpers import *
from surprise import SVDpp , SlopeOne ,NMF , NormalPredictor , KNNBaseline , KNNBasic , KNNWithMeans ,KNNWithZScore , BaselineOnly , CoClustering
print("---------LOADING DATA-----------")
data_train = pd.read_csv('data_train.csv')
data_output = pd.read_csv('sampleSubmission.csv')

print("---------PREPROCESSING DATA-----------")
df_input = split_(data_train, column='Id')
df_output = split_(data_output, column='Id')
reader = Reader(rating_scale=(1, 5))

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(df_input[['userid', 'movieid', 'rating']], reader)
print('Using ALS')
bsl_options = {'method': 'als',
               'n_epochs': 5,
               'reg_u': 12,
               'reg_i': 5
               }
algo = BaselineOnly(bsl_options=bsl_options)
cross_validate(algo, data, measures=['RMSE'], cv=3, verbose=False)

---------LOADING DATA-----------
---------PREPROCESSING DATA-----------
Using ALS
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...


{'test_rmse': array([0.99966153, 1.00081319, 1.00009591]),
 'fit_time': (2.2487738132476807, 3.047178030014038, 2.9119856357574463),
 'test_time': (5.219197988510132, 4.664770603179932, 5.323895454406738)}