In [None]:
%load_ext autoreload
%autoreload 2

PREDICTION_PATH = '../data/predictions/'

#### Load data

In [None]:
from datafile_methods.data_io import load_datasets, load_datasets_sur

In [None]:
# Load datasets
folds, ratings, sample_submission = load_datasets()

In [None]:
folds_tr, folds_te, ratings_sur, sample_submission_sur = load_datasets_sur()

#### This should be moved to a different file!
Function to carry out the cross-validation

In [None]:
import numpy as np
from functools import reduce

def cross_validation(folds, prediction_model, args):
    """Gets the training and test errors for all the folds.
    Returns two lists including train and test errors respectively
    """
    train_rmse = np.zeros(len(folds))
    test_rmse = np.zeros(len(folds))
    for i in range(len(folds)):
        folds_copy = folds.copy()
        test = folds_copy.pop(i)
        train = reduce(lambda x, y: x + y, [m for m in folds_copy])
        train_rmse[i], test_rmse[i] = prediction_model(train, test, True, **args)
    return train_rmse, test_rmse

In [None]:
def cross_validation_sur(folds_tr, folds_te, prediction_model, args):
    """Gets the training and test errors for all the folds.
    Returns two lists including train and test errors respectively
    """
    train_rmse = np.zeros(len(folds_tr))
    test_rmse = np.zeros(len(folds_tr))
    for i in range(len(folds_tr)):
        train = folds_tr[i]
        test = folds_te[i]
        train_rmse[i], test_rmse[i] = prediction_model(train, test, True, **args)
    return train_rmse, test_rmse

#### Define dictionaries to keep the train and test errors

In [None]:
errors = {}

#### Train models

Baselines

In [None]:
from prediction_methods.baseline_model import model_baseline

In [None]:
# 5-fold cross-validation
train_rmse, test_rmse = cross_validation(folds, model_baseline, {'prediction_path': PREDICTION_PATH})
errors['baseline'] = [train_rmse, test_rmse]

In [None]:
# Train on entire training set. Predict for `submission` dataset
model_baseline(ratings, sample_submission, False, prediction_path=PREDICTION_PATH)

Matrix factorization with ALS

In [None]:
from prediction_methods.mf_als_model import model_mf_als

args = {'prediction_path': PREDICTION_PATH,
        'k': 20,
        'lambda_u': .1,
        'lambda_i': .1,
        'tol': 1e-6,
        'max_iter': 100}

# 5-fold cross-validation
train_rmse, test_rmse = cross_validation(folds, model_mf_als, args)
errors['mf_als'] = [train_rmse, test_rmse]

In [None]:
# Train on entire training set. Predict for `submission` dataset
model_mf_als(ratings, sample_submission, False, **args)

## Before refactoring achieved 0.9878 in training
## Other good param_combs are (k20, lambdas 0.095) and (k30, lambdas 0.095)

Matrix factorization with SVD (scipy)

In [None]:
from prediction_methods.mf_svd_model import model_mf_svd

args = {'prediction_path': PREDICTION_PATH,
        'k': 16,
        'library': 'scipy',
        'fn_suffix': 'sci_'}

# 5-fold cross-validation
train_rmse, test_rmse = cross_validation(folds, model_mf_svd, args)
errors['mf_svd_sci'] = [train_rmse, test_rmse]

In [None]:
# Train on entire training set. Predict for `submission` dataset
model_mf_svd(ratings, sample_submission, False, **args)

## Before ref. achieved 0.99385 on test. It's better to average it with the next ones!

Matrix factorization with SVD (sklearn)

In [None]:
args = {'prediction_path': PREDICTION_PATH,
        'k': 12,
        'n_iter': 50,
        'library': 'sklearn',
        'random_state': 70,
        'fn_suffix': 'skl_'}

# 5-fold cross-validation
train_rmse, test_rmse = cross_validation(folds, model_mf_svd, args)
errors['mf_svd_skl'] = [train_rmse, test_rmse]

In [None]:
# Train on entire training set. Predict for `submission` dataset
model_mf_svd(ratings, sample_submission, False, **args)

### Bef ref. achieved 0.99386 (approx) on test
# Achieves 0.99082 in Kaggle

Recommend's ALS model

In [None]:
from prediction_methods.recommend_model import model_mf_als_recommend

n_item, n_user = ratings.shape
args = {'n_user': n_user,
        'n_item': n_item,
        'prediction_path': PREDICTION_PATH,
        'k': 20,
        'n_iter': 50,
        'reg': 85e-3}

# 5-fold cross-validation
train_rmse, test_rmse = cross_validation(folds, model_mf_als_recommend, args)
errors['mf_als_recommend'] = [train_rmse, test_rmse]

In [None]:
# Train on entire training set. Predict for `submission` dataset
model_mf_als_recommend(ratings, sample_submission, False, **args)

#Test RMSE of model_mf_als_recommend: 0.9975413590062877
#0.98585 on Kaggle

Surprise models

* SlopeOne

In [None]:
from prediction_methods.surprise_models import model_slope_one

In [None]:
args = {'prediction_path': PREDICTION_PATH}

# 5-fold cross-validation
train_rmse, test_rmse = cross_validation_sur(folds_tr, folds_te, model_slope_one, args)
errors['slope_one'] = [train_rmse, test_rmse]

In [None]:
model_slope_one(ratings_sur, sample_submission_sur, False, **args)

* CoClustering

In [None]:
from prediction_methods.surprise_models import model_co_clustering

In [None]:
args = {'prediction_path': PREDICTION_PATH,
        'n_cltr_ud': 75,
        'n_cltr_id': 3,
        'n_epochsd': 100}

# 5-fold cross-validation
train_rmse, test_rmse = cross_validation_sur(folds_sur, model_co_clustering, args)
errors['co_clustering'] = [train_rmse, test_rmse]

In [None]:
model_co_clustering(ratings_sur, sample_submission_sur, False, **args)

* KNN Baseline (user based)

In [None]:
from prediction_methods.surprise_models import model_knn_baseline

In [None]:
args = {'prediction_path': PREDICTION_PATH,
        'k': 300,
        'min_k': 20,
        'name': 'pearson_baseline',
        'user_based': True,
        'fn_suffix': 'u_'}

# 5-fold cross-validation
train_rmse, test_rmse = cross_validation_sur(folds_sur, model_knn_baseline, args)
errors['knn_baseline_u'] = [train_rmse, test_rmse]

In [None]:
model_knn_baseline(ratings_sur, sample_submission_sur, False, **args)

* KNN Baseline (item based)

In [None]:
args = {'prediction_path': PREDICTION_PATH,
        'k': 120,
        'min_k': 20,
        'name': 'pearson_baseline',
        'user_based': False,
        'fn_suffix': 'i_'}

# 5-fold cross-validation
train_rmse, test_rmse = cross_validation_sur(folds_sur, model_knn_baseline, args)
errors['knn_baseline_i'] = [train_rmse, test_rmse]

In [None]:
model_knn_baseline(ratings_sur, sample_submission_sur, False, **args)

* NMF

In [None]:
from prediction_methods.surprise_models import model_nmf

In [None]:
args = {'prediction_path': PREDICTION_PATH,
        'biased': True,
        'k': 18,
        'reg_pu': 0.08,
        'reg_qi': 0.08,
        'reg_bu': 0.055,
        'reg_bi': 0.055,
        'n_epochs': 150}

# 5-fold cross-validation
train_rmse, test_rmse = cross_validation_sur(folds_sur, model_nmf, args)
errors['nmf'] = [train_rmse, test_rmse]

In [None]:
model_nmf(ratings_sur, sample_submission_sur, False, **args)

* SVD

In [None]:
from prediction_methods.surprise_models import model_svd

In [None]:
args = {'prediction_path': PREDICTION_PATH,
        'biased': True,
        'k': 130,
        'reg_all': 0.065,
        'n_epochs': 50}

# 5-fold cross-validation
train_rmse, test_rmse = cross_validation_sur(folds_sur, model_svd, args)
errors['sur_svd'] = [train_rmse, test_rmse]

In [None]:
model_svd(ratings_sur, sample_submission_sur, False, **args)

#### Combine ratings

Load ratings for test and validation files

In [None]:
#TODO

Do (regularized?) linear regression on the test rating predictions to obtain best weights

In [None]:
#TODO

Adjust regularization for getting similar errors in test and validation

In [None]:
#TODO
#Also ask a TA if this is ok. Also, maybe I should not be calling the sets `test` and `validation`

Load ratings for submission files

In [None]:
#TODO

Apply previous weights to obtain a final submission

In [None]:
#TODO