In [None]:
%load_ext autotime

%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import scipy.sparse as sp
from datafile_methods.data_io import save_csv
DATA_PATH = '../data/'
PREDICTION_PATH = '../data/predictions/'

In [None]:
import matplotlib.pyplot as plt

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt

### Load data

In [None]:
from datafile_methods.data_io import load_datasets
from datafile_methods.data_processing import load_data

# Load datasets
folds, ratings, sample_submission = load_datasets()

k_fold = len(folds)

models = ['baseline',
          'knn_baseline_i',
          'knn_baseline_u',
          'mf_svd_sci',
          'nmf',
          'slope_one',
          'sur_svd',
          'mf_als_recommend',
          'mf_als']

# Load predictions for each fold and model
predictions = [[load_data('{p}model_{m}_te_{i}.csv'.format(
    p=PREDICTION_PATH, m=model, i=i)) for i in range(k_fold)] for model in models]

predictions_dict = dict(zip(models, predictions))

#### Divide the set into B disjoint subsets: Justification

Support : The support of a data point (u, i) is the
number of votes by user u. The blender can now base
the weighting of predictors dependent on how many
rating the user has given. RBMs are prone to receive
high weight when the user has only a few votes in the
data. SVDs are highly weighted when much information
from a user is available.

http://elf-project.sourceforge.net/CombiningPredictionsForAccurateRecommenderSystems.pdf

In [None]:
from plots import plot_raw_data
num_items_per_user, num_users_per_item = plot_raw_data(ratings)

## Determine a separate blending for each subset

### Option 1: Manually compute weights

In [None]:
from prediction_methods.create_ensemble import create_weighted_ensemble_submission

# Load predictions of submission entries with each model
predictions_sub = [load_data('{p}model_{m}_sub.csv'.format(p=PREDICTION_PATH, m=model))
    for model in models]
predictions_sub_dict = dict(zip(models, predictions_sub))

In [None]:
from prediction_methods.create_ensemble import evaluate_manual_weighted_ensemble
predictions_high_df, errors_comb_high, predictions_low_df, errors_comb_low = evaluate_manual_weighted_ensemble(ratings, folds, predictions_dict)

#### Create ensemble

In [None]:
create_weighted_ensemble_submission(ratings, predictions_sub_dict,
    prediction_path=PREDICTION_PATH)

### Option 2: Use Ridge Regression to find best weights

#### Testing...

We cannot use only the previous predictions, since the ensemble underfits.

We get extra stuff from https://arxiv.org/pdf/0911.0460.pdf

In [None]:
from sklearn.linear_model import Ridge

In [None]:
from prediction_methods.create_ensemble import evaluate_meta_features_ensemble
predictors_high_df, observations_high_df, predictors_low_df, observations_low_df = evaluate_meta_features_ensemble(ratings, folds, predictions_dict)

High support

In [None]:
from sklearn.preprocessing import PolynomialFeatures
import pandas as pd
poly = PolynomialFeatures(2)

In [None]:
regularization_errors_h_tr = {}
regularization_errors_h_te = {}

for alpha in np.linspace(0,.001,9):
    rmse_errors_tr = np.zeros(k_fold)
    rmse_errors_te = np.zeros(k_fold)
    for i in range(k_fold):
        # Get training data
        preds_train = predictors_high_df[i][::2]
        train = poly.fit_transform(preds_train)
        vals_train = observations_high_df[i][::2]
        # Create and fit model
        clf = Ridge(alpha=alpha, normalize=True)
        clf.fit(train, vals_train)
        # Obtain predictions for training set
        pred_train = clf.predict(train)
        pred_train = np.clip(pred_train, 1.0, 5.0)
        # Compute train error
        rmse_train = sqrt(mean_squared_error(vals_train, pred_train))

        # Use remaining data for testing
        preds_test = predictors_high_df[i][1::2]
        test = poly.fit_transform(preds_test)
        vals_test = observations_high_df[i][1::2]
        # Obtain predictions for test setpred_high_sub
        pred_test = clf.predict(test)
        # Compute test error
        rmse_test = sqrt(mean_squared_error(vals_test, pred_test))
        
        # Save errors in arrays
        rmse_errors_tr[i] = rmse_train
        rmse_errors_te[i] = rmse_test
    # Save errors in dictionaries
    regularization_errors_h_tr[alpha] = rmse_errors_tr
    regularization_errors_h_te[alpha] = rmse_errors_te

plt.figure(figsize=(12,5))
pd.DataFrame(regularization_errors_h_tr).boxplot()
pd.DataFrame(regularization_errors_h_te).boxplot()
plt.show()

plt.figure(figsize=(12,5))
pd.DataFrame(regularization_errors_h_tr).boxplot()
plt.show()

plt.figure(figsize=(12,5))
pd.DataFrame(regularization_errors_h_te).boxplot()
plt.show()

# Best: alpha = .000125

Low support

In [None]:
regularization_errors_l_tr = {}
regularization_errors_l_te = {}

for alpha in np.linspace(0,.001,9):
    rmse_errors_tr = np.zeros(k_fold)
    rmse_errors_te = np.zeros(k_fold)
    for i in range(k_fold):
        # Get training data
        preds_train = predictors_low_df[i][::2]
        train = poly.fit_transform(preds_train)
        vals_train = observations_low_df[i][::2]
        # Create and fit model
        clf = Ridge(alpha=alpha, normalize=True)
        clf.fit(train, vals_train)
        # Obtain predictions for training set
        pred_train = clf.predict(train)
        pred_train = np.clip(pred_train, 1.0, 5.0)
        # Compute train error
        rmse_train = sqrt(mean_squared_error(vals_train, pred_train))

        # Use remaining data for testing
        preds_test = predictors_low_df[i][1::2]
        test = poly.fit_transform(preds_test)
        vals_test = observations_low_df[i][1::2]
        # Obtain predictions for test set
        pred_test = clf.predict(test)
        # Compute test error
        rmse_test = sqrt(mean_squared_error(vals_test, pred_test))
        
        # Save errors in arrays
        rmse_errors_tr[i] = rmse_train
        rmse_errors_te[i] = rmse_test
    # Save errors in dictionaries
    regularization_errors_l_tr[alpha] = rmse_errors_tr
    regularization_errors_l_te[alpha] = rmse_errors_te

plt.figure(figsize=(12,5))
pd.DataFrame(regularization_errors_l_tr).boxplot()
pd.DataFrame(regularization_errors_l_te).boxplot()
plt.show()

plt.figure(figsize=(12,5))
pd.DataFrame(regularization_errors_l_tr).boxplot()
plt.show()

plt.figure(figsize=(12,35))
pd.DataFrame(regularization_errors_l_te).boxplot()
plt.show()

# Best: alpha = .000375

#### Generate submission

In [None]:
# Load predictions of submission entries with each model
predictions_sub = [load_data('{p}model_{m}_sub.csv'.format(p=PREDICTION_PATH, m=model))
    for model in models]
predictions_sub_dict = dict(zip(models, predictions_sub))

In [None]:
from prediction_methods.create_ensemble import create_sklearn_ensemble_submission
create_sklearn_ensemble_submission(ratings, predictions_sub_dict, predictions_dict)

# From here on I don't think it's useful
But I can comment on the report that the voting system didn't work :)

### Do some processing

In [None]:
tol=0.01
min_v=15

#### Get valid votes

In [None]:
def decide_vote_gen(tol):
    def decide_vote(pred):
        """Defines if a model gets to vote or not"""
        vote = round(pred)
        if (tol > abs(vote - pred)):
            return vote
        else:
            return np.nan
    return decide_vote

In [None]:
# See if the predictions are valid votes or not
df_votes_tr = df_pred_tr.applymap(decide_vote_gen(tol))
df_votes_sub = df_pred_sub.applymap(decide_vote_gen(tol))

In [None]:
# Have each model vote as many times as specified
df_final_votes_tr = df_votes_tr[['nmf']]
df_final_votes_sub = df_votes_sub[['nmf']]
for m in df_votes_tr.columns:
    for i in range(model_weights['n_votes'][m]):
        df_final_votes_tr['{}_{}'.format(m,i)] = df_votes_tr[m]
        df_final_votes_sub['{}_{}'.format(m,i)] = df_votes_sub[m]

df_final_votes_tr = df_final_votes_tr.drop('nmf', axis=1)
df_final_votes_sub = df_final_votes_sub.drop('nmf', axis=1)

In [None]:
df_pred_tr['median_vote'] = df_final_votes_tr.median(axis=1)
df_pred_sub['median_vote'] = df_final_votes_sub.median(axis=1)

n_votes_tr = df_final_votes_tr.notnull().sum(axis=1)
n_votes_sub = df_final_votes_sub.notnull().sum(axis=1)

df_pred_tr['median_vote'][n_votes_tr < min_v] = np.nan
df_pred_sub['median_vote'][n_votes_sub < min_v] = np.nan

## Get weighted mean
# Add column with mean values
df_pred_tr['mean'] = pd.Series(sp.find(sp_mean_tr)[2])
df_pred_sub['mean'] = pd.Series(sp.find(sp_mean_sub)[2])

# Combine decisions and means
df_pred_tr['final'] = df_pred_tr['median_vote'].combine_first(df_pred_tr['mean'])
df_pred_sub['final'] = df_pred_sub['median_vote'].combine_first(df_pred_sub['mean'])

## Look at the results
e_round['{}_{}'.format(tol, min_v)] = np.sqrt(np.mean((df_obs_tr[0] - df_pred_tr['final'])**2))
e_mean['{}_{}'.format(tol, min_v)] = np.sqrt(np.mean((df_obs_tr[0] - df_pred_tr['mean'])**2))