# Hyperparam Search over CF Baselines
To ensure a fair search over the hyperparameters we use for the baselines, we do a random search over the parameters we can use.

We use two baselines from the [surprise](http://surpriselib.com/) package which are commonly used in collaborative filtering: the K-nearest-neighbours and the SVD algorithm.


In [9]:
# General imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import ParameterSampler

# Imports from our utils
# Todo: factor out notebook utils, remove 
from src.demos import notebook_utils
from data.load_from_csv import get_content_datasets
from src.models.lang_model.w2v_averager_model import W2vAveragerModel

% matplotlib inline

In [10]:
def run_baseline(train_dataset, val_dataset, baseline_alg_str,
                 calibrate=False, cf_algo_params={}):
    """Given a dataset, run the baseline identified by
    baseline_alg_str on the dataset and return the
    val and train Brier loss. We pass the dictionary of 
    parameters in cf_algo_params through to the surprise 
    algorithm.
    
    train_dataset: ContentDataset with the train data
    val_dataset:   ContentDataset with the validation data
    baseline_alg_str:
                      One of 'KNN', 'SVD', 'NormalPredictor', 
                      'TwoClassMF', and 'logitSVD'. KNN, SVD and
                      NormalPredictor are as described in the surprise docs, 
                      logitSVD is the SVD where the inputs are logit-transformed
                      first, and TwoClassMF splits the questions into true/false
                      based on user majority vote and predicts the mean for true/false
                      , else 50%
    calibrate: whether to plot a calibration curve
    cf_algo_params: passed through to the collaborative filtering
                    algorithm
                      """

    from surprise import KNNBasic, SVD, NormalPredictor
    from src.models.content_aware.simple_baseline import BaselineMF 
    from torch.utils.data.sampler import SubsetRandomSampler
    from src.models.content_aware.sampler import SubsetDeterministicSampler
    
    if baseline_alg_str == 'KNN':
        baseline_cf_algo = KNNBasic(**cf_algo_params)
    elif baseline_alg_str == 'SVD':
        baseline_cf_algo = SVD(**cf_algo_params)
    elif baseline_alg_str == 'NormalPredictor':
        baseline_cf_algo = NormalPredictor
    elif baseline_alg_str == 'TwoClassMF':
        baseline_cf_algo = TwoClassMF
    elif baseline_alg_str == 'logitSVD':
        baseline_cf_algo = SVD(**cf_algo_params)
    
    # Load in data
    baseline = BaselineMF(baseline_cf_algo)
    train_idx = np.arange(len(train_dataset.ratings))
    val_idx = np.arange(len(val_dataset.ratings))

    # Train CF algo and evaluate
    baseline.fit(train_dataset, SubsetRandomSampler(train_idx))
    train_pred = np.array(baseline.predict(
        train_dataset, SubsetDeterministicSampler(train_idx)))
    train_ratings = np.array(train_dataset.ratings)[
        train_idx].reshape((len(train_idx)))
    train_loss = ((train_pred - train_ratings)**2).mean()

    # Evaluate on val
    val_pred = np.array(baseline.predict(
        val_dataset, SubsetDeterministicSampler(val_idx)))
    val_ratings = np.array(val_dataset.ratings)[
        val_idx].reshape((len(val_idx)))
    val_loss = ((val_pred - val_ratings)**2).mean()

    if calibrate == True:
        calibration_plot(val_ratings, val_pred)
    # Todo change the train methods in baselineMF so that they return the train loss
    return train_loss, val_loss

In [11]:
def run_baseline_search(baseline_alg_string, params_vals_dict,
                      flat_train_dataset, flat_test_dataset, n_iter=20,
                      verbose=False):
    """Takes a baseline_alg_string and a {parameter: [options]} dictionary
    which maps parameters that can be passed in the cf_algo_params argument.
    Returns the best paramters evaluated on the validation set after a random
    search over n_iter tests."""
    params_dict = ParameterSampler(params_vals_dict, n_iter)
    train_losses = []
    val_losses = []
    params_dicts = []
    for index, params in enumerate(list(params_dict)):
        if verbose:
            print('At index {}'.format(index))
            print("Running with {}".format(params))
        train_loss, val_loss = run_baseline(flat_train_dataset, flat_val_dataset, baseline_alg_string, calibrate=False, cf_algo_params=params)
        params_dicts.append(params)
        train_losses.append(train_loss)
        val_losses.append(val_loss)
    best_val_score_index = np.argmin(val_losses)
    return train_losses[best_val_score_index], val_losses[best_val_score_index], params_dicts[best_val_score_index]

In [13]:
tr, val, ho = get_content_datasets('politifact', 'unmasked')
#train_dataset, val_dataset, test_dataset = load_dataset(task='fermi', sparsity='unmasked')
flat_train_dataset = tr.flatten()
flat_test_dataset = ho.flatten()
flat_val_dataset = val.flatten()

Loading w2v dict
Loaded Word2Vec dict: 6.79s
Number of words in corpus: 400001


# We set up the relevant params and their possible values

In [14]:
knn_params = {'k': range(30), 'min_k': range(1, 3), 'sim_options': [{'name': 'cosine'},
                                                                   {'name': 'MSD'},
                                                                   {'name': 'pearson', 'item_based': True},
                                                                   {'name': 'pearson_baseline'},
                                                                    {'name': 'MSD', 'item_based': True},
                                                                    {'name': 'cosine', 'item_based': True},
                                                                {'name': 'pearson_baseline', 'item_based': True}]}

svd_params = {'n_factors': [25, 50, 100, 200, 300, 400], 'n_epochs': [100, 400, 600, 800, 1200],
              'lr_all': [0.1, 0.01, 0.005, 0.003, 0.001, 0.0001],
             'reg_all': [0.3, 0.1, 0.05, 0.01, 0.003, 0.001]}

logitSVD_params = svd_params

In [15]:
knn_results = run_baseline_search('SVD', svd_params, flat_train_dataset, flat_val_dataset, 50)

In [16]:
print('Train loss was {},\nValidation loss was {},\nWith parameters {}'.format(knn_results[0], knn_results[1], knn_results[2]))
best_params = knn_results[2]
_, test_loss = run_baseline(flat_train_dataset, flat_test_dataset, 'KNN', calibrate=False, cf_algo_params=best_params)
print('Test loss was {}'.format(test_loss))

Train loss was 0.02843937263222587,
Validation loss was 0.12279561496985654,
With parameters {'reg_all': 0.3, 'lr_all': 0.001, 'n_epochs': 600, 'n_factors': 200}
Computing the msd similarity matrix...
Done computing similarity matrix.
Test loss was 0.12636679227604436



# Big search for all masking conditions

In [8]:
num_iters = 200
results = {}
for dataset in ['fermi', 'politifact']:
    for masking_condition in ['unmasked', 'light', 'heavy']:
        tr, val, ho = get_content_datasets(dataset, masking_condition, include_metadata=False)
        flat_train_dataset = tr.flatten()
        flat_test_dataset = ho.flatten()
        flat_val_dataset = val.flatten()
        raw_results = run_baseline_search('KNN', knn_params, flat_train_dataset, flat_val_dataset, num_iters)
        raw_results += (run_baseline(flat_train_dataset, flat_test_dataset, 'KNN', calibrate=False, cf_algo_params=raw_results[2])[1],)
        results[(dataset, masking_condition)] = raw_results

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Co

KeyboardInterrupt: 

In [None]:
for labels, outputs in results.items():
    print('For {} with masking {}, we had train performance {}, val performance {} and test performance {}.\nThe best set of parameters was {}\n'.format(labels[0], labels[1], outputs[0], outputs[1], outputs[3], outputs[2]))

Best params for SVD:
Politifact heavy: lr_all: 0.1, n_factors: 100, n_epochs: 400, reg_all: 0.001, train_loss: 0, val_loss: 0.117, test_loss: 0.1294

Politifact light: lr_all: 0.005, n_factors: 50, n_epochs: 100, reg_all: 0.001, train_loss: 0.018, val_loss: 0.1332, test_loss: 0.12478

Politifact unmasked: lr_all: 0.001, n_factors: 25, reg_all, 0.001, train_loss: 0.0566, val_loss: 0.12397, test_loss: 0.12467

Fermi unmasked: lr_all: 0.005, n_factors: 200, reg_all: 0.1, train_loss: 0.028196, val_loss: 0.1186, test_loss: 0.10196

Fermi light:  lr_all: 0.005, n_factors: 200, n_epochs: 400, train_loss: 0.01737, val_loss: 0.1200, test_loss: 0.11641

Fermi heavy: lr_all: 0.005, n_factors: 200, n_epochs: 400, reg_all: 0.1, train_loss: 0.0149, val_loss: 0.1144, test_loss: 0.122455

# Making a nice calibration curve

In [None]:
# Example calibration plot for the best SVD for the Fermi task
train_dataset, val_dataset, test_dataset = load_dataset(task='fermi', sparsity='heavy')
flat_train_dataset = train_dataset.flatten()
flat_test_dataset = test_dataset.flatten()
flat_val_dataset = val_dataset.flatten()
train_loss, test_loss = notebook_utils.run_baseline(flat_train_dataset, flat_test_dataset, 'SVD', calibrate=True, cf_algo_params={'lr_all':0.005,  'n_factors':200, 'n_epochs': 400, 'reg_all': 0.1})
zz = plt.gca()
for item in ([zz.title, zz.xaxis.label, zz.yaxis.label] +
             zz.get_xticklabels() + zz.get_yticklabels()):
    item.set_fontsize(20)