# Hyperparameter search
This notebook is about the hyperparameter search for our optimal collaborative filtering model.
We evaluated the following methods:
* Single Factorization
* Three-way Factorization
* Three-way connected Fractorization
* Three-way graph Fractorization
* Three-way graph connected Factorization
* Four-way graph connected Factorization

For differnt hyperparameters, Lambda, Gamma, k (latent space dimension)

In [1]:
# reload all modules before executing code
%load_ext autoreload
%autoreload 2 

import pickle
import numpy as np

import DB.database as db
import src.preprocessing as pre
from DB.model import *
import src.Solver as Solver
import src.MF as MF

import time

## Querying the database

In [2]:
N = 100000  #Number of Users
M = 100000  #Number of Businesses

In [7]:
friends, relations, business_attributes, BW, UW, word_labels  = db.get_data(N, M, need_business=False, add_words=True, buss_conn_threshold=0)

## Test&train data splitting

Solver.Dataset keeps the data, creates validation and test indices, 80-10-10 % each.

In [11]:
relations, user_idx, bus_idx, friends, business_attributes, UW, BW = \
    pre.cold_start_preprocessing(relations, friends, business_attributes, UW=UW, BW=BW, min_entries=10)

Shape before: (100000, 100000)
Shape after: (808, 763)


In [12]:
r_data = Solver.Dataset(relations)

Original size:(808, 763)
Nonzero entries:15863
Train:12690, Val:1587, Test:1586


Make the test and validation indices of training data =0. 

The input data should be centered. 

In [13]:
rel_data = r_data.data.copy()

rel_data[r_data.test_ind] = 0
rel_data[r_data.val_ind] = 0

rel_data, means, _ = Solver.center(rel_data)

Center Validation Values

In [14]:
validation_val = np.zeros(len(r_data.val_values))
for i, val in enumerate(r_data.val_values):
    validation_val[i] = val - means[r_data.val_ind[1][i]]

### Construct business connections

In [16]:
business_conn = pre.get_buss_conn_mat(M, rel_data, 0, 2)

## Hyperparameter search
Do hyperparamter search across different methods and hyperparameters.

In [None]:
max_steps = 100
log_every = 10
eval_every = 1
patience = 10

lambdas = [0.5, 1, 2, 3]
ks = [32, 64, 128, 256]
gammas = [0.5, 1, 2, 3]
losses = {}

for k in ks:
    for reg_lambda in lambdas:
        for gamma in gammas:
            U_three_g, V_three_g, W_three, val_losses, train_loss, conv = \
                MF.three_latent_factor_graph_alternating_optimization(
                    friends, business_attributes, rel_data, business_conn, k,
                    val_idx = r_data.val_ind, val_values = validation_val,
                    reg_lambda=reg_lambda, gamma=gamma, max_steps=max_steps,
                    log_every=log_every, patience = patience, eval_every = eval_every)
            losses[('three_graph', k, reg_lambda, gamma)] = min(val_losses)
            
            U_three_gc, V_three_gc, W_three, val_losses, train_loss, conv = \
                MF.three_latent_factor_connected_graph_alternating_optimization(
                    friends, business_attributes, rel_data, business_conn, k,
                    val_idx = r_data.val_ind, val_values = validation_val,
                    reg_lambda=reg_lambda, gamma=gamma, max_steps=max_steps,
                    log_every=log_every, patience = patience, eval_every = eval_every)
            losses[('three_graph_connected', k, reg_lambda, gamma)] = min(val_losses)
            
            U_four_gc, V_four_gc, W_four, Z_four, val_losses, train_loss, conv = \
                MF.four_latent_factor_connected_graph_alternating_optimization(
                    friends, business_attributes, rel_data, business_conn, UW, BW, k,
                    val_idx = r_data.val_ind, val_values = validation_val,
                    reg_lambda=reg_lambda, gamma=gamma, max_steps=max_steps,
                    log_every=log_every, patience = 3, eval_every = eval_every)
            losses[('four_graph_connected', k, reg_lambda, gamma)] = min(val_losses)
            
        U_single, V_single, val_losses ,_ ,_ = \
            MF.latent_factor_alternating_optimization(rel_data, r_data.train_ind,
                k, val_idx = r_data.val_ind, val_values = validation_val,
                reg_lambda=reg_lambda, max_steps=max_steps, init='random',
                log_every=log_every, patience=patience, eval_every=eval_every)
        losses[('single', k, reg_lambda)] = min(val_losses)
        
        U_three, V_three, W_three, val_losses, train_loss, conv = \
            MF.three_latent_factor_alternating_optimization(
                friends, business_attributes, rel_data, k,
                val_idx = r_data.val_ind, val_values = validation_val,
                reg_lambda=reg_lambda, max_steps=max_steps,
                log_every=log_every, patience = patience, eval_every = eval_every)
        losses[('three', k, reg_lambda)] = min(val_losses)
        
        U_three_c, V_three_c, W_three, val_losses, train_loss, conv = \
            MF.three_latent_factor_connected_alternating_optimization(
                friends, business_attributes, rel_data, business_conn, k,
                val_idx = r_data.val_ind, val_values = validation_val,
                reg_lambda=reg_lambda, max_steps=max_steps,
                log_every=log_every, patience = patience, eval_every = eval_every)
        losses[('three_connected', k, reg_lambda)] = min(val_losses)

Iteration 0, training_loss: 1209931.209537, review error: 425437.332791, validation loss: 52675.264628
Iteration 10, training_loss: 12757.101519, review error: 150.483343, validation loss: 1769.251283
Converged after 10 iterations
Iteration 0, training_loss: 1232821.482726, review error: 414139.841058, validation loss: 50317.496747
Iteration 10, training_loss: 374.866762, review error: 214.459267, validation loss: 1731.045163
Iteration 20, training_loss: 321.198603, review error: 196.471091, validation loss: 1712.564921
Iteration 30, training_loss: 307.251282, review error: 191.452766, validation loss: 1710.311207
Converged after 27 iterations
Iteration 0, training_loss: 4035689.987932, review error: 430318.366621, validation loss: 53617.940511


## Get the results

In [None]:
sorted(list(losses.values()))[:10]

In [None]:
np.array(list(losses.keys()))[np.array(list(losses.values())).argsort()[:10]]

## Business-Business connections
For our business business connection matrix we introduced a few hyperparamters during the construction. One for a maximal rating distance between two reviews to form a connection and one for a minimal number of users to form a connection.

In the following we are performing a hyperparameter search for our best method from the previous search.

In [18]:
database = db.Database()
database.__enter__()
users, businesses, reviews, category_names, cities = db.get_entities(database, N, M)

Got users
Got businesses
Got reviews


In [None]:
thresholds = [0, 1, 2, 3, 4]
min_users = [1, 2, 4, 7, 10, 15]
thres_losses = {}
best_U = None
best_V = None
min_loss = -1

# Query
friends, relations, business_attributes  = \
    DB.get_matrices(users, businesses, reviews, category_names, cities)

# Cut
relations_cut, _, bus_idx, friends, business_attributes = \
    pre.cold_start_preprocessing(relations, friends, business_attributes, min_entries=10)

# Split
r_data = Solver.Dataset(relations_cut)

rel_data = r_data.data

rel_data[r_data.test_ind] = 0
rel_data[r_data.val_ind] = 0

# Center
rel_data, means, _ = Solver.center(rel_data)

validation_val = np.zeros(len(r_data.val_values))
for i, val in enumerate(r_data.val_values):
    validation_val[i] = val - means[r_data.val_ind[1][i]]

for min_user in min_users:
    for threshold in thresholds:            
        business_conn = pre.get_buss_conn_mat(M, rel_data, threshold, min_user)
        print('Min user %i; Threshold %i; Entries %i ' % (min_user, threshold, business_conn.nnz))

        # Evaluate
        U_three_gc, V_three_gc, W_three, val_losses, train_loss, conv = \
            MF.three_latent_factor_connected_graph_alternating_optimization(
                friends, business_attributes, rel_data, business_conn, 256,
                val_idx = r_data.val_ind, val_values = validation_val,
                reg_lambda=2, gamma=1, max_steps=100,
                log_every=1, patience = 10, eval_every = 1)
        
        loss = min(val_losses)
        thres_losses[(min_user, threshold)] = loss
        if loss < min_loss or min_loss == -1:
            best_U = U_three_gc
            best_V = V_three_gc
            min_loss = loss

### Check the results

In [None]:
sorted(list(thres_losses.values()))[:10]

In [None]:
np.array(list(thres_losses.keys()))[np.array(list(thres_losses.values())).argsort()[:10]]

## Report RMSE

In [None]:
Solver.RMSE(best_U, best_V, r_data.test_ind, r_data.test_values, means)

Store result

In [None]:
with open('best_factorization.pickle', 'wb') as f:
    pickle.dump((best_U, best_V), f)