In [93]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from process_data import *
from proj1_helpers import *
from implementations import *
from run import *

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [94]:
DATA_TRAIN_PATH = '../data/train.csv' 
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

In [97]:
jet_num_col_index = 22 # Index of PRI_jet_num feature
jet_values = [0, 1, 2, 3] # Values taken by PRI_jet_num

# Array of indices of raw per jet value
subsets_indices_array = indices_split_dataset_jet_num(tX, jet_num_col_index, jet_values)

# Remove useless columns from each subset
tX_cleaned, all_useless_indices = clean_useless_columns_jet(tX, subsets_indices_array)

### Choose the subset you want to train : (ex: Set 0)

In [98]:
# Select the subset we want to train on. 
jet_number = 0 

In [99]:
tX_c = tX_cleaned[jet_number]
y_c = y[subsets_indices_array[jet_number]]

In [100]:
# Split the data in a train and test set (80% train , 20% test)
x_train, x_test, y_train, y_test = split_data(tX_c, y_c, 0.8)

In [104]:
def cross_validation(y, x, k_indices, k, lambda_, degree):
    """Cross validation function on processed dataset using ridge regression"""
    # Create indices and split data. 
    indices_train = k_indices[~(np.arange(k_indices.shape[0]) == k)].reshape(-1)
    indices_test = k_indices[k]
    
    x_tr = x[indices_train]
    x_te = x[indices_test]
    y_tr = y[indices_train]
    y_te = y[indices_test]
    
    # form data with polynomial degree
    tx_tr = full_process_data(x_tr, degree, DATA_TRAIN_PATH, all_useless_indices[jet_number], False)
    tx_te = full_process_data(x_te, degree, DATA_TRAIN_PATH, all_useless_indices[jet_number], False)
    
    # ridge regression
    w, _ = ridge_regression(y_tr, tx_tr, lambda_)

    loss_test = np.sqrt(2 * MSE_loss(y_te, tx_te, w))
    
    return loss_test, w

In [105]:
def find_best_lambda_and_degree(y, x, deg_inf, deg_sup, k_fold):
    """ Find the best lambda and degree using cross validation and ridge regression"""
    loss = float('inf')
    w_ = 0
    
    lambdas = np.logspace(-10, -1, 10)
    
    k_ind = build_k_indices(y, k_fold, 1)
    
    optimal_degree = 0
    optimal_lambda = 0
    
    for degree in range(deg_inf, (deg_sup+1)):
        print(degree)
        for l in lambdas:
            for k in range(k_fold):
                loss_test, w = cross_validation(y, x, k_ind, k, l, degree)
                if(loss > loss_test):
                    loss = loss_test
                    optimal_degree = degree
                    optimal_lambda = l
                    w_ = w
    return loss, w_ , optimal_degree, optimal_lambda

In [85]:
# Compute the loss and w as well as pick the best degree and lambda
loss, w_ , optimal_degree, optimal_lambda = find_best_lambda_and_degree(y_train, x_train, 1, 10, 4)   

6
7
8
9
10


In [86]:
print("optimal_degree = {d}".format(d=optimal_degree))
print("optimal_lambda = {d}".format(d=optimal_lambda))

optimal_degree = 9
optimal_lambda = 1e-06


In [87]:
def find_best_lambda(y, x, degree, lambda_, k_fold):
    """Find a better lambda using linspace from the previouly computed lambda and degree."""
    
    loss = float('inf')
    w_ = 0
    
    lambdas = np.linspace(lambda_/2, 3/2*lambda_, 11)
    k_ind = build_k_indices(y, k_fold, 1)

    optimal_lambda = 0
    
    for l in lambdas:
        for k in range(k_fold):
            loss_test, w = cross_validation(y, x, k_ind, k, l, degree)
            if(loss > loss_test):
                loss = loss_test
                optimal_lambda = l
                w_ = w
    return loss, w_, optimal_lambda

In [88]:
# Find the optimal w and loss for a more precise best lambda.
loss, optimal_w , optimal_lambda2 = find_best_lambda(y_train, x_train, optimal_degree, optimal_lambda, 4)

In [89]:
print("optimal_lambda ={d}".format(d=optimal_lambda2))

optimal_lambda =6e-07


In [90]:
# Process the test set data 
x_test_clean = full_process_data(x_test, optimal_degree, DATA_TRAIN_PATH, all_useless_indices[jet_number], False)
# Make a prediction with the optimal w 
y_pred_test = predict_labels(optimal_w, x_test_clean)

# Compute the accuracy between the prediction and actual y.
accuracy = compute_accuracy(y_test, y_pred_test)

In [91]:
# Compute the accuracy performed on the train set.
x_train_clean = full_process_data(x_train, optimal_degree, DATA_TRAIN_PATH, all_useless_indices[jet_number], False)
train_accuracy = compute_accuracy(y_train, predict_labels(optimal_w, x_train_clean))

In [92]:
print("Test accuracy = {a1}, Train accuracy = {a2}".format(a1=accuracy, a2=train_accuracy ))

Test accuracy = 0.8050164420658972, Train accuracy = 0.8057548158297735
