In [2]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import csv
import math
import time
%load_ext autoreload
%autoreload 2

import pandas as pd
import seaborn as sns

from global_variables import *
from data_preparation import * 
from cost import * 
from cross_validation import *
from performances import * 
from proj1_helpers import * 

## Load the data 

In [None]:
X, Y = load_data()

## Comparing performances for least-squares, Ridge-regression and logistic-regression Newton

### Without pre-processing

In [None]:
k = 10 # ten fold CV
k_fold = k 
y = Y 
x = X
seed = 20
k_indices = build_k_indices(y, k_fold, seed)
clean_method = 'raw'

In [None]:
method = 'least-squares'
err_cv_tr_ls, err_cv_te_ls, accuracy = cross_validation(y, x, k_indices, k, method, 1,  1,  0 , 0, clean_method)
print(np.mean(err_cv_tr_ls), np.var(err_cv_tr_ls), accuracy)

In [None]:
method = 'ridge-regression'
lambdas = np.logspace(-10, -8, 30) # Range through "dichotomous" search
err_lambda = np.empty(len(lambdas))
for index_lambda, lambda_ in enumerate(lambdas):
    _, err_cv_val, accuracy = cross_validation(y, x, k_indices, k, method, batch_size, max_iters, gamma, lambda_, clean_method)
    err_lambda[index_lambda] = np.mean(err_cv_val) 
# best_lambda chosen to minimize the mean generalization error
best_lambda = lambdas[np.argmin(err_lambda)]
# The code should also provide the corresponding error values 
print(best_lambda)    

In [None]:
# Probably would be smarter to get the errors directly above ...
method = 'ridge-regression'
lambda_ = best_lambda
err_cv_tr_ridge, err_cv_te_ridge, accuracy = cross_validation(y, x, k_indices, k, method, batch_size, max_iters, gamma, lambda_, clean_method)
print(np.mean(err_cv_tr_ridge), np.var(err_cv_tr_ridge), accuracy)

In [None]:
max_iters= 10
method = 'log-newton'
gammas = np.logspace(-10, 0, 30)
err_gamma = np.empty(len(gammas))
for index_gamma, gamma in enumerate(gammas):
    _, err_cv_val, accuracy = cross_validation(y, x, k_indices, k, method, 1, max_iters, gamma, lambda_, clean_method)
    err_gamma[index_gamma] = np.mean(err_cv_val) 
# best_gamma chosen to minimize the mean generalization error
best_gamma = gammas[np.argmin(err_gamma)]
# The code should also provide the corresponding error values 
print(best_gamma)    

In [None]:
# Probably would be smarter to get the errors directly above ...
gamma = best_gamma
err_cv_tr_newton, err_cv_te, accuracy = cross_validation(y, x, k_indices, k, method, batch_size, max_iters, gamma, lambda_, clean_method)
print(np.mean(err_cv_tr_newton), np.var(err_cv_tr_newton), accuracy)

In [None]:
## TO DO : 
# Do it for clean_method in ['raw', '', '0', 'mean', 'media']
# for all 3 methods ('least-squares', 'ridge-regression', 'log-newton')
# Add the accuracy
# Write a script working for all hyper-parameters (gamma for newton et lambda for ridge) ?! 


## Checking if all functions work well but not to be included 

In [None]:
k = 10 # ten fold CV
k_fold = k 
y = Y 
x = X
seed = 20
k_indices = build_k_indices(y, k_fold, seed)

In [None]:
# Variable to decide on cleaning method ( see function clean_features)
clean_method = 'median'

### Least squares normal eq

In [None]:
method = 'least-squares'
err_cv_tr, err_cv_te, accuracy = cross_validation(y, x, k_indices, k, method, 1,  1,  0 , 0, clean_method)
print(np.mean(err_cv_tr), np.var(err_cv_tr), accuracy)

### Least squares GD 

In [None]:
method = 'least-squares-GD'
max_iters = 50
gamma = 0.01
batch_size = 1
lambda_ = 0
err_cv_tr, err_cv_te, accuracy = cross_validation(y, x, k_indices, k, method, batch_size, max_iters, gamma, lambda_, clean_method)
print(np.mean(err_cv_tr), np.var(err_cv_tr), accuracy)

### Least-squares SGD  

In [None]:
method = 'least-squares-SGD' 
err_cv_tr, err_cv_te, accuracy = cross_validation(y, x, k_indices, k, method, batch_size, max_iters, gamma, lambda_, clean_method)
print(np.mean(err_cv_tr), np.var(err_cv_tr), accuracy)

### Ridge-regression

In [None]:
method = 'ridge-regression'
lambda_ = 1
err_cv_tr, err_cv_te, accuracy = cross_validation(y, x, k_indices, k, method, batch_size, max_iters, gamma, lambda_, clean_method)
print(np.mean(err_cv_tr), np.var(err_cv_tr), accuracy)

### Log regression 

In [None]:
method = 'log'
err_cv_tr, err_cv_te, accuracy = cross_validation(y, x, k_indices, k, method, batch_size, max_iters, gamma, lambda_, clean_method)
print(np.mean(err_cv_tr), np.var(err_cv_tr), accuracy)

In [None]:
method = 'regularized-log'
err_cv_tr, err_cv_te, accuracy = cross_validation(y, x, k_indices, k, method, batch_size, max_iters, gamma, lambda_, clean_method)
print(np.mean(err_cv_tr), np.var(err_cv_tr), accuracy)

In [None]:
method = 'log-newton'
err_cv_tr, err_cv_te, accuracy = cross_validation(y, x, k_indices, k, method, batch_size, max_iters, gamma, lambda_, clean_method)
print(np.mean(err_cv_tr), np.var(err_cv_tr), accuracy)

## Optimizing models with cross-validation

In [16]:
# CV testing gamma and lambda inside same loop

#X, Y = load_data()
k = 10 # ten fold CV
k_fold = k 
y = Y 
x = X
seed = 20
k_indices = build_k_indices(y, k_fold, seed)
clean_method = 'raw'
batch_size = 1
max_iters = 30

method = 'regularized-log'
lambdas = np.logspace(-5, 3, 4)
gammas = np.logspace(-5, 0, 4)
err = np.empty([len(lambdas), len(gammas)])
acc = np.empty([len(lambdas), len(gammas)])
for index_lambda, lambda_ in enumerate(lambdas):
    print("L: " + str(index_lambda) + "  "+str(lambda_))
    for index_gamma, gamma in enumerate(gammas):
        print("G: " + str(index_gamma) + "  "+ str(gamma))
        err_cv_tr, err_cv_val, accuracy = cross_validation(y, x, k_indices, k, method, batch_size, max_iters, gamma, lambda_, clean_method)
        err[index_lambda, index_gamma] = np.mean(err_cv_val)
        print(np.mean(err_cv_val))
        acc[index_lambda, index_gamma] = accuracy
        print(accuracy)
print(err)
print(acc)
best_combination = np.argwhere(err == np.min(err))
print("least error where :" + str(best_combination))
best_acc = np.argwhere(acc == np.max(acc))
print("highest accuracy where :" + str(best_acc))
best_lambda = lambdas[best_combination[0][0]]
best_gamma = gammas[best_combination[0][1]]
# The code should also provide the corresponding error values 
print("Best lambda: " + str(best_lambda))
print("Best gamma: " + str(best_gamma))


L: 0  1e-05
G: 0  1e-05
1.2506788916662162
0.639
G: 1  0.0004641588833612782
1.3527819339136655
0.63296
G: 2  0.021544346900318846
1.4618734751319011
0.63244
G: 3  1.0
4.23688551058617
0.58632
L: 1  0.004641588833612777
G: 0  1e-05
1.3561565144631536
0.62804
G: 1  0.0004641588833612782
1.440862486321233
0.62064
G: 2  0.021544346900318846
1.4529618192209242
0.63244
G: 3  1.0
5.665679314474367
0.5966
L: 2  2.154434690031882
G: 0  1e-05
13.35259252568062
0.63328
G: 1  0.0004641588833612782
12.566112041360377
0.63152
G: 2  0.021544346900318846
0.7638800020398995
0.66008
G: 3  1.0
1.4286190065491022e+32
0.53692
L: 3  1000.0
G: 0  1e-05
1540.8577736573538
0.64956
G: 1  0.0004641588833612782
0.6931589756038757
0.66008
G: 2  0.021544346900318846
1.2867667667347054e+101
0.54736
G: 3  1.0
5.060383701684753e+201
0.50732
[[1.25067889e+000 1.35278193e+000 1.46187348e+000 4.23688551e+000]
 [1.35615651e+000 1.44086249e+000 1.45296182e+000 5.66567931e+000]
 [1.33525925e+001 1.25661120e+001 7.63880002e

In [None]:
method = 'ridge-regression'
lambda_ = best_lambda
err_cv_tr, err_cv_te, accuracy = cross_validation(y, x, k_indices, k, method, batch_size, max_iters, gamma, lambda_, clean_method)
print(np.mean(err_cv_tr), np.var(err_cv_tr), accuracy)