In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import csv
import math
import time
%load_ext autoreload
%autoreload 2

import pandas as pd
import seaborn as sns

from global_variables import *
from data_preparation import * 
from cost import * 
from cross_validation import *
from performances import * 
from proj1_helpers import * 

## Load the data 

In [None]:
X, Y = load_data()

## Comparing performances for least-squares, Ridge-regression and logistic-regression Newton

### Without pre-processing

In [None]:
k = 10 # ten fold CV
k_fold = k 
y = Y 
x = X
seed = 20
k_indices = build_k_indices(y, k_fold, seed)
clean_method = 'raw'

In [None]:
method = 'least-squares'
err_cv_tr_ls, err_cv_te_ls, accuracy = cross_validation(y, x, k_indices, k, method, 1,  1,  0 , 0, clean_method)
print(np.mean(err_cv_tr_ls), np.var(err_cv_tr_ls), accuracy)

In [None]:
method = 'ridge-regression'
lambdas = np.logspace(-10, -8, 30) # Range through "dichotomous" search
err_lambda = np.empty(len(lambdas))
for index_lambda, lambda_ in enumerate(lambdas):
    _, err_cv_val, accuracy = cross_validation(y, x, k_indices, k, method, batch_size, max_iters, gamma, lambda_, clean_method)
    err_lambda[index_lambda] = np.mean(err_cv_val) 
# best_lambda chosen to minimize the mean generalization error
best_lambda = lambdas[np.argmin(err_lambda)]
# The code should also provide the corresponding error values 
print(best_lambda)    

In [None]:
# Probably would be smarter to get the errors directly above ...
method = 'ridge-regression'
lambda_ = best_lambda
err_cv_tr_ridge, err_cv_te_ridge, accuracy = cross_validation(y, x, k_indices, k, method, batch_size, max_iters, gamma, lambda_, clean_method)
print(np.mean(err_cv_tr_ridge), np.var(err_cv_tr_ridge), accuracy)

In [None]:
max_iters= 10
method = 'log-newton'
gammas = np.logspace(-10, 0, 30)
err_gamma = np.empty(len(gammas))
for index_gamma, gamma in enumerate(gammas):
    _, err_cv_val, accuracy = cross_validation(y, x, k_indices, k, method, 1, max_iters, gamma, lambda_, clean_method)
    err_gamma[index_gamma] = np.mean(err_cv_val) 
# best_gamma chosen to minimize the mean generalization error
best_gamma = gammas[np.argmin(err_gamma)]
# The code should also provide the corresponding error values 
print(best_gamma)    

In [None]:
# Probably would be smarter to get the errors directly above ...
gamma = best_gamma
err_cv_tr_newton, err_cv_te, accuracy = cross_validation(y, x, k_indices, k, method, batch_size, max_iters, gamma, lambda_, clean_method)
print(np.mean(err_cv_tr_newton), np.var(err_cv_tr_newton), accuracy)

In [None]:
## TO DO : 
# Do it for clean_method in ['raw', '', '0', 'mean', 'media']
# for all 3 methods ('least-squares', 'ridge-regression', 'log-newton')
# Add the accuracy
# Write a script working for all hyper-parameters (gamma for newton et lambda for ridge) ?! 


## Checking if all functions work well but not to be included 

In [None]:
k = 10 # ten fold CV
k_fold = k 
y = Y 
x = X
seed = 20
k_indices = build_k_indices(y, k_fold, seed)

In [None]:
# Variable to decide on cleaning method ( see function clean_features)
clean_method = 'median'

### Least squares normal eq

In [None]:
method = 'least-squares'
err_cv_tr, err_cv_te, accuracy = cross_validation(y, x, k_indices, k, method, 1,  1,  0 , 0, clean_method)
print(np.mean(err_cv_tr), np.var(err_cv_tr), accuracy)

### Least squares GD 

In [None]:
method = 'least-squares-GD'
max_iters = 50
gamma = 0.01
batch_size = 1
lambda_ = 0
err_cv_tr, err_cv_te, accuracy = cross_validation(y, x, k_indices, k, method, batch_size, max_iters, gamma, lambda_, clean_method)
print(np.mean(err_cv_tr), np.var(err_cv_tr), accuracy)

### Least-squares SGD  

In [None]:
method = 'least-squares-SGD' 
err_cv_tr, err_cv_te, accuracy = cross_validation(y, x, k_indices, k, method, batch_size, max_iters, gamma, lambda_, clean_method)
print(np.mean(err_cv_tr), np.var(err_cv_tr), accuracy)

### Ridge-regression

In [None]:
method = 'ridge-regression'
lambda_ = 1
err_cv_tr, err_cv_te, accuracy = cross_validation(y, x, k_indices, k, method, batch_size, max_iters, gamma, lambda_, clean_method)
print(np.mean(err_cv_tr), np.var(err_cv_tr), accuracy)

### Log regression 

In [None]:
method = 'log'
err_cv_tr, err_cv_te, accuracy = cross_validation(y, x, k_indices, k, method, batch_size, max_iters, gamma, lambda_, clean_method)
print(np.mean(err_cv_tr), np.var(err_cv_tr), accuracy)

In [None]:
method = 'regularized-log'
err_cv_tr, err_cv_te, accuracy = cross_validation(y, x, k_indices, k, method, batch_size, max_iters, gamma, lambda_, clean_method)
print(np.mean(err_cv_tr), np.var(err_cv_tr), accuracy)

In [None]:
method = 'log-newton'
err_cv_tr, err_cv_te, accuracy = cross_validation(y, x, k_indices, k, method, batch_size, max_iters, gamma, lambda_, clean_method)
print(np.mean(err_cv_tr), np.var(err_cv_tr), accuracy)

## Optimizing models with cross-validation

In [None]:
# CV testing gamma and lambda inside same loop

X, Y = load_data()
k = 10 # ten fold CV
k_fold = k 
y = Y 
x = X
seed = 20
k_indices = build_k_indices(y, k_fold, seed)
clean_method = '0'
batch_size = 1
max_iters = 100

method = 'regularized-log'
lambdas = np.logspace(-5, 3, 8)
#lambdas = [0]
gammas = np.logspace(-20, 2, 13)
err = np.empty([len(lambdas), len(gammas)])
acc = np.empty([len(lambdas), len(gammas)])
for index_lambda, lambda_ in enumerate(lambdas):
    print("L: " + str(index_lambda) + "  "+str(lambda_))
    for index_gamma, gamma in enumerate(gammas):
        print("G: " + str(index_gamma) + "  "+ str(gamma))
        err_cv_tr, err_cv_val, accuracy = cross_validation(y, x, k_indices, k, method, batch_size, max_iters, gamma, lambda_, clean_method)
        err[index_lambda, index_gamma] = np.mean(err_cv_val)
        print(np.mean(err_cv_val))
        acc[index_lambda, index_gamma] = accuracy
        print(accuracy)
print(err)
print(acc)
best_combination = np.argwhere(err == np.min(err))
print("least error where :" + str(best_combination))
best_acc = np.argwhere(acc == np.max(acc))
print("highest accuracy where :" + str(best_acc))
best_lambda = lambdas[best_combination[0][0]]
best_gamma = gammas[best_combination[0][1]]
# The code should also provide the corresponding error values 
print("Best lambda: " + str(best_lambda))
print("Best gamma: " + str(best_gamma))


L: 0  1e-05
G: 0  1e-20
1.334083679995146
0.63732
G: 1  6.812920690579594e-19
1.3896232079883268
0.63452
G: 2  4.6415888336127915e-17
1.3235079688244678
0.64688
G: 3  3.1622776601683794e-15
1.334666427045938
0.62844
G: 4  2.154434690031878e-13
1.4139273031236523
0.63384
G: 5  1.4677992676220676e-11
1.3103149777789938
0.6392
G: 6  1e-09
1.3788497132209465
0.63776
G: 7  6.812920690579594e-08
1.3985606213192072
0.63004
G: 8  4.641588833612773e-06
1.3058494780947647
0.63156
G: 9  0.00031622776601683794
1.4006873461660843
0.6318
G: 10  0.021544346900318777
1.7803239305650482
0.63504
G: 11  1.4677992676220615
4.6434891224379395
0.60944
G: 12  100.0
4.585964258206543
0.60976
L: 1  0.00013894954943731373
G: 0  1e-20
1.3190401480150595
0.63156
G: 1  6.812920690579594e-19
1.3136751053047582
0.63456
G: 2  4.6415888336127915e-17
1.323269815714609
0.63168
G: 3  3.1622776601683794e-15
1.37062609698761
0.62128
G: 4  2.154434690031878e-13
1.3431545547722046
0.63668
G: 5  1.4677992676220676e-11
1.42112

In [None]:
method = 'ridge-regression'
lambda_ = best_lambda
err_cv_tr, err_cv_te, accuracy = cross_validation(y, x, k_indices, k, method, batch_size, max_iters, gamma, lambda_, clean_method)
print(np.mean(err_cv_tr), np.var(err_cv_tr), accuracy)