In [6]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import csv
import math
import time
%load_ext autoreload
%autoreload 2

import pandas as pd
import seaborn as sns

from global_variables import *
from data_preparation import * 
from cost import * 
from cross_validation import *
from performances import * 
from proj1_helpers import * 

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load the data 

In [7]:
X, Y = load_data()

## Comparing performances for least-squares, Ridge-regression and logistic-regression Newton

### Without pre-processing

In [None]:
k = 10 # ten fold CV
k_fold = k 
y = Y 
x = X
seed = 20
k_indices = build_k_indices(y, k_fold, seed)
clean_method = 'raw'

In [None]:
method = 'least-squares'
err_cv_tr_ls, err_cv_te_ls, accuracy = cross_validation(y, x, k_indices, k, method, 1,  1,  0 , 0, clean_method)
print(np.mean(err_cv_tr_ls), np.var(err_cv_tr_ls), accuracy)

In [None]:
method = 'ridge-regression'
lambdas = np.logspace(-10, -8, 30) # Range through "dichotomous" search
err_lambda = np.empty(len(lambdas))
for index_lambda, lambda_ in enumerate(lambdas):
    _, err_cv_val, accuracy = cross_validation(y, x, k_indices, k, method, batch_size, max_iters, gamma, lambda_, clean_method)
    err_lambda[index_lambda] = np.mean(err_cv_val) 
# best_lambda chosen to minimize the mean generalization error
best_lambda = lambdas[np.argmin(err_lambda)]
# The code should also provide the corresponding error values 
print(best_lambda)    

In [None]:
# Probably would be smarter to get the errors directly above ...
method = 'ridge-regression'
lambda_ = best_lambda
err_cv_tr_ridge, err_cv_te_ridge, accuracy = cross_validation(y, x, k_indices, k, method, batch_size, max_iters, gamma, lambda_, clean_method)
print(np.mean(err_cv_tr_ridge), np.var(err_cv_tr_ridge), accuracy)

In [None]:
max_iters= 10
method = 'log-newton'
gammas = np.logspace(-10, 0, 30)
err_gamma = np.empty(len(gammas))
for index_gamma, gamma in enumerate(gammas):
    _, err_cv_val, accuracy = cross_validation(y, x, k_indices, k, method, 1, max_iters, gamma, lambda_, clean_method)
    err_gamma[index_gamma] = np.mean(err_cv_val) 
# best_gamma chosen to minimize the mean generalization error
best_gamma = gammas[np.argmin(err_gamma)]
# The code should also provide the corresponding error values 
print(best_gamma)    

In [None]:
# Probably would be smarter to get the errors directly above ...
gamma = best_gamma
err_cv_tr_newton, err_cv_te, accuracy = cross_validation(y, x, k_indices, k, method, batch_size, max_iters, gamma, lambda_, clean_method)
print(np.mean(err_cv_tr_newton), np.var(err_cv_tr_newton), accuracy)

In [None]:
## TO DO : 
# Do it for clean_method in ['raw', '', '0', 'mean', 'media']
# for all 3 methods ('least-squares', 'ridge-regression', 'log-newton')
# Add the accuracy
# Write a script working for all hyper-parameters (gamma for newton et lambda for ridge) ?! 


## Checking if all functions work well but not to be included 

In [3]:
k = 10 # ten fold CV
k_fold = k 
y = Y 
x = X
seed = 20
k_indices = build_k_indices(y, k_fold, seed)

In [25]:
# Variable to decide on cleaning method ( see function clean_features)
clean_method = 'median'

### Least squares normal eq

In [26]:
method = 'least-squares'
err_cv_tr, err_cv_te, accuracy = cross_validation(y, x, k_indices, k, method, 1,  1,  0 , 0, clean_method)
print(np.mean(err_cv_tr), np.var(err_cv_tr), accuracy)

0.1050349859132332 4.4506333242015705e-06 0.6604


### Least squares GD 

In [22]:
method = 'least-squares-GD'
max_iters = 50
gamma = 0.01
batch_size = 1
lambda_ = 0
err_cv_tr, err_cv_te, accuracy = cross_validation(y, x, k_indices, k, method, batch_size, max_iters, gamma, lambda_, clean_method)
print(np.mean(err_cv_tr), np.var(err_cv_tr), accuracy)

2.485690163205983 0.000652823197192353 0.53872


### Least-squares SGD  

In [17]:
method = 'least-squares-SGD' 
err_cv_tr, err_cv_te, accuracy = cross_validation(y, x, k_indices, k, method, batch_size, max_iters, gamma, lambda_, clean_method)
print(np.mean(err_cv_tr), np.var(err_cv_tr), accuracy)

4.982328643014851 101.6781758771682 0.54836


### Ridge-regression

In [19]:
method = 'ridge-regression'
lambda_ = 1
err_cv_tr, err_cv_te, accuracy = cross_validation(y, x, k_indices, k, method, batch_size, max_iters, gamma, lambda_, clean_method)
print(np.mean(err_cv_tr), np.var(err_cv_tr), accuracy)

0.14997944722255724 2.3106544162739332e-08 0.66008


### Log regression 

In [27]:
method = 'log'
err_cv_tr, err_cv_te, accuracy = cross_validation(y, x, k_indices, k, method, batch_size, max_iters, gamma, lambda_, clean_method)
print(np.mean(err_cv_tr), np.var(err_cv_tr), accuracy)

1.410827435481886 0.006138371482016846 0.63748


In [28]:
method = 'regularized-log'
err_cv_tr, err_cv_te, accuracy = cross_validation(y, x, k_indices, k, method, batch_size, max_iters, gamma, lambda_, clean_method)
print(np.mean(err_cv_tr), np.var(err_cv_tr), accuracy)

1.4768643895425262 0.006898119087476211 0.62632


In [29]:
method = 'log-newton'
err_cv_tr, err_cv_te, accuracy = cross_validation(y, x, k_indices, k, method, batch_size, max_iters, gamma, lambda_, clean_method)
print(np.mean(err_cv_tr), np.var(err_cv_tr), accuracy)

1.3539844281288227 0.006609383366124287 0.62844


## Optimizing models with cross-validation

In [None]:
method = 'ridge-regression'
lambdas = np.logspace(-4, 0, 30)
err_lambda = np.empty(len(lambdas))
for index_lambda, lambda_ in enumerate(lambdas):
    err_cv_tr, err_cv_val, accuracy = cross_validation(y, x, k_indices, k, method, batch_size, max_iters, gamma, lambda_, clean_method)
    err_lambda[index_lambda] = np.mean(err_cv_val) 
best_lambda = lambdas[np.argmin(err_lambda)]
# The code should also provide the corresponding error values 
print(best_lambda)    

In [None]:
method = 'ridge-regression'
lambda_ = best_lambda
err_cv_tr, err_cv_te, accuracy = cross_validation(y, x, k_indices, k, method, batch_size, max_iters, gamma, lambda_, clean_method)
print(np.mean(err_cv_tr), np.var(err_cv_tr), accuracy)