In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import csv
import math
import time
%load_ext autoreload
%autoreload 2

import pandas as pd
import seaborn as sns

from global_variables import *
from data_preparation import * 
from cost import * 
from cross_validation import *
from performances import * 
from proj1_helpers import * 

## Load the data 

In [2]:
X, Y = load_data()

## Comparing performances for least-squares, Ridge-regression and logistic-regression Newton

### Without pre-processing

In [13]:
k = 10 # ten fold CV
k_fold = k 
y = Y 
x = X
seed = 20
k_indices = build_k_indices(y, k_fold, seed)
clean_method = 'raw'

In [14]:
method = 'least-squares'
err_cv_tr_ls, err_cv_te_ls = cross_validation(y, x, k_indices, k, method, None, 1,  1,  0 , 0, clean_method)
print(np.mean(err_cv_tr_ls), np.var(err_cv_tr_ls))

0.10578006553351726 1.4943921777054187e-05


In [17]:
method = 'ridge-regression'
lambdas = np.logspace(-10, -8, 30) # Range through "dichotomous" search
err_lambda = np.empty(len(lambdas))
for index_lambda, lambda_ in enumerate(lambdas):
    _, err_cv_val = cross_validation(y, x, k_indices, k, method, initial_w, batch_size, max_iters, gamma, lambda_, clean_method)
    err_lambda[index_lambda] = np.mean(err_cv_val) 
# best_lambda chosen to minimize the mean generalization error
best_lambda = lambdas[np.argmin(err_lambda)]
# The code should also provide the corresponding error values 
print(best_lambda)    

1e-10


In [18]:
# Probably would be smarter to get the errors directly above ...
method = 'ridge-regression'
lambda_ = best_lambda
err_cv_tr_ridge, err_cv_te_ridge = cross_validation(y, x, k_indices, k, method, initial_w, batch_size, max_iters, gamma, lambda_, clean_method)
print(np.mean(err_cv_tr_ridge), np.var(err_cv_tr_ridge))

0.1057800691212261 1.4943838914801378e-05


In [20]:
max_iters= 500
method = 'log-newton'
gammas = np.logspace(-10, 0, 30)
err_gamma = np.empty(len(gammas))
for index_gamma, gamma in enumerate(gammas):
    _, err_cv_val = cross_validation(y, x, k_indices, k, method, initial_w, batch_size, max_iters, gamma, lambda_, clean_method)
    err_gamma[index_gamma] = np.mean(err_cv_val) 
# best_gamma chosen to minimize the mean generalization error
best_gamma = gammas[np.argmin(err_gamma)]
# The code should also provide the corresponding error values 
print(best_gamma)    

KeyboardInterrupt: 

In [None]:
# Probably would be smarter to get the errors directly above ...
gamma = best_gamma
err_cv_tr_newton, err_cv_te = cross_validation(y, x, k_indices, k, method, initial_w, batch_size, max_iters, gamma, lambda_, clean_method)
print(np.mean(err_cv_tr_newton), np.var(err_cv_tr_newton))

In [None]:
## TO DO : 
# Do it for clean_method in ['raw', '', '0', 'mean', 'media']
# for all 3 methods ('least-squares', 'ridge-regression', 'log-newton')
# Add the accuracy
# Write a script working for all hyper-parameters (gamma for newton et lambda for ridge) ?! 


## Checking if all functions work well but not to be included 

In [3]:
k = 10 # ten fold CV
k_fold = k 
y = Y 
x = X
seed = 20
k_indices = build_k_indices(y, k_fold, seed)

In [4]:
# Variable to decide on cleaning method ( see function clean_features)
clean_method = '0'

### Least squares normal eq

In [5]:
method = 'least-squares'
err_cv_tr, err_cv_te = cross_validation(y, x, k_indices, k, method, None, 1,  1,  0 , 0, clean_method)
print(np.mean(err_cv_tr), np.var(err_cv_tr))

0.10628603762801525 5.516271597259191e-06


### Least squares GD 

In [6]:
method = 'least-squares-GD'
max_iters = 50
gamma = 0.01
batch_size = 1
initial_w = None
lambda_ = 0
err_cv_tr, err_cv_te = cross_validation(y, x, k_indices, k, method, initial_w, batch_size, max_iters, gamma, lambda_, clean_method)
print(np.mean(err_cv_tr), np.var(err_cv_tr))

1.9897983158778125 0.01956881224911784


### Least-squares SGD  

In [7]:
method = 'least-squares-SGD' 
err_cv_tr, err_cv_te = cross_validation(y, x, k_indices, k, method, initial_w, batch_size, max_iters, gamma, lambda_, clean_method)
print(np.mean(err_cv_tr), np.var(err_cv_tr))

1.1656511456357348 0.5725855745062298


### Ridge-regression

In [8]:
method = 'ridge-regression'
lambda_ = 1
err_cv_tr, err_cv_te = cross_validation(y, x, k_indices, k, method, initial_w, batch_size, max_iters, gamma, lambda_, clean_method)
print(np.mean(err_cv_tr), np.var(err_cv_tr))

0.1500700532153169 3.042030522074035e-08


### Log regression 

In [9]:
method = 'log'
initial_w = np.zeros(x.shape[1])
err_cv_tr, err_cv_te = cross_validation(y, x, k_indices, k, method, initial_w, batch_size, max_iters, gamma, lambda_, clean_method)
print(np.mean(err_cv_tr), np.var(err_cv_tr))

1.9842606890968806 0.00592770602653977


In [10]:
method = 'regularized-log'
initial_w = np.zeros(x.shape[1])
err_cv_tr, err_cv_te = cross_validation(y, x, k_indices, k, method, initial_w, batch_size, max_iters, gamma, lambda_, clean_method)
print(np.mean(err_cv_tr), np.var(err_cv_tr))

3.4687536518687443 0.10673300434066402


## Optimizing models with cross-validation

In [11]:
method = 'ridge-regression'
lambdas = np.logspace(-4, 0, 30)
err_lambda = np.empty(len(lambdas))
for index_lambda, lambda_ in enumerate(lambdas):
    err_cv_tr, err_cv_val = cross_validation(y, x, k_indices, k, method, initial_w, batch_size, max_iters, gamma, lambda_, clean_method)
    err_lambda[index_lambda] = np.mean(err_cv_val) 
best_lambda = lambdas[np.argmin(err_lambda)]
# The code should also provide the corresponding error values 
print(best_lambda)    

0.0001


In [12]:
method = 'ridge-regression'
lambda_ = best_lambda
err_cv_tr, err_cv_te = cross_validation(y, x, k_indices, k, method, initial_w, batch_size, max_iters, gamma, lambda_, clean_method)
print(np.mean(err_cv_tr), np.var(err_cv_tr))

0.10631253516169892 5.341317015924967e-06
