In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import csv
import math
import time
%load_ext autoreload
%autoreload 2

import pandas as pd
import seaborn as sns

from global_variables import *
from data_preparation import * 
from cost import * 
from cross_validation import *
from performances import * 

## Load the data 

In [2]:
X, Y = load_data()

In [3]:
X0 = X[X[:,22]==0]
X1 = X[X[:,22]==1]
X2 = X[X[:,22]==2]
X3 = X[X[:,22]==3]
print(X0.shape[0]/X.shape[0] * 100, X1.shape[0]/X.shape[0] * 100, X2.shape[0]/X.shape[0] * 100, X3.shape[0]/X.shape[0] * 100)

39.9652 31.0176 20.1516 8.8656


In [4]:
invalids0 = np.count_nonzero(X0 == -999, axis=0)
print(invalids0/X0.shape[0] * 100)

invalids1 = np.count_nonzero(X1 == -999, axis=0)
print(invalids1/X1.shape[0] * 100)

invalids2 = np.count_nonzero(X2 == -999, axis=0)
print(invalids2/X2.shape[0] * 100)

invalids3 = np.count_nonzero(X3 == -999, axis=0)
print(invalids3/X3.shape[0] * 100)

[ 26.1457468   0.          0.          0.        100.        100.
 100.          0.          0.          0.          0.          0.
 100.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.        100.
 100.        100.        100.        100.        100.          0.       ]
[  9.7518828   0.          0.          0.        100.        100.
 100.          0.          0.          0.          0.          0.
 100.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.        100.        100.        100.          0.       ]
[5.85958435 0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.        ]
[6.66395957 0.      

## Doing cross-validation to evaluate the different models

In [5]:
# Testing all methods without pre-processing
k = 10 # ten fold CV
k_fold = k 
y = Y 
x = X
seed = 20
k_indices = build_k_indices(y, k_fold, seed)

### Least squares normal eq

In [6]:
method = 'least-squares'
err_cv_tr, err_cv_te = cross_validation(y, x, k_indices, k, method, initial_w = None, batch_size =1, max_iters = 1, gamma = 0 , lambda_ = 0)
print(np.mean(err_cv_tr), np.var(err_cv_tr))

0.14357066574693128 1.5762202551039412e-08


### Least squares GD 

In [7]:
method = 'least-squares-GD'
max_iters = 50
gamma = 0.01
batch_size = 1
initial_w = None
err_cv_tr, err_cv_te = cross_validation(y, x, k_indices, k, method, initial_w, batch_size, max_iters, gamma)
print(np.mean(err_cv_tr), np.var(err_cv_tr))

2.03517588710915 2.054764826565965e-06


### Least-squares SGD  

In [8]:
method = 'least-squares-SGD' 
err_cv_tr, err_cv_te = cross_validation(y, x, k_indices, k, method, initial_w, batch_size, max_iters, gamma)
print(np.mean(err_cv_tr), np.var(err_cv_tr))

1.6592329031661646 0.8999233018292113


### Ridge-regression

In [9]:
method = 'ridge-regression'
lambda_ = 1
err_cv_tr, err_cv_te = cross_validation(y, x, k_indices, k, method, initial_w, batch_size, max_iters, gamma, lambda_)
print(np.mean(err_cv_tr), np.var(err_cv_tr))

0.16009767629944566 1.6876341250484806e-08


### Log regression 

In [10]:
method = 'log'
initial_w = np.zeros(x.shape[1])
err_cv_tr, err_cv_te = cross_validation(y, x, k_indices, k, method, initial_w, batch_size, max_iters, gamma, lambda_)
print(np.mean(err_cv_tr), np.var(err_cv_tr))

  loss = - (y.T @ np.log(predictions) + (1-y).T @ np.log(1-predictions))
  loss = - (y.T @ np.log(predictions) + (1-y).T @ np.log(1-predictions))


nan nan


In [11]:
method = 'regularized-log'
initial_w = np.zeros(x.shape[1])
err_cv_tr, err_cv_te = cross_validation(y, x, k_indices, k, method, initial_w, batch_size, max_iters, gamma, lambda_)
print(np.mean(err_cv_tr), np.var(err_cv_tr))

  loss = - (y.T @ np.log(predictions) + (1-y).T @ np.log(1-predictions)) + lambda_ * w.T @ w
  loss = - (y.T @ np.log(predictions) + (1-y).T @ np.log(1-predictions)) + lambda_ * w.T @ w


nan nan


## Optimizing models with cross-validation

In [12]:
method = 'ridge-regression'
lambdas = np.logspace(-4, 0, 30)
err_lambda = np.empty(len(lambdas))
for index_lambda, lambda_ in enumerate(lambdas):
    err_cv_tr, err_cv_val = cross_validation(y, x, k_indices, k, method, initial_w, batch_size, max_iters, gamma, lambda_)
    err_lambda[index_lambda] = np.mean(err_cv_val) 
best_lambda = lambdas[np.argmin(err_lambda)]
# The code should also provide the corresponding error values 
print(best_lambda)    

0.0001


In [13]:
method = 'ridge-regression'
lambda_ = best_lambda
err_cv_tr, err_cv_te = cross_validation(y, x, k_indices, k, method, initial_w, batch_size, max_iters, gamma, lambda_)
print(np.mean(err_cv_tr), np.var(err_cv_tr))

0.14372634577696095 1.5447761705408536e-08
