In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from model import RidgeRegression
from selection import KFoldCV, NestedCV

## Procediamo a fare il training e valutiamo i risultati con cross validation

Carichiamo il dataset elaborato ottenuto dal notebook elaboration e definiamo la funzione di perdita

In [2]:
data = pd.read_pickle("./elaborated.pkl")

def quadratic_loss(y_true, y_pred):
    return (y_true-y_pred)**2

Definiamo il numero di fold e l'insieme di iperparemtri

In [3]:
folds = 5
kfolds = KFoldCV(folds, print=True)
alfas = [1e-15, 1e-10, 1e-8, 1e-4, 1e-3, 1e-2, 1, 5, 10, 20, 50, 100]

Facciamo training del modello di regressione e valutiamo i risultati con cross validation

In [4]:
ridge = RidgeRegression(alfa=20)

errors = kfolds.cross_validate(ridge, data, "median_house_value", quadratic_loss)
errors

Scaled error: 0.3427984533378825
Scaled error: 0.2675400399572118
Scaled error: 0.27673962045109407
Scaled error: 0.33503643449006953
Scaled error: 0.2475441081196953


0.2939317312711907

## Proviamo con PCA 

In [5]:
dataPCA = pd.read_pickle("./pca.pkl")
errors = kfolds.cross_validate(ridge, dataPCA, "median_house_value", quadratic_loss)
errors

Scaled error: 0.4647432639238174
Scaled error: 0.3646435413062233
Scaled error: 0.3160491323534456
Scaled error: 0.5022527737879804
Scaled error: 0.32577089729390135


0.3946919217330736

## Proviamo a fare una stima migliore dell'errore con Nested CV

Definiamo il numero di fold interni ed esterni e l'insieme di parametri

In [6]:
internal_folds = 3
nestedKFolds = NestedCV(folds, internal_folds)
alfas = [1e-8, 1e-4, 1e-3, 1e-2, 1, 5, 10, 20, 50, 100]

Facciamo la stima dell'errore con Nested Cross Validation

In [7]:
error = nestedKFolds.cross_validate(RidgeRegression, data, "median_house_value", quadratic_loss, alfas)
error

External fold num: 1
Testing with value 1e-08 with error : 0.29721157002230725
Testing with value 0.0001 with error : 0.2972149340175818
Testing with value 0.001 with error : 0.2972149065363545
Testing with value 0.01 with error : 0.2972146370416495
Testing with value 1 with error : 0.2971858474995795
Testing with value 5 with error : 0.29708615712183123
Testing with value 10 with error : 0.2969957405170694
Testing with value 20 with error : 0.2969108718263865
Testing with value 50 with error : 0.2971803471509279
Testing with value 100 with error : 0.2984750290498222
The best parameter on interal folds is 20.0, with error on external fold: 0.3427984533378825

External fold num: 2
Testing with value 1e-08 with error : 0.32151280441087227
Testing with value 0.0001 with error : 0.3215100900517808
Testing with value 0.001 with error : 0.3215101078763025
Testing with value 0.01 with error : 0.32151028882843774
Testing with value 1 with error : 0.3215303794789185
Testing with value 5 with er

0.29369169477494145