In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from model import RidgeRegression
from selection import KFoldCV

## Elaborazione del dataset

Leggiamo i dati dal file CSV

In [2]:
# read data from csv file
data = pd.read_csv("cal-housing.csv")
data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


Dalla precedenti analisi sostituiamo i rercord con ISLAN con NEAR OCEAN

In [3]:
data = data.replace("ISLAND", "NEAR OCEAN")
data['ocean_proximity'].value_counts()

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2663
NEAR BAY      2290
Name: ocean_proximity, dtype: int64

Scegliamo cosa fare degli outliers

In [4]:
# drop NaN
data = data.dropna()

In [5]:
data = data.fillna(data.median())

Aggiungiamo delle feature derivando da quelle che già abbiamo

In [6]:
data1 = data.copy()
data1['rooms_per_household']=data1['total_rooms']/data1['households']
data1['population_per_household']=data1['population']/data1['households']
data1['bedrooms_per_room']=data1['total_bedrooms']/data1['total_rooms']
data1

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,population_per_household,bedrooms_per_room
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,6.984127,2.555556,0.146591
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,6.238137,2.109842,0.155797
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,8.288136,2.802260,0.129516
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,5.817352,2.547945,0.184458
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,6.281853,2.181467,0.172096
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND,5.045455,2.560606,0.224625
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND,6.114035,3.122807,0.215208
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND,5.205543,2.325635,0.215173
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND,5.329513,2.123209,0.219892


Trasformiamo le colonne skewed

In [7]:
features=['total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'median_house_value', 
          'rooms_per_household', 'population_per_household', 'bedrooms_per_room']
for i in features:
    data1[i] = np.log(data1[i])
data1

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,population_per_household,bedrooms_per_room
0,-122.23,37.88,41.0,6.779922,4.859812,5.774552,4.836282,2.119287,13.022764,NEAR BAY,1.943640,0.938270,-1.920110
1,-122.22,37.86,21.0,8.867709,7.008505,7.783641,7.037028,2.116424,12.789684,NEAR BAY,1.830682,0.746613,-1.859204
2,-122.24,37.85,52.0,7.290975,5.247024,6.206576,5.176150,1.982022,12.771671,NEAR BAY,2.114825,1.030426,-2.043951
3,-122.25,37.85,52.0,7.149917,5.459586,6.324359,5.389072,1.730434,12.740517,NEAR BAY,1.760845,0.935287,-1.690331
4,-122.25,37.85,52.0,7.394493,5.634790,6.336826,5.556828,1.347086,12.743151,NEAR BAY,1.837665,0.779998,-1.759704
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,7.417580,5.924256,6.739337,5.799093,0.444878,11.265745,INLAND,1.618488,0.940244,-1.493325
20636,-121.21,39.49,18.0,6.546785,5.010635,5.874931,4.736198,0.938756,11.252859,INLAND,1.810587,1.138732,-1.536150
20637,-121.22,39.43,17.0,7.720462,6.184149,6.914731,6.070738,0.530628,11.432799,INLAND,1.649724,0.843993,-1.536313
20638,-121.32,39.43,18.0,7.528332,6.013715,6.608001,5.855072,0.624440,11.346871,INLAND,1.673260,0.752929,-1.514617


Trasformiamo la colonna ocean_proximity con one hot encode

In [8]:
x = data1
x = pd.get_dummies(x)
x

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,rooms_per_household,population_per_household,bedrooms_per_room,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.23,37.88,41.0,6.779922,4.859812,5.774552,4.836282,2.119287,13.022764,1.943640,0.938270,-1.920110,0,0,1,0
1,-122.22,37.86,21.0,8.867709,7.008505,7.783641,7.037028,2.116424,12.789684,1.830682,0.746613,-1.859204,0,0,1,0
2,-122.24,37.85,52.0,7.290975,5.247024,6.206576,5.176150,1.982022,12.771671,2.114825,1.030426,-2.043951,0,0,1,0
3,-122.25,37.85,52.0,7.149917,5.459586,6.324359,5.389072,1.730434,12.740517,1.760845,0.935287,-1.690331,0,0,1,0
4,-122.25,37.85,52.0,7.394493,5.634790,6.336826,5.556828,1.347086,12.743151,1.837665,0.779998,-1.759704,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,7.417580,5.924256,6.739337,5.799093,0.444878,11.265745,1.618488,0.940244,-1.493325,0,1,0,0
20636,-121.21,39.49,18.0,6.546785,5.010635,5.874931,4.736198,0.938756,11.252859,1.810587,1.138732,-1.536150,0,1,0,0
20637,-121.22,39.43,17.0,7.720462,6.184149,6.914731,6.070738,0.530628,11.432799,1.649724,0.843993,-1.536313,0,1,0,0
20638,-121.32,39.43,18.0,7.528332,6.013715,6.608001,5.855072,0.624440,11.346871,1.673260,0.752929,-1.514617,0,1,0,0


Normalizziamo il dataset e scaliamo i valori di median_house_value

In [9]:
def normalize(X):
    return (X - X.mean())/X.std()
x = normalize(x)
x

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,rooms_per_household,population_per_household,bedrooms_per_room,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-1.327281,1.051692,0.982139,-1.129029,-1.625647,-1.691208,-1.560734,1.858461,1.647690,1.075632,-0.386808,-1.400977,-0.890217,-0.682697,2.828592,-0.384596
1,-1.322290,1.042330,-0.606195,1.647970,1.306579,1.028805,1.440765,1.852378,1.238219,0.664507,-1.094021,-1.151797,-0.890217,-0.682697,2.828592,-0.384596
2,-1.332272,1.037649,1.855723,-0.449269,-1.097237,-1.106310,-1.097204,1.566803,1.206573,1.698679,-0.046750,-1.907645,-0.890217,-0.682697,2.828592,-0.384596
3,-1.337263,1.037649,1.855723,-0.636892,-0.807164,-0.946849,-0.806809,1.032234,1.151844,0.410329,-0.397813,-0.460893,-0.890217,-0.682697,2.828592,-0.384596
4,-1.337263,1.037649,1.855723,-0.311578,-0.568070,-0.929971,-0.578013,0.217704,1.156470,0.689924,-0.970832,-0.744713,-0.890217,-0.682697,2.828592,-0.384596
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-0.758299,1.800633,-0.288528,-0.280869,-0.173049,-0.385030,-0.247600,-1.699286,-1.439011,-0.107796,-0.379522,0.345114,-0.890217,1.464708,-0.353516,-0.384596
20636,-0.818192,1.805314,-0.844446,-1.439127,-1.419826,-1.555309,-1.697234,-0.649904,-1.461651,0.591370,0.352900,0.169904,-0.890217,1.464708,-0.353516,-0.384596
20637,-0.823183,1.777229,-0.923862,0.121998,0.181616,-0.147572,0.122885,-1.517086,-1.145533,0.005891,-0.734688,0.169238,-0.890217,1.464708,-0.353516,-0.384596
20638,-0.873094,1.777229,-0.844446,-0.133557,-0.050967,-0.562839,-0.171252,-1.317757,-1.296491,0.091553,-1.070716,0.258003,-0.890217,1.464708,-0.353516,-0.384596


In [10]:
y = data1["median_house_value"].copy()
y = (y - y.min())/(y.max() - y.min())
y = y - y.mean()
y

0        0.267466
1        0.200997
2        0.195860
3        0.186976
4        0.187727
           ...   
20635   -0.233591
20636   -0.237266
20637   -0.185952
20638   -0.210456
20639   -0.195055
Name: median_house_value, Length: 20433, dtype: float64

Rimuoviamo gli outliers

In [11]:
# search for outliers
threshold = 3

buoni = (np.abs(x) <= threshold).all(axis = 1)
x = x[buoni]
y = y[buoni]
len(x)

19614

In [12]:
data1 = pd.concat([x, y], axis=1, sort=False)
data1

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,rooms_per_household,population_per_household,bedrooms_per_room,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN,median_house_value.1
0,-1.327281,1.051692,0.982139,-1.129029,-1.625647,-1.691208,-1.560734,1.858461,1.647690,1.075632,-0.386808,-1.400977,-0.890217,-0.682697,2.828592,-0.384596,0.267466
1,-1.322290,1.042330,-0.606195,1.647970,1.306579,1.028805,1.440765,1.852378,1.238219,0.664507,-1.094021,-1.151797,-0.890217,-0.682697,2.828592,-0.384596,0.200997
2,-1.332272,1.037649,1.855723,-0.449269,-1.097237,-1.106310,-1.097204,1.566803,1.206573,1.698679,-0.046750,-1.907645,-0.890217,-0.682697,2.828592,-0.384596,0.195860
3,-1.337263,1.037649,1.855723,-0.636892,-0.807164,-0.946849,-0.806809,1.032234,1.151844,0.410329,-0.397813,-0.460893,-0.890217,-0.682697,2.828592,-0.384596,0.186976
4,-1.337263,1.037649,1.855723,-0.311578,-0.568070,-0.929971,-0.578013,0.217704,1.156470,0.689924,-0.970832,-0.744713,-0.890217,-0.682697,2.828592,-0.384596,0.187727
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-0.758299,1.800633,-0.288528,-0.280869,-0.173049,-0.385030,-0.247600,-1.699286,-1.439011,-0.107796,-0.379522,0.345114,-0.890217,1.464708,-0.353516,-0.384596,-0.233591
20636,-0.818192,1.805314,-0.844446,-1.439127,-1.419826,-1.555309,-1.697234,-0.649904,-1.461651,0.591370,0.352900,0.169904,-0.890217,1.464708,-0.353516,-0.384596,-0.237266
20637,-0.823183,1.777229,-0.923862,0.121998,0.181616,-0.147572,0.122885,-1.517086,-1.145533,0.005891,-0.734688,0.169238,-0.890217,1.464708,-0.353516,-0.384596,-0.185952
20638,-0.873094,1.777229,-0.844446,-0.133557,-0.050967,-0.562839,-0.171252,-1.317757,-1.296491,0.091553,-1.070716,0.258003,-0.890217,1.464708,-0.353516,-0.384596,-0.210456


In [13]:
data1 = x
data1

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,rooms_per_household,population_per_household,bedrooms_per_room,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-1.327281,1.051692,0.982139,-1.129029,-1.625647,-1.691208,-1.560734,1.858461,1.647690,1.075632,-0.386808,-1.400977,-0.890217,-0.682697,2.828592,-0.384596
1,-1.322290,1.042330,-0.606195,1.647970,1.306579,1.028805,1.440765,1.852378,1.238219,0.664507,-1.094021,-1.151797,-0.890217,-0.682697,2.828592,-0.384596
2,-1.332272,1.037649,1.855723,-0.449269,-1.097237,-1.106310,-1.097204,1.566803,1.206573,1.698679,-0.046750,-1.907645,-0.890217,-0.682697,2.828592,-0.384596
3,-1.337263,1.037649,1.855723,-0.636892,-0.807164,-0.946849,-0.806809,1.032234,1.151844,0.410329,-0.397813,-0.460893,-0.890217,-0.682697,2.828592,-0.384596
4,-1.337263,1.037649,1.855723,-0.311578,-0.568070,-0.929971,-0.578013,0.217704,1.156470,0.689924,-0.970832,-0.744713,-0.890217,-0.682697,2.828592,-0.384596
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-0.758299,1.800633,-0.288528,-0.280869,-0.173049,-0.385030,-0.247600,-1.699286,-1.439011,-0.107796,-0.379522,0.345114,-0.890217,1.464708,-0.353516,-0.384596
20636,-0.818192,1.805314,-0.844446,-1.439127,-1.419826,-1.555309,-1.697234,-0.649904,-1.461651,0.591370,0.352900,0.169904,-0.890217,1.464708,-0.353516,-0.384596
20637,-0.823183,1.777229,-0.923862,0.121998,0.181616,-0.147572,0.122885,-1.517086,-1.145533,0.005891,-0.734688,0.169238,-0.890217,1.464708,-0.353516,-0.384596
20638,-0.873094,1.777229,-0.844446,-0.133557,-0.050967,-0.562839,-0.171252,-1.317757,-1.296491,0.091553,-1.070716,0.258003,-0.890217,1.464708,-0.353516,-0.384596


Salviamo il risultato in un file

In [14]:
data1.to_pickle("./elaborated.pkl")