In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb

In [2]:
def normalize(X):
    return (X - X.mean())/X.std()

def norm_dataset(data):
    cat_vals = data['ocean_proximity']
    y = data["median_house_value"]
    data_norm = normalize(data.drop(['ocean_proximity', 'median_house_value'], axis=1))
    data_norm = pd.concat([data_norm, cat_vals, y], axis=1)
    return data_norm

def remove_outliers(data):
    # calcoliamo il range interquantile
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    # rimuoviamo i valori fuoi dal range interquanitle*1.5
    return data[~((data < (Q1 - 1.5 * IQR)) | (data > (Q3 + 1.5 * IQR))).any(axis=1)]

def new_features(data):
    data['rooms_per_household']=data['total_rooms']/data['households']
    data['population_per_household']=data['population']/data['households']
    data['bedrooms_per_room']=data['total_bedrooms']/data['total_rooms']
    return data

In [3]:
drop = False

## Elaborazione del dataset

Leggiamo i dati dal file CSV

In [4]:
# read data from csv file
data = pd.read_csv("cal-housing.csv")

Dalla precedenti analisi sostituiamo i rercord con ISLAN con NEAR OCEAN

In [5]:
data = data.replace("ISLAND", "NEAR OCEAN")

Scegliamo cosa fare dei valori NaN

In [6]:
#data = data.fillna(data.median())
data = data.dropna()

Rimuoviamo gli outliers

In [7]:
data1 = remove_outliers(data)

Normalizziamo il dataset

In [8]:
data_norm1 = norm_dataset(data1)
data_norm1

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_house_value
2,-1.314208,0.993310,1.842965,-0.622635,-1.154543,-1.164920,-1.167436,2.550540,NEAR BAY,352100.0
3,-1.319197,0.993310,1.842965,-0.799910,-0.951177,-1.061904,-0.962969,1.432248,NEAR BAY,341300.0
4,-1.319197,0.993310,1.842965,-0.475671,-0.747811,-1.050274,-0.768239,0.187461,NEAR BAY,342200.0
5,-1.319197,0.993310,1.842965,-1.125986,-1.050600,-1.302828,-1.089544,0.319497,NEAR BAY,269700.0
6,-1.319197,0.988695,1.842965,0.358348,0.196710,-0.171319,0.473164,0.057849,NEAR BAY,299200.0
...,...,...,...,...,...,...,...,...,...,...
20635,-0.740477,1.745630,-0.367534,-0.440767,-0.323002,-0.585042,-0.422594,-1.396077,INLAND,78100.0
20636,-0.800344,1.750245,-0.940626,-1.329898,-1.335312,-1.397536,-1.474135,-0.705760,INLAND,77100.0
20637,-0.805333,1.722552,-1.022496,0.100243,0.178633,-0.315873,0.078836,-1.299301,INLAND,92300.0
20638,-0.855223,1.722552,-0.940626,-0.261655,-0.164829,-0.757843,-0.330097,-1.183474,INLAND,84700.0


Trasformiamo la colonna ocean_proximity con one hot encode

In [9]:
data_norm1 = pd.get_dummies(data_norm1, drop_first=drop)

Salviamo il risultato in un file

In [10]:
data_norm1.to_pickle("./base_no_out.pkl")
data_norm1

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
2,-1.314208,0.993310,1.842965,-0.622635,-1.154543,-1.164920,-1.167436,2.550540,352100.0,0,0,1,0
3,-1.319197,0.993310,1.842965,-0.799910,-0.951177,-1.061904,-0.962969,1.432248,341300.0,0,0,1,0
4,-1.319197,0.993310,1.842965,-0.475671,-0.747811,-1.050274,-0.768239,0.187461,342200.0,0,0,1,0
5,-1.319197,0.993310,1.842965,-1.125986,-1.050600,-1.302828,-1.089544,0.319497,269700.0,0,0,1,0
6,-1.319197,0.988695,1.842965,0.358348,0.196710,-0.171319,0.473164,0.057849,299200.0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-0.740477,1.745630,-0.367534,-0.440767,-0.323002,-0.585042,-0.422594,-1.396077,78100.0,0,1,0,0
20636,-0.800344,1.750245,-0.940626,-1.329898,-1.335312,-1.397536,-1.474135,-0.705760,77100.0,0,1,0,0
20637,-0.805333,1.722552,-1.022496,0.100243,0.178633,-0.315873,0.078836,-1.299301,92300.0,0,1,0,0
20638,-0.855223,1.722552,-0.940626,-0.261655,-0.164829,-0.757843,-0.330097,-1.183474,84700.0,0,1,0,0


In [11]:
data_norm1 = norm_dataset(data)
data_norm1 = pd.get_dummies(data_norm1, drop_first=drop)
data_norm1.to_pickle("./base.pkl")
data_norm1

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-1.327281,1.051692,0.982139,-0.803793,-0.970301,-0.973296,-0.976809,2.345106,452600.0,0,0,1,0
1,-1.322290,1.042330,-0.606195,2.042080,1.348243,0.861318,1.670332,2.332575,358500.0,0,0,1,0
2,-1.332272,1.037649,1.855723,-0.535176,-0.825541,-0.819749,-0.843406,1.782896,352100.0,0,0,1,0
3,-1.337263,1.037649,1.855723,-0.623495,-0.718750,-0.765037,-0.733544,0.932947,341300.0,0,0,1,0
4,-1.337263,1.037649,1.855723,-0.461959,-0.611959,-0.758860,-0.628914,-0.013143,342200.0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-0.758299,1.800633,-0.288528,-0.444570,-0.388886,-0.511774,-0.443196,-1.216697,78100.0,0,1,0,0
20636,-0.818192,1.805314,-0.844446,-0.887535,-0.920466,-0.943292,-1.008198,-0.692027,77100.0,0,1,0,0
20637,-0.823183,1.777229,-0.923862,-0.175038,-0.125468,-0.368817,-0.173773,-1.143143,92300.0,0,1,0,0
20638,-0.873094,1.777229,-0.844446,-0.355336,-0.305826,-0.603549,-0.393497,-1.055110,84700.0,0,1,0,0


## Secondo dataset - nuove features

In [12]:
data2 = data.copy()
data2 = remove_outliers(data2)

Aggiungiamo delle feature derivando da quelle che già abbiamo

In [13]:
data2 = new_features(data2)

In [14]:
data_norm2 = norm_dataset(data2)
data_norm2 = pd.get_dummies(data_norm2, drop_first=drop)
data_norm2.to_pickle("./add_features.pkl")
data_norm2

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_household,population_per_household,bedrooms_per_room,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
2,-1.314208,0.993310,1.842965,-0.622635,-1.154543,-1.164920,-1.167436,2.550540,1.328392,-0.148532,-1.552226,352100.0,0,0,1,0
3,-1.319197,0.993310,1.842965,-0.799910,-0.951177,-1.061904,-0.962969,1.432248,0.226272,-0.388755,-0.560015,341300.0,0,0,1,0
4,-1.319197,0.993310,1.842965,-0.475671,-0.747811,-1.050274,-0.768239,0.187461,0.433468,-0.734927,-0.783271,342200.0,0,0,1,0
5,-1.319197,0.993310,1.842965,-1.125986,-1.050600,-1.302828,-1.089544,0.319497,-0.244631,-0.774195,0.294458,269700.0,0,0,1,0
6,-1.319197,0.988695,1.842965,0.358348,0.196710,-0.171319,0.473164,0.057849,-0.168690,-0.785050,-0.407578,299200.0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-0.740477,1.745630,-0.367534,-0.440767,-0.323002,-0.585042,-0.422594,-1.396077,-0.118041,-0.376796,0.165352,78100.0,0,1,0,0
20636,-0.800344,1.750245,-0.940626,-1.329898,-1.335312,-1.397536,-1.474135,-0.705760,0.358611,0.154255,-0.004703,77100.0,0,1,0,0
20637,-0.805333,1.722552,-1.022496,0.100243,0.178633,-0.315873,0.078836,-1.299301,-0.046632,-0.598748,-0.005335,92300.0,0,1,0,0
20638,-0.855223,1.722552,-0.940626,-0.261655,-0.164829,-0.757843,-0.330097,-1.183474,0.008666,-0.789958,0.079894,84700.0,0,1,0,0


## Terzo dataset - no skewed

In [12]:
data3 = data.copy()
data3 = new_features(data3)

Trasformiamo le colonne skewed

In [13]:
features= data3.columns.to_list()
features.remove("ocean_proximity")
features.remove("longitude")
features.remove("latitude")
features.remove("median_house_value")

for i in features:
    data3[i] = np.log(data3[i])
    
data3

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,population_per_household,bedrooms_per_room
0,-122.23,37.88,3.713572,6.779922,4.859812,5.774552,4.836282,2.119287,452600.0,NEAR BAY,1.943640,0.938270,-1.920110
1,-122.22,37.86,3.044522,8.867709,7.008505,7.783641,7.037028,2.116424,358500.0,NEAR BAY,1.830682,0.746613,-1.859204
2,-122.24,37.85,3.951244,7.290975,5.247024,6.206576,5.176150,1.982022,352100.0,NEAR BAY,2.114825,1.030426,-2.043951
3,-122.25,37.85,3.951244,7.149917,5.459586,6.324359,5.389072,1.730434,341300.0,NEAR BAY,1.760845,0.935287,-1.690331
4,-122.25,37.85,3.951244,7.394493,5.634790,6.336826,5.556828,1.347086,342200.0,NEAR BAY,1.837665,0.779998,-1.759704
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,3.218876,7.417580,5.924256,6.739337,5.799093,0.444878,78100.0,INLAND,1.618488,0.940244,-1.493325
20636,-121.21,39.49,2.890372,6.546785,5.010635,5.874931,4.736198,0.938756,77100.0,INLAND,1.810587,1.138732,-1.536150
20637,-121.22,39.43,2.833213,7.720462,6.184149,6.914731,6.070738,0.530628,92300.0,INLAND,1.649724,0.843993,-1.536313
20638,-121.32,39.43,2.890372,7.528332,6.013715,6.608001,5.855072,0.624440,84700.0,INLAND,1.673260,0.752929,-1.514617


In [14]:
data3 = remove_outliers(data3)
data_norm3 = norm_dataset(data3bis2)
data_norm3 = pd.get_dummies(data_norm3, drop_first=drop)
data_norm3.to_pickle("./log.pkl")
data_norm3

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_household,population_per_household,bedrooms_per_room,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-1.306675,1.022539,0.938360,-1.672028,-2.314275,-2.448277,-2.290105,2.167859,1.498249,-0.486494,-1.607179,452600.0,0,0,1,0
1,-1.301687,1.013261,-0.526967,2.226779,1.717343,1.305520,1.902616,2.160928,0.946454,-1.347543,-1.317624,358500.0,0,0,1,0
2,-1.311662,1.008623,1.458899,-0.717670,-1.587745,-1.641079,-1.642610,1.835541,2.334478,-0.072466,-2.195943,352100.0,0,0,1,0
3,-1.316650,1.008623,1.458899,-0.981087,-1.188913,-1.421013,-1.236965,1.226448,0.605307,-0.499893,-0.514772,341300.0,0,0,1,0
4,-1.316650,1.008623,1.458899,-0.524357,-0.860176,-1.397720,-0.917366,0.298365,0.980568,-1.197557,-0.844580,342200.0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-0.738068,1.764747,-0.145105,-0.481242,-0.317047,-0.645665,-0.455819,-1.885873,-0.090100,-0.477624,0.421833,78100.0,0,1,0,0
20636,-0.797922,1.769386,-0.864582,-2.107395,-2.031284,-2.260728,-2.480778,-0.690197,0.848293,0.414117,0.218233,77100.0,0,1,0,0
20637,-0.802910,1.741553,-0.989768,0.084369,0.170593,-0.317957,0.061702,-1.678273,0.062487,-0.910047,0.217460,92300.0,0,1,0,0
20638,-0.852787,1.741553,-0.864582,-0.274422,-0.149193,-0.891055,-0.349171,-1.451155,0.177458,-1.319169,0.320607,84700.0,0,1,0,0


## Drop features

In [18]:
data4 = data.copy()
data4 = remove_outliers(data4)
data4 = new_features(data4)

Rimuoviamo le features correlate

In [19]:
data4.drop(["total_bedrooms"], axis=1, inplace=True)
data4.drop(["longitude"], axis=1, inplace=True)
data4.drop(["households"], axis=1, inplace=True)
data4.drop(["population"], axis=1, inplace=True)

In [20]:
data_norm4 = norm_dataset(data4)
data_norm4 = pd.get_dummies(data_norm4, drop_first=drop)
data_norm4.to_pickle("./removed.pkl")
data_norm4

Unnamed: 0,latitude,housing_median_age,total_rooms,median_income,rooms_per_household,population_per_household,bedrooms_per_room,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
2,0.993310,1.842965,-0.622635,2.550540,1.328392,-0.148532,-1.552226,352100.0,0,0,1,0
3,0.993310,1.842965,-0.799910,1.432248,0.226272,-0.388755,-0.560015,341300.0,0,0,1,0
4,0.993310,1.842965,-0.475671,0.187461,0.433468,-0.734927,-0.783271,342200.0,0,0,1,0
5,0.993310,1.842965,-1.125986,0.319497,-0.244631,-0.774195,0.294458,269700.0,0,0,1,0
6,0.988695,1.842965,0.358348,0.057849,-0.168690,-0.785050,-0.407578,299200.0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
20635,1.745630,-0.367534,-0.440767,-1.396077,-0.118041,-0.376796,0.165352,78100.0,0,1,0,0
20636,1.750245,-0.940626,-1.329898,-0.705760,0.358611,0.154255,-0.004703,77100.0,0,1,0,0
20637,1.722552,-1.022496,0.100243,-1.299301,-0.046632,-0.598748,-0.005335,92300.0,0,1,0,0
20638,1.722552,-0.940626,-0.261655,-1.183474,0.008666,-0.789958,0.079894,84700.0,0,1,0,0


## PCA

In [15]:
from transform import PCA
from model import RidgeRegression
from selection import KFoldCV
import pandas as pd

def normalize(X):
    return (X - X.mean())/X.std()

def prepare_for_pca(data):
    data = data.drop("median_house_value", axis =1)
    cols = ["ocean_proximity_NEAR BAY", "ocean_proximity_INLAND", "ocean_proximity_NEAR OCEAN", "ocean_proximity_<1H OCEAN"]
    for c in cols:
        data[c] = normalize(data[c])
    return data

def mean_squared_error(y_true, y_pred):
    import numpy as np
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    return np.mean((y_true-y_pred)**2, axis=0)

def r2_score(y_true, y_pred):
    import numpy as np
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    mean = np.average(y_true, axis=0)
    numerator = ((y_true - y_pred) ** 2).sum(axis=0, dtype=np.float64)
    denominator = ((y_true - mean) ** 2).sum(axis=0, dtype=np.float64)
    return  1 - (numerator /denominator)

dataA = ("base", pd.read_pickle("./base.pkl"))
dataB = ("base_no_out", pd.read_pickle("./base_no_out.pkl"))
dataC = ("add_features", pd.read_pickle("./add_features.pkl"))
dataD = ("log", pd.read_pickle("./log.pkl"))
dataE = ("removed", pd.read_pickle("./removed.pkl"))

datasets = [dataA, dataB, dataC, dataD, dataE]

In [16]:
len(dataD[1].columns)

16

In [19]:
# the number of components
# from min number of eigevalus > 1 to 99% variability explained
rangeA = range(5, 9)
rangeB = range(5, 9)
rangeC = range(5, 12)
rangeD = range(5, 10)
rangeE = range(5, 11)
ranges = [rangeA, rangeB, rangeC, rangeD, rangeE]

In [20]:
folds = 5
kfolds = KFoldCV(folds, print=False, shuffle=100)
ridge = RidgeRegression(alfa=0)

for i, d in enumerate(datasets):
    print(d[0])
    errors = []
    for el in ranges[i]:
        y = (d[1])["median_house_value"]
        x = prepare_for_pca(d[1])
        pca = PCA(x)
        x = pd.DataFrame(pca.get_principal_components(el))
        data = pd.concat([x,y.reset_index(drop=True)], axis= 1)
        
        err = kfolds.cross_validate(ridge, data, "median_house_value", mean_squared_error)
        print("{}\t{}".format(el, round(err, 5)))
        errors.append(err)
    min_value = min(errors)
    idx_min = errors.index(min_value)
    print("best num. components:\t{}\n".format(ranges[i][idx_min]))

base
5	48727110384.92291
6	48537161197.68485
7	48127124891.60181
8	47861999139.10091
best num. components:	8

base_no_out
5	39165866076.81865
6	39137372715.84244
7	38891456415.993
8	38659655204.28965
best num. components:	8

add_features
5	39312366652.87154
6	39302406063.58332
7	39250143385.8801
8	39245126260.5108
9	39245397166.12166
10	38547861328.37031
11	38448156031.06733
best num. components:	11

log
5	41021912210.77733
6	40863108590.13454
7	40859940770.30056
8	40860786381.88616
9	40480819401.24464
best num. components:	9

removed
5	39393707471.63579
6	39401316691.1337
7	39404134583.20962
8	39318437220.4476
9	39311959139.80148
10	38633170626.56415
best num. components:	10



Creiamo quindi le varie versioni da usare successivamente con il numero di componenti migliore trovato dalla precedente analisi

In [21]:
n_comp = [8, 8, 11, 9, 10]
for i, d in enumerate(datasets):
    y = (d[1])["median_house_value"]
    x = prepare_for_pca(d[1])
    pca = PCA(x)
    x = pd.DataFrame(pca.get_principal_components(n_comp[i]))
    data = pd.concat([x,y.reset_index(drop=True)], axis= 1)
    data.to_pickle("PCA{}.pkl".format(d[0]))