In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb

In [3]:
def normalize(X):
    return (X - X.mean())/X.std()

def norm_dataset(data):
    cat_vals = data['ocean_proximity']
    data_norm = normalize(data.drop(['ocean_proximity'], axis=1))
    data_norm = pd.concat([data_norm, cat_vals], axis=1)
    return data_norm

def remove_outliers(data):
    # calcoliamo il range interquantile
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    # rimuoviamo i valori fuoi dal range interquanitle*1.5
    return data[~((data < (Q1 - 1.5 * IQR)) | (data > (Q3 + 1.5 * IQR))).any(axis=1)]

def new_features(data):
    data['rooms_per_household']=data['total_rooms']/data['households']
    data['population_per_household']=data['population']/data['households']
    data['bedrooms_per_room']=data['total_bedrooms']/data['total_rooms']
    return data

## Elaborazione del dataset

Leggiamo i dati dal file CSV

In [30]:
# read data from csv file
data = pd.read_csv("cal-housing.csv")

Dalla precedenti analisi sostituiamo i rercord con ISLAN con NEAR OCEAN

In [31]:
data = data.replace("ISLAND", "NEAR OCEAN")

Scegliamo cosa fare dei valori NaN

In [32]:
#data = data.fillna(data.median())
data = data.dropna()

Rimuoviamo gli outliers

In [6]:
data1 = remove_outliers(data)

Normalizziamo il dataset

In [7]:
data_norm1 = norm_dataset(data1)
data_norm1

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
2,-1.314208,0.993310,1.842965,-0.622635,-1.154543,-1.164920,-1.167436,2.550540,1.760601,NEAR BAY
3,-1.319197,0.993310,1.842965,-0.799910,-0.951177,-1.061904,-0.962969,1.432248,1.645393,NEAR BAY
4,-1.319197,0.993310,1.842965,-0.475671,-0.747811,-1.050274,-0.768239,0.187461,1.654993,NEAR BAY
5,-1.319197,0.993310,1.842965,-1.125986,-1.050600,-1.302828,-1.089544,0.319497,0.881601,NEAR BAY
6,-1.319197,0.988695,1.842965,0.358348,0.196710,-0.171319,0.473164,0.057849,1.196292,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-0.740477,1.745630,-0.367534,-0.440767,-0.323002,-0.585042,-0.422594,-1.396077,-1.162288,INLAND
20636,-0.800344,1.750245,-0.940626,-1.329898,-1.335312,-1.397536,-1.474135,-0.705760,-1.172955,INLAND
20637,-0.805333,1.722552,-1.022496,0.100243,0.178633,-0.315873,0.078836,-1.299301,-1.010810,INLAND
20638,-0.855223,1.722552,-0.940626,-0.261655,-0.164829,-0.757843,-0.330097,-1.183474,-1.091883,INLAND


Trasformiamo la colonna ocean_proximity con one hot encode

In [8]:
data_norm1 = pd.get_dummies(data_norm1, drop_first=True)

Salviamo il risultato in un file

In [9]:
data_norm1.to_pickle("./base_no_out.pkl")
data_norm1

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_INLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
2,-1.314208,0.993310,1.842965,-0.622635,-1.154543,-1.164920,-1.167436,2.550540,1.760601,0,1,0
3,-1.319197,0.993310,1.842965,-0.799910,-0.951177,-1.061904,-0.962969,1.432248,1.645393,0,1,0
4,-1.319197,0.993310,1.842965,-0.475671,-0.747811,-1.050274,-0.768239,0.187461,1.654993,0,1,0
5,-1.319197,0.993310,1.842965,-1.125986,-1.050600,-1.302828,-1.089544,0.319497,0.881601,0,1,0
6,-1.319197,0.988695,1.842965,0.358348,0.196710,-0.171319,0.473164,0.057849,1.196292,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-0.740477,1.745630,-0.367534,-0.440767,-0.323002,-0.585042,-0.422594,-1.396077,-1.162288,1,0,0
20636,-0.800344,1.750245,-0.940626,-1.329898,-1.335312,-1.397536,-1.474135,-0.705760,-1.172955,1,0,0
20637,-0.805333,1.722552,-1.022496,0.100243,0.178633,-0.315873,0.078836,-1.299301,-1.010810,1,0,0
20638,-0.855223,1.722552,-0.940626,-0.261655,-0.164829,-0.757843,-0.330097,-1.183474,-1.091883,1,0,0


In [10]:
data_norm1 = norm_dataset(data)
data_norm1 = pd.get_dummies(data_norm1, drop_first=True)
data_norm1.to_pickle("./base.pkl")
data_norm1

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_INLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-1.327281,1.051692,0.982139,-0.803793,-0.970301,-0.973296,-0.976809,2.345106,2.128767,0,1,0
1,-1.322290,1.042330,-0.606195,2.042080,1.348243,0.861318,1.670332,2.332575,1.313594,0,1,0
2,-1.332272,1.037649,1.855723,-0.535176,-0.825541,-0.819749,-0.843406,1.782896,1.258152,0,1,0
3,-1.337263,1.037649,1.855723,-0.623495,-0.718750,-0.765037,-0.733544,0.932947,1.164593,0,1,0
4,-1.337263,1.037649,1.855723,-0.461959,-0.611959,-0.758860,-0.628914,-0.013143,1.172390,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-0.758299,1.800633,-0.288528,-0.444570,-0.388886,-0.511774,-0.443196,-1.216697,-1.115465,1,0,0
20636,-0.818192,1.805314,-0.844446,-0.887535,-0.920466,-0.943292,-1.008198,-0.692027,-1.124128,1,0,0
20637,-0.823183,1.777229,-0.923862,-0.175038,-0.125468,-0.368817,-0.173773,-1.143143,-0.992452,1,0,0
20638,-0.873094,1.777229,-0.844446,-0.355336,-0.305826,-0.603549,-0.393497,-1.055110,-1.058290,1,0,0


## Secondo dataset - nuove features

In [44]:
data2 = data.copy()
# data2 = remove_outliers(data2)

In [39]:
data2["median_house_value"] = remove_outliers(data2["median_house_value"])

ValueError: No axis named 1 for object type <class 'pandas.core.series.Series'>

Aggiungiamo delle feature derivando da quelle che già abbiamo

In [45]:
data2 = new_features(data2)

In [46]:
data_norm2 = norm_dataset(data2)
data_norm2 = pd.get_dummies(data_norm2, drop_first=True)
data_norm2.to_pickle("./add_features.pkl")
data_norm2

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,rooms_per_household,population_per_household,bedrooms_per_room,ocean_proximity_INLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
2,-1.314208,0.993310,1.842965,-0.622635,-1.154543,-1.164920,-1.167436,2.550540,1.760601,1.328392,-0.148532,-1.552226,0,1,0
3,-1.319197,0.993310,1.842965,-0.799910,-0.951177,-1.061904,-0.962969,1.432248,1.645393,0.226272,-0.388755,-0.560015,0,1,0
4,-1.319197,0.993310,1.842965,-0.475671,-0.747811,-1.050274,-0.768239,0.187461,1.654993,0.433468,-0.734927,-0.783271,0,1,0
5,-1.319197,0.993310,1.842965,-1.125986,-1.050600,-1.302828,-1.089544,0.319497,0.881601,-0.244631,-0.774195,0.294458,0,1,0
6,-1.319197,0.988695,1.842965,0.358348,0.196710,-0.171319,0.473164,0.057849,1.196292,-0.168690,-0.785050,-0.407578,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-0.740477,1.745630,-0.367534,-0.440767,-0.323002,-0.585042,-0.422594,-1.396077,-1.162288,-0.118041,-0.376796,0.165352,1,0,0
20636,-0.800344,1.750245,-0.940626,-1.329898,-1.335312,-1.397536,-1.474135,-0.705760,-1.172955,0.358611,0.154255,-0.004703,1,0,0
20637,-0.805333,1.722552,-1.022496,0.100243,0.178633,-0.315873,0.078836,-1.299301,-1.010810,-0.046632,-0.598748,-0.005335,1,0,0
20638,-0.855223,1.722552,-0.940626,-0.261655,-0.164829,-0.757843,-0.330097,-1.183474,-1.091883,0.008666,-0.789958,0.079894,1,0,0


## Terzo dataset - no skewed

In [14]:
data3 = data.copy()
data3 = new_features(data3)

Trasformiamo le colonne skewed

In [15]:
features= data3.columns.to_list()
features.remove("ocean_proximity")
features.remove("longitude")
features.remove("latitude")

for i in features:
    data3[i] = np.log(data3[i])
    
data3

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,population_per_household,bedrooms_per_room
0,-122.23,37.88,3.713572,6.779922,4.859812,5.774552,4.836282,2.119287,13.022764,NEAR BAY,1.943640,0.938270,-1.920110
1,-122.22,37.86,3.044522,8.867709,7.008505,7.783641,7.037028,2.116424,12.789684,NEAR BAY,1.830682,0.746613,-1.859204
2,-122.24,37.85,3.951244,7.290975,5.247024,6.206576,5.176150,1.982022,12.771671,NEAR BAY,2.114825,1.030426,-2.043951
3,-122.25,37.85,3.951244,7.149917,5.459586,6.324359,5.389072,1.730434,12.740517,NEAR BAY,1.760845,0.935287,-1.690331
4,-122.25,37.85,3.951244,7.394493,5.634790,6.336826,5.556828,1.347086,12.743151,NEAR BAY,1.837665,0.779998,-1.759704
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,3.218876,7.417580,5.924256,6.739337,5.799093,0.444878,11.265745,INLAND,1.618488,0.940244,-1.493325
20636,-121.21,39.49,2.890372,6.546785,5.010635,5.874931,4.736198,0.938756,11.252859,INLAND,1.810587,1.138732,-1.536150
20637,-121.22,39.43,2.833213,7.720462,6.184149,6.914731,6.070738,0.530628,11.432799,INLAND,1.649724,0.843993,-1.536313
20638,-121.32,39.43,2.890372,7.528332,6.013715,6.608001,5.855072,0.624440,11.346871,INLAND,1.673260,0.752929,-1.514617


In [16]:
data_norm3 = norm_dataset(data3)
data_norm3 = pd.get_dummies(data_norm3, drop_first=True)
data_norm3.to_pickle("./log.pkl")
data_norm3

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,rooms_per_household,population_per_household,bedrooms_per_room,ocean_proximity_INLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-1.327281,1.051692,0.860345,-1.129029,-1.625647,-1.691208,-1.560734,1.858461,1.647690,1.075632,-0.386808,-1.400977,0,1,0
1,-1.322290,1.042330,-0.317148,1.647970,1.306579,1.028805,1.440765,1.852378,1.238219,0.664507,-1.094021,-1.151797,0,1,0
2,-1.332272,1.037649,1.278635,-0.449269,-1.097237,-1.106310,-1.097204,1.566803,1.206573,1.698679,-0.046750,-1.907645,0,1,0
3,-1.337263,1.037649,1.278635,-0.636892,-0.807164,-0.946849,-0.806809,1.032234,1.151844,0.410329,-0.397813,-0.460893,0,1,0
4,-1.337263,1.037649,1.278635,-0.311578,-0.568070,-0.929971,-0.578013,0.217704,1.156470,0.689924,-0.970832,-0.744713,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-0.758299,1.800633,-0.010295,-0.280869,-0.173049,-0.385030,-0.247600,-1.699286,-1.439011,-0.107796,-0.379522,0.345114,1,0,0
20636,-0.818192,1.805314,-0.588446,-1.439127,-1.419826,-1.555309,-1.697234,-0.649904,-1.461651,0.591370,0.352900,0.169904,1,0,0
20637,-0.823183,1.777229,-0.689042,0.121998,0.181616,-0.147572,0.122885,-1.517086,-1.145533,0.005891,-0.734688,0.169238,1,0,0
20638,-0.873094,1.777229,-0.588446,-0.133557,-0.050967,-0.562839,-0.171252,-1.317757,-1.296491,0.091553,-1.070716,0.258003,1,0,0


## Drop features

In [40]:
data4 = data.copy()
#data4 = remove_outliers(data4)
data4 = new_features(data4)

Rimuoviamo le features correlate

In [41]:
data4.drop(["total_bedrooms"], axis=1, inplace=True)
data4.drop(["longitude"], axis=1, inplace=True)
data4.drop(["households"], axis=1, inplace=True)
data4.drop(["population"], axis=1, inplace=True)

In [42]:
data_norm4 = norm_dataset(data4)
data_norm4 = pd.get_dummies(data_norm4, drop_first=True)
data_norm4.to_pickle("./removed.pkl")
data_norm4

Unnamed: 0,latitude,housing_median_age,total_rooms,median_income,median_house_value,rooms_per_household,population_per_household,bedrooms_per_room,ocean_proximity_INLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,1.051692,0.982139,-0.803793,2.345106,2.128767,0.625379,-0.049431,-1.145996,0,1,0
1,1.042330,-0.606195,2.042080,2.332575,1.313594,0.324934,-0.092131,-0.987230,0,1,0
2,1.037649,1.855723,-0.535176,1.782896,1.258152,1.150565,-0.025797,-1.440479,0,1,0
3,1.037649,1.855723,-0.623495,0.932947,1.164593,0.155464,-0.050160,-0.492913,0,1,0
4,1.037649,1.855723,-0.461959,-0.013143,1.172390,0.342540,-0.085269,-0.706124,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...
20635,1.800633,-0.288528,-0.444570,-1.216697,-1.115465,-0.155416,-0.048947,0.199815,1,0,0
20636,1.805314,-0.844446,-0.887535,-0.692027,-1.124128,0.274952,0.004912,0.037411,1,0,0
20637,1.777229,-0.923862,-0.175038,-1.143143,-0.992452,-0.090941,-0.071458,0.036807,1,0,0
20638,1.777229,-0.844446,-0.355336,-1.055110,-1.058290,-0.041012,-0.090851,0.118202,1,0,0


## PCA

In [36]:
from transform import PCA
from model import RidgeRegression
from selection import KFoldCV

def normalize(X):
    return (X - X.mean())/X.std()

def prepare_for_pca(data):
    data = data.drop("median_house_value", axis =1)
    cols = ["ocean_proximity_NEAR BAY", "ocean_proximity_INLAND", "ocean_proximity_NEAR OCEAN"]
    for c in cols:
        data[c] = normalize(data[c])
    return data

def quadratic_loss(y_true, y_pred):
    return (y_true-y_pred)**2

dataA = ("base", pd.read_pickle("./base.pkl"))
dataB = ("base_no_out", pd.read_pickle("./base_no_out.pkl"))
dataC = ("add_features", pd.read_pickle("./add_features.pkl"))
dataD = ("log", pd.read_pickle("./log.pkl"))
dataE = ("removed", pd.read_pickle("./removed.pkl"))

datasets = [dataA, dataB, dataC, dataD, dataE]

In [37]:
# the number of components
rangeA = [4,5,6,7,8,9,10,11]
rangeB = rangeA
rangeC = [5,6,7,8,9,10,11,12,13,14]
rangeD = rangeC
rangeE = [4,5,6,7,8,9,10]
ranges = [rangeA, rangeB, rangeC, rangeD, rangeE]

In [38]:
folds = 10
kfolds = KFoldCV(folds, print=False)
ridge = RidgeRegression(alfa=0)

for i, d in enumerate(datasets):
    print(d[0])
    errors = []
    for el in ranges[i]:
        y = (d[1])["median_house_value"]
        x = prepare_for_pca(d[1])
        pca = PCA(x)
        x = pd.DataFrame(pca.get_principal_components(el))
        data = pd.concat([x,y.reset_index(drop=True)], axis= 1)
        err = kfolds.cross_validate(ridge, data, "median_house_value", quadratic_loss)
        print("{}\t{}".format(el, round(err, 5)))
        errors.append(err)
    min_value = min(errors)
    idx_min = errors.index(min_value)
    print("best num. components:\t{}\n".format(ranges[i][idx_min]))

base
4	0.49002
5	0.49037
6	0.44247
7	0.44799
8	0.42436
9	0.40481
10	0.39473
11	0.39406
best num. components:	11

base_no_out
4	0.53125
5	0.54862
6	0.50496
7	0.50703
8	0.47346
9	0.43755
10	0.43048
11	0.42255
best num. components:	11

add_features
5	0.58443
6	0.59662
7	0.58928
8	0.60071
9	0.51896
10	0.45307
11	0.43865
12	0.43066
13	0.41535
14	0.41619
best num. components:	13

log
5	0.49031
6	0.48475
7	0.48729
8	0.39415
9	0.39118
10	0.36017
11	0.34198
12	0.34973
13	0.34971
14	0.34973
best num. components:	11

removed
4	0.55919
5	0.56525
6	0.5629
7	0.56345
8	0.56904
9	0.48967
10	0.42671
best num. components:	10



Creiamo quindi le varie versioni da usare successivamente con il numero di componenti migliore trovato dalla precedente analisi

In [39]:
n_comp =[11, 11, 13, 11, 10]
for d, n in zip(datasets, n_comp):
    y = (d[1])["median_house_value"]
    x = prepare_for_pca(d[1])
    pca = PCA(x)
    x = pd.DataFrame(pca.get_principal_components(n))
    data = pd.concat([x,y.reset_index(drop=True)], axis= 1)
    data.to_pickle("PCA{}.pkl".format(d[0]))