In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb

In [2]:
def normalize(X):
    return (X - X.mean())/X.std()

def norm_dataset(data):
    cat_vals = data['ocean_proximity']
    y = data["median_house_value"]
    data_norm = normalize(data.drop(['ocean_proximity', 'median_house_value'], axis=1))
    data_norm = pd.concat([data_norm, cat_vals, y], axis=1)
    return data_norm

def remove_outliers(data):
    # calcoliamo il range interquantile
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    # rimuoviamo i valori fuoi dal range interquanitle*1.5
    return data[~((data < (Q1 - 1.5 * IQR)) | (data > (Q3 + 1.5 * IQR))).any(axis=1)]

def new_features(data):
    data['rooms_per_household']=data['total_rooms']/data['households']
    data['population_per_household']=data['population']/data['households']
    data['bedrooms_per_room']=data['total_bedrooms']/data['total_rooms']
    return data

In [4]:
drop = False

## Elaborazione del dataset

Leggiamo i dati dal file CSV

In [4]:
# read data from csv file
data = pd.read_csv("cal-housing.csv")

Dalla precedenti analisi sostituiamo i rercord con ISLAN con NEAR OCEAN

In [5]:
data = data.replace("ISLAND", "NEAR OCEAN")

Scegliamo cosa fare dei valori NaN

In [6]:
#data = data.fillna(data.median())
data = data.dropna()

Rimuoviamo gli outliers

In [7]:
data1 = remove_outliers(data)

Normalizziamo il dataset

In [8]:
data_norm1 = norm_dataset(data1)
data_norm1

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_house_value
2,-1.314208,0.993310,1.842965,-0.622635,-1.154543,-1.164920,-1.167436,2.550540,NEAR BAY,352100.0
3,-1.319197,0.993310,1.842965,-0.799910,-0.951177,-1.061904,-0.962969,1.432248,NEAR BAY,341300.0
4,-1.319197,0.993310,1.842965,-0.475671,-0.747811,-1.050274,-0.768239,0.187461,NEAR BAY,342200.0
5,-1.319197,0.993310,1.842965,-1.125986,-1.050600,-1.302828,-1.089544,0.319497,NEAR BAY,269700.0
6,-1.319197,0.988695,1.842965,0.358348,0.196710,-0.171319,0.473164,0.057849,NEAR BAY,299200.0
...,...,...,...,...,...,...,...,...,...,...
20635,-0.740477,1.745630,-0.367534,-0.440767,-0.323002,-0.585042,-0.422594,-1.396077,INLAND,78100.0
20636,-0.800344,1.750245,-0.940626,-1.329898,-1.335312,-1.397536,-1.474135,-0.705760,INLAND,77100.0
20637,-0.805333,1.722552,-1.022496,0.100243,0.178633,-0.315873,0.078836,-1.299301,INLAND,92300.0
20638,-0.855223,1.722552,-0.940626,-0.261655,-0.164829,-0.757843,-0.330097,-1.183474,INLAND,84700.0


Trasformiamo la colonna ocean_proximity con one hot encode

In [9]:
data_norm1 = pd.get_dummies(data_norm1, drop_first=drop)

Salviamo il risultato in un file

In [10]:
data_norm1.to_pickle("./base_no_out.pkl")
data_norm1

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
2,-1.314208,0.993310,1.842965,-0.622635,-1.154543,-1.164920,-1.167436,2.550540,352100.0,0,0,1,0
3,-1.319197,0.993310,1.842965,-0.799910,-0.951177,-1.061904,-0.962969,1.432248,341300.0,0,0,1,0
4,-1.319197,0.993310,1.842965,-0.475671,-0.747811,-1.050274,-0.768239,0.187461,342200.0,0,0,1,0
5,-1.319197,0.993310,1.842965,-1.125986,-1.050600,-1.302828,-1.089544,0.319497,269700.0,0,0,1,0
6,-1.319197,0.988695,1.842965,0.358348,0.196710,-0.171319,0.473164,0.057849,299200.0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-0.740477,1.745630,-0.367534,-0.440767,-0.323002,-0.585042,-0.422594,-1.396077,78100.0,0,1,0,0
20636,-0.800344,1.750245,-0.940626,-1.329898,-1.335312,-1.397536,-1.474135,-0.705760,77100.0,0,1,0,0
20637,-0.805333,1.722552,-1.022496,0.100243,0.178633,-0.315873,0.078836,-1.299301,92300.0,0,1,0,0
20638,-0.855223,1.722552,-0.940626,-0.261655,-0.164829,-0.757843,-0.330097,-1.183474,84700.0,0,1,0,0


In [11]:
data_norm1 = norm_dataset(data)
data_norm1 = pd.get_dummies(data_norm1, drop_first=drop)
data_norm1.to_pickle("./base.pkl")
data_norm1

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-1.327281,1.051692,0.982139,-0.803793,-0.970301,-0.973296,-0.976809,2.345106,452600.0,0,0,1,0
1,-1.322290,1.042330,-0.606195,2.042080,1.348243,0.861318,1.670332,2.332575,358500.0,0,0,1,0
2,-1.332272,1.037649,1.855723,-0.535176,-0.825541,-0.819749,-0.843406,1.782896,352100.0,0,0,1,0
3,-1.337263,1.037649,1.855723,-0.623495,-0.718750,-0.765037,-0.733544,0.932947,341300.0,0,0,1,0
4,-1.337263,1.037649,1.855723,-0.461959,-0.611959,-0.758860,-0.628914,-0.013143,342200.0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-0.758299,1.800633,-0.288528,-0.444570,-0.388886,-0.511774,-0.443196,-1.216697,78100.0,0,1,0,0
20636,-0.818192,1.805314,-0.844446,-0.887535,-0.920466,-0.943292,-1.008198,-0.692027,77100.0,0,1,0,0
20637,-0.823183,1.777229,-0.923862,-0.175038,-0.125468,-0.368817,-0.173773,-1.143143,92300.0,0,1,0,0
20638,-0.873094,1.777229,-0.844446,-0.355336,-0.305826,-0.603549,-0.393497,-1.055110,84700.0,0,1,0,0


## Secondo dataset - nuove features

In [12]:
data2 = data.copy()
data2 = remove_outliers(data2)

Aggiungiamo delle feature derivando da quelle che già abbiamo

In [13]:
data2 = new_features(data2)

In [14]:
data_norm2 = norm_dataset(data2)
data_norm2 = pd.get_dummies(data_norm2, drop_first=drop)
data_norm2.to_pickle("./add_features.pkl")
data_norm2

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_household,population_per_household,bedrooms_per_room,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
2,-1.314208,0.993310,1.842965,-0.622635,-1.154543,-1.164920,-1.167436,2.550540,1.328392,-0.148532,-1.552226,352100.0,0,0,1,0
3,-1.319197,0.993310,1.842965,-0.799910,-0.951177,-1.061904,-0.962969,1.432248,0.226272,-0.388755,-0.560015,341300.0,0,0,1,0
4,-1.319197,0.993310,1.842965,-0.475671,-0.747811,-1.050274,-0.768239,0.187461,0.433468,-0.734927,-0.783271,342200.0,0,0,1,0
5,-1.319197,0.993310,1.842965,-1.125986,-1.050600,-1.302828,-1.089544,0.319497,-0.244631,-0.774195,0.294458,269700.0,0,0,1,0
6,-1.319197,0.988695,1.842965,0.358348,0.196710,-0.171319,0.473164,0.057849,-0.168690,-0.785050,-0.407578,299200.0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-0.740477,1.745630,-0.367534,-0.440767,-0.323002,-0.585042,-0.422594,-1.396077,-0.118041,-0.376796,0.165352,78100.0,0,1,0,0
20636,-0.800344,1.750245,-0.940626,-1.329898,-1.335312,-1.397536,-1.474135,-0.705760,0.358611,0.154255,-0.004703,77100.0,0,1,0,0
20637,-0.805333,1.722552,-1.022496,0.100243,0.178633,-0.315873,0.078836,-1.299301,-0.046632,-0.598748,-0.005335,92300.0,0,1,0,0
20638,-0.855223,1.722552,-0.940626,-0.261655,-0.164829,-0.757843,-0.330097,-1.183474,0.008666,-0.789958,0.079894,84700.0,0,1,0,0


## Terzo dataset - no skewed

In [15]:
data3 = data.copy()
data3 = new_features(data3)

Trasformiamo le colonne skewed

In [16]:
features= data3.columns.to_list()
features.remove("ocean_proximity")
features.remove("longitude")
features.remove("latitude")
features.remove("median_house_value")

for i in features:
    data3[i] = np.log(data3[i])
    
data3

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,population_per_household,bedrooms_per_room
0,-122.23,37.88,3.713572,6.779922,4.859812,5.774552,4.836282,2.119287,452600.0,NEAR BAY,1.943640,0.938270,-1.920110
1,-122.22,37.86,3.044522,8.867709,7.008505,7.783641,7.037028,2.116424,358500.0,NEAR BAY,1.830682,0.746613,-1.859204
2,-122.24,37.85,3.951244,7.290975,5.247024,6.206576,5.176150,1.982022,352100.0,NEAR BAY,2.114825,1.030426,-2.043951
3,-122.25,37.85,3.951244,7.149917,5.459586,6.324359,5.389072,1.730434,341300.0,NEAR BAY,1.760845,0.935287,-1.690331
4,-122.25,37.85,3.951244,7.394493,5.634790,6.336826,5.556828,1.347086,342200.0,NEAR BAY,1.837665,0.779998,-1.759704
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,3.218876,7.417580,5.924256,6.739337,5.799093,0.444878,78100.0,INLAND,1.618488,0.940244,-1.493325
20636,-121.21,39.49,2.890372,6.546785,5.010635,5.874931,4.736198,0.938756,77100.0,INLAND,1.810587,1.138732,-1.536150
20637,-121.22,39.43,2.833213,7.720462,6.184149,6.914731,6.070738,0.530628,92300.0,INLAND,1.649724,0.843993,-1.536313
20638,-121.32,39.43,2.890372,7.528332,6.013715,6.608001,5.855072,0.624440,84700.0,INLAND,1.673260,0.752929,-1.514617


In [17]:
data_norm3 = norm_dataset(data3)
data_norm3 = pd.get_dummies(data_norm3, drop_first=drop)
data_norm3.to_pickle("./log.pkl")
data_norm3

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_household,population_per_household,bedrooms_per_room,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-1.327281,1.051692,0.860345,-1.129029,-1.625647,-1.691208,-1.560734,1.858461,1.075632,-0.386808,-1.400977,452600.0,0,0,1,0
1,-1.322290,1.042330,-0.317148,1.647970,1.306579,1.028805,1.440765,1.852378,0.664507,-1.094021,-1.151797,358500.0,0,0,1,0
2,-1.332272,1.037649,1.278635,-0.449269,-1.097237,-1.106310,-1.097204,1.566803,1.698679,-0.046750,-1.907645,352100.0,0,0,1,0
3,-1.337263,1.037649,1.278635,-0.636892,-0.807164,-0.946849,-0.806809,1.032234,0.410329,-0.397813,-0.460893,341300.0,0,0,1,0
4,-1.337263,1.037649,1.278635,-0.311578,-0.568070,-0.929971,-0.578013,0.217704,0.689924,-0.970832,-0.744713,342200.0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-0.758299,1.800633,-0.010295,-0.280869,-0.173049,-0.385030,-0.247600,-1.699286,-0.107796,-0.379522,0.345114,78100.0,0,1,0,0
20636,-0.818192,1.805314,-0.588446,-1.439127,-1.419826,-1.555309,-1.697234,-0.649904,0.591370,0.352900,0.169904,77100.0,0,1,0,0
20637,-0.823183,1.777229,-0.689042,0.121998,0.181616,-0.147572,0.122885,-1.517086,0.005891,-0.734688,0.169238,92300.0,0,1,0,0
20638,-0.873094,1.777229,-0.588446,-0.133557,-0.050967,-0.562839,-0.171252,-1.317757,0.091553,-1.070716,0.258003,84700.0,0,1,0,0


## Drop features

In [18]:
data4 = data.copy()
data4 = remove_outliers(data4)
data4 = new_features(data4)

Rimuoviamo le features correlate

In [19]:
data4.drop(["total_bedrooms"], axis=1, inplace=True)
data4.drop(["longitude"], axis=1, inplace=True)
data4.drop(["households"], axis=1, inplace=True)
data4.drop(["population"], axis=1, inplace=True)

In [20]:
data_norm4 = norm_dataset(data4)
data_norm4 = pd.get_dummies(data_norm4, drop_first=drop)
data_norm4.to_pickle("./removed.pkl")
data_norm4

Unnamed: 0,latitude,housing_median_age,total_rooms,median_income,rooms_per_household,population_per_household,bedrooms_per_room,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
2,0.993310,1.842965,-0.622635,2.550540,1.328392,-0.148532,-1.552226,352100.0,0,0,1,0
3,0.993310,1.842965,-0.799910,1.432248,0.226272,-0.388755,-0.560015,341300.0,0,0,1,0
4,0.993310,1.842965,-0.475671,0.187461,0.433468,-0.734927,-0.783271,342200.0,0,0,1,0
5,0.993310,1.842965,-1.125986,0.319497,-0.244631,-0.774195,0.294458,269700.0,0,0,1,0
6,0.988695,1.842965,0.358348,0.057849,-0.168690,-0.785050,-0.407578,299200.0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
20635,1.745630,-0.367534,-0.440767,-1.396077,-0.118041,-0.376796,0.165352,78100.0,0,1,0,0
20636,1.750245,-0.940626,-1.329898,-0.705760,0.358611,0.154255,-0.004703,77100.0,0,1,0,0
20637,1.722552,-1.022496,0.100243,-1.299301,-0.046632,-0.598748,-0.005335,92300.0,0,1,0,0
20638,1.722552,-0.940626,-0.261655,-1.183474,0.008666,-0.789958,0.079894,84700.0,0,1,0,0


## PCA

In [3]:
from transform import PCA
from model import RidgeRegression
from selection import KFoldCV
import pandas as pd

def normalize(X):
    return (X - X.mean())/X.std()

def prepare_for_pca(data):
    data = data.drop("median_house_value", axis =1)
    cols = ["ocean_proximity_NEAR BAY", "ocean_proximity_INLAND", "ocean_proximity_NEAR OCEAN", "ocean_proximity_<1H OCEAN"]
    for c in cols:
        data[c] = normalize(data[c])
    return data

def quadratic_loss(y_true, y_pred):
    return (y_true-y_pred)**2

dataA = ("base", pd.read_pickle("./base.pkl"))
dataB = ("base_no_out", pd.read_pickle("./base_no_out.pkl"))
dataC = ("add_features", pd.read_pickle("./add_features.pkl"))
dataD = ("log", pd.read_pickle("./log.pkl"))
dataE = ("removed", pd.read_pickle("./removed.pkl"))

datasets = [dataA, dataB, dataC, dataD, dataE]

In [4]:
len(dataD[1].columns)

16

In [5]:
# the number of components
rangeA = range(5, len(dataA[1].columns))
rangeB = range(5, len(dataB[1].columns))
rangeC = range(5, len(dataC[1].columns))
rangeD = range(5, len(dataD[1].columns))
rangeE = range(5, len(dataE[1].columns))
ranges = [rangeA, rangeB, rangeC, rangeD, rangeE]

In [18]:
d = dataD[1]
y = (d)["median_house_value"]
x = prepare_for_pca(d)

n = 15

folds = 5

kfolds = KFoldCV(folds, print=False, shuffle=100)
ridge = RidgeRegression(alfa=0)
x.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_household,population_per_household,bedrooms_per_room,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
count,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0
mean,-1.433214e-12,-7.008686e-14,-5.013743e-14,-1.350646e-15,-4.615734e-14,1.707216e-14,-5.982536e-15,-1.978714e-14,3.801976e-14,9.737128e-15,1.409032e-14,-1.96147e-14,2.192583e-15,2.698087e-15,-1.667198e-14
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-2.385388,-1.447902,-5.675353,-9.225142,-8.257619,-8.021746,-8.156723,-4.117763,-6.606475,-5.20593,-2.965787,-0.8902173,-0.6826966,-0.3535155,-0.384596
25%,-1.112665,-0.7972585,-0.5884457,-0.4647729,-0.4922366,-0.4813001,-0.4716852,-0.6441781,-0.5718786,-0.5741738,-0.6662696,-0.8902173,-0.6826966,-0.3535155,-0.384596
50%,0.5393794,-0.6427892,0.2509165,0.04485955,0.03313758,0.0509081,0.0451145,0.03933333,0.02348674,-0.0266281,-0.0657507,-0.8902173,-0.6826966,-0.3535155,-0.384596
75%,0.7789508,0.9767974,0.679679,0.5642234,0.5749063,0.5787856,0.5768259,0.6634723,0.554482,0.5358214,0.6129322,1.123266,1.464708,-0.3535155,-0.384596
max,2.625647,2.956812,1.278635,3.924834,3.711859,4.682533,3.72667,3.109466,12.03649,22.44428,6.454706,1.123266,1.464708,2.828592,2.600004


In [22]:
from sklearn.decomposition import PCA as pcab
pca1 = pcab(n_components=n)
principalComponents = pca1.fit_transform(x)
x1 = pd.DataFrame(data = principalComponents)
data1 = pd.concat([x1,y.reset_index(drop=True)], axis= 1)

In [23]:
err = kfolds.cross_validate(ridge, data1, "median_house_value", quadratic_loss)
err

50388236149.416534

In [9]:
pca1.explained_variance_

array([3.94662413e+00, 2.75322221e+00, 2.37962201e+00, 1.68181476e+00,
       1.30724778e+00, 9.90821554e-01, 7.48209607e-01, 6.81174681e-01,
       3.30648234e-01, 1.55638348e-01, 2.49766849e-02, 1.64397639e-30,
       1.41237429e-30, 3.30027668e-31, 1.82570569e-31])

In [19]:
pca2 = PCA(x)
x2 = pd.DataFrame(pca2.get_principal_components(n))
data2 = pd.concat([x2,y.reset_index(drop=True)], axis= 1)

In [20]:
pca2.eigenvalues

array([ 3.94662413e+00,  2.75322221e+00,  2.37962201e+00,  1.68181476e+00,
        1.30724778e+00,  9.90821554e-01,  7.48209607e-01,  6.81174681e-01,
        3.30648234e-01,  1.55638348e-01,  2.49766849e-02,  7.72031614e-16,
        3.84329916e-16, -2.23028814e-16, -5.96106543e-16])

In [21]:
err = kfolds.cross_validate(ridge, data2, "median_house_value", quadratic_loss)
err

5185062422.714296

In [30]:
folds = 5
kfolds = KFoldCV(folds, print=False, shuffle=100)
ridge = RidgeRegression(alfa=0)

for i, d in enumerate(datasets):
    print(d[0])
    errors = []
    for el in ranges[i]:
        y = (d[1])["median_house_value"]
        x = prepare_for_pca(d[1])
        pca = PCA(x)
        x = pd.DataFrame(pca.get_principal_components(el))
        data = pd.concat([x,y.reset_index(drop=True)], axis= 1)
        
        err = kfolds.cross_validate(ridge, data, "median_house_value", quadratic_loss)
        print("{}\t{}".format(el, round(err, 5)))
        errors.append(err)
    min_value = min(errors)
    idx_min = errors.index(min_value)
    print("best num. components:\t{}\n".format(ranges[i][idx_min]))

base
5	48727110384.92296
6	48537161197.68483
7	48127124891.60178
8	47861999139.10093
9	47692805690.06791
10	47600495276.59445
11	47592701235.93602
12	15455769706.5532
best num. components:	12

base_no_out
5	39165866076.81865
6	39137372715.84243
7	38891456415.993
8	38659655204.28956
9	38432743894.03438
10	38381399335.8666
11	38345580761.98764
12	21732606979.81872
best num. components:	12

add_features
5	39312366652.87155
6	39302406063.58328
7	39250143385.88006
8	39245126260.51076
9	39245397166.12169
10	38547861328.37034
11	38448156031.06728
12	38408358888.04289
13	38312066939.85928
14	38315536472.41677
15	36872946397.6464
best num. components:	15

log
5	49115248506.0834
6	48931444101.8196
7	48927089616.07238
8	48931401965.51237
9	48519724913.94859
10	48199254146.76548
11	48030405137.55788
12	5187637308.686
13	5186423684.22964
14	5186532159.39939
15	5185062422.7143
best num. components:	15

removed
5	39393707471.63576
6	39401316691.1337
7	39404134583.20958
8	39318437220.4476
9	3931195913

Creiamo quindi le varie versioni da usare successivamente con il numero di componenti migliore trovato dalla precedente analisi

In [5]:
n_comp =5
da = list()
for d in datasets:
    y = (d[1])["median_house_value"]
    x = prepare_for_pca(d[1])
    pca = PCA(x)
    x = pd.DataFrame(pca.get_principal_components(n_comp))
    data = pd.concat([x,y.reset_index(drop=True)], axis= 1)
    data.to_pickle("PCA{}.pkl".format(d[0]))
    da.append(data)