In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb

from transform import PCA

## Elaborazione del dataset

Leggiamo i dati dal file CSV

In [2]:
# read data from csv file
data = pd.read_csv("cal-housing.csv")
data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


Dalla precedenti analisi sostituiamo i rercord con ISLAN con NEAR OCEAN

In [3]:
data = data.replace("ISLAND", "NEAR OCEAN")
data['ocean_proximity'].value_counts()

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2663
NEAR BAY      2290
Name: ocean_proximity, dtype: int64

Scegliamo cosa fare dei valori NaN

In [4]:
data = data.fillna(data.median())

Aggiungiamo delle feature derivando da quelle che già abbiamo

In [5]:
data1 = data.copy()
data1['rooms_per_household']=data1['total_rooms']/data1['households']
data1['population_per_household']=data1['population']/data1['households']
data1['bedrooms_per_room']=data1['total_bedrooms']/data1['total_rooms']
data1

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,population_per_household,bedrooms_per_room
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,6.984127,2.555556,0.146591
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,6.238137,2.109842,0.155797
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,8.288136,2.802260,0.129516
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,5.817352,2.547945,0.184458
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,6.281853,2.181467,0.172096
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND,5.045455,2.560606,0.224625
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND,6.114035,3.122807,0.215208
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND,5.205543,2.325635,0.215173
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND,5.329513,2.123209,0.219892


Trasformiamo le colonne skewed

In [6]:
features=['total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'median_house_value', 
          'rooms_per_household', 'population_per_household', 'bedrooms_per_room']
for i in features:
    data1[i] = np.log(data1[i])

Trasformiamo la colonna ocean_proximity con one hot encode

In [7]:
data1 = pd.get_dummies(data1)

Normalizziamo il dataset

In [8]:
def normalize(X):
    return (X - X.mean())/X.std()

data1 = normalize(data1)
data1

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,rooms_per_household,population_per_household,bedrooms_per_room,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-1.327803,1.052523,0.982119,-1.129227,-1.634187,-1.691975,-1.561273,1.858741,1.647905,1.077249,-0.388000,-1.350893,-0.891134,-0.681873,2.830673,-0.384872
1,-1.322812,1.043159,-0.607004,1.648799,1.312839,1.028436,1.440819,1.852658,1.238371,0.665970,-1.095945,-1.110907,-0.891134,-0.681873,2.830673,-0.384872
2,-1.332794,1.038478,1.856137,-0.449216,-1.103109,-1.106991,-1.097651,1.567092,1.206721,1.700529,-0.047591,-1.838866,-0.891134,-0.681873,2.830673,-0.384872
3,-1.337785,1.038478,1.856137,-0.636909,-0.811572,-0.947507,-0.807199,1.032541,1.151982,0.411698,-0.399017,-0.445496,-0.891134,-0.681873,2.830673,-0.384872
4,-1.337785,1.038478,1.856137,-0.311474,-0.571272,-0.930626,-0.578358,0.218039,1.156610,0.691397,-0.972628,-0.718844,-0.891134,-0.681873,2.830673,-0.384872
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-0.758808,1.801603,-0.289180,-0.280754,-0.174256,-0.385606,-0.247879,-1.698888,-1.439271,-0.106621,-0.380708,0.330772,-0.891134,1.466479,-0.353256,-0.384872
20636,-0.818702,1.806285,-0.845373,-1.439441,-1.427326,-1.556056,-1.697800,-0.649541,-1.461914,0.592807,0.352472,0.162026,-0.891134,1.466479,-0.353256,-0.384872
20637,-0.823693,1.778194,-0.924829,0.122262,0.182199,-0.148113,0.122679,-1.516694,-1.145748,0.007109,-0.736241,0.161385,-0.891134,1.466479,-0.353256,-0.384872
20638,-0.873605,1.778194,-0.845373,-0.133387,-0.051559,-0.563441,-0.171517,-1.317372,-1.296729,0.092802,-1.072616,0.246875,-0.891134,1.466479,-0.353256,-0.384872


In [9]:
'''
y = data1["median_house_value"].copy()
y = (y - y.min())/(y.max() - y.min())
y = y - y.mean()
y
'''

'\ny = data1["median_house_value"].copy()\ny = (y - y.min())/(y.max() - y.min())\ny = y - y.mean()\ny\n'

Rimuoviamo gli outliers

In [10]:
# search for outliers
threshold = 3

buoni = (np.abs(data1) <= threshold).all(axis = 1)
data1 = data1[buoni]
len(data1)

19784

Salviamo il risultato in un file

In [11]:
data1.to_pickle("./elaborated.pkl")

## PCA

Usiamo PCA per ridurre la dimensione del dataset: dalle precedenti analisi con 8 componenti riusciamo a specificare il 90% della varianza nel dataset

In [12]:
data2 = data1.copy()
target = data2["median_house_value"].copy()
data2.drop(["median_house_value"], axis=1, inplace=True)
pca = PCA(data2)
mat_reduced = pca.get_principal_components(8)

principal_df = pd.DataFrame(mat_reduced)
principal_df = pd.concat([principal_df , target.reset_index(drop=True)] , axis = 1)
principal_df

Unnamed: 0,0,1,2,3,4,5,6,7,median_house_value
0,3.886217,-0.999883,2.467829,2.147896,0.140422,-0.460955,-0.912723,-1.035052,1.647905
1,-0.891984,-4.091257,0.631903,2.511098,0.087095,-0.126467,-0.909246,-1.111115,1.238371
2,3.237974,-1.550007,2.511398,2.405453,0.054578,-1.300365,-1.240556,-0.089454,1.206721
3,3.097929,-1.068674,0.680424,2.535607,0.016036,-0.829969,-1.171175,-0.297641,1.151982
4,2.896971,-1.376928,0.383572,2.485920,0.121305,-0.470301,-1.478559,-0.074083,1.156610
...,...,...,...,...,...,...,...,...,...
19779,1.731325,-1.199188,-1.460065,-1.603710,-0.488312,0.619868,0.290549,0.394080,-1.439271
19780,3.421723,-0.312679,0.362702,-2.073060,-0.511892,0.558822,0.870312,-0.487486,-1.461914
19781,1.058237,-1.817701,-1.358985,-1.627500,-0.411196,1.017995,0.319969,0.132137,-1.145748
19782,1.548289,-1.607404,-1.121617,-1.563860,-0.287870,1.394804,0.184682,0.110734,-1.296729


In [13]:
principal_df.to_pickle("./pca.pkl")