In [1]:
import numpy
import pandas
import seaborn
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import learning_curve, validation_curve, cross_val_score, train_test_split, KFold, GridSearchCV

from plotting import Plotting
from cholesky import Cholesky
from lasso import Lasso

column_to_predict = 'median_house_value'
categories_columns = ['ocean_proximity']
numerics_columns = ["longitude", "latitude", "housing_median_age", "total_rooms", "total_bedrooms", "population", "households", "median_income"]



In [2]:
data_frame = pandas.read_csv(filepath_or_buffer='cal-housing.csv')

outliers = data_frame[data_frame[column_to_predict] == 500001].index

print(f'There are {len(outliers)} outliers')

data_frame.drop(outliers, inplace=True)

for c in data_frame.columns:
    if data_frame[c].hasnans:
        m = data_frame[c].mean()
        data_frame[c].fillna(value=m, inplace=True)
        
useLabelEncoder = False

if useLabelEncoder:
    labelencoder = LabelEncoder()

    for c in categories_columns:
        c_name = c + '_cat'
        data_frame[c_name] = labelencoder.fit_transform(data_frame[c])
        numerics_columns.append(c_name)

    data_frame.drop(columns=categories_columns, inplace=True)    
else:
    data_frame['ocean_proximity'].replace(['INLAND', '<1H OCEAN', 'NEAR OCEAN', 'NEAR BAY', 'ISLAND'], [1, 10, 50, 100, 500], inplace=True)
    numerics_columns.append('ocean_proximity')

data_frameOriginal = data_frame.copy(deep=True)  

columns_to_remove = []
columns_to_use = list(data_frame.columns)

for u in columns_to_remove:
    columns_to_use.remove(u)
    if numerics_columns.count(u) > 0:
        numerics_columns.remove(u)
        
data_frame.drop(columns=columns_to_remove, inplace=True)

X = data_frame[numerics_columns]
y = data_frame[column_to_predict]

useMinMaxScaler = True
if useMinMaxScaler:
    column_to_predict_idx = data_frame.columns.get_loc(column_to_predict)
    cols = list(range(0, data_frame.shape[1]))
    cols.remove(column_to_predict_idx)

    scaler = MinMaxScaler()
    scaler.fit(data_frame)
    data_frame = scaler.transform(data_frame)    
    
data_frame = pandas.DataFrame(data_frame, columns=columns_to_use)

X = data_frame[numerics_columns]
y = data_frame[column_to_predict]



There are 965 outliers


In [3]:
X

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
0,0.211155,0.567481,0.784314,0.022331,0.019711,0.008941,0.020395,0.539668,0.198397
1,0.212151,0.565356,0.392157,0.180503,0.171349,0.067210,0.186842,0.538027,0.198397
2,0.210159,0.564293,1.000000,0.037260,0.029179,0.013818,0.028783,0.466028,0.198397
3,0.209163,0.564293,1.000000,0.032352,0.036163,0.015555,0.035691,0.354699,0.198397
4,0.209163,0.564293,1.000000,0.041330,0.043148,0.015752,0.042270,0.230776,0.198397
...,...,...,...,...,...,...,...,...,...
19670,0.324701,0.737513,0.470588,0.042296,0.057737,0.023599,0.053947,0.073130,0.000000
19671,0.312749,0.738576,0.333333,0.017676,0.022971,0.009894,0.018421,0.141853,0.000000
19672,0.311753,0.732200,0.313725,0.057277,0.074965,0.028140,0.070888,0.082764,0.000000
19673,0.301793,0.732200,0.333333,0.047256,0.063169,0.020684,0.057072,0.094295,0.000000


In [4]:
y

0        0.902268
1        0.708248
2        0.695052
3        0.672784
4        0.674640
           ...   
19670    0.130105
19671    0.128043
19672    0.159383
19673    0.143713
19674    0.153404
Name: median_house_value, Length: 19675, dtype: float64

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

pca = PCA()
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

pandas.Series(pca.explained_variance_ratio_ * 100)

0    49.666685
1    34.384375
2     6.578611
3     5.738423
4     2.427058
5     0.936327
6     0.118943
7     0.106750
8     0.042829
dtype: float64

In [6]:
alphas = numpy.sort(numpy.linspace(1, 0, 20, False))

for C in range(1, 10):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)
    pandas.DataFrame(X_train)
    pca = PCA(n_components=C)
    X_train = pca.fit_transform(X_train)
    X_test = pca.transform(X_test)

    pandas.Series(pca.explained_variance_ratio_ * 100)

    cholesky = Cholesky()
    cholesky.calculateScoring(alphas, X_train, y_train, X_test, y_test)
    print('Componenti', C)
    cholesky.printBestScores()

Componenti 1
Cholesky:
-best ɑ: 0.04999999999999993
-best MSE: 0.040279874366137704
-best R²: 0.0031805808765876797
Componenti 2
Cholesky:
-best ɑ: 0.04999999999999993
-best MSE: 0.03976858524730072
-best R²: 0.008212004825167374
Componenti 3
Cholesky:
-best ɑ: 0.04999999999999993
-best MSE: 0.01811645504372248
-best R²: 0.5394231009851475
Componenti 4
Cholesky:
-best ɑ: 0.04999999999999993
-best MSE: 0.020392140652673005
-best R²: 0.5151721011439927
Componenti 5
Cholesky:
-best ɑ: 0.04999999999999993
-best MSE: 0.017831901285628067
-best R²: 0.5550920275467899
Componenti 6
Cholesky:
-best ɑ: 0.04999999999999993
-best MSE: 0.018595444006374653
-best R²: 0.5519344391850293
Componenti 7
Cholesky:
-best ɑ: 0.04999999999999993
-best MSE: 0.017096217124522663
-best R²: 0.565834047879429
Componenti 8
Cholesky:
-best ɑ: 0.09999999999999998
-best MSE: 0.01676032197292137
-best R²: 0.5888655881752769
Componenti 9
Cholesky:
-best ɑ: 0.1499999999999999
-best MSE: 0.016259365051663196
-best R²: 0.