## Boston Housing Challenge - Spiced 2022 - Valentin Lorenzen


In [1]:
# loading libraries

from sklearn.datasets import fetch_california_housing

import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import PoissonRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, KBinsDiscretizer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error

import statistics
import math



In [2]:

housing = fetch_california_housing()

In [3]:
df = pd.DataFrame(housing.data, columns=housing.feature_names)

In [4]:
df.head(5)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [5]:
X = df
y = housing.target

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [7]:
initial_transformation = ColumnTransformer(
    [
        (("StandardScaler"), StandardScaler(), X.columns)
    ]
)

In [8]:
X_train_FE = pd.DataFrame(initial_transformation.fit_transform(X_train), columns=initial_transformation.get_feature_names_out())
X_test_FE = pd.DataFrame(initial_transformation.transform(X_test), columns=initial_transformation.get_feature_names_out())

In [12]:
X_train_FE

Unnamed: 0,StandardScaler__MedInc,StandardScaler__HouseAge,StandardScaler__AveRooms,StandardScaler__AveBedrms,StandardScaler__Population,StandardScaler__AveOccup,StandardScaler__Latitude,StandardScaler__Longitude
0,0.176489,0.666407,-0.060854,-0.281118,-0.496544,-0.048283,-0.860741,0.730999
1,0.771374,1.062889,0.383521,-0.020378,-0.411672,-0.058729,0.709168,-1.196710
2,0.021424,0.587111,0.278523,-0.084625,-0.626915,-0.072838,1.313701,-1.551288
3,1.309004,0.269925,0.233072,-0.243135,-0.234928,-0.069768,-0.696721,0.586171
4,-0.793136,-0.919520,-0.264184,-0.139881,-0.631290,-0.085141,1.369936,-0.872096
...,...,...,...,...,...,...,...,...
15475,1.308847,0.507814,0.286357,-0.388828,-0.673288,-0.006223,-0.874800,0.810904
15476,-0.435101,0.349222,0.591764,0.393370,0.284806,0.066712,-0.762329,1.075590
15477,-0.495824,0.587111,-0.598322,-0.039164,0.287431,0.018854,-0.757643,0.601153
15478,0.966991,-1.078113,0.396334,-0.065869,0.305805,0.006042,0.905993,-1.186722


In [9]:
models = [
LinearRegression(),
Ridge(),
Lasso(alpha=0.1),
ElasticNet(),
PoissonRegressor()
]

In [10]:
# Cross-validation for different models

for model in models:

    cv = KFold(n_splits=5, shuffle=True, random_state=1)

    scores = cross_val_score(model, X_train_FE, y_train, scoring='r2', cv=cv, n_jobs=-1)

    scores = abs(scores)

    print(f'Validation accuracy scores for {model}: {np.round(scores,2)}')
    print(f'mean: {scores.mean():.2}')
    print(f'std: {scores.std():.2}\n')

Validation accuracy scores for LinearRegression(): [0.6  0.63 0.61 0.61 0.59]
mean: 0.61
std: 0.014

Validation accuracy scores for Ridge(): [0.6  0.63 0.61 0.61 0.59]
mean: 0.61
std: 0.014

Validation accuracy scores for Lasso(alpha=0.1): [0.48 0.51 0.49 0.51 0.48]
mean: 0.49
std: 0.014

Validation accuracy scores for ElasticNet(): [0.21 0.21 0.2  0.2  0.21]
mean: 0.21
std: 0.0016

Validation accuracy scores for PoissonRegressor(): [0.44 0.46 0.44 0.45 0.43]
mean: 0.45
std: 0.012



In [11]:
# using GridSeachCV with different models and parameters

models = {
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'PoissonRegressor': PoissonRegressor(),
    'Elastic_Net': ElasticNet(),
    'LinearRegression': LinearRegression(),
    'RandomForestRegressor': RandomForestRegressor()
}

params = {
    'Ridge': {
        'alpha': [0.1,0.3,0.5,0.7,2,5,10,20,50,100]
    },
    'Lasso': {
        'max_iter': [100,500,1000],
        'alpha': [0.1,0.3,0.5,0.7,2,5,10,20,50,100]

    },
    "PoissonRegressor": {
        'max_iter': [100,500,1000],
        'alpha': [0.1,0.3,0.5,0.7,2,5,10,20,50,100]
    },
    "Elastic_Net": {
        'l1_ratio': [0.1,0.3,0.5,0.7,0.9],
        'alpha': [1,2,5,8,10,20,30]
    },
    'LinearRegression': {
    },
    'RandomForestRegressor': {
        'max_depth': [15],
        'n_estimators': [20],
        'min_samples_split': [2,3]
    }
}

def fit(train_features, train_actuals):
        for name in models.keys():
            est = models[name]
            est_params = params[name]
            gscv = GridSearchCV(estimator=est, param_grid=est_params, cv=5, scoring="r2")
            gscv.fit(train_features, train_actuals)
            print("best parameters are: {}".format(gscv.best_estimator_))
            print("best score is: {}\n".format(gscv.best_score_))

fit(X_train_FE, y_train)

best parameters are: Ridge(alpha=10)
best score is: 0.6071667315464089

best parameters are: Lasso(alpha=0.1, max_iter=100)
best score is: 0.4936902501001167

best parameters are: PoissonRegressor(alpha=0.1)
best score is: 0.49870861701902164

best parameters are: ElasticNet(alpha=1, l1_ratio=0.1)
best score is: 0.3474960789416772

best parameters are: LinearRegression()
best score is: 0.607159968693202

best parameters are: RandomForestRegressor(max_depth=15, min_samples_split=3, n_estimators=20)
best score is: 0.7911230672169487



In [21]:
rf_m = RandomForestRegressor(n_estimators=700, max_depth=15, min_samples_split=3, n_jobs=-1)
rf_m.fit(X_train_FE, y_train)

RandomForestRegressor(max_depth=15, min_samples_split=3, n_estimators=700,
                      n_jobs=-1)

In [22]:
rf_m.score(X_train_FE, y_train)

0.9556598846728522

In [23]:
rf_m.score(X_test_FE,y_test)

0.8060434817847564

In [28]:
#cv = KFold(n_splits=5, shuffle=True, random_state=42)
#scores = cross_val_score(rf_m, X_train, y_train, cv=cv, scoring="r2")

print(f'Validation-scores: {np.round(scores,2)}')
print(f'mean: {scores.mean():.2}')
print(f'standard diviation: {scores.std():.2}')

Validation-scores: [0.81 0.8  0.79 0.81 0.81]
mean: 0.8
standard diviation: 0.0087
