In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import Binarizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.style as style
style.use('fivethirtyeight')
import matplotlib.cm as cm
import helper

In [2]:
colors = ["#FF0B04", "#F1BE48",
           "#B9975B", "#8B5B29",
           "#524727",
         ]
sns.set_palette(sns.color_palette(colors))

In [51]:
df = pd.read_csv('Ames_Housing_Price_Data.csv', 
                             index_col=0,low_memory = False)

In [52]:
train, test = helper.data_processing_wrapper(df, num_to_cat_list=[], remove_PID=False)

In [53]:
train['LogSalePrice'] = np.log(train['SalePrice'])
test['LogSalePrice'] = np.log(test['SalePrice'])

In [54]:
categorical = train.select_dtypes(['object','bool']).columns.to_list() + ['MSSubClass']

In [55]:
binary = ['WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch']

In [56]:
def my_binarizer(y):
    if y > 0:
        return 1
    else:
        return 0

In [57]:
for col in binary:
    train[col] = train.apply(lambda x: my_binarizer(x[col]), axis=1)

In [58]:
for col in binary:
    test[col] = test.apply(lambda x: my_binarizer(x[col]), axis=1)

In [59]:
X_train = train.drop(['SalePrice', 'LogSalePrice', 'PID', 'TotalBsmtSF', 'GrLivArea'], axis=1)
y_train = train['LogSalePrice']
X_test = test.drop(['SalePrice', 'LogSalePrice', 'PID', 'TotalBsmtSF', 'GrLivArea'], axis=1)
y_test = test['LogSalePrice']

### Note we are keeping various features related to the sale. The justification of some is clearer than others.

## We first build a single basic model to be trained on all of X_train. 

## (all) Lasso for selection.

In [39]:
def Lasso_select(X, y, alpha):

    pipe = Pipeline(
        [
            ('transformer', ColumnTransformer([("Cat", OneHotEncoder(handle_unknown = 'ignore'), categorical)], 
                                                remainder='passthrough')),
            ('scaler', StandardScaler(with_mean=False))
        ]
    )
    
    X = pipe.fit_transform(X)

    cv = KFold(n_splits=4, shuffle=True, random_state=42)

    cross = cross_val_score(Lasso(alpha=alpha, max_iter=10000), X, y, scoring='r2', cv=cv, n_jobs=-1)
    
    selector = SelectFromModel(Lasso(alpha=alpha, max_iter=10000))
    selector.fit(X,y)
    num_features = np.sum(selector.get_support())
    
    return cross, num_features

In [40]:
for alpha in np.logspace(-5, -1, 5):
    print(Lasso_select(X_train, y_train, alpha))

(array([0.93032442, 0.91956625, 0.93190727, 0.9443367 ]), 232)
(array([0.93236323, 0.92815946, 0.93383035, 0.94442936]), 224)
(array([0.93904337, 0.93584105, 0.94207856, 0.94622555]), 166)
(array([0.91723667, 0.91723178, 0.93191566, 0.93144335]), 48)
(array([0.67913054, 0.6968    , 0.70571303, 0.70438037]), 6)


In [41]:
for alpha in [0.006, 0.0062, 0.0064, 0.0066, 0.0068]:
    print(Lasso_select(X_train,y_train,alpha))

(array([0.93028147, 0.92783025, 0.94005857, 0.94031129]), 61)
(array([0.92976211, 0.92730672, 0.93976631, 0.93990238]), 61)
(array([0.9292223 , 0.92676318, 0.93946141, 0.93949715]), 59)
(array([0.9286454 , 0.92620633, 0.93914331, 0.93908662]), 57)
(array([0.92804877, 0.92564332, 0.93882027, 0.93866268]), 55)


## (all) We will try to keep the number of features below 60 so let's move ahead with Lasso_alpha = 0.0064.

### What are those features?

In [42]:
pipe = Pipeline(
        [
            ('transformer', ColumnTransformer([("Cat", OneHotEncoder(handle_unknown = 'ignore'), categorical)], 
                                                remainder='passthrough')),
            ('scaler', StandardScaler(with_mean=False))
        ]
    )
    
X = pipe.fit_transform(X_train)
y = y_train

selector = SelectFromModel(Lasso(alpha=0.0064, max_iter=5000))
selector.fit(X,y)
mask = selector.get_support()
feat_names = pipe.named_steps['transformer'].get_feature_names()
names = [name for name, boo in zip(feat_names, mask) if boo]
names

['Cat__x0_C (all)',
 'Cat__x0_RM',
 'Cat__x2_Reg',
 'Cat__x5_CulDSac',
 'Cat__x7_ClearCr',
 'Cat__x7_Crawfor',
 'Cat__x7_Edwards',
 'Cat__x7_GrnHill',
 'Cat__x7_MeadowV',
 'Cat__x7_NoRidge',
 'Cat__x7_NridgHt',
 'Cat__x7_SawyerW',
 'Cat__x7_Somerst',
 'Cat__x7_StoneBr',
 'Cat__x8_Feedr',
 'Cat__x8_Norm',
 'Cat__x10_1Fam',
 'Cat__x10_Twnhs',
 'Cat__x12_Flat',
 'Cat__x14_BrkFace',
 'Cat__x14_PreCast',
 'Cat__x15_AsbShng',
 'Cat__x17_PConc',
 'Cat__x21_N',
 'Cat__x23_Maj1',
 'Cat__x23_Maj2',
 'Cat__x23_Typ',
 'Cat__x24_Attchd',
 'Cat__x28_Normal',
 'Cat__x29_30',
 'Cat__x29_160',
 'LotFrontage',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'ExterQual',
 'BsmtQual',
 'BsmtExposure',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'HeatingQC',
 '1stFlrSF',
 '2ndFlrSF',
 'BsmtFullBath',
 'HalfBath',
 'KitchenAbvGr',
 'KitchenQual',
 'TotRmsAbvGrd',
 'Fireplaces',
 'FireplaceQu',
 'GarageFinish',
 'GarageCars',
 'GarageArea',
 'PavedDrive',
 'OpenPorchSF',
 'Scree

## (all) Train a Ridge regression on the selected features.

In [43]:
X = X_train
y = y_train

pipe = Pipeline([('transformer', ColumnTransformer([("Cat", OneHotEncoder(handle_unknown='ignore'), categorical)], 
                                            remainder='passthrough')),
                 ('scaler', StandardScaler(with_mean=False)),
                 ('selector', SelectFromModel(Lasso(alpha=0.0064, max_iter=10000))),
                 ('ridge', Ridge())])


param_grid = {'ridge__alpha':[0.001, 0.1, 1, 10]}

cv = KFold(n_splits=4, shuffle=True, random_state=42)

grid = GridSearchCV(pipe, param_grid, scoring='r2', cv=cv, n_jobs=-1)
grid.fit(X, y)

print(grid.best_params_)
print(grid.best_score_)

{'ridge__alpha': 0.001}
0.9399219121369596


In [44]:
X = X_train
y = y_train

pipe = Pipeline([('transformer', ColumnTransformer([("Cat", OneHotEncoder(handle_unknown='ignore'), categorical)], 
                                            remainder='passthrough')),
                 ('scaler', StandardScaler(with_mean=False)),
                 ('selector', SelectFromModel(Lasso(alpha=0.0064, max_iter=10000))),
                 ('ridge', Ridge())])


param_grid = {'ridge__alpha':np.linspace(0.0001, 0.01, 1000)}

cv = KFold(n_splits=4, shuffle=True, random_state=42)

grid = GridSearchCV(pipe, param_grid, scoring='r2', cv=cv, n_jobs=-1)
grid.fit(X, y)

print(grid.best_params_)
print(grid.best_score_)

{'ridge__alpha': 0.00011981981981981982}
0.939921922356055


## The model does not find a Ridge penalty useful (or does it now?). We will try Lasso instead.

In [47]:
X = X_train
y = y_train

pipe = Pipeline([('transformer', ColumnTransformer([("Cat", OneHotEncoder(handle_unknown='ignore'), categorical)], 
                                            remainder='passthrough')),
                 ('scaler', StandardScaler(with_mean=False)),
                 ('selector', SelectFromModel(Lasso(alpha=0.0064, max_iter=10000))),
                 ('lasso', Lasso())])


param_grid = {'lasso__alpha':[0.001, 0.1, 1, 10]}

cv = KFold(n_splits=4, shuffle=True, random_state=42)

grid = GridSearchCV(pipe, param_grid, scoring='r2', cv=cv, n_jobs=-1)
grid.fit(X, y)

print(grid.best_params_)
print(grid.best_score_)

{'lasso__alpha': 0.001}
0.9400982382738722


In [48]:
X = X_train
y = y_train

pipe = Pipeline([('transformer', ColumnTransformer([("Cat", OneHotEncoder(handle_unknown='ignore'), categorical)], 
                                            remainder='passthrough')),
                 ('scaler', StandardScaler(with_mean=False)),
                 ('selector', SelectFromModel(Lasso(alpha=0.0064, max_iter=10000))),
                 ('lasso', Lasso())])


param_grid = {'lasso__alpha':np.linspace(0.0001, 0.01, 1000)}

cv = KFold(n_splits=4, shuffle=True, random_state=42)

grid = GridSearchCV(pipe, param_grid, scoring='r2', cv=cv, n_jobs=-1)
grid.fit(X, y)

print(grid.best_params_)
print(grid.best_score_)

{'lasso__alpha': 0.001100900900900901}
0.9401003840836091


## The gridsearch has selected a (update this)

### We now evaluate the chosen model on the test data.

In [66]:
pipe = Pipeline([('transformer', ColumnTransformer([("Cat", OneHotEncoder(handle_unknown='ignore'), categorical)], 
                                            remainder='passthrough')),
                 ('scaler', StandardScaler(with_mean=False)),
                 ('selector', SelectFromModel(Lasso(alpha=0.1, max_iter=10000))),
                 ('ridge', Ridge(alpha=0.001))])

pipe.fit(X_train, y_train)

print(f'The train score is {pipe.score(X_train, y_train)}')
print(f'The test score is {pipe.score(X_test, y_test)}')

The train score is 0.8298680385285429
The test score is 0.7941944633274168


In [61]:
pipe = Pipeline([('transformer', ColumnTransformer([("Cat", OneHotEncoder(handle_unknown='ignore'), categorical)], 
                                            remainder='passthrough')),
                 ('scaler', StandardScaler(with_mean=False)),
                 ('selector', SelectFromModel(Lasso(alpha=0.0064, max_iter=10000))),
                 ('lasso', Lasso(alpha=0.0011))])

pipe.fit(X_train, y_train)

print(f'The train score is {pipe.score(X_train, y_train)}')
print(f'The test score is {pipe.score(X_test, y_test)}')

The train score is 0.9478405823119486
The test score is 0.9125538341505728


## This represents a sensible baseline. A Lasso model with 58 features (after one-hot encoding) and no engineering has a test score of 91.1%