In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.style as style
style.use('fivethirtyeight')
import matplotlib.cm as cm
import helper

In [2]:
colors = ["#FF0B04", "#F1BE48",
           "#B9975B", "#8B5B29",
           "#524727",
         ]
sns.set_palette(sns.color_palette(colors))

In [3]:
df = pd.read_csv('Ames_Housing_Price_Data.csv', 
                             index_col=0,low_memory = False)

In [4]:
train, test = helper.data_processing_wrapper(df, num_to_cat_list=[], remove_PID=False)

In [5]:
train['LogSalePrice'] = np.log(train['SalePrice'])
test['LogSalePrice'] = np.log(test['SalePrice'])

In [6]:
categorical = train.select_dtypes(['object','bool']).columns.to_list()

In [7]:
X_train = train.drop(['SalePrice', 'LogSalePrice', 'PID', 'TotalBsmtSF', 'GrLivArea'], axis=1)
y_train = train['LogSalePrice']
X_test = test.drop(['SalePrice', 'LogSalePrice', 'PID', 'TotalBsmtSF', 'GrLivArea'], axis=1)
y_test = test['LogSalePrice']

### Note we are keeping various features related to the sale. The justification of some is clearer than others.

## We first build a single basic model to be trained on all of X_train. 

## (all) Lasso for selection.

In [8]:
def Lasso_select(X, y, alpha):

    pipe = Pipeline(
        [
            ('transformer', ColumnTransformer([("Cat", OneHotEncoder(handle_unknown = 'ignore'), categorical)], 
                                                remainder='passthrough')),
            ('scaler', StandardScaler(with_mean=False))
        ]
    )
    
    X = pipe.fit_transform(X)

    cv = KFold(n_splits=4, shuffle=True, random_state=42)

    cross = cross_val_score(Lasso(alpha=alpha, max_iter=5000), X, y, scoring='r2', cv=cv, n_jobs=-1)
    
    selector = SelectFromModel(Lasso(alpha=alpha, max_iter=5000))
    selector.fit(X,y)
    num_features = np.sum(selector.get_support())
    
    return cross, num_features

In [9]:
for alpha in np.logspace(-5, -1, 5):
    print(Lasso_select(X_train, y_train, alpha))

(array([0.93126349, 0.91993158, 0.93507105, 0.94147087]), 217)
(array([0.93221843, 0.92667699, 0.93687772, 0.94177989]), 204)
(array([0.93793251, 0.93477841, 0.94416997, 0.94436756]), 150)
(array([0.91648923, 0.91656477, 0.9321301 , 0.92995493]), 44)
(array([0.67913054, 0.69680002, 0.70571302, 0.7043804 ]), 6)


In [10]:
for alpha in [0.006, 0.0062, 0.0064, 0.0066, 0.0068]:
    print(Lasso_select(X_train,y_train,alpha))

(array([0.92854759, 0.92693649, 0.94027579, 0.93824267]), 63)
(array([0.92803659, 0.92643237, 0.93995957, 0.9378606 ]), 61)
(array([0.92750585, 0.92591258, 0.93964194, 0.93749254]), 58)
(array([0.92695612, 0.9253717 , 0.93932038, 0.93713178]), 56)
(array([0.92638571, 0.9248221 , 0.93899835, 0.93675714]), 55)


## (all) We will try to keep the number of features below 60 so let's move ahead with Lasso_alpha = 0.0064.

### What are those features?

In [11]:
pipe = Pipeline(
        [
            ('transformer', ColumnTransformer([("Cat", OneHotEncoder(handle_unknown = 'ignore'), categorical)], 
                                                remainder='passthrough')),
            ('scaler', StandardScaler(with_mean=False))
        ]
    )
    
X = pipe.fit_transform(X_train)
y = y_train

selector = SelectFromModel(Lasso(alpha=0.01, max_iter=5000))
selector.fit(X,y)
mask = selector.get_support()
feat_names = pipe.named_steps['transformer'].get_feature_names()
names = [name for name, boo in zip(feat_names, mask) if boo]
names

['Cat__x0_C (all)',
 'Cat__x0_RM',
 'Cat__x2_Reg',
 'Cat__x7_ClearCr',
 'Cat__x7_Crawfor',
 'Cat__x7_Edwards',
 'Cat__x7_GrnHill',
 'Cat__x7_MeadowV',
 'Cat__x7_NoRidge',
 'Cat__x7_StoneBr',
 'Cat__x8_Norm',
 'Cat__x10_1Fam',
 'Cat__x10_Twnhs',
 'Cat__x14_BrkFace',
 'Cat__x14_PreCast',
 'Cat__x17_PConc',
 'Cat__x21_N',
 'Cat__x23_Typ',
 'Cat__x28_Normal',
 'LotFrontage',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'ExterQual',
 'BsmtQual',
 'BsmtExposure',
 'BsmtFinSF1',
 'HeatingQC',
 '1stFlrSF',
 '2ndFlrSF',
 'BsmtFullBath',
 'HalfBath',
 'KitchenQual',
 'TotRmsAbvGrd',
 'Fireplaces',
 'FireplaceQu',
 'GarageFinish',
 'GarageCars',
 'GarageArea',
 'PavedDrive',
 'ScreenPorch']

## (all) Train a Ridge regression on the selected features.

In [12]:
X = X_train
y = y_train

pipe = Pipeline([('transformer', ColumnTransformer([("Cat", OneHotEncoder(handle_unknown='ignore'), categorical)], 
                                            remainder='passthrough')),
                 ('scaler', StandardScaler(with_mean=False)),
                 ('selector', SelectFromModel(Lasso(alpha=0.0064, max_iter=10000))),
                 ('ridge', Ridge())])


param_grid = {'ridge__alpha':[0.001, 0.1, 1, 10]}

cv = KFold(n_splits=4, shuffle=True, random_state=42)

grid = GridSearchCV(pipe, param_grid, scoring='r2', cv=cv, n_jobs=-1)
grid.fit(X, y)

print(grid.best_params_)
print(grid.best_score_)

{'ridge__alpha': 0.001}
0.9383125666822049


In [13]:
X = X_train
y = y_train

pipe = Pipeline([('transformer', ColumnTransformer([("Cat", OneHotEncoder(handle_unknown='ignore'), categorical)], 
                                            remainder='passthrough')),
                 ('scaler', StandardScaler(with_mean=False)),
                 ('selector', SelectFromModel(Lasso(alpha=0.0064, max_iter=10000))),
                 ('ridge', Ridge())])


param_grid = {'ridge__alpha':[0.000001, 0.00001, 0.0001, 0.001]}

cv = KFold(n_splits=4, shuffle=True, random_state=42)

grid = GridSearchCV(pipe, param_grid, scoring='r2', cv=cv, n_jobs=-1)
grid.fit(X, y)

print(grid.best_params_)
print(grid.best_score_)

{'ridge__alpha': 1e-06}
0.938312581078951


## The model does not find a Ridge penalty useful. We will try Lasso instead.

In [14]:
X = X_train
y = y_train

pipe = Pipeline([('transformer', ColumnTransformer([("Cat", OneHotEncoder(handle_unknown='ignore'), categorical)], 
                                            remainder='passthrough')),
                 ('scaler', StandardScaler(with_mean=False)),
                 ('selector', SelectFromModel(Lasso(alpha=0.0064, max_iter=10000))),
                 ('lasso', Lasso())])


param_grid = {'lasso__alpha':[0.001, 0.1, 1, 10]}

cv = KFold(n_splits=4, shuffle=True, random_state=42)

grid = GridSearchCV(pipe, param_grid, scoring='r2', cv=cv, n_jobs=-1)
grid.fit(X, y)

print(grid.best_params_)
print(grid.best_score_)

{'lasso__alpha': 0.001}
0.9385903006865761


In [15]:
X = X_train
y = y_train

pipe = Pipeline([('transformer', ColumnTransformer([("Cat", OneHotEncoder(handle_unknown='ignore'), categorical)], 
                                            remainder='passthrough')),
                 ('scaler', StandardScaler(with_mean=False)),
                 ('selector', SelectFromModel(Lasso(alpha=0.0064, max_iter=10000))),
                 ('lasso', Lasso())])


param_grid = {'lasso__alpha':[0.000001, 0.00001, 0.0001, 0.001]}

cv = KFold(n_splits=4, shuffle=True, random_state=42)

grid = GridSearchCV(pipe, param_grid, scoring='r2', cv=cv, n_jobs=-1)
grid.fit(X, y)

print(grid.best_params_)
print(grid.best_score_)

{'lasso__alpha': 0.001}
0.9385903006865761


In [16]:
X = X_train
y = y_train

pipe = Pipeline([('transformer', ColumnTransformer([("Cat", OneHotEncoder(handle_unknown='ignore'), categorical)], 
                                            remainder='passthrough')),
                 ('scaler', StandardScaler(with_mean=False)),
                 ('selector', SelectFromModel(Lasso(alpha=0.0064, max_iter=10000))),
                 ('lasso', Lasso())])


param_grid = {'lasso__alpha':np.linspace(0.0001, 0.01, 1000)}

cv = KFold(n_splits=4, shuffle=True, random_state=42)

grid = GridSearchCV(pipe, param_grid, scoring='r2', cv=cv, n_jobs=-1)
grid.fit(X, y)

print(grid.best_params_)
print(grid.best_score_)

{'lasso__alpha': 0.00119009009009009}
0.9385980623449289


## The gridsearch has selected a Lasso penalty with alpha=0.0012.

### We now evaluate the chosen model on the test data.

In [17]:
pipe = Pipeline([('transformer', ColumnTransformer([("Cat", OneHotEncoder(handle_unknown='ignore'), categorical)], 
                                            remainder='passthrough')),
                 ('scaler', StandardScaler(with_mean=False)),
                 ('selector', SelectFromModel(Lasso(alpha=0.0064, max_iter=10000))),
                 ('lasso', Lasso(alpha=0.0012))])

pipe.fit(X_train, y_train)

print(f'The train score is {pipe.score(X_train, y_train)}')
print(f'The test score is {pipe.score(X_test, y_test)}')

The train score is 0.9470352051499253
The test score is 0.9109436400780235


## This represents a sensible baseline. A Lasso model with 58 features (after one-hot encoding) and no engineering has a test score of 91.1%