In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.style as style
style.use('fivethirtyeight')
import matplotlib.cm as cm
import helper

In [2]:
colors = ["#FF0B04", "#F1BE48",
           "#B9975B", "#8B5B29",
           "#524727",
         ]
sns.set_palette(sns.color_palette(colors))

In [3]:
df = pd.read_csv('Ames_Housing_Price_Data.csv', 
                             index_col=0,low_memory = False)

In [4]:
train, test = helper.data_processing_wrapper(df, num_to_cat_list=[], remove_PID=False)

In [5]:
train['LogSalePrice'] = np.log(train['SalePrice'])
test['LogSalePrice'] = np.log(test['SalePrice'])

In [6]:
comp_dict = train.groupby(['Neighborhood', 'BedroomAbvGr', 'BldgType',
               'OverallQual', 'FullBath', 'KitchenQual', 'GarageCars']).mean()['LogSalePrice'].to_dict()

In [7]:
train['Comp'] = train.apply(lambda x: comp_dict[(x['Neighborhood'], x['BedroomAbvGr'], x['BldgType'],
               x['OverallQual'], x['FullBath'], x['KitchenQual'], x['GarageCars'])], axis=1)

In [8]:
alt_dict = train.groupby('Neighborhood').mean()['LogSalePrice'].to_dict()

In [9]:
def test_comp(x):
    if (x['Neighborhood'], x['BedroomAbvGr'], x['BldgType'],
               x['OverallQual'], x['FullBath'], x['KitchenQual'], x['GarageCars']) in comp_dict.keys():
        return comp_dict[(x['Neighborhood'], x['BedroomAbvGr'], x['BldgType'],
               x['OverallQual'], x['FullBath'], x['KitchenQual'], x['GarageCars'])]
    else:
        return alt_dict[x['Neighborhood']]    

In [10]:
test['Comp'] = test.apply(lambda x: test_comp(x), axis=1)

In [11]:
X_train = train.drop(['SalePrice', 'LogSalePrice', 'PID', 'TotalBsmtSF', 'GrLivArea'], axis=1)
y_train = train['LogSalePrice']
X_test = test.drop(['SalePrice', 'LogSalePrice', 'PID', 'TotalBsmtSF', 'GrLivArea'], axis=1)
y_test = test['LogSalePrice']

In [12]:
print(X_train.shape)
print(y_train.shape)
print('\n')
print(X_test.shape)
print(y_test.shape)

(1871, 78)
(1871,)


(624, 78)
(624,)


In [14]:
categorical = train.select_dtypes(['object','bool']).columns.to_list()

## Lasso for selection.

In [16]:
def Lasso_select(X, y, alpha):

    pipe = Pipeline(
        [
            ('transformer', ColumnTransformer([("Cat", OneHotEncoder(handle_unknown = 'ignore'), categorical)], 
                                                remainder='passthrough')),
            ('scaler', StandardScaler(with_mean=False))
        ]
    )
    
    X = pipe.fit_transform(X)

    cv = KFold(n_splits=5, shuffle=True, random_state=42)

    cross = cross_val_score(Lasso(alpha=alpha, max_iter=5000), X, y, scoring='r2', cv=cv, n_jobs=-1)
    
    selector = SelectFromModel(Lasso(alpha=alpha, max_iter=5000))
    selector.fit(X,y)
    num_features = np.sum(selector.get_support())
    
    return cross, num_features

In [17]:
for alpha in np.logspace(-5, -1, 5):
    print(Lasso_select(X_train, y_train, alpha))

(array([0.96362675, 0.96098576, 0.96806863, 0.96801297, 0.95841441]), 222)
(array([0.96433379, 0.96144082, 0.96887477, 0.96885509, 0.95873809]), 207)
(array([0.96537433, 0.96217806, 0.97125059, 0.97002611, 0.95872382]), 134)
(array([0.9544198 , 0.95258039, 0.96265516, 0.96051672, 0.94733634]), 15)
(array([0.88160681, 0.87137783, 0.88883576, 0.87369729, 0.86858259]), 1)


In [19]:
for alpha in [0.008, 0.009, 0.01, 0.011, 0.012]:
    print(Lasso_select(X_train,y_train,alpha))

(array([0.95628956, 0.95418546, 0.96418006, 0.96214877, 0.94920834]), 21)
(array([0.95528527, 0.9534063 , 0.96327352, 0.96122883, 0.94820191]), 17)
(array([0.9544198 , 0.95258039, 0.96265516, 0.96051672, 0.94733634]), 15)
(array([0.9536512 , 0.95174698, 0.96203864, 0.95982093, 0.94645027]), 12)
(array([0.95280215, 0.95086563, 0.96135495, 0.95910569, 0.94567129]), 11)


## Will move forward on fifteen features corresponding to alpha=0.01.

In [20]:
pipe = Pipeline(
        [
            ('transformer', ColumnTransformer([("Cat", OneHotEncoder(handle_unknown = 'ignore'), categorical)], 
                                                remainder='passthrough')),
            ('scaler', StandardScaler(with_mean=False))
        ]
    )
    
X = pipe.fit_transform(X_train)
y = y_train

selector = SelectFromModel(Lasso(alpha=0.01, max_iter=5000))
selector.fit(X,y)
mask = selector.get_support()
feat_names = pipe.named_steps['transformer'].get_feature_names()
names = [name for name, boo in zip(feat_names, mask) if boo]
names

['OverallQual',
 'YearRemodAdd',
 'ExterQual',
 'BsmtQual',
 'BsmtFinSF1',
 'HeatingQC',
 '1stFlrSF',
 '2ndFlrSF',
 'HalfBath',
 'TotRmsAbvGrd',
 'Fireplaces',
 'FireplaceQu',
 'GarageArea',
 'WoodDeckSF',
 'Comp']

## Notice no categorical selected among the fifteen.

## Ridge for robustness.

In [21]:
X = X_train[names]
y = y_train

pipe = Pipeline([('scaler', StandardScaler()),
                 ('ridge', Ridge())])


param_grid = {'ridge__alpha':[0.001, 0.1, 1, 10]}

cv = KFold(n_splits=5, shuffle=True, random_state=42)

grid = GridSearchCV(pipe, param_grid, scoring='r2', cv=cv, n_jobs=-1)
grid.fit(X, y)

print(grid.cv_results_['mean_test_score'])
print(grid.best_params_)
print(grid.best_score_)

[0.95950596 0.95950565 0.9595013  0.95932458]
{'ridge__alpha': 0.001}
0.9595059596811562


In [23]:
X = X_train[names]
y = y_train

pipe = Pipeline([('scaler', StandardScaler()),
                 ('ridge', Ridge())])

param_grid = {'ridge__alpha':np.linspace(0.00000001, 0.01, 1000)}

cv = KFold(n_splits=5, shuffle=True, random_state=42)

grid = GridSearchCV(pipe, param_grid, scoring='r2', cv=cv, n_jobs=-1)
grid.fit(X, y)

print(grid.best_params_)
print(grid.best_score_)

{'ridge__alpha': 1e-08}
0.9595059626202914


## It seems to function best without a penalty. We will see when we run on test...

In [24]:
X_train_ = X_train[names]
X_test_ = X_test[names] 

pipe = Pipeline([('scaler', StandardScaler()),
                 ('ols', LinearRegression())])

pipe.fit(X_train_, y_train)
print(f'Train score is {pipe.score(X_train_, y_train)}')
print(f'Train score is {pipe.score(X_test_, y_test)}')

Train score is 0.9601654790194283
Train score is 0.8140708068282201


## What happens if we add a Ridge penalty?

In [32]:
X_train_ = X_train[names]
X_test_ = X_test[names] 

pipe = Pipeline([('scaler', StandardScaler()),
                 ('ridge', Ridge(alpha=1))])

pipe.fit(X_train_, y_train)
print(f'Train score is {pipe.score(X_train_, y_train)}')
print(f'Test score is {pipe.score(X_test_, y_test)}')

Train score is 0.9601643985198131
Test score is 0.8148869860526615


## What happens if we jettison the comp?

In [30]:
names_nocomp = [name for name in names if name != 'Comp']

## First go back to OLS.

In [31]:
X_train_ = X_train[names_nocomp]
X_test_ = X_test[names_nocomp] 

pipe = Pipeline([('scaler', StandardScaler()),
                 ('ols', LinearRegression())])

pipe.fit(X_train_, y_train)
print(f'Train score is {pipe.score(X_train_, y_train)}')
print(f'Test score is {pipe.score(X_test_, y_test)}')

Train score is 0.8975330681352833
Train score is 0.8620809626608303


## Ceiling is lower now, but test score increased.