In [1]:
## Load packages

In [2]:
import pandas as pd
import numpy as np
import helper
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.style as style
style.use('fivethirtyeight')

from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
# from sklearn.feature_selection import SelectFromModel

In [3]:
colors = ["#FF0B04", "#F1BE48",
           "#B9975B", "#8B5B29",
           "#524727",
         ]
sns.set_palette(sns.color_palette(colors))

In [4]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 100)

In [5]:
housing = pd.read_csv('Ames_Housing_Price_Data.csv', index_col=0,low_memory = False)


train, test = helper.data_processing_wrapper(housing,
                                               num_to_cat_list = ['MSSubClass','MoSold'])





In [6]:
cat_feats = train.select_dtypes(['object','bool']).columns.to_list()
cat_feats


['MSSubClass',
 'MSZoning',
 'Street',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'Foundation',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'CentralAir',
 'Electrical',
 'Functional',
 'GarageType',
 'Fence',
 'MiscFeature',
 'MoSold',
 'SaleType',
 'SaleCondition']

In [7]:
num_cols = train.select_dtypes(['float64','int64']).columns.to_list()
num_cols.remove('SalePrice')

In [8]:
num_cols

['GrLivArea',
 'LotFrontage',
 'LotArea',
 'Alley',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'ExterQual',
 'ExterCond',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 'HeatingQC',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'KitchenQual',
 'TotRmsAbvGrd',
 'Fireplaces',
 'FireplaceQu',
 'GarageYrBlt',
 'GarageFinish',
 'GarageCars',
 'GarageArea',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'PoolQC',
 'MiscVal',
 'YrSold']

define added feature functions

In [9]:
def add_year_since_feature(df):
    df['year_since_built'] = df['YrSold']-df['YearBuilt']
    df['year_since_remod'] = df['YrSold']-df['YearRemodAdd']
    df['year_since_garage'] = df['YrSold']-df['GarageYrBlt']

    df.loc[df['year_since_built']<0,'year_since_built']=0
    df.loc[df['year_since_remod']<0,'year_since_remod']=0
    df.loc[df['year_since_garage']<0,'year_since_garage']=0
    return df

In [10]:
def add_score_feature(df):
    df['overall_score'] = df['OverallQual']*df['OverallCond']
    df['exter_score'] = df['ExterQual']*df['ExterCond']
    df['bsmt_score'] = df['BsmtQual']*df['BsmtCond']
    df['garage_score'] = df['GarageQual']*df['GarageCond']
    return df


In [11]:
def add_non_linear_transformed_features(df,cols):
    df_list = [df]
    for col in cols:
        df_new = pd.DataFrame()
        df_new[col+'_squared'] = df[col]**2
        df_new[col+'_cubed'] = df[col]**3
        df_new[col+'_square_root'] = df[col]**0.5
        df_list.append(df_new)
    df = pd.concat(df_list, axis=1)
    return df

In [12]:
def add_price_comp_feature(train_, test_,comp_feature):
    temp = train.groupby(comp_feature).agg({'SalePrice':'median'})
    temp.columns = [comp_feature+'_comp']
    train_ = train_.merge(temp, how='left', on=comp_feature)
    test_ = test_.merge(temp, how='left', on=comp_feature)
    return train_, test_


In [13]:
def add_price_comp_log_feature(train_, test_,comp_feature):
    temp = train_.copy()
    temp['log_SalePrice'] = np.log(temp['SalePrice'])
    temp = temp.groupby(comp_feature).agg({'log_SalePrice':'median'})
    temp.columns = [comp_feature+'_log_comp']
    train_ = train_.merge(temp, how='left', on=comp_feature)
    test_ = test_.merge(temp, how='left', on=comp_feature)
    return train_, test_

In [14]:
num_feat_test = [
'GrLivArea',
 'LotFrontage',
 'LotArea',
 'Alley',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'ExterQual',
 'ExterCond',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 'HeatingQC',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'KitchenQual',
 'TotRmsAbvGrd',
 'Fireplaces',
 'FireplaceQu',
 'GarageYrBlt',
 'GarageFinish',
 'GarageCars',
 'GarageArea',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'PoolQC',
 'MiscVal',
 'YrSold',
 'year_since_built',
 'year_since_remod',
 'overall_score',
 'exter_score',
 'bsmt_score',
 'garage_score',
    'Neighborhood_comp',
    'GarageCars_comp',
    'BldgType_comp',
    'MSZoning_comp',
        'Neighborhood_log_comp',
    'GarageCars_log_comp',
    'BldgType_log_comp',
    'MSZoning_log_comp',
]

In [15]:
train, test = add_price_comp_feature(train, test,'Neighborhood')
train, test = add_price_comp_feature(train, test,'GarageCars')
train, test = add_price_comp_feature(train, test,'BldgType')
train, test = add_price_comp_feature(train, test,'MSZoning')
train, test = add_price_comp_feature(train, test,'HouseStyle')
train, test = add_price_comp_feature(train, test,'FullBath')
train, test = add_price_comp_feature(train, test,'MSSubClass')
train, test = add_price_comp_feature(train, test,'LotShape')
train, test = add_price_comp_feature(train, test,'LotConfig')
train, test = add_price_comp_feature(train, test,'Condition1')
train, test = add_price_comp_feature(train, test,'MasVnrType')
train, test = add_price_comp_feature(train, test,'Foundation')
train, test = add_price_comp_feature(train, test,'BsmtFinType1')
train, test = add_price_comp_feature(train, test,'GarageType')
train, test = add_price_comp_feature(train, test,'Fence')


train, test = add_price_comp_log_feature(train, test,'Neighborhood')
train, test = add_price_comp_log_feature(train, test,'GarageCars')
train, test = add_price_comp_log_feature(train, test,'BldgType')
train, test = add_price_comp_log_feature(train, test,'MSZoning')
train, test = add_price_comp_log_feature(train, test,'HouseStyle')
train, test = add_price_comp_log_feature(train, test,'FullBath')
train, test = add_price_comp_log_feature(train, test,'MSSubClass')
train, test = add_price_comp_log_feature(train, test,'LotShape')
train, test = add_price_comp_log_feature(train, test,'LotConfig')
train, test = add_price_comp_log_feature(train, test,'Condition1')
train, test = add_price_comp_log_feature(train, test,'MasVnrType')
train, test = add_price_comp_log_feature(train, test,'Foundation')
train, test = add_price_comp_log_feature(train, test,'BsmtFinType1')
train, test = add_price_comp_log_feature(train, test,'GarageType')
train, test = add_price_comp_log_feature(train, test,'Fence')






In [16]:
train = add_year_since_feature(train)
train = add_score_feature(train)
train = add_non_linear_transformed_features(train,num_feat_test)

In [17]:
test = add_year_since_feature(test)
test = add_score_feature(test)
test = add_non_linear_transformed_features(test,num_feat_test)

In [18]:
train, test = add_price_comp_feature(train, test,'overall_score')

In [19]:
num_cols = train.select_dtypes(['float64','int64']).columns.to_list()
num_cols.remove('SalePrice')

In [20]:

scaler = StandardScaler(with_mean=False)
lasso = Lasso()

X = train.drop(['SalePrice'],axis=1)
transformer = ColumnTransformer([("Cat", 
                                  OneHotEncoder(handle_unknown = 'ignore'), 
                                  cat_feats)], remainder='passthrough')
X = transformer.fit_transform(X)
X = scaler.fit_transform(X)
y = np.log(train['SalePrice'])

# Grid Search set up.

alphas = np.linspace(.01, .005, 20)
tuned_parameters = [{'alpha': alphas}]
clf = GridSearchCV(lasso, tuned_parameters, cv=3)
clf.fit(X, y)

GridSearchCV(cv=3, estimator=Lasso(),
             param_grid=[{'alpha': array([0.01      , 0.00973684, 0.00947368, 0.00921053, 0.00894737,
       0.00868421, 0.00842105, 0.00815789, 0.00789474, 0.00763158,
       0.00736842, 0.00710526, 0.00684211, 0.00657895, 0.00631579,
       0.00605263, 0.00578947, 0.00552632, 0.00526316, 0.005     ])}])

In [21]:
clf.best_score_

0.9399658873687192

In [22]:
X_tst = test.drop(['SalePrice'],axis=1)
X_tst = transformer.transform(X_tst)
X_tst = scaler.transform(X_tst)
y_tst = np.log(test['SalePrice'])

clf.score(X_tst,y_tst)

0.9157672483579035

In [23]:
columns_transformed = transformer.named_transformers_['Cat'].get_feature_names(input_features= cat_feats)
new_columns = list(columns_transformed)+num_cols

coef_df = pd.DataFrame({'features':new_columns,'coefs':clf.best_estimator_.coef_})
coef_df = coef_df[coef_df['coefs']!=0]
coef_df['coefs_abs'] = abs(coef_df['coefs'])
coef_df = coef_df.sort_values('coefs_abs',ascending=False).reset_index(drop=True)
coef_df


Unnamed: 0,features,coefs,coefs_abs
0,GrLivArea_square_root,0.1228605,0.1228605
1,overall_score_square_root,0.06252124,0.06252124
2,year_since_built_square_root,-0.04600754,0.04600754
3,TotalBsmtSF,0.03393217,0.03393217
4,LotArea_square_root,0.02856147,0.02856147
5,Neighborhood_log_comp,0.02587961,0.02587961
6,OverallQual_cubed,0.01893318,0.01893318
7,BsmtFinSF1_square_root,0.01762729,0.01762729
8,MSZoning_log_comp,0.01464141,0.01464141
9,KitchenQual_cubed,0.01414586,0.01414586


In [24]:
coef_df.head(20)

Unnamed: 0,features,coefs,coefs_abs
0,GrLivArea_square_root,0.12286,0.12286
1,overall_score_square_root,0.062521,0.062521
2,year_since_built_square_root,-0.046008,0.046008
3,TotalBsmtSF,0.033932,0.033932
4,LotArea_square_root,0.028561,0.028561
5,Neighborhood_log_comp,0.02588,0.02588
6,OverallQual_cubed,0.018933,0.018933
7,BsmtFinSF1_square_root,0.017627,0.017627
8,MSZoning_log_comp,0.014641,0.014641
9,KitchenQual_cubed,0.014146,0.014146


In [25]:
# helper.lasso_model_score(
#         .01,
#     train, 
#     test,
#     'SalePrice', 
#     cat_feats,
#     drop_cols = ['SalePrice'],
# )

In [26]:
#squared

# bsmt_score

In [27]:
# cubed

# bsmt_score
# OverallQual
# ExterQual
# BsmtQual
# GarageCond
# KitchenQual

In [28]:
# square_root

# overall_score
# OverallQual
# OverallCond
# ExterQual
# FireplaceQu

In [29]:
# helper.plot_lasso_grid_search(    
#     0.0005, .005, 20,
#     train, 
#     test,
#     'SalePrice', 
#     cat_feats,
#     drop_cols = ['SalePrice'],
#                              )

In [30]:
# helper.lasso_train_test_graph(
#     0.0005, .005, 50,
#     train, 
#     test,
#     'SalePrice', 
#     cat_feats,
#     drop_cols = ['SalePrice'],
#                )

In [31]:
# helper.lasso_model_score(
#         .005,
#     train, 
#     test,
#     'SalePrice', 
#     cat_feats,
#     drop_cols = ['SalePrice'],
# )