In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.style as style
import helper
style.use('fivethirtyeight')

In [2]:
colors = ["#FF0B04", "#F1BE48",
           "#B9975B", "#8B5B29",
           "#524727",
         ]
sns.set_palette(sns.color_palette(colors))

## Data processing

In [3]:
housing = pd.read_csv('Ames_Housing_Price_Data.csv', index_col=0,low_memory = False)


train, test = helper.data_processing_wrapper(housing,
                                               num_to_cat_list = ['MSSubClass'])


In [4]:
cat_feats = train.select_dtypes(['object','bool']).columns.to_list()


In [5]:
num_cols = train.select_dtypes(['float64','int64']).columns.to_list()
num_cols.remove('SalePrice')


## Simple linear model

In [6]:
def simple_linear_model_score(train, test, cols, target):
    ols = linear_model.LinearRegression()
    X = train[cols]
    cat_feats = X.select_dtypes(['object','bool']).columns.to_list()
    transformer = ColumnTransformer([("Cat", OneHotEncoder(handle_unknown = 'ignore'), cat_feats)], remainder='passthrough')
    X = transformer.fit_transform(X)
    y = np.log(train[target])
    
    ols.fit(X, y)
    train_score = ols.score(X, y)
    
    X_tst = test[cols]
    X_tst = transformer.transform(X_tst)
    y_tst = np.log(test[target])
    test_score = ols.score(X_tst, y_tst)
    
    return train_score, test_score

The kitchen sink model of every feature over learns. And has a negative test score

In [7]:
simple_linear_model_score(train, test, cat_feats+num_cols, 'SalePrice')

(0.9577939177156539, 0.9049441938900812)

The numerical kitchen sink model is very successful

In [8]:
simple_linear_model_score(train, test, num_cols, 'SalePrice')

(0.9290445015300175, 0.8849212267144517)

The numerical kitchen sink model gets better when we selectively pick our favorite categorical features

In [9]:
fav_cat = ['YearBuilt', 'ExterQual', 'Neighborhood', 'KitchenQual']
simple_linear_model_score(train, test, num_cols+fav_cat, 'SalePrice')

(0.9418792481441322, 0.8992313787558742)

We can see that as we add more and more numerical features our model gets better.

In [10]:
k=100
cols = train.corr().nlargest(k, 'SalePrice')['SalePrice'].index
cols = cols[1:]
for num_features in range(1,len(cols)):
    print(cols[num_features])
    print(simple_linear_model_score(train, test, cols[1:1+num_features], 'SalePrice'))

GrLivArea
(0.5361242929283243, 0.5052486795731859)
ExterQual
(0.7034962809602388, 0.6644905975921633)
TotalBsmtSF
(0.7792598241555013, 0.7229657933399387)
KitchenQual
(0.7974247390402703, 0.7489550650148744)
1stFlrSF
(0.7975069805128663, 0.7497354661616471)
GarageArea
(0.8182544807598195, 0.779205489561503)
GarageCars
(0.8229613212881246, 0.7862240548493213)
BsmtQual
(0.8367316241545875, 0.8093688234853456)
YearBuilt
(0.8496799637760847, 0.8201252110298075)
FullBath
(0.8515427358211811, 0.8194532379214836)
GarageFinish
(0.8546721816101726, 0.8239315804336104)
FireplaceQu
(0.862791589110423, 0.8355690508933211)
MasVnrArea
(0.863014924192381, 0.8357178472479138)
TotRmsAbvGrd
(0.8634271653730479, 0.8341419606401707)
YearRemodAdd
(0.8695869148482882, 0.8386413512424243)
Fireplaces
(0.8735302259811796, 0.8403365999459429)
BsmtFinSF1
(0.8805143918287985, 0.8482424745087166)
HeatingQC
(0.8819825209355477, 0.8512038129217325)
BsmtExposure
(0.8837381955399547, 0.8524812209245907)
LotFrontage
(0

## Lasso

lasso with all columns 

In [11]:
def lasso_linear_model_score(train_, test_, alpha, target, 
                             categorical_features,
                             drop_cols = ['SalePrice', 'TotalBsmtSF']):
    scaler = StandardScaler(with_mean=False)
    ols = linear_model.LinearRegression()
    
    
    X = train_.drop(drop_cols,axis=1)
    transformer = ColumnTransformer([("Cat", 
                                      OneHotEncoder(handle_unknown = 'ignore'), 
                                      categorical_features)], remainder='passthrough')
    X = transformer.fit_transform(X)
    X = scaler.fit_transform(X)
    y = np.log(train_[target])
    lasso = linear_model.Lasso(alpha=alpha)
    selector = SelectFromModel(estimator=lasso)
    X = selector.fit_transform(X, y)
    
    ols.fit(X,y)
    train_score = ols.score(X,y)

    X_tst = test_.drop(drop_cols,axis=1)
    X_tst = transformer.transform(X_tst)
    X_tst = scaler.transform(X_tst)
    y_tst = np.log(test_[target])
    X_tst = selector.transform(X_tst)
    test_score = ols.score(X_tst,y_tst)
    
    
    feat_names = transformer.get_feature_names()
    mask = selector.get_support()
    lasso_feats = [a for a, b in zip(feat_names, mask) if b]
    
    return train_score, test_score,lasso_feats

##########

lasso_linear_model_score(train, test, 0.1, 'SalePrice', 
                             cat_feats,
                             drop_cols = ['SalePrice', 'TotalBsmtSF'])




(0.874653311752264,
 0.8379021385124785,
 ['GrLivArea',
  'OverallQual',
  'YearBuilt',
  'BsmtQual',
  '1stFlrSF',
  'KitchenQual',
  'GarageCars',
  'GarageArea'])

In [12]:
lasso_linear_model_score(train, test, 0.1, 'SalePrice', 
                             cat_feats,
                             drop_cols = ['SalePrice', 'TotalBsmtSF'])

(0.874653311752264,
 0.8379021385124785,
 ['GrLivArea',
  'OverallQual',
  'YearBuilt',
  'BsmtQual',
  '1stFlrSF',
  'KitchenQual',
  'GarageCars',
  'GarageArea'])

In [13]:
lasso_linear_model_score(train, test, 0.01, 'SalePrice', 
                             cat_feats,
                             drop_cols = ['SalePrice', 'TotalBsmtSF'])

(0.9460319639800638,
 0.9083007695396004,
 ['Cat__x0_160',
  'Cat__x0_30',
  'Cat__x1_C (all)',
  'Cat__x1_RM',
  'Cat__x8_ClearCr',
  'Cat__x8_Crawfor',
  'Cat__x8_Edwards',
  'Cat__x8_GrnHill',
  'Cat__x8_MeadowV',
  'Cat__x8_Somerst',
  'Cat__x8_StoneBr',
  'Cat__x9_Norm',
  'Cat__x11_1Fam',
  'Cat__x11_Twnhs',
  'Cat__x15_BrkFace',
  'Cat__x15_PreCast',
  'Cat__x18_PConc',
  'Cat__x22_N',
  'Cat__x24_Typ',
  'Cat__x25_Attchd',
  'Cat__x29_Normal',
  'GrLivArea',
  'LotFrontage',
  'LotArea',
  'OverallQual',
  'OverallCond',
  'YearBuilt',
  'YearRemodAdd',
  'MasVnrArea',
  'ExterQual',
  'BsmtQual',
  'BsmtExposure',
  'BsmtFinSF1',
  'BsmtFinSF2',
  'HeatingQC',
  '1stFlrSF',
  'BsmtFullBath',
  'KitchenQual',
  'Fireplaces',
  'FireplaceQu',
  'GarageFinish',
  'GarageCars',
  'GarageArea',
  'PavedDrive',
  'ScreenPorch'])

In [14]:
lasso_linear_model_score(train, test, 0.001, 'SalePrice', 
                             cat_feats,
                             drop_cols = ['SalePrice', 'TotalBsmtSF'])

(0.9561244295278687,
 0.9063054954999941,
 ['Cat__x0_150',
  'Cat__x0_160',
  'Cat__x0_20',
  'Cat__x0_30',
  'Cat__x0_45',
  'Cat__x0_50',
  'Cat__x0_60',
  'Cat__x0_70',
  'Cat__x0_85',
  'Cat__x0_90',
  'Cat__x1_A (agr)',
  'Cat__x1_C (all)',
  'Cat__x1_I (all)',
  'Cat__x1_RH',
  'Cat__x1_RL',
  'Cat__x1_RM',
  'Cat__x2_Grvl',
  'Cat__x3_IR2',
  'Cat__x3_IR3',
  'Cat__x4_Bnk',
  'Cat__x4_HLS',
  'Cat__x4_Low',
  'Cat__x6_Corner',
  'Cat__x6_CulDSac',
  'Cat__x6_FR2',
  'Cat__x6_FR3',
  'Cat__x7_Mod',
  'Cat__x8_BrDale',
  'Cat__x8_BrkSide',
  'Cat__x8_ClearCr',
  'Cat__x8_CollgCr',
  'Cat__x8_Crawfor',
  'Cat__x8_Edwards',
  'Cat__x8_Greens',
  'Cat__x8_GrnHill',
  'Cat__x8_IDOTRR',
  'Cat__x8_Landmrk',
  'Cat__x8_MeadowV',
  'Cat__x8_NAmes',
  'Cat__x8_NWAmes',
  'Cat__x8_NoRidge',
  'Cat__x8_NridgHt',
  'Cat__x8_OldTown',
  'Cat__x8_SWISU',
  'Cat__x8_SawyerW',
  'Cat__x8_Somerst',
  'Cat__x8_StoneBr',
  'Cat__x8_Veenker',
  'Cat__x9_Artery',
  'Cat__x9_Feedr',
  'Cat__x9_Norm',


In [17]:

lasso_scores_train = []

lasso_scores_test  = []

alphas = np.linspace(0.01, 1, 100)

for alpha in alphas:
    print(alpha)
    try:
        train_score, test_score,lasso_feats = lasso_linear_model_score(
            train, 
           test,
           alpha, 
           'SalePrice', 
           cat_feats,
           drop_cols = ['SalePrice', 'TotalBsmtSF']
        )
        lasso_scores_train.append(train_score)
        lasso_scores_test.append(test_score)
    except:
    lasso_scores_train.append(train_score)
    lasso_scores_test.append(test_score)

lasso_scores_train = np.array(lasso_scores_train) 
lasso_scores_test  = np.array(lasso_scores_test)

0.01
0.02
0.03
0.04
0.05
0.060000000000000005
0.06999999999999999
0.08
0.09
0.09999999999999999
0.11
0.12
0.13
0.14
0.15000000000000002
0.16
0.17
0.18000000000000002
0.19
0.2
0.21000000000000002
0.22
0.23
0.24000000000000002
0.25
0.26
0.27
0.28
0.29000000000000004
0.3
0.31
0.32


  warn("No features were selected: either the data is"


ValueError: Found array with 0 feature(s) (shape=(1871, 0)) while a minimum of 1 is required.

In [18]:
train_score, test_score,lasso_feats = lasso_linear_model_score(
    train, 
   test,
   alpha, 
   'SalePrice', 
   cat_feats,
   drop_cols = ['SalePrice', 'TotalBsmtSF']
)

  warn("No features were selected: either the data is"


ValueError: Found array with 0 feature(s) (shape=(1871, 0)) while a minimum of 1 is required.

In [16]:
alphas

array([0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 , 0.11,
       0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2 , 0.21, 0.22,
       0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3 , 0.31, 0.32, 0.33,
       0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4 , 0.41, 0.42, 0.43, 0.44,
       0.45, 0.46, 0.47, 0.48, 0.49, 0.5 , 0.51, 0.52, 0.53, 0.54, 0.55,
       0.56, 0.57, 0.58, 0.59, 0.6 , 0.61, 0.62, 0.63, 0.64, 0.65, 0.66,
       0.67, 0.68, 0.69, 0.7 , 0.71, 0.72, 0.73, 0.74, 0.75, 0.76, 0.77,
       0.78, 0.79, 0.8 , 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87, 0.88,
       0.89, 0.9 , 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99,
       1.  ])

In [None]:
plt.plot(alphas, lasso_scores_train, label=r'$train\ R^2$')
plt.plot(alphas, lasso_scores_test, label=r'$test\ R^2$')
plt.title(r'Lasso Train-Test $R^2$ Comparison')
lasso_underfit = lasso_scores_train < lasso_scores_test
last_underfit  = np.max(alphas[lasso_underfit])
plt.axvline(last_underfit, linestyle='--', color='g', label='optimal lambda', alpha=0.4)
plt.legend(loc=1)
plt.xlabel(r'hyperparameter $\lambda$')
plt.ylabel(r'$R^2$')