In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

In [57]:
data = pd.read_csv('../communal/Ames_Housing_Price_Data_cleaned_2.csv', header = [0])

pd.set_option("display.max_columns", None)

data.drop(['PID', 'lot_bucket', 'mean_LotFrontage', 'Prop_Addr'], axis = 1, inplace = True)

In [39]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import linear_model
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler

## Lasso with all features

In [25]:
y = data['SalePrice'].apply(lambda x: np.log(x))

In [26]:
x = data.drop(['SalePrice'], axis = 1)

In [27]:
x = pd.get_dummies(x)

In [29]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

In [30]:
x_scaled = MinMaxScaler()
x_train = x_scaled.fit_transform(x_train)

In [33]:
lasso_model = linear_model.Lasso()

params = {'alpha': np.linspace(1e-6, 100, 1000)}

grid_search = GridSearchCV(linear_model.Lasso(max_iter = 10000),
                          params,
                          cv = 10)

In [34]:
grid_search.fit(x_train, y_train)

GridSearchCV(cv=10, estimator=Lasso(max_iter=10000),
             param_grid={'alpha': array([1.00000000e-06, 1.00101099e-01, 2.00201198e-01, 3.00301297e-01,
       4.00401396e-01, 5.00501495e-01, 6.00601595e-01, 7.00701694e-01,
       8.00801793e-01, 9.00901892e-01, 1.00100199e+00, 1.10110209e+00,
       1.20120219e+00, 1.30130229e+00, 1.40140239e+00, 1.50150249e+00,
       1.60160259e+00, 1.70170268e+00, 1....
       9.76976977e+01, 9.77977978e+01, 9.78978979e+01, 9.79979980e+01,
       9.80980981e+01, 9.81981982e+01, 9.82982983e+01, 9.83983984e+01,
       9.84984985e+01, 9.85985986e+01, 9.86986987e+01, 9.87987988e+01,
       9.88988989e+01, 9.89989990e+01, 9.90990991e+01, 9.91991992e+01,
       9.92992993e+01, 9.93993994e+01, 9.94994995e+01, 9.95995996e+01,
       9.96996997e+01, 9.97997998e+01, 9.98998999e+01, 1.00000000e+02])})

In [35]:
grid_search.score(x_train, y_train)

0.9470027502469196

In [37]:
x_test = x_scaled.fit_transform(x_test)

In [38]:
grid_search.score(x_test, y_test)

0.8737392196604432

In [41]:
coefs = grid_search.best_estimator_.coef_

In [42]:
coef_list = list(zip(coefs, x.columns))

In [45]:
sorted(coef_list, key = lambda x: abs(x[0]), reverse = True)

[(1.0227896111702939, 'GrLivArea'),
 (-0.9322499107188041, 'PoolArea'),
 (-0.8494515184180251, 'Condition2_PosN'),
 (0.5216894867315413, 'LotArea'),
 (-0.5191034472027893, 'PoolQC_None'),
 (0.44778432031299154, 'OverallQual'),
 (0.4260191493288459, 'Neighborhood_GrnHill'),
 (0.4248753206780711, 'BsmtQual_Po'),
 (-0.3892061358564747, 'MiscFeature_TenC'),
 (-0.3436225990878493, 'GarageYrBlt_1895.0'),
 (0.334890226665268, 'OverallCond'),
 (0.33148884223217195, 'Exterior1st_PreCast'),
 (-0.32385325332055465, 'GarageQual_Po'),
 (-0.3148420715687064, 'Functional_Sal'),
 (0.29976473664976544, 'PoolQC_Fa'),
 (-0.26197045389252677, 'MSZoning_C (all)'),
 (-0.2508344458861888, 'PoolQC_Ex'),
 (0.2452632133272495, 'GarageQual_Ex'),
 (0.23427507101708944, 'PoolQC_Gd'),
 (0.2328740031165434, 'TotalBsmtSF'),
 (0.21945668402789476, 'SaleType_Con'),
 (0.21869446051988103, 'GarageYrBlt_2010.0'),
 (0.20390691522261306, 'YearBuilt'),
 (-0.19618975332422278, 'MiscFeature_Othr'),
 (-0.18971899733436953, 'Gar

## Lasso with feature selection

In [58]:
data.drop(['YearRemodAdd', 'Exterior2nd', 'BsmtFinSF1', 'BsmtFinSF2',
           'BsmtUnfSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 
           'TotRmsAbvGrd', 'GarageCars'], axis = 1, inplace = True)
data['Total_Porch'] = data['OpenPorchSF'] + data['EnclosedPorch'] + \
                      data['3SsnPorch'] + data['ScreenPorch']
data.drop(['OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch'],
           axis = 1, inplace = True)

In [59]:
data

Unnamed: 0,GrLivArea,SalePrice,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,RoofStyle,RoofMatl,Exterior1st,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,lat,long,Total_Porch
0,856,126000,30,RL,64.9,7890,Pave,,Reg,Lvl,AllPub,Corner,Gtl,SWISU,Norm,Norm,1Fam,1Story,6,6,1939,Gable,CompShg,Wd Sdng,,0.0,TA,TA,CBlock,TA,TA,No,Rec,Unf,856.0,GasA,TA,Y,SBrkr,1.0,0.0,1,0,2,1,TA,Typ,1,Gd,Detchd,1939.0,Unf,399.0,TA,TA,Y,0,0,,,,0,3,2010,WD,Normal,42.017780,-93.651452,166
1,1049,139500,120,RL,42.0,4235,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Edwards,Norm,Norm,TwnhsE,1Story,5,5,1984,Gable,CompShg,HdBoard,BrkFace,149.0,Gd,TA,CBlock,Gd,TA,Mn,GLQ,ALQ,1049.0,GasA,TA,Y,SBrkr,1.0,0.0,2,0,2,1,Gd,Typ,0,,Attchd,1984.0,Fin,266.0,TA,TA,Y,0,0,,,,0,2,2009,WD,Normal,42.024697,-93.664186,105
2,1049,139500,120,RL,42.0,4235,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Edwards,Norm,Norm,TwnhsE,1Story,5,5,1984,Gable,CompShg,HdBoard,BrkFace,149.0,Gd,TA,CBlock,Gd,TA,Mn,GLQ,ALQ,1049.0,GasA,TA,Y,SBrkr,1.0,0.0,2,0,2,1,Gd,Typ,0,,Attchd,1984.0,Fin,266.0,TA,TA,Y,0,0,,,,0,2,2009,WD,Normal,42.024697,-93.664186,105
3,1001,124900,30,C (all),60.0,6060,Pave,,Reg,Lvl,AllPub,Inside,Gtl,IDOTRR,Norm,Norm,1Fam,1Story,5,9,1930,Hip,CompShg,MetalSd,,0.0,Gd,TA,BrkTil,TA,TA,No,ALQ,Unf,837.0,GasA,Ex,Y,SBrkr,0.0,0.0,1,0,2,1,Gd,Typ,0,,Detchd,1930.0,Unf,216.0,TA,Po,N,154,0,,,,0,11,2007,WD,Normal,42.021389,-93.614855,128
4,1039,114000,70,RL,80.0,8146,Pave,,Reg,Lvl,AllPub,Corner,Gtl,OldTown,Norm,Norm,1Fam,2Story,4,8,1900,Gable,CompShg,MetalSd,,0.0,Gd,Gd,BrkTil,Fa,TA,No,Unf,Unf,405.0,GasA,Gd,Y,SBrkr,0.0,0.0,1,0,2,1,TA,Typ,0,,Detchd,1940.0,Unf,281.0,TA,TA,N,0,0,,,,0,5,2009,WD,Normal,42.038070,-93.612065,279
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2619,952,121000,30,RL,68.7,8854,Pave,,Reg,Lvl,AllPub,Inside,Gtl,BrkSide,Norm,Norm,1Fam,1.5Unf,6,6,1916,Gable,CompShg,Wd Sdng,,0.0,TA,TA,BrkTil,TA,TA,No,Unf,Unf,952.0,Grav,Fa,N,FuseF,0.0,0.0,1,0,2,1,Fa,Typ,1,Gd,Detchd,1916.0,Unf,192.0,Fa,Po,P,0,0,,,,0,5,2009,WD,Normal,42.031937,-93.626510,138
2620,1733,139600,20,RL,87.2,13680,Pave,,IR1,Lvl,AllPub,CulDSac,Gtl,Edwards,Norm,Norm,1Fam,1Story,3,5,1955,Hip,CompShg,BrkFace,,0.0,TA,TA,Slab,,,,,,0.0,GasA,Ex,Y,FuseA,0.0,0.0,2,0,4,1,TA,Min2,1,Gd,Attchd,1955.0,Unf,452.0,TA,TA,Y,0,0,,,,0,6,2009,WD,Normal,42.027798,-93.666899,0
2621,2002,145000,90,RH,82.0,6270,Pave,,Reg,HLS,AllPub,Inside,Gtl,Crawfor,Norm,Norm,Duplex,2Story,5,6,1949,Gable,CompShg,MetalSd,,0.0,TA,TA,CBlock,TA,TA,No,BLQ,Unf,1001.0,GasA,TA,N,FuseA,0.0,0.0,2,0,4,2,TA,Typ,0,,2Types,1949.0,Unf,871.0,TA,TA,Y,0,0,,,,0,8,2007,WD,Normal,42.019944,-93.643206,0
2622,1842,217500,60,RL,68.7,8826,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2000,Gable,CompShg,VinylSd,BrkFace,144.0,Gd,TA,PConc,Gd,TA,No,GLQ,Unf,985.0,GasA,Ex,Y,SBrkr,1.0,0.0,2,1,3,1,Gd,Typ,1,TA,Attchd,2000.0,Fin,486.0,TA,TA,Y,193,0,,,,0,7,2007,WD,Normal,42.016826,-93.690382,96


In [60]:
y = data['SalePrice'].apply(lambda x: np.log(x))

In [61]:
x = data.drop(['SalePrice'], axis = 1)
x = pd.get_dummies(x)

In [62]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

In [63]:
x_scaled = MinMaxScaler()
x_train = x_scaled.fit_transform(x_train)

In [64]:
lasso_model = linear_model.Lasso()

params = {'alpha': np.linspace(1e-6, 100, 1000)}

grid_search = GridSearchCV(linear_model.Lasso(max_iter = 10000),
                          params,
                          cv = 10)

In [65]:
grid_search.fit(x_train, y_train)

GridSearchCV(cv=10, estimator=Lasso(max_iter=10000),
             param_grid={'alpha': array([1.00000000e-06, 1.00101099e-01, 2.00201198e-01, 3.00301297e-01,
       4.00401396e-01, 5.00501495e-01, 6.00601595e-01, 7.00701694e-01,
       8.00801793e-01, 9.00901892e-01, 1.00100199e+00, 1.10110209e+00,
       1.20120219e+00, 1.30130229e+00, 1.40140239e+00, 1.50150249e+00,
       1.60160259e+00, 1.70170268e+00, 1....
       9.76976977e+01, 9.77977978e+01, 9.78978979e+01, 9.79979980e+01,
       9.80980981e+01, 9.81981982e+01, 9.82982983e+01, 9.83983984e+01,
       9.84984985e+01, 9.85985986e+01, 9.86986987e+01, 9.87987988e+01,
       9.88988989e+01, 9.89989990e+01, 9.90990991e+01, 9.91991992e+01,
       9.92992993e+01, 9.93993994e+01, 9.94994995e+01, 9.95995996e+01,
       9.96996997e+01, 9.97997998e+01, 9.98998999e+01, 1.00000000e+02])})

In [66]:
grid_search.score(x_train, y_train)

0.9496280385644401

In [67]:
x_test = x_scaled.fit_transform(x_test)

In [68]:
grid_search.score(x_test, y_test)

0.8339111838624171

In [69]:
coefs = grid_search.best_estimator_.coef_

In [70]:
coef_list = list(zip(coefs, x.columns))

In [71]:
sorted(coef_list, key = lambda x: abs(x[0]), reverse = True)

[(1.2091651662270784, 'GrLivArea'),
 (-0.7423038158168658, 'PoolArea'),
 (0.6995836157124519, 'LotArea'),
 (0.5922379582158414, 'Neighborhood_GrnHill'),
 (-0.5261602078728371, 'PoolQC_None'),
 (0.4400515052562592, 'OverallQual'),
 (-0.41867325257045424, 'GarageCond_Ex'),
 (-0.41811688553999415, 'GarageQual_Po'),
 (0.4095385523292571, 'OverallCond'),
 (-0.3980979854326751, 'Condition2_RRAe'),
 (-0.37144466345037397, 'Functional_Sal'),
 (0.36720004183360805, 'GarageQual_Ex'),
 (-0.34223069617233914, 'MiscFeature_TenC'),
 (0.3167413410944346, 'RoofStyle_Shed'),
 (0.3134303754519263, 'TotalBsmtSF'),
 (0.2863204007698607, 'Exterior1st_PreCast'),
 (-0.25645125242791794, 'PoolQC_Ex'),
 (0.2303412854490046, 'GarageYrBlt_1927.0'),
 (0.21163543772674237, 'RoofMatl_Membran'),
 (0.21077631813398398, 'GarageCond_Po'),
 (0.2077977997027606, 'KitchenQual_Po'),
 (0.20739041579984985, 'GarageYrBlt_1943.0'),
 (0.2026323355110993, 'YearBuilt'),
 (-0.2010275734573655, 'GarageYrBlt_None'),
 (-0.19952234497