In [49]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import pickle
import sys

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import linear_model
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler

sys.path.insert(0,'../charles/helpers')
import helper
import joblib

In [50]:
data = pd.read_csv('../communal/Ames_Housing_Price_Data_cleaned_2.csv', header = [0])

pd.set_option("display.max_columns", None)

data.drop(['PID', 'lot_bucket', 'mean_LotFrontage', 'Prop_Addr', 'GarageYrBlt', 'lat', 'long'], axis = 1, inplace = True)

data.drop(['YearRemodAdd', 'Exterior2nd', 'BsmtFinSF1', 'BsmtFinSF2',
           'BsmtUnfSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 
           'TotRmsAbvGrd', 'GarageCars'], axis = 1, inplace = True)
data['Total_Porch'] = data['OpenPorchSF'] + data['EnclosedPorch'] + \
                      data['3SsnPorch'] + data['ScreenPorch']
data.drop(['OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch'],
           axis = 1, inplace = True)

In [51]:
y = data['SalePrice'].apply(lambda x: np.log(x))
x = data.drop(['SalePrice'], axis = 1)
x = pd.get_dummies(x, drop_first = True)

In [4]:
np.random.seed(10)
x_train, x_test, y_train, y_test = train_test_split(x,y)

scaler = MinMaxScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

params = {'alpha': np.linspace(1e-7,1e-3,5000)}

lasso_gs = GridSearchCV(Lasso(max_iter = 500000),param_grid=params, cv = 5)

In [5]:
lasso_gs = helper.runGridSearch(lasso_gs, x_train, x_test, y_train, y_test, 'lasso_all', 'all features')

joblib imported
change made


In [6]:
best_alpha = lasso_gs.best_estimator_.alpha

In [9]:
train_score = lasso_gs.score(x_train, y_train)
train_score

0.9321263141199571

In [7]:
best_alpha

0.00016271626325265053

In [10]:
test_score = lasso_gs.score(x_test, y_test)
test_score

0.9324663984703787

In [11]:
lasso_gs_df = pd.DataFrame(x_train, columns = x.columns)

In [12]:
coefs = lasso_gs.best_estimator_.coef_
coef_list = list(zip(coefs, x.columns))
sorted_coef_list = sorted(coef_list, key = lambda x: abs(x[0]), reverse = True)

In [42]:
sorted_coef_list

[(1.147970286814866, 'GrLivArea'),
 (0.4824339476796453, 'OverallQual'),
 (-0.4062136528465338, 'Condition2_PosN'),
 (0.3884218476419442, 'OverallCond'),
 (0.37864726014202876, 'Neighborhood_GrnHill'),
 (0.3129003099725433, 'TotalBsmtSF'),
 (0.22178498063690183, 'YearBuilt'),
 (0.1775419286786149, 'LotArea'),
 (0.1733904334117891, 'GarageArea'),
 (-0.15202287881449397, 'Functional_Maj2'),
 (-0.15011758943749773, 'SaleType_Oth'),
 (-0.12944707994430363, 'MSZoning_C (all)'),
 (-0.12098438577461096, 'Neighborhood_MeadowV'),
 (0.1206261444327479, 'SaleCondition_Partial'),
 (0.11588865995777407, 'Total_Porch'),
 (-0.11077516427539928, 'BldgType_Twnhs'),
 (0.10903554359077493, 'SaleCondition_Alloca'),
 (0.10802260786348304, 'Neighborhood_StoneBr'),
 (0.1035816046001243, 'BsmtFullBath'),
 (0.094892293294999, 'Neighborhood_Crawfor'),
 (0.0945190695585236, 'Neighborhood_Somerst'),
 (-0.09386958341284432, 'KitchenQual_Fa'),
 (0.09099138055491214, 'SaleCondition_Normal'),
 (0.08897835661542626, '

In [13]:
top_10_features = [sorted_coef_list[i][1] for i in list(range(10))]
top_10_features

['GrLivArea',
 'OverallQual',
 'Condition2_PosN',
 'OverallCond',
 'Neighborhood_GrnHill',
 'TotalBsmtSF',
 'YearBuilt',
 'LotArea',
 'GarageArea',
 'Functional_Maj2']

In [14]:
x_train_df = pd.DataFrame(x_train, columns = x.columns)

In [15]:
x_test_df = pd.DataFrame(x_test, columns = x.columns)

In [16]:
def sub_lists (lst):
    lists = [[]]
    for i in range(len(lst) + 1):
        for j in range(i):
            lists.append(lst[j:i])
    return lists

In [20]:
top_10_subsets = sub_lists(top_10_features)[1:]

In [21]:
subsets = []
train_scores = []
test_scores = []

In [24]:
for combo in top_10_subsets:
    lasso_model = linear_model.Lasso()
    params = {'alpha': [best_alpha]}
    grid_search = GridSearchCV(linear_model.Lasso(max_iter = 500000), params, cv = 5)
    grid_search.fit(x_train_df[combo], y_train)
    train_score = grid_search.score(x_train_df[combo], y_train)
    train_scores.append(train_score)
    grid_search.fit(x_test_df[combo], y_test)
    test_score = grid_search.score(x_test_df[combo], y_test)
    test_scores.append(test_score)
    subsets.append(combo)

In [25]:
subset_scores = list(zip(subsets, train_scores, test_scores))
subset_scores_sorted = sorted(subset_scores, key = lambda x: x[2], reverse = True)

In [26]:
subset_scores_sorted

[(['GrLivArea',
   'OverallQual',
   'Condition2_PosN',
   'OverallCond',
   'Neighborhood_GrnHill',
   'TotalBsmtSF',
   'YearBuilt',
   'LotArea',
   'GarageArea'],
  0.07319532618911795,
  0.8978284642964416),
 (['GrLivArea',
   'OverallQual',
   'Condition2_PosN',
   'OverallCond',
   'Neighborhood_GrnHill',
   'TotalBsmtSF',
   'YearBuilt',
   'LotArea',
   'GarageArea',
   'Functional_Maj2'],
  0.39661820860071473,
  0.8978284642964416),
 (['GrLivArea',
   'OverallQual',
   'Condition2_PosN',
   'OverallCond',
   'Neighborhood_GrnHill',
   'TotalBsmtSF',
   'YearBuilt',
   'LotArea'],
  0.3468915880493626,
  0.8900943656700789),
 (['GrLivArea',
   'OverallQual',
   'Condition2_PosN',
   'OverallCond',
   'Neighborhood_GrnHill',
   'TotalBsmtSF',
   'YearBuilt'],
  0.4128569521292931,
  0.8829977842012982),
 (['GrLivArea',
   'OverallQual',
   'Condition2_PosN',
   'OverallCond',
   'Neighborhood_GrnHill',
   'TotalBsmtSF'],
  0.0015690572823721105,
  0.8325621762458021),
 (['Over

In [33]:
top_10_subset_list = []
for i in range(len(top_10_features)):
    top_10_subset_list.append(top_10_features[:i+1])

In [38]:
subsets = []
train_scores = []
test_scores = []
alphas = []

In [39]:
for subset in top_10_subset_list:
    lasso_model = linear_model.Lasso()
    params = {'alpha': np.linspace(1e-7,1e-3,5000)}
    grid_search = GridSearchCV(linear_model.Lasso(max_iter = 500000), params, cv = 5)
    grid_search.fit(x_train_df[subset], y_train)
    train_score = grid_search.score(x_train_df[subset], y_train)
    train_scores.append(train_score)
    grid_search.fit(x_test_df[subset], y_test)
    test_score = grid_search.score(x_test_df[subset], y_test)
    test_scores.append(test_score)
    subsets.append(subset)
    alphas.append(grid_search.best_estimator_.alpha)

In [40]:
subset_scores = list(zip(subsets, train_scores, test_scores, alphas))
subset_scores_sorted = sorted(subset_scores, key = lambda x: x[2], reverse = True)

In [41]:
subset_scores_sorted

[(['GrLivArea',
   'OverallQual',
   'Condition2_PosN',
   'OverallCond',
   'Neighborhood_GrnHill',
   'TotalBsmtSF',
   'YearBuilt',
   'LotArea',
   'GarageArea'],
  0.8765215415035408,
  0.8965088110929862,
  0.0006373637327465493),
 (['GrLivArea',
   'OverallQual',
   'Condition2_PosN',
   'OverallCond',
   'Neighborhood_GrnHill',
   'TotalBsmtSF',
   'YearBuilt',
   'LotArea',
   'GarageArea',
   'Functional_Maj2'],
  0.8768107237399386,
  0.8965088110929862,
  0.0006373637327465493),
 (['GrLivArea',
   'OverallQual',
   'Condition2_PosN',
   'OverallCond',
   'Neighborhood_GrnHill',
   'TotalBsmtSF',
   'YearBuilt',
   'LotArea'],
  0.87039887377645,
  0.8880177055456079,
  0.0007899789957991598),
 (['GrLivArea',
   'OverallQual',
   'Condition2_PosN',
   'OverallCond',
   'Neighborhood_GrnHill',
   'TotalBsmtSF',
   'YearBuilt'],
  0.8618939930785323,
  0.8830348512763158,
  6.950694138827766e-05),
 (['GrLivArea',
   'OverallQual',
   'Condition2_PosN',
   'OverallCond',
   'Ne

In [52]:
train_scores = []
test_scores = []
alphas = []
feature_positions = {}

In [53]:
for i in range(20):
    np.random.seed(i)
    x_train, x_test, y_train, y_test = train_test_split(x,y)
    scaler = MinMaxScaler()
    x_train = scaler.fit_transform(x_train)
    x_test = scaler.transform(x_test)
    params = {'alpha': np.linspace(1e-7,1e-3,5000)}
    model = GridSearchCV(Lasso(max_iter = 500000), params, cv = 5)
    model.fit(x_train, y_train)
    train_score = model.score(x_train, y_train)
    train_scores.append(train_score)
    model.fit(x_test, y_test)
    test_score = model.score(x_test, y_test)
    test_scores.append(test_score)
    alphas.append(model.best_estimator_.alpha)
    coefs = model.best_estimator_.coef_
    coef_list = list(zip(coefs, x.columns))
    sorted_coef_list = sorted(coef_list, key = lambda x: abs(x[0]), reverse = True)
    for i in range(len(sorted_coef_list)):
        feature = sorted_coef_list[i][1]
        if feature in feature_positions.keys():
            feature_positions[feature] += i
        else:
            feature_positions[feature] = i

In [66]:
feature_positions_sorted = sorted(feature_positions.items(), key=lambda item: item[1])
feature_positions_sorted

[('GrLivArea', 0),
 ('OverallQual', 23),
 ('TotalBsmtSF', 56),
 ('OverallCond', 60),
 ('YearBuilt', 127),
 ('GarageArea', 136),
 ('BsmtFullBath', 286),
 ('Neighborhood_Crawfor', 326),
 ('CentralAir_Y', 332),
 ('Neighborhood_Somerst', 381),
 ('BldgType_Twnhs', 469),
 ('BsmtExposure_Gd', 512),
 ('Neighborhood_Edwards', 622),
 ('KitchenQual_TA', 624),
 ('Neighborhood_MeadowV', 654),
 ('Condition1_Norm', 686),
 ('Neighborhood_NridgHt', 695),
 ('Fireplaces', 717),
 ('MSZoning_RL', 722),
 ('HeatingQC_TA', 785),
 ('Total_Porch', 808),
 ('BsmtFinType1_Unf', 827),
 ('BsmtQual_TA', 879),
 ('Foundation_PConc', 886),
 ('MSSubClass', 893),
 ('BldgType_Duplex', 939),
 ('Exterior1st_BrkFace', 986),
 ('MSZoning_C (all)', 1018),
 ('PavedDrive_Y', 1102),
 ('GarageType_Attchd', 1110),
 ('WoodDeckSF', 1120),
 ('Functional_Typ', 1138),
 ('BsmtQual_Gd', 1177),
 ('BsmtExposure_No', 1181),
 ('MasVnrArea', 1195),
 ('HalfBath', 1197),
 ('Neighborhood_StoneBr', 1213),
 ('BsmtFinType1_LwQ', 1257),
 ('Neighborhood

In [55]:
train_scores

[0.9295974445056083,
 0.9394254409413796,
 0.9397181243760504,
 0.9185843886080564,
 0.9444995296785396,
 0.9415599323734648,
 0.9364578661484178,
 0.9300814575064233,
 0.9387728571777866,
 0.9269279215371702,
 0.9321263141199571,
 0.928402500654327,
 0.9196502379893325,
 0.9293498012278066,
 0.9368400718104588,
 0.9264309883791253,
 0.9429262774189149,
 0.9366886924755927,
 0.9305201567851774,
 0.9281600333117719]

In [56]:
test_scores

[0.9495550992458056,
 0.9474606719251077,
 0.9182210236670545,
 0.9492983192815208,
 0.9090340105599725,
 0.9373339068082505,
 0.936353458007518,
 0.9384571721342032,
 0.9149427095587997,
 0.941521351798595,
 0.9420360713225522,
 0.9508503385815343,
 0.9429657647147932,
 0.9482824898289033,
 0.9452238304667182,
 0.9452757529280581,
 0.9138862644750525,
 0.9270338179750128,
 0.959360716307723,
 0.9350316218170271]

In [57]:
alphas

[0.00041094108821764354,
 0.0005043504300860171,
 0.001,
 0.0005291529105821164,
 0.001,
 0.0007235723544708941,
 0.0003717371674334867,
 0.0006457645729145828,
 0.001,
 0.0006455645529105821,
 0.0008421842168433686,
 0.0003695369473894779,
 0.000795779575915183,
 0.00027852784556911384,
 0.0005291529105821164,
 0.00044274426885377075,
 0.001,
 0.001,
 0.00019231922384476896,
 0.0003849384876975395]