In [3]:
import sklearn
import warnings
import numpy as np
import sklearn.model_selection as ms
from sklearn import linear_model
import pandas as pd
data = pd.read_csv('compiled_with_season.csv')

In [51]:
pd.set_option('display.max_columns',None)
data.head()
data = data.fillna(0)

In [58]:
target = data['LogSalePrice']
features = data.drop(columns = ['LogSalePrice'])

In [18]:
# setting lasso parameters for randomized search
lasso = linear_model.Lasso()
lasso.set_params(normalize = True, max_iter = 100000)

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=100000,
      normalize=True, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [None]:
# running gridsearch to obtain ideal alpha
from sklearn.model_selection import GridSearchCV
from sklearn import datasets
param_grid = [{'alpha':list(np.arange(.0001,.001,.0001))}]

para_search = GridSearchCV(estimator = lasso, param_grid = param_grid, scoring = 'r2', return_train_score = True)
para_search = para_search.fit(features,target)            
               

In [37]:
# running randomizedsearch to obtain ideal alpha
from sklearn.model_selection import RandomizedSearchCV
param_distributions = [{'alpha':list(np.arange(.000001,.001,.000001))}]

random_search = RandomizedSearchCV(estimator = lasso, param_distributions = param_distributions, scoring = 'r2', return_train_score = True, n_iter = 500)
random_search = random_search.fit(features,target)     

In [38]:
# results from randomized search
print(random_search.best_score_)
print(random_search.best_params_)
print(random_search.best_index_)

0.8793459006658522
{'alpha': 4.399999999999999e-05}
145


In [59]:
# coeficients from randomized search
best_lasso = linear_model.Lasso()
best_lasso.set_params(normalize = True, alpha = 4.399999999999999e-05)
best_lasso.fit(features,target)
best_lasso.coef_

array([-0.00000000e+00,  2.73208000e-02,  0.00000000e+00,  6.50577317e-03,
        4.28902542e-02, -2.53529229e-02, -0.00000000e+00, -3.87079947e-04,
       -6.95618646e-02, -5.46840226e-02,  8.47352218e-03,  4.97035984e-02,
        6.87041639e-03,  1.45439138e-01, -7.05652049e-02, -0.00000000e+00,
       -1.12598067e-02, -1.75583829e-01, -6.78617900e-03, -0.00000000e+00,
       -2.77962137e-02, -2.94635017e-03,  7.22359259e-02,  1.01533721e-01,
       -3.67766384e-02, -0.00000000e+00,  3.55076871e-03,  6.63828064e-02,
        1.23000260e-01,  3.77775229e-03,  4.18334388e-02, -0.00000000e+00,
        5.91331071e-02,  1.26970634e-02, -1.12381315e-02, -1.69735163e-02,
        4.09744347e-02, -0.00000000e+00,  0.00000000e+00,  1.02893108e-06,
       -8.08729370e-05,  1.80206343e-02,  1.04355518e-04,  8.24818660e-05,
        6.94768825e-05,  1.74188904e-04,  2.74658309e-04, -9.85545731e-02,
       -0.00000000e+00,  2.52603906e-05,  0.00000000e+00,  5.47498383e-02,
       -0.00000000e+00,  

In [69]:
# list of coefficients with column name
list(zip(list(features.columns),best_lasso.coef_))

[('LotShape_IR1', -0.0),
 ('LotShape_IR2', 0.027320799960905407),
 ('LotShape_Reg', 0.0),
 ('LotConfig_Corner', 0.006505773167635052),
 ('LotConfig_CulDSac', 0.04289025419643521),
 ('LotConfig_FR2', -0.025352922931321138),
 ('LotConfig_Inside', -0.0),
 ('Neighborhood_Blmngtn', -0.00038707994672150373),
 ('Neighborhood_Blueste', -0.06956186463524947),
 ('Neighborhood_BrDale', -0.05468402261326364),
 ('Neighborhood_BrkSide', 0.008473522181700296),
 ('Neighborhood_ClearCr', 0.04970359844134408),
 ('Neighborhood_CollgCr', 0.0068704163855524665),
 ('Neighborhood_Crawfor', 0.14543913787941565),
 ('Neighborhood_Edwards', -0.07056520493060316),
 ('Neighborhood_Gilbert', -0.0),
 ('Neighborhood_IDOTRR', -0.011259806662641317),
 ('Neighborhood_MeadowV', -0.17558382917321494),
 ('Neighborhood_Mitchel', -0.0067861789954561024),
 ('Neighborhood_NAmes', -0.0),
 ('Neighborhood_NPkVill', -0.027796213663783695),
 ('Neighborhood_NWAmes', -0.0029463501680312377),
 ('Neighborhood_NoRidge', 0.07223592589047

In [70]:
# transforming coefficients to percent impact on saleprice
import math as ma
coefs_as_percent = list(map(lambda x: (ma.exp(x)-1)*100,best_lasso.coef_))

In [72]:
coef_list = list(zip(list(features.columns),coefs_as_percent))

In [109]:
# creating dataframe of betas as percent impact on salesprice and name of corresponding variable column
di = {}
for (a,b) in coef_list:
    di.setdefault(a,[]).append(b)
coef_df = pd.DataFrame.from_dict(di)
coef_df = coef_df.transpose().sort_values(by = 0)
coef_df.to_csv('coef_df.csv')


In [105]:
pd.set_option('display.min_rows',500)
# pd.set_option('display.max_rows',500)

In [112]:
# displaying above dataframe in alphabetical order of variable name for analysis
coef_df.sort_index()

Unnamed: 0,0
3SsnPorch,0.01742
BldgType_1Fam,4.972727
BldgType_TwnhsE,0.0
BsmtCond_Fa,0.0
BsmtCond_Gd,3.673399
BsmtCond_Po,-4.984372
BsmtCond_TA,3.29288
BsmtExposure_Av,0.0
BsmtExposure_Gd,5.233859
BsmtExposure_Mn,0.0


In [53]:
target = target.apply(lambda x: ma.exp(x))