In [22]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.preprocessing import MinMaxScaler


pd.set_option('display.max_columns', None)

In [16]:
data = pd.read_csv('./../../communal/Ames_Housing_Price_Data_cleaned_2.csv')

In [17]:
log_price = np.log(data['SalePrice'])

data.drop(['SalePrice','PID','lat','long','Prop_Addr', 'mean_LotFrontage', 'lot_bucket', 'GarageYrBlt'],axis = 1, inplace = True)


In [19]:
dummy = pd.get_dummies(data, drop_first = True)

In [26]:
X_train, X_test, y_train, y_test = train_test_split(dummy,log_price)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)



In [31]:
lasso = Lasso(max_iter = 2000000)
params = {'alpha' : np.linspace(1e-5,1e-3,1000)}
gs = GridSearchCV(lasso,param_grid=params, n_jobs = -1,cv = 5,verbose = 3)
gs.fit(X_train, y_train)

Fitting 5 folds for each of 1000 candidates, totalling 5000 fits


GridSearchCV(cv=5, estimator=Lasso(max_iter=2000000), n_jobs=-1,
             param_grid={'alpha': array([1.00000000e-05, 1.09909910e-05, 1.19819820e-05, 1.29729730e-05,
       1.39639640e-05, 1.49549550e-05, 1.59459459e-05, 1.69369369e-05,
       1.79279279e-05, 1.89189189e-05, 1.99099099e-05, 2.09009009e-05,
       2.18918919e-05, 2.28828829e-05, 2.38738739e-05, 2.48648649e-05,
       2.58558559e-05, 2.6846...
       9.77207207e-04, 9.78198198e-04, 9.79189189e-04, 9.80180180e-04,
       9.81171171e-04, 9.82162162e-04, 9.83153153e-04, 9.84144144e-04,
       9.85135135e-04, 9.86126126e-04, 9.87117117e-04, 9.88108108e-04,
       9.89099099e-04, 9.90090090e-04, 9.91081081e-04, 9.92072072e-04,
       9.93063063e-04, 9.94054054e-04, 9.95045045e-04, 9.96036036e-04,
       9.97027027e-04, 9.98018018e-04, 9.99009009e-04, 1.00000000e-03])},
             verbose=3)

In [33]:
best_estimator = gs.best_estimator_
best_estimator.alpha

0.00021513513513513513

In [43]:
ordered = pd.Series(best_estimator.coef_, index = dummy.columns).sort_values(key = abs, ascending = False)

In [44]:
ordered

GrLivArea              0.957211
OverallQual            0.507176
OverallCond            0.354942
LotArea                0.273528
TotalBsmtSF            0.255739
                         ...   
Exterior1st_HdBoard   -0.000000
Exterior1st_ImStucc    0.000000
Exterior1st_Plywood    0.000000
Exterior1st_PreCast    0.000000
Exterior1st_Wd Sdng   -0.000000
Length: 266, dtype: float64

In [65]:
widdled_down = dummy[ordered.index[:25]]

X_train, X_test, y_train, y_test = train_test_split(widdled_down,log_price)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

lasso = Lasso(max_iter = 2000000)
params = {'alpha' : np.linspace(1e-5,1e-3,1000)}
gs = GridSearchCV(lasso,param_grid=params, n_jobs = -1,cv = 5,verbose = 3)
gs.fit(X_train, y_train)

Fitting 5 folds for each of 1000 candidates, totalling 5000 fits


GridSearchCV(cv=5, estimator=Lasso(max_iter=2000000), n_jobs=-1,
             param_grid={'alpha': array([1.00000000e-05, 1.09909910e-05, 1.19819820e-05, 1.29729730e-05,
       1.39639640e-05, 1.49549550e-05, 1.59459459e-05, 1.69369369e-05,
       1.79279279e-05, 1.89189189e-05, 1.99099099e-05, 2.09009009e-05,
       2.18918919e-05, 2.28828829e-05, 2.38738739e-05, 2.48648649e-05,
       2.58558559e-05, 2.6846...
       9.77207207e-04, 9.78198198e-04, 9.79189189e-04, 9.80180180e-04,
       9.81171171e-04, 9.82162162e-04, 9.83153153e-04, 9.84144144e-04,
       9.85135135e-04, 9.86126126e-04, 9.87117117e-04, 9.88108108e-04,
       9.89099099e-04, 9.90090090e-04, 9.91081081e-04, 9.92072072e-04,
       9.93063063e-04, 9.94054054e-04, 9.95045045e-04, 9.96036036e-04,
       9.97027027e-04, 9.98018018e-04, 9.99009009e-04, 1.00000000e-03])},
             verbose=3)

In [75]:
best_estimator = gs.best_estimator_

from sklearn.inspection import permutation_importance

X_train, X_test, y_train, y_test = train_test_split(widdled_down,log_price)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

r = permutation_importance(best_estimator,X_train, y_train, n_repeats = 30, random_state = 0)



In [82]:
skel = []
for i in r.importances_mean.argsort()[::-1]:
    skel.append([ordered.index[i],round(r.importances_mean[i],3),round(r.importances_std[i],3) ])
    
f = pd.DataFrame(skel)

In [83]:
f

Unnamed: 0,0,1,2
0,GrLivArea,0.262,0.008
1,OverallQual,0.154,0.005
2,YearBuilt,0.151,0.004
3,OverallCond,0.045,0.002
4,TotalBsmtSF,0.022,0.001
5,BsmtFinSF1,0.019,0.001
6,GarageCars,0.014,0.001
7,SaleCondition_Partial,0.011,0.001
8,SaleCondition_Normal,0.01,0.001
9,Neighborhood_Crawfor,0.01,0.001


In [84]:
best_estimator.score(X_test, y_test)

0.9226905193265396

In [87]:
from sklearn.ensemble import GradientBoostingRegressor

gbr = GradientBoostingRegressor()
parameters = {
    "n_estimators":[850, 900, 950, 1000],
    "max_depth":[4, 5, 6],
    "learning_rate":[0.005, 0.01, 0.025, 0.05, 0.075],
    "subsample":[0.4, 0.6, 0.7, 0.8, 0.9],
    "max_features":[7, 8]
}

grid_search = GridSearchCV(estimator = gbr, param_grid = parameters, n_jobs = -1, cv = 5, verbose = 3)

In [88]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 600 candidates, totalling 3000 fits


GridSearchCV(cv=5, estimator=GradientBoostingRegressor(), n_jobs=-1,
             param_grid={'learning_rate': [0.005, 0.01, 0.025, 0.05, 0.075],
                         'max_depth': [4, 5, 6], 'max_features': [7, 8],
                         'n_estimators': [850, 900, 950, 1000],
                         'subsample': [0.4, 0.6, 0.7, 0.8, 0.9]},
             verbose=3)

In [93]:
gbm_best = grid_search.best_estimator_
print(gbm_best.score(X_train, y_train))
print(gbm_best.score(X_test, y_test))

0.9679392901310736
0.9249684448289214
