In [5]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.preprocessing import MinMaxScaler
pd.set_option('display.max_columns', None)

In [31]:
data = pd.read_csv('./../../communal/Ames_Housing_Price_Data_cleaned_2.csv')

In [32]:
data.head()
predict = data.SalePrice
log_predict = np.log(predict)
data.drop('SalePrice', axis = 1, inplace = True)

data.drop(['PID', 'lot_bucket', 'mean_LotFrontage', 'Prop_Addr', 'GarageYrBlt', 'lat', 'long'], axis = 1, inplace = True)

data.drop(['YearRemodAdd', 'Exterior2nd', 'BsmtFinSF1', 'BsmtFinSF2',
           'BsmtUnfSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 
           'TotRmsAbvGrd', 'GarageCars'], axis = 1, inplace = True)
data['Total_Porch'] = data['OpenPorchSF'] + data['EnclosedPorch'] + \
                      data['3SsnPorch'] + data['ScreenPorch']
data.drop(['OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch'],
           axis = 1, inplace = True)

In [33]:
dummy = pd.get_dummies(data, drop_first = True)

In [9]:
from collections import defaultdict
num_iters = 10

params = {'alpha' : np.linspace(1e-5,1e-3,10)}
lasso = Lasso(max_iter = 200000)
gs = GridSearchCV(lasso,param_grid=params, verbose = 3, cv = 3, n_jobs = -1)

d = defaultdict(list)
trainR2 = []
testR2 = []
best_params = []

mins = []
maxs =[]

for i in range(num_iters):
    print(i, end = '/r')
    np.random.seed(i)
    
    X_train, X_test, y_train, y_test = train_test_split(dummy,log_predict,test_size = .3)
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    mins.append(scaler.data_min_)
    maxs.append(scaler.data_max_)
    
    gs.fit(X_train ,y_train)
    best_estimator = gs.best_estimator_
    trainR2.append(best_estimator.score(X_train, y_train))
    testR2.append(best_estimator.score(X_test,y_test))
    best_params.append(gs.best_params_)
    
    
    important_features = pd.Series(best_estimator.coef_, 
                                  index = dummy.columns).sort_values(key = abs,ascending = False)

    
    for i, e in enumerate(important_features.index):
        d[e].append(i)
    
    
 
    
    

0/rFitting 3 folds for each of 10 candidates, totalling 30 fits
1/rFitting 3 folds for each of 10 candidates, totalling 30 fits
2/rFitting 3 folds for each of 10 candidates, totalling 30 fits
3/rFitting 3 folds for each of 10 candidates, totalling 30 fits
4/rFitting 3 folds for each of 10 candidates, totalling 30 fits
5/rFitting 3 folds for each of 10 candidates, totalling 30 fits
6/rFitting 3 folds for each of 10 candidates, totalling 30 fits
7/rFitting 3 folds for each of 10 candidates, totalling 30 fits
8/rFitting 3 folds for each of 10 candidates, totalling 30 fits
9/rFitting 3 folds for each of 10 candidates, totalling 30 fits


In [29]:
top_f = ['GrLivArea', 'OverallQual', 'OverallCond','YearBuilt','TotalBsmtSF', 'GarageArea','Fireplaces',\
         'BsmtFullBath']

top_f  = ['GrLivArea', 'OverallQual', 'TotalBsmtSF', 'OverallCond', 'YearBuilt',
              'GarageArea', 'BsmtFullBath', 'CentralAir_Y', 'BldgType_Twnhs', 'BsmtExposure_Gd', 'KitchenQual_TA',
              'Condition1_Norm', 'Fireplaces', 'MSZoning_RL',
              'HeatingQC_TA']

top_20_list = ['GrLivArea', 'OverallQual', 'TotalBsmtSF', 'OverallCond', 'YearBuilt',
              'GarageArea', 'BsmtFullBath', 'Neighborhood_Crawfor', 'CentralAir_Y',
              'Neighborhood_Somerst', 'BldgType_Twnhs', 'BsmtExposure_Gd', 
              'Neighborhood_Edwards', 'KitchenQual_TA', 'Neighborhood_MeadowV',
              'Condition1_Norm', 'Neighborhood_NridgHt', 'Fireplaces', 'MSZoning_RL',
              'HeatingQC_TA']

def calc_vif(data):
    vif = {}
    for feature in data.columns:
        X = [f for f in data.columns if f != feature]
        X = data[X]
        y = data[feature]
        r2 = LinearRegression().fit(X,y).score(X,y)
        vif[feature] = 1/(1-r2)
    return pd.DataFrame.from_dict([vif]).T.sort_values(ascending = False, by = 0)

In [35]:
from sklearn.linear_model import LinearRegression
calc_vif(dummy[top_20_list]).rename(columns = {0 : 'VIF'})


Unnamed: 0,VIF
YearBuilt,3.018649
OverallQual,2.970474
GrLivArea,1.933121
TotalBsmtSF,1.828463
GarageArea,1.774874
MSZoning_RL,1.704128
KitchenQual_TA,1.669596
Neighborhood_Somerst,1.48914
Fireplaces,1.442851
OverallCond,1.428164


In [18]:

betas = []
for feature in top_f:
    obs = dummy.iloc[244]
    play = obs.copy()
    play.at[feature] = play[feature] + 1
    play = scaler.transform([play])
    obs = scaler.transform([obs])
    beta = np.exp(best_estimator.predict(play))[0]- np.exp(best_estimator.predict(obs))[0]
    betas.append(beta)

    
    
np_betas = np.array(betas)
np_betas

pd.Series(np_betas, index = top_f).to_frame()

Unnamed: 0,0
GrLivArea,28.428935
OverallQual,6279.141136
TotalBsmtSF,13.155105
OverallCond,4832.893645
YearBuilt,162.076026
GarageArea,12.763801
BsmtFullBath,4227.779022
CentralAir_Y,7033.993516
BldgType_Twnhs,-10096.478354
BsmtExposure_Gd,6232.155765


In [19]:
k = 10

for k in range(1000,1500):
    play = dummy.iloc[[k]][top_f]
    print(i)
    print(f'predict : {np.dot(play,np_betas)[0] }')
    print(f'actual  : {predict[k]}\n\n')

238
predict : 464442.3536450013
actual  : 172400


238
predict : 481791.17120182957
actual  : 224500


238
predict : 462308.16334011237
actual  : 157000


238
predict : 436718.5692591405
actual  : 140000


238
predict : 461624.73981555516
actual  : 183500


238
predict : 409111.2194089609
actual  : 80000


238
predict : 466471.8497510038
actual  : 175000


238
predict : 468702.6963960781
actual  : 181900


238
predict : 512689.9798713421
actual  : 315750


238
predict : 443506.33491993975
actual  : 145000


238
predict : 468028.1931307665
actual  : 197000


238
predict : 467126.5198561296
actual  : 84900


238
predict : 450034.43854568474
actual  : 158000


238
predict : 447831.3505695291
actual  : 156000


238
predict : 516304.89471704944
actual  : 305900


238
predict : 491322.5015535677
actual  : 257500


238
predict : 436979.3844469603
actual  : 116000


238
predict : 446123.44436979597
actual  : 143450


238
predict : 501757.8200550834
actual  : 277500


238
predict : 468974.52042

238
predict : 441987.04017732886
actual  : 138500


238
predict : 434304.4963305539
actual  : 147000


238
predict : 519544.08856338495
actual  : 354000


238
predict : 451642.0956776255
actual  : 135000


238
predict : 469646.19414682465
actual  : 160000


238
predict : 445812.3855610458
actual  : 127000


238
predict : 482321.3796419212
actual  : 219990


238
predict : 476237.4118642354
actual  : 235000


238
predict : 524533.4705147089
actual  : 377426


238
predict : 465643.3536107178
actual  : 160200


238
predict : 504506.32972183125
actual  : 306000


238
predict : 461471.71980262897
actual  : 190000


238
predict : 470502.02892548905
actual  : 214000


238
predict : 433343.0836752668
actual  : 134000


238
predict : 480096.4157594264
actual  : 170000


238
predict : 484731.24081274745
actual  : 190000


238
predict : 473058.0160983127
actual  : 179900


238
predict : 440461.8945062471
actual  : 112000


238
predict : 444197.47955656954
actual  : 140000


238
predict : 495233.62