In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.preprocessing import MinMaxScaler
pd.set_option('display.max_columns', None)

In [2]:
data = pd.read_csv('../communal/Ames_Housing_Price_Data_cleaned_2.csv')

In [3]:
data.head()
predict = data.SalePrice
log_predict = np.log(predict)
data.drop('SalePrice', axis = 1, inplace = True)

data.drop(['PID', 'lot_bucket', 'mean_LotFrontage', 'Prop_Addr', 'GarageYrBlt', 'lat', 'long'], axis = 1, inplace = True)

data.drop(['YearRemodAdd', 'Exterior2nd', 'BsmtFinSF1', 'BsmtFinSF2',
           'BsmtUnfSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 
           'TotRmsAbvGrd', 'GarageCars'], axis = 1, inplace = True)
data['Total_Porch'] = data['OpenPorchSF'] + data['EnclosedPorch'] + \
                      data['3SsnPorch'] + data['ScreenPorch']
data.drop(['OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch'],
           axis = 1, inplace = True)

In [4]:
dummy = pd.get_dummies(data, drop_first = True)

In [5]:
from collections import defaultdict
num_iters = 10

params = {'alpha' : np.linspace(1e-5,1e-3,10)}
lasso = Lasso(max_iter = 200000)
gs = GridSearchCV(lasso,param_grid=params, verbose = 3, cv = 3, n_jobs = -1)

d = defaultdict(list)
trainR2 = []
testR2 = []
best_params = []

mins = []
maxs =[]

for i in range(num_iters):
    print(i, end = '/r')
    np.random.seed(i)
    
    X_train, X_test, y_train, y_test = train_test_split(dummy,log_predict,test_size = .3)
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    mins.append(scaler.data_min_)
    maxs.append(scaler.data_max_)
    
    gs.fit(X_train ,y_train)
    best_estimator = gs.best_estimator_
    trainR2.append(best_estimator.score(X_train, y_train))
    testR2.append(best_estimator.score(X_test,y_test))
    best_params.append(gs.best_params_)
    
    
    important_features = pd.Series(best_estimator.coef_, 
                                  index = dummy.columns).sort_values(key = abs,ascending = False)

    
    for i, e in enumerate(important_features.index):
        d[e].append(i)
    
    
 
    
    

0/rFitting 3 folds for each of 10 candidates, totalling 30 fits
1/rFitting 3 folds for each of 10 candidates, totalling 30 fits
2/rFitting 3 folds for each of 10 candidates, totalling 30 fits
3/rFitting 3 folds for each of 10 candidates, totalling 30 fits
4/rFitting 3 folds for each of 10 candidates, totalling 30 fits
5/rFitting 3 folds for each of 10 candidates, totalling 30 fits
6/rFitting 3 folds for each of 10 candidates, totalling 30 fits
7/rFitting 3 folds for each of 10 candidates, totalling 30 fits
8/rFitting 3 folds for each of 10 candidates, totalling 30 fits
9/rFitting 3 folds for each of 10 candidates, totalling 30 fits


In [6]:
top_f = ['GrLivArea', 'OverallQual', 'OverallCond','YearBuilt','TotalBsmtSF', 'GarageArea','Fireplaces',\
         'BsmtFullBath']

In [7]:

betas = []
for feature in top_f:
    obs = dummy.iloc[244]
    play = obs.copy()
    play.at[feature] = play[feature] + 1
    play = scaler.transform([play])
    obs = scaler.transform([obs])
    
#     beta = np.exp(best_estimator.predict(play))[0]- np.exp(best_estimator.predict(obs))[0]
    beta = np.log(best_estimator.predict(play)/best_estimator.predict(obs))[0]
    # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    
    
    betas.append(beta)

    
    
np_betas = np.array(betas)
np_betas

np_betas = pd.Series(np_betas, index = top_f).to_frame()

In [13]:
np_betas.apply(lambda x: np.exp(x))

Unnamed: 0,0
GrLivArea,1.000022
OverallQual,1.004831
OverallCond,1.003742
YearBuilt,1.000128
TotalBsmtSF,1.00001
GarageArea,1.00001
Fireplaces,1.001183
BsmtFullBath,1.003283


In [8]:
top_f

['GrLivArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'TotalBsmtSF',
 'GarageArea',
 'Fireplaces',
 'BsmtFullBath']

In [9]:
(best_estimator.predict(play)/best_estimator.predict(obs))[0]

1.0032826157273071

In [10]:
play.shape

(1, 239)

In [11]:
k = 10

for k in range(1000,1500):
    play = dummy.iloc[[k]][top_f]
    comparison = dummy.iloc[[k+1]][top_f]
    print(k)
#     print(f'predict : {np.dot(play,np_betas)[0] }')
    ##                 ^^^^^^^^^^^^^^ this misses beta_0 so can't be compared with the label directly
    
    
    # You need to do something like below,
    print(f'difference : {np.dot(comparison,np_betas)[0] - np.dot(play,np_betas)[0] }')
    #                                       ^^^^^^^^ 
    #             but with the coefficient multiplied by max-min and transformed with exp
    print(f'actual  : {np.log(predict[k+1]/predict[k])}\n\n')
    

1000
difference : [0.01355343]
actual  : 0.2640583489527163


1001
difference : [-0.01308825]
actual  : -0.3576299018340009


1002
difference : [-0.02339009]
actual  : -0.11460338273900374


1003
difference : [0.02062725]
actual  : 0.27057224488532067


1004
difference : [-0.04114535]
actual  : -0.8301880328207434


1005
difference : [0.03996872]
actual  : 0.7827593392496325


1006
difference : [0.00109825]
actual  : 0.038671111600562455


1007
difference : [0.03468484]
actual  : 0.551493675706524


1008
difference : [-0.04955452]
actual  : -0.778217018810026


1009
difference : [0.01932334]
actual  : 0.3064699863174141


1010
difference : [0.00220166]
actual  : -0.8417296354206869


1011
difference : [-0.01362747]
actual  : 0.6211209397096651


1012
difference : [-0.00395127]
actual  : -0.012739025777429714


1013
difference : [0.04832839]
actual  : 0.6734022439073207


1014
difference : [-0.01966262]
actual  : -0.17223853105306697


1015
difference : [-0.04274683]
actual  : -0.797429

1152
difference : [-0.01346887]
actual  : -0.25408835516363126


1153
difference : [0.04512864]
actual  : 0.6277702127861143


1154
difference : [-0.04323978]
actual  : -0.7173823714758015


1155
difference : [0.02386344]
actual  : 0.4587096226269767


1156
difference : [-0.03897539]
actual  : -0.5701650635522995


1157
difference : [0.03144696]
actual  : 0.5577425435537424


1158
difference : [-0.0258422]
actual  : -0.4780358009429998


1159
difference : [0.02171262]
actual  : 0.3496340861384756


1160
difference : [0.01226162]
actual  : 0.27466422178277633


1161
difference : [0.00282576]
actual  : -0.06468251998582925


1162
difference : [-0.02049607]
actual  : -0.3078534313155603


1163
difference : [0.06614875]
actual  : 1.0635209688568397


1164
difference : [-0.0639825]
actual  : -1.0986122886681098


1165
difference : [0.05305245]
actual  : 0.9619839595330539


1166
difference : [-0.04738294]
actual  : -0.7519124897861968


1167
difference : [-0.04023794]
actual  : -0.798507696

1310
difference : [0.01888484]
actual  : 0.3629054936893685


1311
difference : [-0.02669383]
actual  : -0.47290638890369696


1312
difference : [0.03710106]
actual  : 0.4264323809520953


1313
difference : [0.00427643]
actual  : 0.3485448178554822


1314
difference : [-0.01700197]
actual  : -0.32091882052781145


1315
difference : [-0.0034412]
actual  : -0.22319278087164604


1316
difference : [-0.03891797]
actual  : -0.5057105230992203


1317
difference : [0.05498673]
actual  : 0.8746180354735871


1318
difference : [-0.01306045]
actual  : -0.40333970879585096


1319
difference : [-0.01014247]
actual  : -0.09340117508840076


1320
difference : [0.01088153]
actual  : 0.04779066383634848


1321
difference : [0.01523078]
actual  : 0.4054651081081644


1322
difference : [-0.0435952]
actual  : -0.6306941659133954


1323
difference : [0.03372688]
actual  : 0.648315767263215


1324
difference : [-0.01439635]
actual  : -0.33997180274747735


1325
difference : [-0.00178136]
actual  : -0.01857

difference : [-0.01726777]
actual  : -0.5783079209475753


1460
difference : [0.06367917]
actual  : 1.256797943707547


1461
difference : [-0.05255198]
actual  : -0.7669454433218353


1462
difference : [-0.00636803]
actual  : -0.06261359272798676


1463
difference : [0.01301199]
actual  : 0.15634607039069404


1464
difference : [-0.02149027]
actual  : -0.38721166779881405


1465
difference : [-0.00173505]
actual  : 0.06381151274495377


1466
difference : [-0.00605014]
actual  : 0.05998270460171041


1467
difference : [0.]
actual  : 0.0


1468
difference : [0.0251442]
actual  : 0.1193039484970901


1469
difference : [-0.00798469]
actual  : 0.0788729800866586


1470
difference : [0.0333155]
actual  : 0.4110673636568342


1471
difference : [0.01214719]
actual  : 0.35387773931767114


1472
difference : [-0.06721292]
actual  : -1.1264470876615535


1473
difference : [0.02390334]
actual  : 0.35020242943311497


1474
difference : [-0.01562415]
actual  : -0.41689380393178715


1475
difference 