In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.preprocessing import MinMaxScaler
pd.set_option('display.max_columns', None)

In [2]:
data = pd.read_csv('../communal/Ames_Housing_Price_Data_cleaned_2.csv')

In [3]:
data.head()
predict = data.SalePrice
log_predict = np.log(predict)
data.drop('SalePrice', axis = 1, inplace = True)

data.drop(['PID', 'lot_bucket', 'mean_LotFrontage', 'Prop_Addr', 'GarageYrBlt', 'lat', 'long'], axis = 1, inplace = True)

data.drop(['YearRemodAdd', 'Exterior2nd', 'BsmtFinSF1', 'BsmtFinSF2',
           'BsmtUnfSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 
           'TotRmsAbvGrd', 'GarageCars'], axis = 1, inplace = True)
data['Total_Porch'] = data['OpenPorchSF'] + data['EnclosedPorch'] + \
                      data['3SsnPorch'] + data['ScreenPorch']
data.drop(['OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch'],
           axis = 1, inplace = True)

In [4]:
dummy = pd.get_dummies(data, drop_first = True)

In [5]:
from collections import defaultdict
num_iters = 10

params = {'alpha' : np.linspace(1e-5,1e-3,10)}
lasso = Lasso(max_iter = 200000)
gs = GridSearchCV(lasso,param_grid=params, verbose = 3, cv = 3, n_jobs = -1)

d = defaultdict(list)
trainR2 = []
testR2 = []
best_params = []

mins = []
maxs =[]

for i in range(num_iters):
    print(i, end = '/r')
    np.random.seed(i)
    
    X_train, X_test, y_train, y_test = train_test_split(dummy,log_predict,test_size = .3)
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    mins.append(scaler.data_min_)
    maxs.append(scaler.data_max_)
    
    gs.fit(X_train ,y_train)
    best_estimator = gs.best_estimator_
    trainR2.append(best_estimator.score(X_train, y_train))
    testR2.append(best_estimator.score(X_test,y_test))
    best_params.append(gs.best_params_)
    
    
    important_features = pd.Series(best_estimator.coef_, 
                                  index = dummy.columns).sort_values(key = abs,ascending = False)

    
    for i, e in enumerate(important_features.index):
        d[e].append(i)
    
    
 
    
    

0/rFitting 3 folds for each of 10 candidates, totalling 30 fits
1/rFitting 3 folds for each of 10 candidates, totalling 30 fits
2/rFitting 3 folds for each of 10 candidates, totalling 30 fits
3/rFitting 3 folds for each of 10 candidates, totalling 30 fits
4/rFitting 3 folds for each of 10 candidates, totalling 30 fits
5/rFitting 3 folds for each of 10 candidates, totalling 30 fits
6/rFitting 3 folds for each of 10 candidates, totalling 30 fits
7/rFitting 3 folds for each of 10 candidates, totalling 30 fits
8/rFitting 3 folds for each of 10 candidates, totalling 30 fits
9/rFitting 3 folds for each of 10 candidates, totalling 30 fits


In [6]:
top_f = ['GrLivArea', 'OverallQual', 'OverallCond','YearBuilt','TotalBsmtSF', 'GarageArea','Fireplaces',\
         'BsmtFullBath']

In [13]:

betas = []
for feature in dummy.columns:
    obs = dummy.iloc[244]
    play = obs.copy()
    play.at[feature] = play[feature] + 1
    play = scaler.transform([play])
    obs = scaler.transform([obs])
    beta = np.exp(best_estimator.predict(play))[0]- np.exp(best_estimator.predict(obs))[0]
    betas.append(beta)

    
    
np_betas = np.array(betas)
np_betas

all_betas = pd.Series(np_betas, index = dummy.columns).to_frame()

pd.set_option("display.max_rows", None)

all_betas

Unnamed: 0,0
GrLivArea,28.428935
MSSubClass,0.0
LotFrontage,8.581792
LotArea,0.0
OverallQual,6279.141136
OverallCond,4832.893645
YearBuilt,162.076026
MasVnrArea,0.0
TotalBsmtSF,13.155105
BsmtFullBath,4227.779022


In [12]:
k = 10

for k in range(1000,1500):
    play = dummy.iloc[[k]][dummy.columns]
    print(i)
    print(f'predict : {np.dot(play,np_betas)[0] }')
    print(f'actual  : {predict[k]}\n\n')

238
predict : 25433.672333107912
actual  : 172400


238
predict : 40838.4107124834
actual  : 224500


238
predict : 17912.933474085992
actual  : 157000


238
predict : -6467.973467550124
actual  : 140000


238
predict : 27769.3464779972
actual  : 183500


238
predict : -48995.648850194935
actual  : 80000


238
predict : 34083.25271942059
actual  : 175000


238
predict : 23511.07875767909
actual  : 181900


238
predict : 75697.63987903995
actual  : 315750


238
predict : -2711.611676806642
actual  : 145000


238
predict : 39009.33212594745
actual  : 197000


238
predict : 879.0780758017208
actual  : 84900


238
predict : 6704.259925419057
actual  : 158000


238
predict : 14149.136255672609
actual  : 156000


238
predict : 73853.98282402381
actual  : 305900


238
predict : 64045.71667504712
actual  : 257500


238
predict : -17318.585288449132
actual  : 116000


238
predict : 657.2153580958256
actual  : 143450


238
predict : 72461.49034576281
actual  : 277500


238
predict : 35962.431989

actual  : 200500


238
predict : 22172.599288101424
actual  : 235000


238
predict : -6232.799568702816
actual  : 140000


238
predict : 37407.7999526927
actual  : 191500


238
predict : 27791.154021536873
actual  : 190000


238
predict : 69065.75410739292
actual  : 302000


238
predict : 13935.223046795873
actual  : 158000


238
predict : 21785.048723014537
actual  : 187000


238
predict : 7266.068505154282
actual  : 149900


238
predict : -18960.205702586216
actual  : 119164


238
predict : 15835.97966049955
actual  : 143000


238
predict : -12338.360477929178
actual  : 138000


238
predict : 35365.42434818775
actual  : 170000


238
predict : -34010.743453607196
actual  : 100000


238
predict : 10414.51967425883
actual  : 154000


238
predict : 51951.295953255845
actual  : 239000


238
predict : -53047.99994900543
actual  : 90500


238
predict : -4347.725637047552
actual  : 135000


238
predict : 8225.619113201683
actual  : 155000


238
predict : 47034.24978020694
actual  : 221000


