In [18]:
import sklearn
import lightgbm
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

In [3]:
housing_df = pd.read_csv('housing_dataset.csv')

## Removing Columns with lot of missing values

In [17]:
filtered_columns = [col for col in housing_df.columns if col.lower() not in ['id']]
housing_filtered = housing_df[filtered_columns]
cutoff = 0.4
missing_vals = housing_filtered.isnull().sum().sort_values()
missing_vals_reduced = missing_vals[missing_vals != 0]

columnswith_many_missing_values = missing_vals_reduced[missing_vals_reduced>cutoff*len(housing_df)].index.values
print (f"{columnswith_many_missing_values} have missing values for more {cutoff*100}% of the dataset")

['FireplaceQu' 'Fence' 'Alley' 'MiscFeature' 'PoolQC'] have missing values for more 40.0% of the dataset


In [21]:
filter_columns = columnswith_many_missing_values.tolist()

In [22]:
housing_filtered['log_sales'] = np.log(housing_filtered['SalePrice'])

In [23]:
housing_filtered_2 = housing_filtered[[col for col in housing_filtered.columns if col not in filter_columns ]]
final_housing_df = housing_filtered_2.dropna()

## Prepare train and test split

In [159]:
X = final_housing_df[[col for col in housing_filtered_2.columns if col not in ['SalePrice', 'log_sales']]]

In [160]:
Y = final_housing_df['SalePrice']

In [161]:
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict

In [162]:
le = defaultdict(LabelEncoder)

In [163]:
X['SaleType'].dtype.name

'object'

In [164]:
X_encoded = X.apply(lambda x: le[x.name].fit_transform(x) if x.dtype.name not in ['int64', 'float64'] else x)

In [165]:
X_encoded

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,3,65.0,8450,1,3,3,0,4,0,...,61,0,0,0,0,0,2,2008,8,4
1,20,3,80.0,9600,1,3,3,0,2,0,...,0,0,0,0,0,0,5,2007,8,4
2,60,3,68.0,11250,1,0,3,0,4,0,...,42,0,0,0,0,0,9,2008,8,4
3,70,3,60.0,9550,1,0,3,0,0,0,...,35,272,0,0,0,0,2,2006,8,0
4,60,3,84.0,14260,1,0,3,0,2,0,...,84,0,0,0,0,0,12,2008,8,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,3,62.0,7917,1,3,3,0,4,0,...,40,0,0,0,0,0,8,2007,8,4
1456,20,3,85.0,13175,1,3,3,0,4,0,...,0,0,0,0,0,0,2,2010,8,4
1457,70,3,66.0,9042,1,3,3,0,4,0,...,60,0,0,0,0,2500,5,2010,8,4
1458,20,3,68.0,9717,1,3,3,0,4,0,...,0,112,0,0,0,0,4,2010,8,4


In [166]:
trainx, testx, trainy, testy = train_test_split(X_encoded,Y,test_size=0.3)

## XGBoost Model

In [167]:
import xgboost as xgb
from xgboost import XGBRegressor

In [168]:
def fetch_df_row(df, index):
    return pd.DataFrame(testx.loc[index].values.reshape(1,len(df.columns)), columns = df.columns)

In [169]:
trainx

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
1265,160,1,35.0,3735,1,3,3,0,3,0,...,34,0,0,0,0,0,3,2006,8,4
444,60,3,70.0,8750,1,3,3,0,4,0,...,133,0,0,0,0,0,7,2008,8,4
343,120,3,63.0,8849,1,0,3,0,4,0,...,72,0,0,0,0,0,7,2008,8,4
1434,20,3,80.0,17400,1,3,2,0,4,1,...,41,0,0,0,0,0,5,2006,8,4
143,20,3,78.0,10335,1,0,3,0,4,0,...,29,0,0,0,0,0,6,2009,8,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
220,20,3,73.0,8990,1,0,3,0,4,0,...,33,0,0,0,0,0,4,2006,6,5
799,50,3,60.0,7200,1,3,3,0,0,0,...,0,264,0,0,0,0,6,2007,8,4
1015,60,3,70.0,8400,1,3,3,0,4,0,...,45,0,0,0,0,0,11,2009,8,4
34,120,3,60.0,7313,1,3,3,0,4,0,...,47,0,0,0,0,0,8,2007,8,4


In [170]:
train_data = xgb.DMatrix(trainx, label=trainy)

In [171]:
param = {}
param['nthread'] = 4
param['eval_metric'] = 'auc'

In [172]:
model = XGBRegressor(colsample_bytree=0.4,
                 gamma=0,                 
                 learning_rate=0.07,
                 max_depth=3,
                 min_child_weight=1.5,
                 n_estimators=10000,                                                                    
                 reg_alpha=0.75,
                 reg_lambda=0.45,
                 subsample=0.6,
                 seed=42, n_jobs=-1, **param)
model.fit(trainx, trainy)
# bst = xgb.train(param, train_data, num_round)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.4, eval_metric='auc',
             gamma=0, gpu_id=-1, importance_type='gain',
             interaction_constraints='', learning_rate=0.07, max_delta_step=0,
             max_depth=3, min_child_weight=1.5, missing=nan,
             monotone_constraints='()', n_estimators=10000, n_jobs=-1,
             nthread=4, num_parallel_tree=1, random_state=42, reg_alpha=0.75,
             reg_lambda=0.45, scale_pos_weight=1, seed=42, subsample=0.6,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [125]:
!pip install eli5



In [173]:
import eli5

In [174]:
eli5.explain_weights_xgboost(model)

Weight,Feature
0.2685,GarageCars
0.1743,OverallQual
0.0726,BsmtQual
0.0701,FullBath
0.0532,PoolArea
0.0316,KitchenQual
0.0262,Fireplaces
0.0258,Condition2
0.0226,CentralAir
0.0195,GarageFinish


In [175]:
# model.predict(testx.iloc[0].values.reshape(1,74))
testx

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
468,20,3,98.0,11428,1,0,3,0,4,0,...,44,0,0,0,0,0,5,2007,8,4
966,50,3,130.0,9600,1,0,1,0,4,0,...,70,0,0,0,0,0,6,2009,8,4
594,20,3,88.0,7990,1,0,3,0,4,0,...,0,0,0,0,0,0,4,2008,8,4
400,120,3,38.0,14963,1,1,3,0,4,0,...,30,0,0,224,0,0,12,2008,8,4
1445,85,3,70.0,8400,1,3,3,0,4,0,...,0,252,0,0,0,0,5,2007,8,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1274,50,3,53.0,5362,1,3,3,0,0,0,...,0,81,0,0,0,0,11,2007,8,4
1113,20,3,66.0,8923,1,3,3,0,4,0,...,18,0,0,0,0,0,5,2007,8,4
937,60,3,75.0,9675,1,3,3,0,4,0,...,48,0,0,0,0,0,2,2009,8,4
1391,90,3,65.0,8944,1,3,3,0,4,0,...,152,0,0,0,0,0,4,2009,8,4


In [177]:
trainy.min()

35311

In [179]:
print (model.predict(fetch_df_row(testx, 966)), testy.loc[966])

[138456.55] 160000


In [180]:
print (model.predict(fetch_df_row(testx, 607)), testy.loc[607])

[170998.38] 225000


In [182]:
fetch_df_row(testx, 607)

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,20.0,3.0,78.0,7800.0,1.0,3.0,0.0,0.0,4.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,8.0,2006.0,8.0,4.0


In [185]:
fetch_df_row(testx, 607)[testx.columns[[44,47,16,18]]]

Unnamed: 0,GrLivArea,FullBath,OverallCond,YearRemodAdd
0,2008.0,3.0,8.0,2002.0


In [184]:
eli5.xgboost.explain_prediction_xgboost(model, doc=fetch_df_row(testx, 607))

Contribution?,Feature
190151.435,<BIAS>
16718.22,x44
8145.129,x47
4971.554,x16
4037.422,x18
2134.679,x51
1649.143,x3
1277.057,x31
1209.506,x1
927.441,x21
