In [1]:
import sklearn
import lightgbm
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
housing_df = pd.read_csv('housing_dataset.csv')

## Removing Columns with lot of missing values

In [3]:
filtered_columns = [col for col in housing_df.columns if col.lower() not in ['id']]
housing_filtered = housing_df[filtered_columns]
cutoff = 0.4
missing_vals = housing_filtered.isnull().sum().sort_values()
missing_vals_reduced = missing_vals[missing_vals != 0]

columnswith_many_missing_values = missing_vals_reduced[missing_vals_reduced>cutoff*len(housing_df)].index.values
print (f"{columnswith_many_missing_values} have missing values for more {cutoff*100}% of the dataset")

['FireplaceQu' 'Fence' 'Alley' 'MiscFeature' 'PoolQC'] have missing values for more 40.0% of the dataset


In [4]:
filter_columns = columnswith_many_missing_values.tolist()

In [5]:
housing_filtered['log_sales'] = np.log(housing_filtered['SalePrice'])

In [6]:
housing_filtered_2 = housing_filtered[[col for col in housing_filtered.columns if col not in filter_columns ]]
final_housing_df = housing_filtered_2.dropna()

## Prepare train and test split

In [7]:
X = final_housing_df[[col for col in housing_filtered_2.columns if col not in ['SalePrice', 'log_sales']]]

In [8]:
Y = final_housing_df['SalePrice']

In [9]:
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict

In [10]:
le = defaultdict(LabelEncoder)

In [11]:
X['SaleType'].dtype.name

'object'

In [12]:
X_encoded = X.apply(lambda x: le[x.name].fit_transform(x) if x.dtype.name not in ['int64', 'float64'] else x)

In [13]:
X_encoded

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,3,65.0,8450,1,3,3,0,4,0,...,61,0,0,0,0,0,2,2008,8,4
1,20,3,80.0,9600,1,3,3,0,2,0,...,0,0,0,0,0,0,5,2007,8,4
2,60,3,68.0,11250,1,0,3,0,4,0,...,42,0,0,0,0,0,9,2008,8,4
3,70,3,60.0,9550,1,0,3,0,0,0,...,35,272,0,0,0,0,2,2006,8,0
4,60,3,84.0,14260,1,0,3,0,2,0,...,84,0,0,0,0,0,12,2008,8,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,3,62.0,7917,1,3,3,0,4,0,...,40,0,0,0,0,0,8,2007,8,4
1456,20,3,85.0,13175,1,3,3,0,4,0,...,0,0,0,0,0,0,2,2010,8,4
1457,70,3,66.0,9042,1,3,3,0,4,0,...,60,0,0,0,0,2500,5,2010,8,4
1458,20,3,68.0,9717,1,3,3,0,4,0,...,0,112,0,0,0,0,4,2010,8,4


In [14]:
trainx, testx, trainy, testy = train_test_split(X_encoded,Y,test_size=0.3)

## XGBoost Model

In [15]:
import xgboost as xgb
from xgboost import XGBRegressor

In [16]:
def fetch_df_row(df, index):
    return pd.DataFrame(testx.loc[index].values.reshape(1,len(df.columns)), columns = df.columns)

In [17]:
trainx

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
1127,20,3,182.0,14572,1,2,3,0,0,0,...,36,0,0,0,0,0,11,2007,8,3
1329,60,3,63.0,9084,1,0,3,0,4,0,...,28,0,0,0,0,0,6,2006,8,4
240,20,1,75.0,9000,1,3,3,0,4,0,...,168,0,0,0,0,0,4,2010,8,4
543,120,2,34.0,4058,1,3,3,0,4,0,...,40,0,0,0,0,0,6,2007,8,4
1239,20,3,64.0,9037,1,0,1,0,4,0,...,33,0,0,0,0,0,12,2007,8,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,20,3,78.0,10140,1,3,3,0,4,0,...,0,0,0,0,0,0,8,2009,8,4
585,20,3,88.0,11443,1,3,3,0,4,0,...,66,0,0,0,0,0,3,2006,6,5
677,30,3,52.0,9022,1,3,3,0,4,0,...,0,120,0,0,0,0,5,2009,8,4
1454,20,1,62.0,7500,1,3,3,0,4,0,...,113,0,0,0,0,0,10,2009,8,4


In [18]:
train_data = xgb.DMatrix(trainx, label=trainy)

In [19]:
param = {}
param['nthread'] = 4
param['eval_metric'] = 'auc'

In [20]:
model = XGBRegressor(colsample_bytree=0.4,
                 gamma=0,                 
                 learning_rate=0.07,
                 max_depth=3,
                 min_child_weight=1.5,
                 n_estimators=10000,                                                                    
                 reg_alpha=0.75,
                 reg_lambda=0.45,
                 subsample=0.6,
                 seed=42, n_jobs=-1, **param)
model.fit(trainx, trainy)
# bst = xgb.train(param, train_data, num_round)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.4, eval_metric='auc',
             gamma=0, gpu_id=-1, importance_type='gain',
             interaction_constraints='', learning_rate=0.07, max_delta_step=0,
             max_depth=3, min_child_weight=1.5, missing=nan,
             monotone_constraints='()', n_estimators=10000, n_jobs=-1,
             nthread=4, num_parallel_tree=1, random_state=42, reg_alpha=0.75,
             reg_lambda=0.45, scale_pos_weight=1, seed=42, subsample=0.6,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [22]:
import eli5



In [23]:
eli5.explain_weights_xgboost(model)

Weight,Feature
0.1448,PoolArea
0.1274,GarageCars
0.1268,ExterQual
0.1248,OverallQual
0.0689,KitchenAbvGr
0.0667,KitchenQual
0.0666,Condition2
0.0512,FullBath
0.0305,CentralAir
0.0171,GrLivArea


In [24]:
# model.predict(testx.iloc[0].values.reshape(1,74))
testx

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
502,20,3,70.0,9170,1,3,3,0,0,0,...,0,184,0,0,0,400,4,2007,8,4
103,20,3,94.0,10402,1,0,3,0,0,0,...,36,0,0,0,0,0,5,2010,8,4
436,50,4,40.0,4400,1,3,3,0,4,0,...,0,0,0,0,0,0,10,2006,8,4
765,20,3,75.0,14587,1,0,3,0,4,0,...,174,0,0,0,0,0,8,2008,6,5
80,60,3,100.0,13000,1,3,3,0,0,0,...,72,0,0,252,0,0,6,2009,8,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1012,70,3,55.0,10592,1,3,3,0,4,0,...,0,112,0,53,0,0,8,2007,8,4
303,20,3,70.0,9800,1,3,3,0,0,0,...,0,0,0,0,0,0,7,2006,8,0
1202,50,4,50.0,6000,1,3,3,0,0,0,...,0,208,0,0,0,0,5,2009,8,4
1421,120,3,53.0,4043,1,3,3,0,4,0,...,55,0,0,165,0,0,7,2010,8,4


In [25]:
testy

502     140000
103     198900
436     116000
765     264132
80      193500
         ...  
1012    165000
303     149900
1202    117000
1421    127500
1113    134500
Name: SalePrice, Length: 329, dtype: int64

In [26]:
trainy.min()

35311

In [28]:
print (model.predict(fetch_df_row(testx, 502)), testy.loc[502])

[135484.03] 140000


In [30]:
print (model.predict(fetch_df_row(testx, 1012)), testy.loc[1012])

[160373.16] 165000


In [31]:
fetch_df_row(testx, 502)

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,20.0,3.0,70.0,9170.0,1.0,3.0,3.0,0.0,0.0,0.0,...,0.0,184.0,0.0,0.0,0.0,400.0,4.0,2007.0,8.0,4.0


In [40]:
fetch_df_row(testx, 502)[testx.columns[[16,31,32,21,19,2,15]]]

Unnamed: 0,OverallCond,BsmtFinType1,BsmtFinSF1,Exterior1st,RoofStyle,LotFrontage,OverallQual
0,7.0,0.0,698.0,7.0,3.0,70.0,5.0


In [38]:
eli5.xgboost.explain_prediction_xgboost(model, doc=fetch_df_row(testx, 502))

Contribution?,Feature
185773.904,<BIAS>
4802.507,x16
2577.705,x31
2424.198,x32
1568.736,x21
1211.912,x19
936.181,x39
908.104,x2
904.645,x45
690.496,x73


In [34]:
eli5.xgboost.explain_prediction_xgboost?

[0;31mSignature:[0m
[0meli5[0m[0;34m.[0m[0mxgboost[0m[0;34m.[0m[0mexplain_prediction_xgboost[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mxgb[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdoc[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mvec[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtop[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtop_targets[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtarget_names[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtargets[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfeature_names[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfeature_re[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfeature_filter[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mvectorized[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0m