In [2]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import balanced_accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm.sklearn import LGBMClassifier
from xgboost.sklearn import XGBClassifier

  from pandas import MultiIndex, Int64Index


In [3]:
df = pd.read_csv("bank.csv",sep=';')
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


In [4]:
df.y.value_counts()

no     4000
yes     521
Name: y, dtype: int64

In [5]:
# Get the feature matrix (X) and the target vector (y)

y = df["y"].map({"no": 0, "yes": 1})
X = df.drop("y", axis=1)

In [6]:
X.dtypes

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
dtype: object

In [7]:
num_features = ["age", 'balance',"campaign", "pdays", "previous"]

cat_features = ["job", "marital", "education","default", "housing", "loan",
                "contact", "month",  "poutcome"]

In [8]:
preprocessor = ColumnTransformer([("numerical", "passthrough", num_features), 
                                  ("categorical", OneHotEncoder(sparse=False, handle_unknown="ignore"),
                                   cat_features)])

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.3, random_state=42)

In [10]:
preprocessor.fit_transform(X_train)

array([[3.700e+01, 1.467e+03, 1.000e+00, ..., 0.000e+00, 0.000e+00,
        1.000e+00],
       [2.500e+01, 3.400e+01, 1.000e+00, ..., 0.000e+00, 0.000e+00,
        1.000e+00],
       [5.300e+01, 1.278e+03, 3.000e+00, ..., 0.000e+00, 0.000e+00,
        1.000e+00],
       ...,
       [3.500e+01, 0.000e+00, 2.000e+00, ..., 0.000e+00, 0.000e+00,
        1.000e+00],
       [3.700e+01, 1.050e+02, 4.000e+00, ..., 0.000e+00, 0.000e+00,
        1.000e+00],
       [6.300e+01, 4.740e+02, 1.000e+00, ..., 0.000e+00, 0.000e+00,
        1.000e+00]])

In [11]:
# Get the list of categories generated by the process
ohe_categories = preprocessor.named_transformers_["categorical"].categories_
print(ohe_categories)

new_ohe_features = [f"{col}__{val}" for col, vals in zip(cat_features, ohe_categories) for val in vals]
print(new_ohe_features)

all_features = num_features + new_ohe_features

[array(['admin.', 'blue-collar', 'entrepreneur', 'housemaid', 'management',
       'retired', 'self-employed', 'services', 'student', 'technician',
       'unemployed', 'unknown'], dtype=object), array(['divorced', 'married', 'single'], dtype=object), array(['primary', 'secondary', 'tertiary', 'unknown'], dtype=object), array(['no', 'yes'], dtype=object), array(['no', 'yes'], dtype=object), array(['no', 'yes'], dtype=object), array(['cellular', 'telephone', 'unknown'], dtype=object), array(['apr', 'aug', 'dec', 'feb', 'jan', 'jul', 'jun', 'mar', 'may',
       'nov', 'oct', 'sep'], dtype=object), array(['failure', 'other', 'success', 'unknown'], dtype=object)]
['job__admin.', 'job__blue-collar', 'job__entrepreneur', 'job__housemaid', 'job__management', 'job__retired', 'job__self-employed', 'job__services', 'job__student', 'job__technician', 'job__unemployed', 'job__unknown', 'marital__divorced', 'marital__married', 'marital__single', 'education__primary', 'education__secondary', 'educat

In [12]:
X_train = pd.DataFrame(preprocessor.transform(X_train), columns=all_features)
X_test = pd.DataFrame(preprocessor.transform(X_test), columns=all_features)

In [13]:
X_train.head()

Unnamed: 0,age,balance,campaign,pdays,previous,job__admin.,job__blue-collar,job__entrepreneur,job__housemaid,job__management,...,month__jun,month__mar,month__may,month__nov,month__oct,month__sep,poutcome__failure,poutcome__other,poutcome__success,poutcome__unknown
0,37.0,1467.0,1.0,-1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,25.0,34.0,1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,53.0,1278.0,3.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,31.0,-6.0,2.0,-1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,43.0,3529.0,2.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [14]:
lr_model = LogisticRegression(class_weight="balanced", solver="liblinear", random_state=42)
dt_model = DecisionTreeClassifier(class_weight="balanced",max_depth=10, min_samples_split= 0.05)
rf_model = RandomForestClassifier(class_weight="balanced",max_depth=10, min_samples_split= 0.01, n_jobs=-1)
lgb_model = LGBMClassifier(class_weight="balanced",max_depth=12, min_child_samples=60,n_estimators=50, n_jobs=-1)
xgb_model = XGBClassifier(class_weight="balanced",max_depth=10, min_samples_split= 0.05,verbosity=0)

In [15]:
lr_model.fit(X_train, y_train)
dt_model.fit(X_train,y_train)
rf_model.fit(X_train,y_train)
lgb_model.fit(X_train,y_train)
xgb_model.fit(X_train,y_train)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


XGBClassifier(base_score=0.5, booster='gbtree', class_weight='balanced',
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              enable_categorical=False, gamma=0, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=10,
              min_child_weight=1, min_samples_split=0.05, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=0)

In [16]:
y_pred_lr = lr_model.predict(X_test)
y_pred_dt = dt_model.predict(X_test)
y_pred_rf = rf_model.predict(X_test)
y_pred_lgb = lgb_model.predict(X_test)
y_pred_xgb = xgb_model.predict(X_test)

In [17]:
print(balanced_accuracy_score(y_test, y_pred_lr))
print(balanced_accuracy_score(y_test, y_pred_dt))
print(balanced_accuracy_score(y_test, y_pred_rf))
print(balanced_accuracy_score(y_test, y_pred_lgb))
print(balanced_accuracy_score(y_test, y_pred_xgb))

0.6474545784495827
0.6587859476077628
0.64838329170136
0.6650707743547044
0.5755887188027071


In [18]:
lr_model.coef_

array([[ 2.48158080e-03, -6.01523246e-06, -6.92148273e-02,
         6.10392157e-05,  9.81345421e-02, -5.82839199e-02,
        -2.38801663e-01,  2.56872666e-01, -1.61842243e-02,
         1.16030939e-01,  6.17016569e-01,  1.77816725e-02,
        -2.38892559e-01,  3.47845125e-01, -1.56216600e-01,
        -6.39807613e-01,  1.80321866e-01,  2.56448895e-01,
        -2.21724032e-01,  1.52957394e-01,  9.33351456e-02,
         2.11791354e-01,  1.24317504e-01, -2.41761746e-01,
        -1.61643100e-01,  3.49325357e-01,  1.72657817e-01,
         1.50244397e-02,  3.52482040e-01, -1.64799783e-01,
         3.79647674e-01,  3.97044036e-01, -5.89009453e-01,
        -1.44463555e-01, -1.95322611e-01,  7.11543824e-01,
        -1.18102211e-01, -1.22111977e+00, -4.53006464e-01,
         4.88241300e-02,  1.01067000e+00, -4.82569270e-01,
        -6.29061956e-01,  1.52120443e+00,  1.39085708e-01,
        -7.87105826e-01, -1.97489072e-01,  1.73220581e+00,
        -5.59928656e-01]])

In [19]:
#!pip install eli5
import eli5

eli5.show_weights(lr_model, feature_names=all_features)

Weight?,Feature
+1.732,poutcome__success
+1.521,month__oct
+1.011,month__mar
+0.712,month__dec
+0.617,job__retired
+0.397,contact__telephone
+0.380,contact__cellular
+0.352,loan__no
+0.349,default__yes
+0.348,job__student


In [20]:
eli5.show_weights(dt_model, feature_names=all_features)

Weight,Feature
0.2909,poutcome__success
0.1577,balance
0.1571,contact__unknown
0.0914,month__oct
0.0732,age
0.0421,pdays
0.0294,marital__married
0.0294,month__nov
0.0273,month__jun
0.0199,campaign


In [122]:
eli5.show_weights(rf_model, feature_names=all_features)

Weight,Feature
0.1098  ± 0.0858,balance
0.0987  ± 0.0905,age
0.0844  ± 0.1532,poutcome__success
0.0668  ± 0.1253,pdays
0.0566  ± 0.1188,contact__unknown
0.0471  ± 0.0519,campaign
0.0428  ± 0.1083,previous
0.0374  ± 0.0651,month__oct
0.0367  ± 0.0818,contact__cellular
0.0299  ± 0.1096,poutcome__unknown


In [123]:
eli5.show_weights(lgb_model, feature_names=all_features)

Weight,Feature
0.2340,balance
0.1869,age
0.1408,poutcome__success
0.0682,contact__unknown
0.0592,campaign
0.0389,pdays
0.0253,marital__married
0.0247,job__blue-collar
0.0185,month__may
0.0167,loan__no


In [124]:
eli5.show_weights(xgb_model, feature_names=all_features)

Weight,Feature
0.1774,poutcome__success
0.0652,month__oct
0.0429,contact__unknown
0.0365,job__unknown
0.0333,month__mar
0.0276,month__feb
0.0273,month__sep
0.0256,month__jun
0.0242,poutcome__other
0.0220,marital__divorced


In [125]:
i = 4
X_test.iloc[[i]]

Unnamed: 0,age,balance,campaign,pdays,previous,job__admin.,job__blue-collar,job__entrepreneur,job__housemaid,job__management,...,month__jun,month__mar,month__may,month__nov,month__oct,month__sep,poutcome__failure,poutcome__other,poutcome__success,poutcome__unknown
4,46.0,1375.0,3.0,-1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [126]:
y_test.iloc[i]

0

In [127]:
eli5.show_prediction(lr_model, X_test.iloc[i],
                     feature_names=all_features, show_feature_values=True)

Contribution?,Feature,Value
0.629,month__nov,1.0
0.56,poutcome__unknown,1.0
0.222,marital__married,1.0
0.208,campaign,3.0
0.162,default__no,1.0
0.008,balance,1375.0
0.0,pdays,-1.0
-0.015,housing__yes,1.0
-0.114,age,46.0
-0.116,job__management,1.0


In [128]:
eli5.show_prediction(dt_model, X_test.iloc[i],
                     feature_names=all_features, show_feature_values=True)

Contribution?,Feature,Value
0.5,<BIAS>,1.0
0.309,balance,1375.0
0.062,marital__married,1.0
0.045,poutcome__success,0.0
0.02,month__oct,0.0
0.019,age,46.0
0.012,month__jun,0.0
0.008,pdays,-1.0
-0.06,contact__unknown,0.0


In [130]:
from eli5.sklearn import PermutationImportance

In [131]:
perm_dt = PermutationImportance(dt_model, scoring="balanced_accuracy")

In [132]:
perm_dt.fit(X_test, y_test)
eli5.show_weights(perm_dt, feature_names=all_features)

Weight,Feature
0.0444  ± 0.0230,contact__unknown
0.0276  ± 0.0134,poutcome__success
0.0269  ± 0.0126,month__jun
0.0228  ± 0.0095,age
0.0192  ± 0.0297,balance
0.0168  ± 0.0074,pdays
0.0113  ± 0.0128,loan__no
0.0039  ± 0.0068,month__nov
0.0037  ± 0.0065,job__student
0.0033  ± 0.0053,month__oct


In [133]:
perm_rf = PermutationImportance(rf_model, scoring="balanced_accuracy")
perm_rf.fit(X_test, y_test)

eli5.show_weights(perm_rf, feature_names=all_features)

Weight,Feature
0.0238  ± 0.0204,contact__unknown
0.0159  ± 0.0153,age
0.0153  ± 0.0164,balance
0.0132  ± 0.0069,month__apr
0.0112  ± 0.0118,loan__no
0.0112  ± 0.0068,contact__cellular
0.0093  ± 0.0079,month__jul
0.0090  ± 0.0029,loan__yes
0.0090  ± 0.0120,housing__yes
0.0088  ± 0.0125,housing__no


In [134]:
perm_lgb = PermutationImportance(lgb_model, scoring="balanced_accuracy")
perm_lgb.fit(X_test, y_test)

eli5.show_weights(perm_lgb, feature_names=all_features)

Weight,Feature
0.0294  ± 0.0308,balance
0.0261  ± 0.0152,contact__unknown
0.0168  ± 0.0190,age
0.0157  ± 0.0077,poutcome__success
0.0123  ± 0.0135,housing__no
0.0109  ± 0.0114,month__jul
0.0105  ± 0.0204,campaign
0.0098  ± 0.0162,month__jun
0.0083  ± 0.0099,month__may
0.0082  ± 0.0109,loan__no
