In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import balanced_accuracy_score, classification_report
from lightgbm import LGBMClassifier
from interpret.glassbox import ExplainableBoostingClassifier, LogisticRegression, ClassificationTree, DecisionListClassifier

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
df = pd.read_csv('../bank.csv')

In [3]:

y = df["y"].map({"no": 0, "yes": 1})
X = df.drop("y", axis=1)

In [4]:
num_features = ["age", "campaign", "pdays", "previous"]

cat_features = ["job", "marital", "education","default", "housing", "loan",
                "contact", "month", "day_of_week", "poutcome"]

In [5]:
preprocessor = ColumnTransformer([("numerical", "passthrough", num_features), 
                                  ("categorical", OneHotEncoder(sparse=False, handle_unknown="ignore"),
                                   cat_features)])

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.3, random_state=42)


In [7]:
preprocessor.fit(X_train)

preprocessor.transform(X_train)

array([[ 50.,   2., 999., ...,   0.,   1.,   0.],
       [ 51.,   5., 999., ...,   0.,   1.,   0.],
       [ 46.,   2., 999., ...,   0.,   1.,   0.],
       ...,
       [ 35.,   3., 999., ...,   0.,   1.,   0.],
       [ 32.,   4., 999., ...,   0.,   1.,   0.],
       [ 31.,   3., 999., ...,   0.,   1.,   0.]])

In [8]:
# Get the list of categories generated by the process
ohe_categories = preprocessor.named_transformers_["categorical"].categories_

# Create nice names for our one hot encoded features
new_ohe_features = [f"{col}__{val}" for col, vals in zip(cat_features, ohe_categories) for val in vals]

# Create a new list with all names of features
all_features = num_features + new_ohe_features

In [9]:
X_train = pd.DataFrame(preprocessor.transform(X_train), columns=all_features)
X_test = pd.DataFrame(preprocessor.transform(X_test), columns=all_features)

In [10]:
X_train.head()

Unnamed: 0,age,campaign,pdays,previous,job__admin.,job__blue-collar,job__entrepreneur,job__housemaid,job__management,job__retired,...,month__oct,month__sep,day_of_week__fri,day_of_week__mon,day_of_week__thu,day_of_week__tue,day_of_week__wed,poutcome__failure,poutcome__nonexistent,poutcome__success
0,50.0,2.0,999.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,51.0,5.0,999.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,46.0,2.0,999.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,46.0,1.0,999.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,25.0,5.0,999.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [11]:
from interpret import show
from interpret.data import ClassHistogram

hist = ClassHistogram().explain_data(X_train, y_train, name = 'Train Data')
show(hist)

In [12]:

ebm = ExplainableBoostingClassifier(random_state=42)


In [13]:
ebm.fit(X_train, y_train)   #Works on dataframes and numpy arrays

ExplainableBoostingClassifier(binning_strategy='quantile', data_n_episodes=2000,
                              early_stopping_run_length=50,
                              early_stopping_tolerance=1e-05,
                              feature_names=['age', 'campaign', 'pdays',
                                             'previous', 'job__admin.',
                                             'job__blue-collar',
                                             'job__entrepreneur',
                                             'job__housemaid',
                                             'job__management', 'job__retired',
                                             'job__self-employed',
                                             'job__services', 'job__student',
                                             'job_...
                                             'categorical', 'categorical',
                                             'categorical', 'categorical',
                             

In [14]:
ebm_global = ebm.explain_global(name='EBM')
show(ebm_global)


In [15]:
ebm_local = ebm.explain_local(X_test[:4], y_test[:4], name='EBM')
show(ebm_local)

In [16]:
from interpret.perf import ROC

ebm_perf = ROC(ebm.predict_proba).explain_perf(X_test, y_test, name='EBM')
show(ebm_perf)

In [17]:
from interpret.glassbox import LogisticRegression, ClassificationTree 

from interpret.perf import ROC

# Logistic Regression
lr_model = LogisticRegression()

lr_model.fit(X_train, y_train)
# Random Forest
rf_model = ClassificationTree()

rf_model.fit(X_train, y_train)

#Light GBM
lgb_model = LGBMClassifier()

lgb_model.fit(X_train, y_train)


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [18]:
lgbm_perf = ROC(lgb_model.predict_proba).explain_perf(X_test, y_test, name='LGBM')
show(lgbm_perf)

In [19]:
lr_perf = ROC(lr_model.predict_proba).explain_perf(X_test, y_test, name='Logistic Regression')
tree_perf = ROC(rf_model.predict_proba).explain_perf(X_test, y_test, name='Classification Tree')
lgbm_perf = ROC(lgb_model.predict_proba).explain_perf(X_test, y_test, name='Light GBM')
ebm_perf = ROC(ebm.predict_proba).explain_perf(X_test, y_test, name='EBM')


show(lr_perf)
show(tree_perf)
show(ebm_perf)
show(lgbm_perf)


In [21]:
lr_global = lr_model.explain_global(name='LR')
tree_global = rf_model.explain_global(name='Tree')

show(lr_global)
show(tree_global)
show(ebm_global)

In [23]:
lr_global = lr_model.explain_global(name='LR')
tree_global = rf_model.explain_global(name='Tree')
show([hist, lr_global, lr_perf, tree_global, tree_perf,ebm_global,ebm_local,ebm_perf], share_tables=True)
