In [68]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from interpret.glassbox import ExplainableBoostingClassifier
from interpret import show
import joblib
from sklearn.metrics import accuracy_score

Uploading data


In [69]:
# Train data
x_train = np.load("data/processed/x_train.npy")
# Train labels
x_test = np.load("data/processed/x_test.npy")
# Test data
y_train = np.load("data/processed/y_train.npy")
# Test labels
y_test = np.load("data/processed/y_test.npy")

Collecting feature names

In [70]:
feature_name = {
    "feature_0": "Current Loan Amount",
    "feature_1": "Term", 
    "feature_2": "Credit Score", 
    "feature_3": "Annual Income", 
    "feature_4": "Years in current job", 
    "feature_5": "Home Ownership", 
    "feature_6": "Purpose", 
    "feature_7": "Monthly Debt", 
    "feature_8": "Years of Credit History", 
    "feature_9": "Number of Open Accounts", 
    "feature_10": "Number of Credit Problems", 
    "feature_11": "Current Credit Balance", 
    "feature_12": "Maximum Open Credit", 
    "feature_13": "Bankruptcies", 
    "feature_14": "Tax Liens"
    }

Instanciating EBM model 

In [71]:
from interpret.glassbox import ExplainableBoostingClassifier

Transforming numpy array into dataframe for extracting name columns

In [72]:
x_train = pd.DataFrame(x_train, columns=feature_name.values())
x_train

Unnamed: 0,Current Loan Amount,Term,Credit Score,Annual Income,Years in current job,Home Ownership,Purpose,Monthly Debt,Years of Credit History,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens
0,86592.0,1.0,657.0,915648.0,8.0,2.0,3.0,1458.25,15.1,2.0,0.0,88274.0,118998.0,0.0,0.0
1,99999999.0,1.0,748.0,1065577.0,4.0,3.0,14.0,4102.48,23.0,14.0,0.0,113107.0,711722.0,0.0,0.0
2,376200.0,0.0,624.0,3110547.0,3.0,1.0,3.0,19269.42,22.3,14.0,0.0,366149.0,575278.0,0.0,0.0
3,196372.0,1.0,750.0,1747069.0,1.0,1.0,7.0,23148.65,21.6,6.0,0.0,138909.0,245564.0,0.0,0.0
4,220000.0,1.0,724.0,370766.0,2.0,1.0,3.0,13426.73,16.3,9.0,0.0,230793.0,543158.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67153,553256.0,1.0,738.0,2006780.0,2.0,1.0,9.0,13729.78,22.2,12.0,0.0,43776.0,7239606.0,0.0,0.0
67154,435468.0,1.0,717.0,1617166.0,8.0,1.0,3.0,18597.39,20.9,12.0,0.0,254505.0,389796.0,0.0,0.0
67155,219472.0,1.0,708.0,336661.0,2.0,1.0,3.0,20612.91,9.4,17.0,0.0,207993.0,1007644.0,0.0,0.0
67156,330924.0,1.0,749.0,1318486.0,1.0,3.0,3.0,13843.97,17.4,11.0,0.0,0.0,0.0,0.0,0.0


Training EBM model
Explainable Boosting Machine (EBM) is a tree-based, cyclic gradient boosting Generalized Additive Model with automatic interaction detection. 
EBMs are often as accurate as state-of-the-art blackbox models while remaining completely interpretable.

In [73]:
ebm = ExplainableBoostingClassifier(feature_names=list(feature_name.values()))

In [75]:
ebm.fit(x_train, y_train)

Get EBM predictions

In [76]:
y_pred_ebm = ebm.predict(x_test)

Calculate accuracy

In [77]:
accuracy_ebm = accuracy_score(y_test, y_pred_ebm)
print(f"EBM Accuracy: {accuracy_ebm}")

EBM Accuracy: 0.7971053336906996


Show global explanation

In [78]:
show(ebm.explain_global())

AUC provides an aggregated measurement of performance across all possible classification thresholds
0.7 is a good result for our model 

In [79]:

auc = roc_auc_score(y_test, ebm.predict_proba(x_test)[:, 1]) 
print("AUC: {:.3f}".format(auc))

AUC: 0.760


Uploading BB model

In [44]:
BB=joblib.load("xgb.joblib")


Trying to unpickle estimator StandardScaler from version 1.4.0 when using version 1.3.2. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Trying to unpickle estimator Pipeline from version 1.4.0 when using version 1.3.2. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations



Making predictions with BB model and printing the accuracy

In [45]:
y_pred = BB.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.797373358348968


Looking at misclassified instances by the BB model with low and high confidence

In [46]:
# 1. Misclassified Instances
misclassified_indices = y_test != y_pred
misclassified_instances = pd.DataFrame(x_test[misclassified_indices][:2], columns=[f'feature_{i}' for i in range(x_test.shape[1])])

# 2. Low Confidence Predictions
proba = BB.predict_proba(x_test)
low_confidence_indices = (proba.max(axis=1) < 0.7) & misclassified_indices
low_confidence_instances = pd.DataFrame(x_test[low_confidence_indices][:2], columns=[f'feature_{i}' for i in range(x_test.shape[1])])

# 3. High Confidence Predictions
high_confidence_indices = (proba.max(axis=1) > 0.9) & (~misclassified_indices)
high_confidence_instances = pd.DataFrame(x_test[high_confidence_indices][:2], columns=[f'feature_{i}' for i in range(x_test.shape[1])])

# 4. Randomly Sample Remaining Instances
correctly_classified_indices = ~misclassified_indices
correctly_classified_instances = pd.DataFrame(x_test[correctly_classified_indices][:4], columns=[f'feature_{i}' for i in range(x_test.shape[1])])
 
# 5. Combine and Inspect
selected_instances = pd.concat([
    misclassified_instances,
    low_confidence_instances,
    high_confidence_instances,
    correctly_classified_instances
])

selected_instances

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,feature_14
0,345422.0,1.0,741.0,3914589.0,0.0,1.0,3.0,7830.85,14.2,7.0,0.0,307097.0,494560.0,0.0,0.0
1,446996.0,0.0,740.0,1351147.0,1.0,1.0,3.0,25671.85,30.7,18.0,0.0,224485.0,1243682.0,0.0,0.0
0,215270.0,1.0,720.0,658958.0,1.0,3.0,3.0,7577.96,33.7,7.0,0.0,182438.0,318120.0,0.0,0.0
1,671946.0,0.0,706.0,2319216.0,7.0,3.0,3.0,38760.38,17.3,13.0,0.0,880593.0,1234442.0,0.0,0.0
0,99999999.0,1.0,750.0,1616292.0,2.0,1.0,5.0,15220.14,20.9,6.0,0.0,80142.0,187110.0,0.0,0.0
1,266684.0,1.0,748.0,2495194.0,1.0,1.0,5.0,15470.18,19.2,14.0,0.0,276184.0,1133990.0,0.0,0.0
0,67232.0,1.0,699.0,290301.0,10.0,3.0,3.0,1858.01,13.1,3.0,0.0,46607.0,60500.0,0.0,0.0
1,527164.0,1.0,742.0,1697783.0,3.0,1.0,5.0,6168.54,20.9,3.0,0.0,237842.0,483142.0,0.0,0.0
2,649704.0,1.0,701.0,2057396.0,1.0,3.0,3.0,27089.06,13.5,23.0,0.0,471960.0,1339426.0,0.0,0.0
3,99999999.0,1.0,750.0,1616292.0,2.0,1.0,5.0,15220.14,20.9,6.0,0.0,80142.0,187110.0,0.0,0.0


In [50]:
selected_instances.rename(columns=feature_name, inplace=True)
selected_instances.set_index(np.arange(selected_instances.shape[0]), inplace=True)

Local explanations taking into account the 10 instances 

In [80]:

for idx, instance in selected_instances.iterrows():
    print(f"Instance {idx}:")
    
   
    instance_df = pd.DataFrame([instance], columns=selected_instances.columns)
    
    exp = ebm.explain_local(instance_df)
    
    
    show(exp)
    
   
    




Instance 0:


Instance 1:


Instance 2:


Instance 3:


Instance 4:


Instance 5:


Instance 6:


Instance 7:


Instance 8:


Instance 9:
