In [1]:
import joblib
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn import metrics

# import lime
# import shap
# import xgboost as xgb

from utils.xailib.explainers.lore_explainer import LoreTabularExplainer

from utils.xailib.models.sklearn_classifier_wrapper import sklearn_classifier_wrapper
from utils.xailib.data_loaders.dataframe_loader import prepare_dataframe

## Data import

In [2]:
# Train data
x_train = np.load("data/processed/x_train.npy")
# Train labels
x_test = np.load("data/processed/x_test.npy")
# Test data
y_train = np.load("data/processed/y_train.npy")
# Test labels
y_test = np.load("data/processed/y_test.npy")

In [16]:
feature_name = {
    "feature_0": "Current Loan Amount",
    "feature_1": "Term", 
    "feature_2": "Credit Score", 
    "feature_3": "Annual Income", 
    "feature_4": "Years in current job", 
    "feature_5": "Home Ownership", 
    "feature_6": "Purpose", 
    "feature_7": "Monthly Debt", 
    "feature_8": "Years of Credit History", 
    "feature_9": "Number of Open Accounts", 
    "feature_10": "Number of Credit Problems", 
    "feature_11": "Current Credit Balance", 
    "feature_12": "Maximum Open Credit", 
    "feature_13": "Bankruptcies", 
    "feature_14": "Tax Liens"
    }

In [4]:
target = "Loan Status"

In [37]:
df = pd.read_csv("data/processed/df_test.csv")

In [38]:
df

Unnamed: 0,Loan Status,Current Loan Amount,Term,Credit Score,Annual Income,Years in current job,Home Ownership,Purpose,Monthly Debt,Years of Credit History,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens
0,Charged Off,329274.0,Short Term,746.0,796252.0,4 years,Home Mortgage,Other,12540.76,12.4,8.0,0.0,41686.0,294360.0,0.0,0.0
1,Fully Paid,149666.0,Short Term,744.0,524552.0,5 years,Rent,major_purchase,695.02,12.6,4.0,0.0,39026.0,97592.0,0.0,0.0
2,Fully Paid,285890.0,Short Term,737.0,839306.0,4 years,Rent,Debt Consolidation,16366.60,13.9,17.0,1.0,62624.0,366234.0,1.0,0.0
3,Charged Off,257906.0,Short Term,6980.0,1577703.0,10+ years,Rent,Debt Consolidation,28792.98,15.4,11.0,0.0,263568.0,364188.0,0.0,0.0
4,Fully Paid,108944.0,Short Term,707.0,846716.0,6 years,Rent,Debt Consolidation,11007.27,16.3,8.0,0.0,161538.0,206910.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22381,Fully Paid,210584.0,Short Term,719.0,783389.0,1 year,Home Mortgage,Other,3727.61,17.4,6.0,0.0,456.0,259160.0,0.0,0.0
22382,Fully Paid,99999999.0,Short Term,732.0,1289416.0,1 year,Rent,Debt Consolidation,13109.05,9.4,22.0,0.0,153045.0,509234.0,0.0,0.0
22383,Fully Paid,103136.0,Short Term,742.0,1150545.0,6 years,Rent,Debt Consolidation,7315.57,18.8,12.0,1.0,109554.0,537548.0,1.0,0.0
22384,Fully Paid,530332.0,Short Term,746.0,1717524.0,9 years,Rent,Debt Consolidation,9890.07,15.0,8.0,0.0,404225.0,738254.0,0.0,0.0


In [58]:
df_processed = pd.DataFrame(x_test, columns=df.columns[1:])

In [59]:
df_processed

Unnamed: 0,Current Loan Amount,Term,Credit Score,Annual Income,Years in current job,Home Ownership,Purpose,Monthly Debt,Years of Credit History,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens
0,67232.0,1.0,699.0,290301.0,10.0,3.0,3.0,1858.01,13.1,3.0,0.0,46607.0,60500.0,0.0,0.0
1,527164.0,1.0,742.0,1697783.0,3.0,1.0,5.0,6168.54,20.9,3.0,0.0,237842.0,483142.0,0.0,0.0
2,649704.0,1.0,701.0,2057396.0,1.0,3.0,3.0,27089.06,13.5,23.0,0.0,471960.0,1339426.0,0.0,0.0
3,99999999.0,1.0,750.0,1616292.0,2.0,1.0,5.0,15220.14,20.9,6.0,0.0,80142.0,187110.0,0.0,0.0
4,447172.0,0.0,720.0,1800972.0,7.0,1.0,7.0,10265.51,14.0,12.0,0.0,137332.0,638660.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22381,152416.0,1.0,747.0,637241.0,5.0,1.0,3.0,12521.76,19.8,9.0,1.0,120118.0,221122.0,1.0,0.0
22382,110330.0,1.0,724.0,952755.0,3.0,1.0,3.0,7534.83,39.2,13.0,0.0,229596.0,436524.0,0.0,0.0
22383,267146.0,0.0,668.0,1050377.0,1.0,3.0,3.0,8289.13,16.0,8.0,0.0,100415.0,141130.0,0.0,0.0
22384,261668.0,1.0,735.0,941640.0,1.0,1.0,3.0,18518.92,16.5,8.0,0.0,583110.0,1000252.0,0.0,0.0


## Model import

In [7]:
BB = joblib.load("xgb.joblib")

In [8]:
# Make predictions on the test set
y_pred = BB.predict(x_test)

# Evaluate the performance of the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.797373358348968


## Instance selection

In [9]:
df_output = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
# print(df_output)

In [19]:
# 1. Misclassified Instances
misclassified_indices = y_test != y_pred
misclassified_instances = pd.DataFrame(x_test[misclassified_indices][:2], columns=[f'feature_{i}' for i in range(x_test.shape[1])])

# 2. Low Confidence Predictions
proba = BB.predict_proba(x_test)
low_confidence_indices = (proba.max(axis=1) < 0.7) & misclassified_indices
low_confidence_instances = pd.DataFrame(x_test[low_confidence_indices][:2], columns=[f'feature_{i}' for i in range(x_test.shape[1])])

# 3. High Confidence Predictions
high_confidence_indices = (proba.max(axis=1) > 0.9) & (~misclassified_indices)
high_confidence_instances = pd.DataFrame(x_test[high_confidence_indices][:2], columns=[f'feature_{i}' for i in range(x_test.shape[1])])

# 4. Randomly Sample Remaining Instances
correctly_classified_indices = ~misclassified_indices
correctly_classified_instances = pd.DataFrame(x_test[correctly_classified_indices][:4], columns=[f'feature_{i}' for i in range(x_test.shape[1])])

# 5. Combine and Inspect
selected_instances = pd.concat([
    misclassified_instances,
    low_confidence_instances,
    high_confidence_instances,
    correctly_classified_instances
])

In [23]:
selected_instances.rename(columns=feature_name, inplace=True)
selected_instances.set_index(np.arange(selected_instances.shape[0]), inplace=True)

In [24]:
selected_instances

Unnamed: 0,Current Loan Amount,Term,Credit Score,Annual Income,Years in current job,Home Ownership,Purpose,Monthly Debt,Years of Credit History,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens
0,345422.0,1.0,741.0,3914589.0,0.0,1.0,3.0,7830.85,14.2,7.0,0.0,307097.0,494560.0,0.0,0.0
1,446996.0,0.0,740.0,1351147.0,1.0,1.0,3.0,25671.85,30.7,18.0,0.0,224485.0,1243682.0,0.0,0.0
2,215270.0,1.0,720.0,658958.0,1.0,3.0,3.0,7577.96,33.7,7.0,0.0,182438.0,318120.0,0.0,0.0
3,671946.0,0.0,706.0,2319216.0,7.0,3.0,3.0,38760.38,17.3,13.0,0.0,880593.0,1234442.0,0.0,0.0
4,99999999.0,1.0,750.0,1616292.0,2.0,1.0,5.0,15220.14,20.9,6.0,0.0,80142.0,187110.0,0.0,0.0
5,266684.0,1.0,748.0,2495194.0,1.0,1.0,5.0,15470.18,19.2,14.0,0.0,276184.0,1133990.0,0.0,0.0
6,67232.0,1.0,699.0,290301.0,10.0,3.0,3.0,1858.01,13.1,3.0,0.0,46607.0,60500.0,0.0,0.0
7,527164.0,1.0,742.0,1697783.0,3.0,1.0,5.0,6168.54,20.9,3.0,0.0,237842.0,483142.0,0.0,0.0
8,649704.0,1.0,701.0,2057396.0,1.0,3.0,3.0,27089.06,13.5,23.0,0.0,471960.0,1339426.0,0.0,0.0
9,99999999.0,1.0,750.0,1616292.0,2.0,1.0,5.0,15220.14,20.9,6.0,0.0,80142.0,187110.0,0.0,0.0


# Lore

Lore is an algorithm developed internally to the KDDLab. To use it we will use the xai lib that we imported at the beginning. Note that shap and lime are wrappend in the xai lib, so if you want you can replicate the previous part using the xai lib.

We then use the `sklearn_classifier_wrapper` from xai lib to wrap our black box and make it ready to explain

In [25]:
bbox = sklearn_classifier_wrapper(BB)

Now e could also use the imlpementations of SHAP and Lime that are present in the XAI-lib

In [26]:
explainer = LoreTabularExplainer(bbox)

In [27]:
inst = selected_instances.values[0]

Lore uses various type of neighborhood generation to provide a local explanation based on factuals and counterfactuals

In [39]:
dfz, feature_names, class_values, numeric_columns, rdf, real_feature_names, features_map = prepare_dataframe(df, target)

In [55]:
explainer = LoreTabularExplainer(bbox)
config = {'neigh_type':'rndgen', 'size':1000, 'ocr':0.1, 'ngen':10}
explainer.fit(df, target, config)
exp = explainer.explain(inst)
print(exp)

IndexError: index 27 is out of bounds for axis 0 with size 15

In [None]:
print('Instance ', inst)
print('True class ', Y_train.iloc[147])
print('Predicted class ', bb.predict(inst.reshape(1, -1)))

Instance  [    58      2 111169      9      2     14      5      1      1      0
      0     40     41]
True class  0
Predicted class  [0]


In [None]:
exp.plotRules()