# Wyjaśnialne uczenie maszynowe – praca domowa 4

### Katarzyna Koprowska

In [None]:
import pandas as pd
import numpy as np
import pickle

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import matplotlib.pyplot as plt

## Wczytanie danych

Wykorzystanym zbirem danych jest Home Equity (HMEQ), zawierający informacje o 5960 klientach banku, którzy otrzymali kredyty hipoteczne.

Na podstawie zbioru próbowałam przewidzieć prawdopodobieństwo **defaultu**, czyli faktu, że klient będzie zalegał z płatnościami – określa to binarna zmienna **BAD** (1 oznacza default). Pozostałe 12 zmiennych opisuje m.in. historię kredytową aplikującego, historię zawodową oraz charakterystyki obecnej pożyczki. 

Więcej informacji na temat danych można znaleźć pod linkiem https://www.kaggle.com/ajay1735/hmeq-data 

In [None]:
hmeq = pd.read_csv("hmeq.csv", error_bad_lines=False)

In [None]:
hmeq_info = {'BAD' : 'client defaulted on loan 0 = loan repaid',
"LOAN" : "Amount of the loan request",
"MORTDUE" : "Amount due on existing mortgage",
"VALUE": "Value of current property",
"REASON": "DebtCon debt consolidation HomeImp = home improvement",
"JOBS" : "occupational categories",
"YOJ": "Years at present job",
"DEROG" : "Number of major derogatory reports",
"DELINQ": "Number of delinquent credit lines",
"CLAGE": "Age of oldest trade line in months",
"NINQ": "Number of recent credit lines",
"CLNO": "Number of credit lines",
"DEBTINC" : "Debt-to-income ratio"}

## Przekształcenie danych nienumerycznych na *dummy variables*

In [None]:
from pandas.api.types import is_numeric_dtype
{column : is_numeric_dtype(hmeq[column]) for column in hmeq.columns}

In [None]:
set(hmeq['REASON'])

In [None]:
set(hmeq['JOB'])

In [None]:
hmeq = pd.concat([hmeq, pd.get_dummies(hmeq['REASON'], prefix='REASON', dummy_na=True)],axis=1)
hmeq = pd.concat([hmeq, pd.get_dummies(hmeq['JOB'], prefix='JOB', dummy_na=True)],axis=1)
hmeq.drop(['REASON', 'JOB'],axis=1, inplace=True)

## Braki danych

In [None]:
hmeq.isna().sum()

In [None]:
hmeq_nonan = hmeq.dropna()

In [None]:
X = hmeq_nonan.iloc[:, 1:]
y = hmeq_nonan.loc[:, "BAD"]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.6, random_state=42)

In [None]:
for data in [X_train, X_test, X_val, y_train,  y_val, y_test]:
    data.reset_index(drop=True, inplace = True)

In [None]:
X_train.shape

In [None]:
metrics = ["accuracy_train", "accuracy_test", "roc_auc_train", "roc_auc_test"]

## Model – las losowy

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf_final1 = pickle.load(open("final_nonan_rf.p", "rb"))

## Sprawdzenie na zbiorze testowym

In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score

In [None]:
results = {metric : {} for metric in ["accuracy_test", "roc_auc_test"]}
results["accuracy_test"]["RandomForest"] = (accuracy_score(y_test, rf_final1.predict(X_test)))
results["roc_auc_test"]["RandomForest"] = (roc_auc_score(y_test, rf_final1.predict_proba(X_test)[:,1]))

In [None]:
results = pd.DataFrame(results)

In [None]:
results

## Wyjaśnianie

### [2. for some selected observation from this dataset, calculate the model predictions for model (1)]

In [None]:
ind = 6
obs = pd.DataFrame(X_test.iloc[ind, :]).T

In [None]:
y_test[obs.index].values

In [None]:
rf_final1.predict_proba(obs)

### [3. for an observation selected in (2), calculate the decomposition of model prediction using Ceteris paribus / ICE profiles (packages for R: DALEX, ALEPlot, ingredients, packages for python: pyCeterisParibus)]

In [None]:
import lime
from lime import lime_tabular

In [None]:
explainer = lime.lime_tabular.LimeTabularExplainer(X_train.values, feature_names=list(X_train.columns), class_names=["GOOD","BAD"])
explainer_discretize = lime.lime_tabular.LimeTabularExplainer(X_train.values, feature_names=list(X_train.columns), class_names=["GOOD","BAD"], discretize_continuous=True)

In [None]:
exp = explainer.explain_instance(X_train.values[ind], rf_final1.predict_proba)
exp.show_in_notebook(show_table=True, show_all=False)

Przykładowe wyjaśnienie metodą **Ceteris Paribus** (najważniejsze zmienne wg **LIME**).

In [None]:
import ceteris_paribus

In [None]:
from ceteris_paribus.explainer import explain
from ceteris_paribus.plots.plots import plot_notebook
from ceteris_paribus.profiles import individual_variable_profile
from ceteris_paribus.plots.plots import plot
from ceteris_paribus.explainer import explain

In [None]:
cp_explainer = explain(rf_final1, X_test.columns, X_train, y_train)

In [None]:
cp = individual_variable_profile(cp_explainer, X_test.iloc[ind], y_test.iloc[ind])
plot_notebook(cp, selected_variables=["DEBTINC", "DEROG", "CLAGE", "DELINQ", "REASON_nan"], print_observations=False)

### [4. find two observations in the data set, such that they have different CP profiles (e.g. model response is growing with age for one observations and lowering with age for another). Note that you need to have model with interactions to have such differences]

In [None]:
np.random.seed(45)
ind_good = np.random.randint(len(y_test[y_test==0]), size=5)
np.random.seed(45)
ind_bad = np.random.randint(len(y_test[y_test==1]), size=5)
obs_indexes = y_test[y_test[y_test==0].index[ind_good]].index.tolist()+ y_test[y_test[y_test==1].index[ind_bad]].index.tolist()

In [None]:
print(obs_indexes)
obs_indexes = [457, 24]

### LIME i Ceteris Paribus dla lasów losowych

In [None]:
for i in obs_indexes:
    exp = explainer.explain_instance(X_test.values[i], rf_final1.predict_proba)
    exp.show_in_notebook(show_table=True, show_all=False)
    cp = individual_variable_profile(cp_explainer, X_test.iloc[i], y_test.iloc[i])
    plot_notebook(cp, selected_variables=list(X_test.columns), print_observations=False)

### [5. train a second model (of any class, neural nets, linear, other boosting) and find an observation for which CP profiles are different between the models]

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
adaboost = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth=7),n_estimators=150, random_state=42)

In [None]:
adaboost.fit(X_train, y_train)

In [None]:
results = {metric : {} for metric in ["accuracy_test", "roc_auc_test"]}
results["accuracy_test"]["RandomForest"] = (accuracy_score(y_test, rf_final1.predict(X_test)))
results["roc_auc_test"]["RandomForest"] = (roc_auc_score(y_test, rf_final1.predict_proba(X_test)[:,1]))
results["accuracy_test"]["AdaBoost"] = (accuracy_score(y_test, adaboost.predict(X_test)))
results["roc_auc_test"]["AdaBoost"] = (roc_auc_score(y_test, adaboost.predict_proba(X_test)[:,1]))
pd.DataFrame(results)

In [None]:
ind_good, ind_bad

In [None]:
cp_explainer_adaboost = explain(adaboost, X_test.columns, X_train, y_train)

In [None]:
np.random.seed(42)
ind_good = np.random.randint(len(y_test[y_test==0]), size=5)
np.random.seed(42)
ind_bad = np.random.randint(len(y_test[y_test==1]), size=5)
obs_indexes = y_test[y_test[y_test==0].index[ind_good]].index.tolist()+ y_test[y_test[y_test==1].index[ind_bad]].index.tolist()
obs_indexes=[obs_indexes[5], obs_indexes[8]]
obs_indexes

### LIME i Ceteris Paribus dla AdaBoost

In [None]:
for i in obs_indexes:
    exp = explainer.explain_instance(X_test.values[i], adaboost.predict_proba)
    exp.show_in_notebook(show_table=True, show_all=False)
    cp = individual_variable_profile(cp_explainer_adaboost, X_test.iloc[i], y_test.iloc[i])
    plot_notebook(cp, selected_variables=list(X_test.columns), print_observations=False)