# Wyjaśnialne uczenie maszynowe – praca domowa 5

### Katarzyna Koprowska

In [None]:
import pandas as pd
import numpy as np
import pickle

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import matplotlib.pyplot as plt

## Wczytanie danych

In [None]:
hmeq = pd.read_csv("hmeq.csv", error_bad_lines=False)

In [None]:
hmeq_info = {'BAD' : 'client defaulted on loan 0 = loan repaid',
"LOAN" : "Amount of the loan request",
"MORTDUE" : "Amount due on existing mortgage",
"VALUE": "Value of current property",
"REASON": "DebtCon debt consolidation HomeImp = home improvement",
"JOBS" : "occupational categories",
"YOJ": "Years at present job",
"DEROG" : "Number of major derogatory reports",
"DELINQ": "Number of delinquent credit lines",
"CLAGE": "Age of oldest trade line in months",
"NINQ": "Number of recent credit lines",
"CLNO": "Number of credit lines",
"DEBTINC" : "Debt-to-income ratio"}

## Przekształcenie danych nienumerycznych na *dummy variables*

In [None]:
from pandas.api.types import is_numeric_dtype
{column : is_numeric_dtype(hmeq[column]) for column in hmeq.columns}

In [None]:
set(hmeq['REASON'])

In [None]:
set(hmeq['JOB'])

In [None]:
hmeq = pd.concat([hmeq, pd.get_dummies(hmeq['REASON'], prefix='REASON', dummy_na=True)],axis=1)
hmeq = pd.concat([hmeq, pd.get_dummies(hmeq['JOB'], prefix='JOB', dummy_na=True)],axis=1)
hmeq.drop(['REASON', 'JOB'],axis=1, inplace=True)

## Braki danych

In [None]:
hmeq.isna().sum()

In [None]:
hmeq_nonan = hmeq.dropna()

In [None]:
X = hmeq_nonan.iloc[:, 1:]
y = hmeq_nonan.loc[:, "BAD"]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.6, random_state=42)

In [None]:
for data in [X_train, X_test, X_val, y_train,  y_val, y_test]:
    data.reset_index(drop=True, inplace = True)

In [None]:
X_train.shape

In [None]:
metrics = ["accuracy_train", "accuracy_test", "roc_auc_train", "roc_auc_test"]

### [2. For the selected data set, train at least one tree-based ensemble model (random forest, gbm, catboost or any other boosting)]

## Model – las losowy

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf_final1 = pickle.load(open("final_nonan_rf.p", "rb"))

## Sprawdzenie na zbiorze testowym

In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score

In [None]:
results = {metric : {} for metric in ["accuracy_test", "roc_auc_test"]}
results["accuracy_test"]["RandomForest"] = (accuracy_score(y_test, rf_final1.predict(X_test)))
results["roc_auc_test"]["RandomForest"] = (roc_auc_score(y_test, rf_final1.predict_proba(X_test)[:,1]))

In [None]:
results = pd.DataFrame(results)

In [None]:
results

## Wyjaśnianie

### [3. Calculate permutational variable importance for the selected model]

In [None]:
from sklearn.inspection import *

### Las losowy

In [None]:
result_rf1 = permutation_importance(rf_final1, X_train, y_train, n_repeats=15,
                                random_state=42)
perm_sorted_idx = result_rf1.importances_mean.argsort()

tree_importance_sorted_idx = np.argsort(rf_final1.feature_importances_)
tree_indices = np.arange(0, len(rf_final1.feature_importances_)) + 0.5

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8))
ax1.barh(tree_indices,
         rf_final1.feature_importances_[tree_importance_sorted_idx], height=0.7)
ax1.set_yticklabels(X_train.columns[tree_importance_sorted_idx])
ax1.set_yticks(tree_indices)
ax1.set_ylim((0, len(rf_final1.feature_importances_)))
ax2.boxplot(result_rf1.importances[perm_sorted_idx].T, vert=False,
            labels=X_train.columns[perm_sorted_idx])
fig.tight_layout()
fig.savefig("rf1_perm.png")
plt.show()

In [None]:
new_columns_rf = X_train.columns[perm_sorted_idx][-8:]
new_columns_rf

### [4. Train three or more candidate models (different variables, different transformations, different model structures) and compare ranking of important features between these models. Are they similar or different?]

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

### AdaBoost

In [None]:
adaboost = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth=7),n_estimators=150, random_state=42)

In [None]:
adaboost.fit(X_train, y_train)

In [None]:
results = {metric : {} for metric in ["accuracy_test", "roc_auc_test"]}
results["accuracy_test"]["RandomForest"] = (accuracy_score(y_test, rf_final1.predict(X_test)))
results["roc_auc_test"]["RandomForest"] = (roc_auc_score(y_test, rf_final1.predict_proba(X_test)[:,1]))
results["accuracy_test"]["AdaBoost"] = (accuracy_score(y_test, adaboost.predict(X_test)))
results["roc_auc_test"]["AdaBoost"] = (roc_auc_score(y_test, adaboost.predict_proba(X_test)[:,1]))
pd.DataFrame(results)

In [None]:
result_ab1 = permutation_importance(adaboost, X_train, y_train, n_repeats=15,
                                random_state=42)
perm_sorted_idx = result_ab1.importances_mean.argsort()

tree_importance_sorted_idx = np.argsort(adaboost.feature_importances_)
tree_indices = np.arange(0, len(adaboost.feature_importances_)) + 0.5

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8))
ax1.barh(tree_indices,
         adaboost.feature_importances_[tree_importance_sorted_idx], height=0.7)
ax1.set_yticklabels(X_train.columns[tree_importance_sorted_idx])
ax1.set_yticks(tree_indices)
ax1.set_ylim((0, len(adaboost.feature_importances_)))
ax2.boxplot(result_ab1.importances[perm_sorted_idx].T, vert=False,
            labels=X_train.columns[perm_sorted_idx])
fig.tight_layout()
fig.savefig("ab1_perm.png")
plt.show()

In [None]:
new_columns_ab = X_train.columns[perm_sorted_idx][-8:]
new_columns_ab

In [None]:
rf_final2 = RandomForestClassifier(max_depth=11, random_state=42).fit(X_train.loc[:, new_columns_rf], y_train)

In [None]:
adaboost2 = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth=7), 
                   n_estimators=150, random_state=42).fit(X_train.loc[:, new_columns_ab], y_train)

In [None]:
results = {metric : {} for metric in ["accuracy_test", "roc_auc_test"]}
results["accuracy_test"]["RandomForest"] = (accuracy_score(y_test, 
                                                           rf_final2.predict(X_test.loc[:, new_columns_rf])))
results["roc_auc_test"]["RandomForest"] = (roc_auc_score(y_test, 
                                                         rf_final2.predict_proba(X_test.loc[:, new_columns_rf])[:,1]))
results["accuracy_test"]["AdaBoost"] = (accuracy_score(y_test, adaboost2.predict(X_test.loc[:, new_columns_ab])))
results["roc_auc_test"]["AdaBoost"] = (roc_auc_score(y_test, adaboost2.predict_proba(X_test.loc[:, new_columns_ab])[:,1]))
pd.DataFrame(results)

### Las losowy z ograniczoną liczbą zmiennych

In [None]:
result_rf2 = permutation_importance(rf_final2, X_train.loc[:, new_columns_rf], y_train, n_repeats=15,
                                random_state=42)
perm_sorted_idx = result_rf2.importances_mean.argsort()

tree_importance_sorted_idx = np.argsort(rf_final2.feature_importances_)
tree_indices = np.arange(0, len(rf_final2.feature_importances_)) + 0.5

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8))
ax1.barh(tree_indices,
         rf_final2.feature_importances_[tree_importance_sorted_idx], height=0.7)
ax1.set_yticklabels(new_columns_rf[tree_importance_sorted_idx])
ax1.set_yticks(tree_indices)
ax1.set_ylim((0, len(rf_final2.feature_importances_)))
ax2.boxplot(result_rf2.importances[perm_sorted_idx].T, vert=False,
            labels=new_columns_rf[perm_sorted_idx])
fig.tight_layout()
fig.savefig("rf2_perm.png")
plt.show()

### AdaBoost z ograniczoną liczbą zmiennych

In [None]:
result_ab2 = permutation_importance(adaboost2, X_train.loc[:, new_columns_ab], y_train, n_repeats=15,
                                random_state=42)
perm_sorted_idx = result_ab2.importances_mean.argsort()

tree_importance_sorted_idx = np.argsort(adaboost2.feature_importances_)
tree_indices = np.arange(0, len(adaboost2.feature_importances_)) + 0.5

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8))
ax1.barh(tree_indices,
         adaboost2.feature_importances_[tree_importance_sorted_idx], height=0.7)
ax1.set_yticklabels(new_columns_ab[tree_importance_sorted_idx])
ax1.set_yticks(tree_indices)
ax1.set_ylim((0, len(adaboost2.feature_importances_)))
ax2.boxplot(result_ab2.importances[perm_sorted_idx].T, vert=False,
            labels=new_columns_ab[perm_sorted_idx])
fig.tight_layout()
fig.savefig("ab2_perm.png")
plt.show()

In [None]:
import seaborn as sns

sns.set(style="white")
corr = X_train.corr()
mask = np.triu(np.ones_like(corr, dtype=np.bool))
f, ax = plt.subplots(figsize=(11, 9))
cmap = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(corr, mask = mask, cmap=cmap, vmax=.6, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

f.savefig("corr_heatmap.png")