# Wyjaśnialne uczenie maszynowe – praca domowa 6

### Katarzyna Koprowska

In [None]:
import pandas as pd
import numpy as np
import pickle

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import matplotlib.pyplot as plt

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
compas_data = pd.read_csv("../iml2020-team1/compas/data/data.csv")

In [None]:
columns_to_select = ['person_id','p_current_age', 'p_age_first_offense', 
             'p_charge', 'p_jail30', 'p_prison', 'p_probation', 
             'race', 'sex', 'offenses_within_30', 'p_felony_count_person',
                               'p_misdem_count_person', 'p_charge_violent', 'p_juv_fel_count',
                               'p_felprop_violarrest', 'p_murder_arrest', 'p_felassault_arrest',
                               'p_misdemassault_arrest', 'p_famviol_arrest', 'p_sex_arrest',
                               'p_weapons_arrest', 'p_n_on_probation', 'p_current_on_probation',
                               'p_prob_revoke', 'p_arrest', 'p_prison30', 'scale_set',
                               'marital_status', 'custody_status']

In [None]:
new_names = pd.read_csv("../iml2020-team1/compas/data/new_variable_names.csv")

In [None]:
new_names = {i[1]["current_name"] : i[1]["name"] for i in new_names.iterrows()}
new_names["person_id"] = "person_id"

In [None]:
compas_data.drop("Unnamed: 0", axis=1, inplace=True)

In [None]:
compas_data = compas_data.loc[:, columns_to_select]

In [None]:
compas_data.columns = [new_names[i] for i in compas_data.columns]

In [None]:
X = compas_data.sort_values("person_id")

In [None]:
recid = pd.read_csv("../iml2020-team1/compas/data/recidivism.csv")

In [None]:
y_rec = recid.loc[:, ["person_id", "recid"]].sort_values("person_id")
y_rec_viol = recid.loc[:, ["person_id", "recid_violent"]].sort_values("person_id")

In [None]:
all(X["person_id"]==y_rec["person_id"]), all(X["person_id"]==y_rec_viol["person_id"])

In [None]:
for d in X, y_rec, y_rec_viol:
    d.drop("person_id", axis=1, inplace=True)

In [None]:
non_numeric_cols = ["race", "sex", "scale_set", "marital_status", "custody_status"]

In [None]:
for column in non_numeric_cols:
    X = pd.concat([X, pd.get_dummies(X[column], prefix=column, dummy_na=True)],axis=1)
X.drop(non_numeric_cols, axis = 1, inplace = True)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_rec, test_size=0.2, random_state=10)
#X_train, X_test, y_rec_viol_train, y_rec_viol_test = train_test_split(X, y_rec_viol, test_size=0.2, random_state=10)

In [None]:
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)

In [None]:
from xgboost import XGBClassifier

In [None]:
xgb = XGBClassifier(scale_pos_weight=1, n_estimators=12,max_depth = 3, learning_rate = 0.3, colsample_bytree=1,
                    subsample = 0.8, min_child_weight = 3, reg_alpha = 1, reg_lambda=1)
#xgb_viol = XGBClassifier(scale_pos_weight=1, n_estimators=19,max_depth = 3, learning_rate = 0.3, colsample_bytree=0.8,
#                    subsample = 0.8, min_child_weight = 10, reg_alpha = 1, reg_lambda=0)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
rf = RandomForestClassifier(max_depth = 3)
cross_val_score(rf, X, y_rec, scoring="roc_auc", cv=5)

In [None]:
rf.fit(X_train, y_train)

In [None]:
y_train = y_train.reset_index(drop=True).values[:, 0]
y_test = y_test.reset_index(drop=True).values[:, 0]

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
scores = cross_val_score(xgb, X, y_rec, scoring="roc_auc", cv=5)
#scores_viol = cross_val_score(xgb_viol, X, y_rec_viol, scoring="roc_auc", cv=5)

In [None]:
scores, np.mean(scores)

In [None]:
#scores_viol, np.mean(scores_viol)

In [None]:
xgb.fit(X_train, y_train)
#xgb_viol.fit(X_train, y_rec_viol_train)

### TODO:

1. For the selected data set, train at least one tree-based ensemble model (random forest, gbm, catboost or any other boosting)

2. For selected variables from the model (1) calculate Partial Dependence Profiles and Accumulated Local Dependence

3. Train a second model with a different structure (neural nets, linear, other boosting) and find a variable that has different behaviour between models

4. Comment on the results for points (2) and (3)

In [None]:
from sklearn.inspection import *

In [None]:
result_xgb = permutation_importance(xgb, X_train, y_train, n_repeats=15,
                                random_state=42)
perm_sorted_idx = result_xgb.importances_mean.argsort()

tree_importance_sorted_idx = np.argsort(xgb.feature_importances_)
tree_indices = np.arange(0, len(xgb.feature_importances_)) + 0.5

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8))
ax1.barh(tree_indices,
         xgb.feature_importances_[tree_importance_sorted_idx], height=0.7)
ax1.set_yticklabels(X_train.columns[tree_importance_sorted_idx])
ax1.set_yticks(tree_indices)
ax1.set_ylim((0, len(xgb.feature_importances_)))
ax2.boxplot(result_xgb.importances[perm_sorted_idx].T, vert=False,
            labels=X_train.columns[perm_sorted_idx])
fig.tight_layout()
fig.savefig("xgb_compas_perm.png")
plt.show()

In [None]:
result_rf = permutation_importance(rf, X_train, y_train, n_repeats=15,
                                random_state=42)
perm_sorted_idx = result_rf.importances_mean.argsort()

tree_importance_sorted_idx = np.argsort(rf.feature_importances_)
tree_indices = np.arange(0, len(rf.feature_importances_)) + 0.5

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8))
ax1.barh(tree_indices,
         rf.feature_importances_[tree_importance_sorted_idx], height=0.7)
ax1.set_yticklabels(X_train.columns[tree_importance_sorted_idx])
ax1.set_yticks(tree_indices)
ax1.set_ylim((0, len(rf.feature_importances_)))
ax2.boxplot(result_rf.importances[perm_sorted_idx].T, vert=False,
            labels=X_train.columns[perm_sorted_idx])
fig.tight_layout()
fig.savefig("rf_compas_perm.png")
plt.show()

In [None]:
import seaborn as sns

sns.set(style="white")
corr = X.corr()
mask = np.triu(np.ones_like(corr, dtype=np.bool))
f, ax = plt.subplots(figsize=(11, 9))
#cmap = sns.diverging_palette(220, 10, as_cmap=True)
#sns.heatmap(corr, mask = mask, cmap=cmap, vmax=.6, center=0,
#            square=True, linewidths=.5, cbar_kws={"shrink": .5})

#f.savefig("corr_compas_heatmap.png")
corr

In [None]:
{k: v for k, v in sorted(xgb.get_booster().get_score(importance_type="gain").items(), 
                         key=lambda item: item[1])}

In [None]:
import dalex

In [None]:
exp = dalex.Explainer(xgb, X_train, y_train, predict_function= (lambda model, x: model.predict(pd.DataFrame(x)).T))
exp2 = dalex.Explainer(rf, X_train, y_train, predict_function= (lambda model, x: model.predict(pd.DataFrame(x)).T))

In [None]:
pdp1 = exp.model_profile(
        N=300,
        variables= ['current_age', 'age_at_first_offence'],
        type='partial')
pdp1.result["_label_"] = 'XGBoost'
pdp2 = exp2.model_profile(
        N=300,
        variables= ['current_age', 'age_at_first_offence'],
        type='partial')
pdp2.result["_label_"] = 'RandomForest'
pdp1.plot(pdp2)


In [None]:
pdp1 = exp.model_profile(
        N=300,
        variables= ['current_age', 'age_at_first_offence'],
        type='accumulated')
pdp1.result["_label_"] = 'XGBoost'
pdp2 = exp2.model_profile(
        N=300,
        variables= ['current_age', 'age_at_first_offence'],
        type='accumulated')
pdp2.result["_label_"] = 'RandomForest'
pdp1.plot(pdp2)

In [None]:
pdp1 = exp.model_profile(
        N=300,
        variables= ['number_of_misdemeanours', 'charges_count'],
        type='partial')
pdp1.result["_label_"] = 'XGBoost'
pdp2 = exp2.model_profile(
        N=300,
        variables= ['number_of_misdemeanours', 'charges_count'],
        type='partial')
pdp2.result["_label_"] = 'RandomForest'
pdp1.plot(pdp2)

In [None]:
pdp1 = exp.model_profile(
        N=300,
        variables= ['number_of_misdemeanours', 'charges_count'],
        type='accumulated')
pdp1.result["_label_"] = 'XGBoost'
pdp2 = exp2.model_profile(
        N=300,
        variables= ['number_of_misdemeanours', 'charges_count'],
        type='accumulated')
pdp2.result["_label_"] = 'RandomForest'
pdp1.plot(pdp2)

In [None]:
race = ['race_African-American',
                    'race_Asian',
                    'race_Caucasian',
                    'race_Hispanic',
                    'race_Native American',
                    'race_Other']
pdp1 = exp.model_profile(
        N=300,
        variables= race,
        type='partial')
pdp1.result["_label_"] = 'XGBoost'
pdp2 = exp2.model_profile(
        N=300,
        variables= race,
        type='partial')
pdp2.result["_label_"] = 'RandomForest'
pdp1.plot(pdp2)

In [None]:
races = ['race_African-American',
                    'race_Caucasian',
                    'race_Hispanic']
for race in races:
    pdp1 = exp.model_profile(
            N=300,
            variables= race,
            type='accumulated')
    pdp1.result["_label_"] = 'XGBoost'
    pdp2 = exp2.model_profile(
            N=300,
            variables= race,
            type='accumulated')
    pdp2.result["_label_"] = 'RandomForest'
    pdp1.plot(pdp2)

In [None]:
sex = ['sex_Male', 'sex_Female']
pdp1 = exp.model_profile(
        N=300,
        variables= sex,
        type='partial')
pdp1.result["_label_"] = 'XGBoost'
pdp2 = exp2.model_profile(
        N=300,
        variables= sex,
        type='partial')
pdp2.result["_label_"] = 'RandomForest'
pdp1.plot(pdp2)

In [None]:
sex = ['sex_Male']
pdp1 = exp.model_profile(
        N=300,
        variables= sex,
        type='accumulated')
pdp1.result["_label_"] = 'XGBoost'
pdp2 = exp2.model_profile(
        N=300,
        variables= sex,
        type='accumulated')
pdp2.result["_label_"] = 'RandomForest'
pdp1.plot(pdp2)

In [None]:
sex = ['sex_Female']
pdp1 = exp.model_profile(
        N=300,
        variables= sex,
        type='accumulated')
pdp1.result["_label_"] = 'XGBoost'
pdp2 = exp2.model_profile(
        N=300,
        variables= sex,
        type='accumulated')
pdp2.result["_label_"] = 'RandomForest'
pdp1.plot(pdp2)