# Brain Stroke Dataset - Analysis, Part V

**Author:** Jakub Bednarz

**Previous parts**: [Part I](https://github.com/mim-uw/eXplainableMachineLearning-2023/blob/main/Homeworks/HW1/JakubBednarz/Analysis.ipynb), [Part II](https://htmlpreview.github.io/?https://github.com/mim-uw/eXplainableMachineLearning-2023/blob/main/Homeworks/HW2/JakubBednarz/Analysis%2C%20Part%20II.html), [Part III](https://htmlpreview.github.io/?https://github.com/mim-uw/eXplainableMachineLearning-2023/blob/main/Homeworks/HW3/JakubBednarz/Analysis%2C%20Part%20III.html), [Part IV](https://htmlpreview.github.io/?https://github.com/mim-uw/eXplainableMachineLearning-2023/blob/main/Homeworks/HW4/JakubBednarz/Analysis%2C%20Part%20IV.html)

In [14]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from typing import Union
import plotly
import plotly.express as px
import plotly.offline as py
import plotly.io as pio
import plotly.graph_objects as go
from scipy.special import logit
from copy import deepcopy
import warnings

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.base import BaseEstimator, ClassifierMixin
from xgboost.sklearn import XGBClassifier
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.svm import LinearSVC

import optuna
from optuna.trial import Trial, FrozenTrial
from optuna.integration.sklearn import OptunaSearchCV

import shap
import shap.maskers
import dalex
import lime

warnings.filterwarnings(action="ignore")
pio.renderers.default = "notebook"
plotly.offline.init_notebook_mode(connected=True)

In [2]:
df = pd.read_csv("brain_stroke.csv")
for col in ("hypertension", "heart_disease"):
    df[col] = df[col].apply(lambda idx: ["No", "Yes"][idx])

In [3]:
# for name, values in df.select_dtypes("object").iteritems():
#     print(name, values.unique())

In [4]:
class ColumnTransformerEx(ColumnTransformer):
    def transform(self, X):
        if isinstance(X, pd.DataFrame):
            return pd.DataFrame(
                data=np.asarray(super().transform(X)),
                columns=self.get_feature_names_out(),
                index=X.index,
            )
        else:
            return super().transform(X)
    
    def fit_transform(self, X, y=None):
        if isinstance(X, pd.DataFrame):
            return pd.DataFrame(
                data=np.asarray(super().fit_transform(X, y)),
                columns=self.get_feature_names_out(),
                index=X.index,
            )
        else:
            return super().fit_transform(X, y)

cat_features = [
    "gender",
    "ever_married",
    "work_type",
    "Residence_type",
    "smoking_status",
    "hypertension",
    "heart_disease",
]

one_tf = ColumnTransformerEx(
    transformers=[
        (col, OneHotEncoder(drop="if_binary"), [col])
        for col in cat_features],
    remainder="passthrough",
    verbose_feature_names_out=False,
)

ord_tf = ColumnTransformerEx(
    transformers=[
        (col, OrdinalEncoder(), [col])
        for col in cat_features],
    remainder="passthrough",
    verbose_feature_names_out=False,
)

one_df = one_tf.fit_transform(df)
ord_df = ord_tf.fit_transform(df)

all_features = ord_df.columns.tolist()

cat_idxes = [all_features.index(feat) for feat in cat_features]

cat_names = {
    idx: OrdinalEncoder().fit(df[[feat]]).categories_[0]
    for idx, feat in zip(cat_idxes, cat_features)
}

X_tf = ColumnTransformerEx(
    transformers=[
        (col, OneHotEncoder(drop="if_binary"), [col])
        for col in cat_features],
    remainder="passthrough",
    verbose_feature_names_out=False,
)

In [5]:
def split(df):
    X, y = df.drop(columns=["stroke"]), df["stroke"]
    train_X, test_X, train_y, test_y = \
        train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    return train_X, test_X, train_y, test_y

train_X, test_X, train_y, test_y = split(df)
X_tf = X_tf.fit(train_X)
train_X_one, test_X_one, train_y_one, test_y_one = split(one_df)
train_X_ord, test_X_ord, train_y_ord, test_y_ord = split(ord_df)

In [6]:
conv_tf = ColumnTransformer(
    transformers=[
        (col, OneHotEncoder(drop="if_binary"), [idx])
        for col, idx in zip(cat_features, cat_idxes)],
    remainder="passthrough",
    verbose_feature_names_out=False,
)

conv_tf = conv_tf.fit(ord_df.drop(columns=["stroke"]).to_numpy())

In [7]:
class XGBClassifier_Optuna:
    def fit(self, X, y):
        def model_fn(trial: Union[Trial, FrozenTrial]):
            return XGBClassifier(
                max_depth=int(trial.suggest_float("max_depth", 3, 18, step=1)),
                gamma=trial.suggest_float("gamma", 1, 9),
                reg_alpha=int(trial.suggest_float("reg_alpha", 40, 180, step=1)),
                reg_lambda=trial.suggest_float("reg_lambda", 0, 1),
                colsample_bytree=trial.suggest_float("colsample_bytree", 0.5, 1),
                min_child_weight=int(trial.suggest_float("min_child_weight", 0, 10, step=1)),
                n_estimators=180,
                seed=42,
            )

        sample_wt = compute_sample_weight(
            class_weight="balanced",
            y=y,
        )
        
        fit_params = dict(
            sample_weight=sample_wt,
        )

        def objective(trial: optuna.Trial):
            model = model_fn(trial)
            return np.min(cross_val_score(model, X, y, scoring="roc_auc", fit_params=fit_params))
        
        optuna.logging.set_verbosity(optuna.logging.WARN)
        sampler = optuna.samplers.TPESampler(seed=99)
        study = optuna.create_study(direction="maximize", sampler=sampler)
        study.optimize(objective, n_trials=16)

        model = model_fn(study.best_trial)
        model.fit(X, y, **fit_params)
        return model

xgb_model = XGBClassifier_Optuna().fit(train_X_one, train_y_one)

In [11]:
roc_auc_score(test_y_one, xgb_model.predict(test_X_one))

0.7905807814149947

In [9]:
lr_model_cv = LogisticRegressionCV(
    solver="liblinear",
    scoring="roc_auc",
    class_weight="balanced",
)

lr_model_cv = lr_model_cv.fit(train_X_one, train_y_one)

lr_model = LogisticRegression(
    solver="liblinear",
    class_weight="balanced",
    C=lr_model_cv.C_[0],
)

lr_model = lr_model.fit(train_X_one, train_y_one)

In [12]:
roc_auc_score(test_y_one, lr_model.predict(test_X_one))

0.7943400211193241

In [17]:
class SVC_Optuna:
    def fit(self, X, y):
        def model_fn(trial: Union[Trial, FrozenTrial]):
            return LinearSVC(
                penalty=trial.suggest_categorical("penalty", ["l1", "l2"]),
                dual=False,
                C=trial.suggest_loguniform("C", 1e-4, 1e4),
                random_state=42,
            )

        sample_wt = compute_sample_weight(
            class_weight="balanced",
            y=y,
        )
        
        fit_params = dict(
            sample_weight=sample_wt,
        )

        def objective(trial: optuna.Trial):
            model = model_fn(trial)
            return np.min(cross_val_score(model, X, y, scoring="roc_auc", fit_params=fit_params))
        
        optuna.logging.set_verbosity(optuna.logging.WARN)
        sampler = optuna.samplers.TPESampler(seed=99)
        study = optuna.create_study(direction="maximize", sampler=sampler)
        study.optimize(objective, n_trials=16)

        model = model_fn(study.best_trial)
        model.fit(X, y, **fit_params)
        return model

svc_model = SVC_Optuna().fit(train_X_one, train_y_one)

In [18]:
roc_auc_score(test_y_one, svc_model.predict(test_X_one))

0.7922280887011615

## Introduction

The goal for today is to become familiar with *Permutation-based Variable Importance* (PVI).

### Calculate PVI for the selected model

In [19]:
from sklearn.inspection import permutation_importance

res = permutation_importance(xgb_model, test_X_one, test_y_one, n_repeats=10)

imp_df = pd.DataFrame(
    data=res["importances"].T,
    columns=test_X_one.columns,
)

In [20]:
fig = px.box(pd.melt(imp_df),
    y="variable",
    x="value",
)
fig

In [21]:
fig = px.box(pd.melt(imp_df),
    y="variable",
    x="value",
)
fig.add_trace(go.Scatter(
    y=test_X_one.columns,
    x=xgb_model.feature_importances_,
    mode="markers",
    marker=dict(color="red"),
))
fig

In [22]:
def pvi_for(model):
    imp = permutation_importance(model, test_X_one, test_y_one, n_repeats=10)
    imp_df = pd.DataFrame(
        data=imp["importances"].T,
        columns=test_X_one.columns,
    )
    
    mask = np.count_nonzero(imp_df, axis=0) > 0
    imp_df = imp_df.iloc[:,mask]
    
    mean_vi = imp_df.mean(axis=0)
    imp_df = imp_df.iloc[:,np.argsort(mean_vi)]

    fig = px.box(pd.melt(imp_df),
        y="variable",
        x="value",
    )
    return fig

In [23]:
pvi_for(xgb_model)

In [24]:
pvi_for(lr_model)

In [25]:
pvi_for(svc_model)