In [7]:
from typing import Dict, List
from sklearn.base import BaseEstimator
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from src.data import load_dataset
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.inspection import PartialDependenceDisplay
import scienceplots

RANDOM_STATE = 42

X, y = load_dataset()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

models : Dict[str, BaseEstimator] = {
    "GaussianNB": GaussianNB(),
    "RandomForest": RandomForestClassifier(random_state=RANDOM_STATE),
    "GradientBoosting": GradientBoostingClassifier(random_state=RANDOM_STATE),
    "SVC": SVC(kernel='rbf', random_state=RANDOM_STATE),
    "MLP": Pipeline([('scaler', MinMaxScaler()), ("MLP", MLPClassifier(random_state=RANDOM_STATE, max_iter=300))]),
    "KNN": KNeighborsClassifier(),
}

features = ['age', 'education_num', 'capital-gain', 'hours-per-week', 'capital-loss']
categorical_features = [col for col in X.columns if X[col].dtype == 'bool']

plt.style.use(['science', 'ieee'])

In [8]:
for model in tqdm(models.values()):
    model.fit(X_train, y_train)

100%|██████████| 6/6 [01:44<00:00, 17.47s/it]


In [None]:
def explain(fitted_estimators: Dict[str, BaseEstimator], X_test: pd.DataFrame, feature: str):

    _, axes = plt.subplots(ncols=2, nrows=3, figsize=(6, 8), sharey=True, constrained_layout=True)
    axes = axes.flatten()

    for index, (estimator_name, estimator) in tqdm(enumerate(fitted_estimators.items())):
        ax = axes[index]

        features_info = {
            "features": [feature],
            "kind": "both",
            "centered": True,
        } 

        PartialDependenceDisplay.from_estimator(
            estimator,
            X_test,
            **features_info,
            ax=ax,
            pd_line_kw={
                "color": "blue",
                "label": "PDP",
            },
            ice_lines_kw= {
                "alpha": 0.05,
                "color": "dimgrey"
            }
        )

        plt.title(estimator_name)

    plt.savefig(f'../plots/x_test_ice_{feature}.pdf', dpi=300)

In [None]:
for feature in X.columns:
    if feature not in categorical_features:
        explain(models, X_train, feature)

3it [00:50, 16.53s/it]