In [None]:
import mlflow
import mlflow.sklearn
from mlflow.tracking import MlflowClient
import pandas as pd
import matplotlib.pyplot as plt
import shap
from sklearn.inspection import permutation_importance, PartialDependenceDisplay
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

: 

In [None]:
# -------------------------
# Setup MLflow
# -------------------------
mlflow.set_tracking_uri("file:///G:/My Drive/github/msc-ml-cw/q1/mlruns")
experiment_name = "bank_marketing"
experiment = mlflow.get_experiment_by_name(experiment_name)
experiment_id = experiment.experiment_id

runs = mlflow.search_runs(experiment_ids=[experiment_id])
runs = runs.sort_values("metrics.val_auc", ascending=False)


In [13]:
# Pick the best run
best_run_id = runs.iloc[0]["run_id"]
client = MlflowClient()


In [14]:
# -------------------------
# Load the best pipeline model
# -------------------------
local_path = client.download_artifacts(best_run_id, "model")
pipeline = mlflow.sklearn.load_model(local_path)

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

In [16]:
# -------------------------
# Load original dataset
# -------------------------
df = pd.read_csv("../data/bank-additional-full.csv", sep=";")
TARGET = "y"
y = (df[TARGET] == "yes").astype(int)
X = df.drop(columns=[TARGET])

In [18]:
from sklearn.inspection import permutation_importance

perm_result = permutation_importance(
    pipeline,  # full pipeline: preprocessing + classifier
    X,
    y,
    n_repeats=10,
    random_state=42,
    n_jobs=-1
)

ValueError: columns are missing: {'pdays_bucket', 'contact_last', 'campaign_intensity'}

In [17]:
# -------------------------
# Transform features through pipeline preprocessing
# -------------------------
preprocessor = pipeline.named_steps['pre']
X_preprocessed = preprocessor.transform(X)

ValueError: columns are missing: {'pdays_bucket', 'contact_last', 'campaign_intensity'}

In [None]:
# -------------------------
# 1. Feature Importance (tree-based models)
# -------------------------
if hasattr(model, "feature_importances_"):
    fi = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)
    plt.figure(figsize=(10,6))
    fi.head(20).plot(kind="barh")
    plt.title("Top 20 Feature Importances")
    plt.gca().invert_yaxis()
    plt.show()

In [None]:
# -------------------------
# Transform features through pipeline preprocessing
# -------------------------
preprocessor = pipeline.named_steps['pre']
X_preprocessed = preprocessor.transform(X)

# Get feature names after preprocessing (works if using ColumnTransformer)
try:
    feature_names = preprocessor.get_feature_names_out()
except:
    # fallback: just number them
    feature_names = [f"f{i}" for i in range(X_preprocessed.shape[1])]


ValueError: columns are missing: {'pdays_bucket', 'contact_last', 'campaign_intensity'}

In [None]:
# -------------------------
# 3. SHAP Values (for tree-based models)
# -------------------------
if hasattr(model, "predict_proba"):
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X)
    # Summary plot
    shap.summary_plot(shap_values[1], X)  # assuming binary classification

In [None]:

# -------------------------
# 4. Partial Dependence Plots (for top 3 features)
# -------------------------
top_features = fi.head(3).index.tolist() if 'fi' in locals() else X.columns[:3]
PartialDependenceDisplay.from_estimator(model, X, features=top_features, kind="average", grid_resolution=50)
plt.show()