In [12]:
import sys
import mlflow
import numpy as np
import pandas as pd
import shap
import matplotlib.pyplot as plt
from pathlib import Path
from tempfile import TemporaryDirectory
from sklearn.feature_selection import SelectKBest, mutual_info_classif, RFE
from xgboost import XGBClassifier

sys.path.append('..')
from src.mlflow_utils import configure_mlflow, find_latest_run_id_by_experiment_and_stage, get_targets, get_data, load_config

In [13]:
CONFIG = load_config()

In [16]:
def log_feature_selection_artifacts(features: list[str], datasets: dict[str, pd.DataFrame], artifacts, split_dirs) -> None:
    """Log selected features and datasets to MLflow"""
    with TemporaryDirectory() as tmp_dir:
        # Save feature list
        feature_path = Path(tmp_dir) / "selected_features.json"
        pd.Series(features).to_json(feature_path)
        mlflow.log_artifact(feature_path, artifacts["selected"])
        
        # Save datasets
        for (name, data), split_dir in zip(datasets.items(), split_dirs):
            data_path = Path(tmp_dir) / f"{name}.parquet"
            data.to_parquet(data_path)
            mlflow.log_artifact(data_path, f"{artifacts["selected"]}/{split_dir}")

In [17]:
def mutual_info_feature_selection(X_train, y_train):
    # Mutual Information Feature Selection
    selector_mi = SelectKBest(mutual_info_classif, k=25)
    X_train_mi = selector_mi.fit_transform(X_train, y_train.values.ravel())
    mi_mask = selector_mi.get_support()
    selected_features_mi = X_train.columns[mi_mask]
    mlflow.log_param("num_features_mi", len(selected_features_mi))
    return selected_features_mi, mi_mask

In [18]:
def calculate_scale_pos_weight(y_train) -> float:
    fraud_ratio = sum(y_train.values) / len(y_train)
    scale_pos_weight = ((1 - fraud_ratio) / fraud_ratio)
    return scale_pos_weight

In [19]:
def rfe_feature_selection(X_train, y_train):    
    scale_pos_weight = calculate_scale_pos_weight(y_train)
    estimator = XGBClassifier(
        scale_pos_weight=scale_pos_weight,
        subsample=0.8,
        random_state=42,
        device='cuda'
    )
    selector_rfe = RFE(estimator, n_features_to_select=25, step=10)
    selector_rfe.fit(X_train, y_train)
    rfe_mask = selector_rfe.support_
    selected_features_rfe = X_train.columns[rfe_mask]
    mlflow.log_param("num_features_rfe", len(selected_features_rfe))
    return selected_features_rfe, rfe_mask

In [20]:
def shap_feature_selection(X_train, y_train):
    scale_pos_weight = calculate_scale_pos_weight(y_train)
    model_shap = XGBClassifier(scale_pos_weight=scale_pos_weight, device='cuda').fit(X_train, y_train)
    explainer = shap.TreeExplainer(model_shap)
    shap_values = explainer.shap_values(X_train)
    shap_importances = np.abs(shap_values).mean(axis=0)
    feature_idx = np.argsort(shap_importances)[-20:]
    selected_features_shap = X_train.columns[feature_idx]
    mlflow.log_param("num_features_shap", len(selected_features_shap))
    return selected_features_shap, shap_values

In [21]:
def plot_shap_summary(shap_values, X_train):
    shap.summary_plot(shap_values, X_train, feature_names=X_train.columns, show=False)
    with TemporaryDirectory() as tmp_dir:
        shap_summary_path = Path(tmp_dir) / "shap_summary.png"
        plt.savefig(shap_summary_path)
        plt.close()
        mlflow.log_artifact(shap_summary_path)

In [22]:
def select_and_log_features(mi_mask, rfe_mask, selected_features_shap, X_train, X_val, X_test, artifacts, split_dirs):
    # Create voting system (features selected by >= 2 methods)
    selected_features_mi = X_train.columns[mi_mask]
    selected_features_rfe = X_train.columns[rfe_mask]
    selection_matrix = pd.DataFrame({
        'MI': mi_mask,
        'RFE': rfe_mask,
        'SHAP': [col in selected_features_shap for col in X_train.columns],
    })

    final_selection = selection_matrix.sum(axis=1) >= 2
    selected_features = X_train.columns[final_selection]        
    
    # Log selection metrics
    mlflow.log_metrics({
        "mi_features_selected": len(selected_features_mi),
        "rfe_features_selected": len(selected_features_rfe),
        "shap_features_selected": len(selected_features_shap),
        "final_features_selected": len(selected_features)
    })
    
    # Save and log final features
    selected_data = {
        "X_train": X_train[selected_features],
        "X_val": X_val[selected_features],
        "X_test": X_test[selected_features]
    }
    log_feature_selection_artifacts(selected_features, selected_data, artifacts, split_dirs)

    with TemporaryDirectory() as tmp_dir:
        # Log feature matrix
        selection_matrix.to_parquet(Path(tmp_dir) / "selection_matrix.parquet")
        mlflow.log_artifact(Path(tmp_dir) / "selection_matrix.parquet", "reports")

In [23]:
def feature_selection_pipeline(config):
    with TemporaryDirectory() as tmp_dir:
        mlflow.set_tags({
            "stage": "feature_selection",
            "model_type": "ensemble_selector"
        })

        preprocessing_experiment_name = config["experiment_names"]["preprocessing"]
        fe_experiment_name = config["experiment_names"]["feature_engineering"]

        # Find parent runs
        preprocessing_run_id = find_latest_run_id_by_experiment_and_stage(preprocessing_experiment_name, config["run_names"]["preprocessing"])
        fe_run_id = find_latest_run_id_by_experiment_and_stage(fe_experiment_name, config["run_names"]["feature_engineering"])

        # Log lineage
        mlflow.log_params({
            "preprocessing_run_id": preprocessing_run_id,
            "feature_engineering_run_id": fe_run_id
        })

        # Load data from MLflow
        engineered_data = get_data(fe_run_id, config["dataset"], config["artifacts"]["data"]["engineered"])
        targets = get_targets(preprocessing_run_id, config["dataset"])

        # Dataset preparation
        X_train, y_train = engineered_data["X_train"], targets["y_train"]
        X_val, y_val = engineered_data["X_val"], targets["y_val"]
        X_test = engineered_data["X_test"]

        # Log dataset characteristics
        mlflow.log_metrics({
            "original_features": X_train.shape[1],
            "class_ratio": y_train.mean()
        })

        _, mi_mask = mutual_info_feature_selection(X_train, y_train)
        _ , rfe_mask = rfe_feature_selection(X_train, y_train)
        selected_features_shap, shap_values = shap_feature_selection(X_train, y_train)
        plot_shap_summary(shap_values, X_train)

        artifacts = config["artifacts"]["data"]
        split_dirs = config["dataset"]["split_dirs"]
        select_and_log_features(mi_mask, rfe_mask, selected_features_shap, X_train, X_val, X_test, artifacts, split_dirs)
        
        # Log configuration
        mlflow.log_dict(config, "feature_selection_config.json")

In [24]:
if __name__ == "__main__":
    experiment_name = CONFIG["experiment_names"]["feature_selection"]
    run_name = CONFIG["run_names"]["feature_selection"]
    configure_mlflow(experiment_name)
    with mlflow.start_run(run_name=run_name):
        feature_selection_pipeline(CONFIG)
        print("Feature selection pipeline completed. Run ID:", mlflow.active_run().info.run_id)

Feature selection pipeline completed. Run ID: 9f003c5e0c5943399342b82f8e2576d2
