In [82]:
import sys
import os
import mlflow
import numpy as np
import pandas as pd
import shap
import matplotlib.pyplot as plt
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import Dict, List
from mlflow.tracking import MlflowClient
from mlflow.entities import ViewType
from sklearn.feature_selection import SelectKBest, mutual_info_classif, RFE
from xgboost import XGBClassifier

sys.path.append('..')
from dotenv import load_dotenv
from src.mlflow_init import configure_mlflow

load_dotenv()

True

In [83]:
CONFIG = {
    "feature_selection": {
        "k_best": 30,
        "rfe_features": 25,
        "shap_features": 20,
        "voting_threshold": 2
    },
    "artifacts": {
        "feature_selected_dir": "selected_features"
    },
    "data_params": {
        "split_dirs": ["training", "validation", "testing"],
        "split_types": ["Features", "Target"],
        "split_names": ["encoded_X_train", "encoded_X_val", "encoded_X_test"],
        "target_split_names": ["y_train", "y_val", "y_test"]
    }
}

In [84]:
def find_latest_run_id_by_experiment_and_stage(experiment_name: str, stage: str) -> str:
    """Find the latest successful preprocessing run"""
    client = MlflowClient()
    experiment_id = mlflow.get_experiment_by_name(experiment_name).experiment_id
    runs = client.search_runs(
        experiment_ids = [experiment_id],
        filter_string=f"tags.stage='{stage}' AND attributes.status='FINISHED'",
        run_view_type=ViewType.ACTIVE_ONLY,
        max_results=1,
        order_by=["attribute.start_time DESC"]
    )
    return runs[0].info.run_id if runs else None

In [85]:
def get_data(run_id: str, data_params: dict[list[str]], artifact_dir: str) -> dict:
    """Retrieve data from MLflow artifacts"""
    client = MlflowClient()
    data = {}

    for split_dir in data_params["split_dirs"]:
        for split_type in data_params["split_types"]:
            artifacts = client.list_artifacts(run_id, os.path.join(artifact_dir, split_dir, split_type))
            for artifact in artifacts:
                if artifact.path.endswith(".parquet"):
                    path = client.download_artifacts(run_id, artifact.path)
                    split = artifact.path.split("/")[-1].split(".")[0]
                    data[split] = pd.read_parquet(path)
        
    return data
    
def get_targets(preprocessing_run_id: str, config) -> Dict[str, pd.Series]:
    """Retrieve target variables from preprocessing run"""
    client = MlflowClient()
    targets = {}
    for split, target_split in zip(config["split_dirs"], config["target_split_names"]):
        path = client.download_artifacts(
            preprocessing_run_id, 
            f"data/processed/{split}/Target/{target_split}.parquet"
        )
        targets[f"{target_split}"] = pd.read_parquet(path).squeeze()
    return targets

In [86]:
def log_feature_selection_artifacts(features: List[str], datasets: Dict[str, pd.DataFrame]) -> None:
    """Log selected features and datasets to MLflow"""
    with TemporaryDirectory() as tmp_dir:
        # Save feature list
        feature_path = Path(tmp_dir) / "selected_features.json"
        pd.Series(features).to_json(feature_path)
        mlflow.log_artifact(feature_path, CONFIG["artifacts"]["feature_selected_dir"])
        
        # Save datasets
        for name, data in datasets.items():
            data_path = Path(tmp_dir) / f"{name}.parquet"
            data.to_parquet(data_path)
            mlflow.log_artifact(data_path, CONFIG["artifacts"]["feature_selected_dir"])

In [95]:
def mutual_info_feature_selection(X_train, y_train):
    # Mutual Information Feature Selection
    selector_mi = SelectKBest(mutual_info_classif, k=25)
    X_train_mi = selector_mi.fit_transform(X_train, y_train.values.ravel())
    mi_mask = selector_mi.get_support()
    selected_features_mi = X_train.columns[mi_mask]
    mlflow.log_param("num_features_mi", len(selected_features_mi))
    return selected_features_mi, mi_mask

In [103]:
def calculate_scale_pos_weight(y_train) -> float:
    fraud_ratio = sum(y_train.values) / len(y_train)
    scale_pos_weight = ((1 - fraud_ratio) / fraud_ratio)
    return scale_pos_weight

In [104]:
def rfe_feature_selection(X_train, y_train):    
    scale_pos_weight = calculate_scale_pos_weight(y_train)
    estimator = XGBClassifier(
        scale_pos_weight=scale_pos_weight,
        subsample=0.8,
        random_state=42,
        device='cuda'
    )
    selector_rfe = RFE(estimator, n_features_to_select=25, step=10)
    selector_rfe.fit(X_train, y_train)
    rfe_mask = selector_rfe.support_
    selected_features_rfe = X_train.columns[rfe_mask]
    mlflow.log_param("num_features_rfe", len(selected_features_rfe))
    return selected_features_rfe, rfe_mask

In [105]:
def shap_feature_selection(X_train, y_train):
    scale_pos_weight = calculate_scale_pos_weight(y_train)
    model_shap = XGBClassifier(scale_pos_weight=scale_pos_weight, device='cuda').fit(X_train, y_train)
    explainer = shap.TreeExplainer(model_shap)
    shap_values = explainer.shap_values(X_train)
    shap_importances = np.abs(shap_values).mean(axis=0)
    feature_idx = np.argsort(shap_importances)[-20:]
    selected_features_shap = X_train.columns[feature_idx]
    mlflow.log_param("num_features_shap", len(selected_features_shap))
    return selected_features_shap, shap_values

In [106]:
def plot_shap_summary(shap_values, X_train):
    shap.summary_plot(shap_values, X_train, feature_names=X_train.columns, show=False)
    with TemporaryDirectory() as tmp_dir:
        shap_summary_path = Path(tmp_dir) / "shap_summary.png"
        plt.savefig(shap_summary_path)
        plt.close()
        mlflow.log_artifact(shap_summary_path)

In [112]:
def select_and_log_features(mi_mask, rfe_mask, selected_features_shap, X_train, X_val, X_test):
    # Create voting system (features selected by >= 2 methods)
    selected_features_mi = X_train.columns[mi_mask]
    selected_features_rfe = X_train.columns[rfe_mask]
    selection_matrix = pd.DataFrame({
        'MI': mi_mask,
        'RFE': rfe_mask,
        'SHAP': [col in selected_features_shap for col in X_train.columns],
    })

    final_selection = selection_matrix.sum(axis=1) >= 2
    selected_features = X_train.columns[final_selection]        
    
    # Log selection metrics
    mlflow.log_metrics({
        "mi_features_selected": len(selected_features_mi),
        "rfe_features_selected": len(selected_features_rfe),
        "shap_features_selected": len(selected_features_shap),
        "final_features_selected": len(selected_features)
    })
    
    # Save and log final features
    selected_data = {
        "X_train_selected": X_train[selected_features],
        "X_val_selected": X_val[selected_features],
        "X_test_selected": X_test[selected_features]
    }
    log_feature_selection_artifacts(selected_features, selected_data)

    with TemporaryDirectory() as tmp_dir:
        # Log feature matrix
        selection_matrix.to_parquet(Path(tmp_dir) / "selection_matrix.parquet")
        mlflow.log_artifact(Path(tmp_dir) / "selection_matrix.parquet", "reports")

In [113]:
def feature_selection_pipeline(config):
    with TemporaryDirectory() as tmp_dir:
        mlflow.set_tags({
            "stage": "feature_selection",
            "model_type": "ensemble_selector"
        })

        # Find parent runs
        client = MlflowClient()
        preprocessing_run_id = find_latest_run_id_by_experiment_and_stage("CreditCardFraudPreprocessing", "preprocessing")
        fe_run_id = find_latest_run_id_by_experiment_and_stage("CreditCardFraudAutoencoder", "feature_engineering")

        # Log lineage
        mlflow.log_params({
            "preprocessing_run_id": preprocessing_run_id,
            "feature_engineering_run_id": fe_run_id
        })

        # Load data from MLflow
        engineered_data = get_data(fe_run_id, config["data_params"], "engineered_features")
        targets = get_targets(preprocessing_run_id, config["data_params"])

        # Dataset preparation
        X_train, y_train = engineered_data["encoded_X_train"], targets["y_train"]
        X_val, y_val = engineered_data["encoded_X_val"], targets["y_val"]
        X_test = engineered_data["encoded_X_test"]

        # Log dataset characteristics
        mlflow.log_metrics({
            "original_features": X_train.shape[1],
            "class_ratio": y_train.mean()
        })

        _, mi_mask = mutual_info_feature_selection(X_train, y_train)
        _ , rfe_mask = rfe_feature_selection(X_train, y_train)
        selected_features_shap, shap_values = shap_feature_selection(X_train, y_train)
        plot_shap_summary(shap_values, X_train)
        select_and_log_features(mi_mask, rfe_mask, selected_features_shap, X_train, X_val, X_test)
        
        # Log model configuration
        mlflow.log_dict(config, "feature_selection_config.json")

In [114]:
if __name__ == "__main__":
    configure_mlflow("CreditCardFraudFeatureSelection")
    with mlflow.start_run(run_name="feature_selection"):
        feature_selection_pipeline(CONFIG)
        print("Feature selection pipeline completed. Run ID:", mlflow.active_run().info.run_id)

Feature selection pipeline completed. Run ID: 1dcdf9df9187406da0a573756a0bcb36
