In [1]:
import os
import pandas as pd
import numpy as np
import mlflow
from sklearn.feature_selection import SelectKBest, mutual_info_classif, RFE
from xgboost import XGBClassifier
import shap
import matplotlib.pyplot as plt

In [2]:
mlflow.set_tracking_uri("file:///home/nick/Documents/code/fraud-detection-mlops/mlflow/mlruns")
experiment_name = "CreditCardFraudFeatureSelection"
mlflow.set_experiment(experiment_name)

2025/02/06 00:14:43 INFO mlflow.tracking.fluent: Experiment with name 'CreditCardFraudFeatureSelection' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///home/nick/Documents/code/fraud-detection-mlops/mlflow/mlruns/839867756831717012', creation_time=1738754083254, experiment_id='839867756831717012', last_update_time=1738754083254, lifecycle_stage='active', name='CreditCardFraudFeatureSelection', tags={}>

In [3]:
processed_data_dir = '../data/processed'
engineered_data_dir = '../data/engineered'
feature_selected_data_dir = '../data/selected'

In [5]:
with mlflow.start_run():
    # Load autoencoder-enriched data
    X_train = pd.read_csv(os.path.join(engineered_data_dir, 'X_train_enriched.csv'))
    y_train = pd.read_csv(os.path.join(processed_data_dir, 'y_train.csv'))
    X_val = pd.read_csv(os.path.join(engineered_data_dir, 'X_val_enriched.csv'))
    y_val = pd.read_csv(os.path.join(processed_data_dir, 'y_val.csv'))
    X_test = pd.read_csv(os.path.join(engineered_data_dir, 'X_test_enriched.csv'))

    # Log basic information about the dataset
    mlflow.log_param("num_rows_train", X_train.shape[0])
    mlflow.log_param("num_columns_train", X_train.shape[1])
    mlflow.log_param("num_rows_val", X_val.shape[0])
    mlflow.log_param("num_columns_val", X_val.shape[1])

    # Mutual Information Feature Selection
    selector_mi = SelectKBest(mutual_info_classif, k=25)
    X_train_mi = selector_mi.fit_transform(X_train, y_train.values.ravel())
    X_val_mi = selector_mi.transform(X_val)
    mi_mask = selector_mi.get_support()
    selected_features_mi = X_train.columns[mi_mask]
    mlflow.log_param("num_features_mi", len(selected_features_mi))

    # RFE Feature Selection
    fraud_ratio = sum(y_train.values) / len(y_train)
    scale_pos_weight = ((1 - fraud_ratio) / fraud_ratio)[0]
    estimator = XGBClassifier(
        scale_pos_weight=scale_pos_weight,
        subsample=0.8,
        random_state=42,
        device='cuda'
    )
    selector_rfe = RFE(estimator, n_features_to_select=25, step=10)
    selector_rfe.fit(X_train, y_train)
    rfe_mask = selector_rfe.support_
    selected_features_rfe = X_train.columns[rfe_mask]
    mlflow.log_param("num_features_rfe", len(selected_features_rfe))

    # SHAP Feature Selection
    model_shap = XGBClassifier(scale_pos_weight=scale_pos_weight, device='cuda').fit(X_train, y_train)
    explainer = shap.TreeExplainer(model_shap)
    shap_values = explainer.shap_values(X_train)
    shap_importances = np.abs(shap_values).mean(axis=0)
    feature_idx = np.argsort(shap_importances)[-20:]
    selected_features_shap = X_train.columns[feature_idx]
    mlflow.log_param("num_features_shap", len(selected_features_shap))

    # Plot SHAP summary
    shap.summary_plot(shap_values, X_train, feature_names=X_train.columns, show=False)
    shap_summary_path = os.path.join(feature_selected_data_dir, 'shap_summary.png')
    plt.savefig(shap_summary_path)
    plt.close()
    mlflow.log_artifact(shap_summary_path)

    # Create voting system (features selected by >= 2 methods)
    selection_matrix = pd.DataFrame({
        'MI': mi_mask,
        'RFE': rfe_mask,
        'SHAP': [col in selected_features_shap for col in X_train.columns],
    })
    final_selection = selection_matrix.sum(axis=1) >= 2
    selected_features = X_train.columns[final_selection]
    mlflow.log_param("num_final_features", len(selected_features))

    # Filter datasets
    X_train_selected = X_train[selected_features]
    X_val_selected = X_val[selected_features]
    X_test_selected = X_test[selected_features]

    # Save selected features and datasets
    selected_features.to_series().to_csv(os.path.join(feature_selected_data_dir, 'selected_features.csv'), index=False)
    X_train_selected.to_csv(os.path.join(feature_selected_data_dir, 'X_train_selected.csv'), index=False)
    X_val_selected.to_csv(os.path.join(feature_selected_data_dir, 'X_val_selected.csv'), index=False)
    X_test_selected.to_csv(os.path.join(feature_selected_data_dir, 'X_test_selected.csv'), index=False)

    # Log artifacts
    mlflow.log_artifact(os.path.join(feature_selected_data_dir, 'selected_features.csv'))
    mlflow.log_artifact(os.path.join(feature_selected_data_dir, 'X_train_selected.csv'))
    mlflow.log_artifact(os.path.join(feature_selected_data_dir, 'X_val_selected.csv'))
    mlflow.log_artifact(os.path.join(feature_selected_data_dir, 'X_test_selected.csv'))

print("MLflow run completed.")

MLflow run completed.
