In [1]:
import sys
import mlflow
import numpy as np
import pandas as pd
import shap
import matplotlib.pyplot as plt
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import List, Dict, Tuple
from sklearn.feature_selection import SelectKBest, mutual_info_classif, RFE
from xgboost import XGBClassifier
from dataclasses import dataclass

sys.path.append('..')
from src.mlflow_utils import configure_mlflow, find_latest_run_id_by_experiment_and_stage, get_targets, get_data, load_config

In [2]:
CONFIG = load_config()

In [3]:
@dataclass
class FeatureSelectionResult:
    selected_features: pd.Index
    selection_mask: np.ndarray
    method_name: str

In [4]:
class BaseSelector:
    """Base class for feature selection methods"""
    def __init__(self, config: Dict):
        self.config = config
        self.result: FeatureSelectionResult = None
        
    def fit(self, X: pd.DataFrame, y: pd.DataFrame) -> None:
        raise NotImplementedError
        
    def log_metrics(self) -> None:
        mlflow.log_param(f"{self.result.method_name}_num_features", len(self.result.selected_features))

In [5]:
class MutualInfoSelector(BaseSelector):
    """Mutual Information feature selector"""
    def __init__(self, config: Dict):
        super().__init__(config)
        self.k = config["feature_selection"]["mi"]["k"]
        
    def fit(self, X: pd.DataFrame, y: pd.DataFrame) -> None:
        selector = SelectKBest(mutual_info_classif, k=self.k)
        selector.fit(X, y.values.ravel())
        
        self.result = FeatureSelectionResult(
            selected_features=X.columns[selector.get_support()],
            selection_mask=selector.get_support(),
            method_name="mi"
        )

In [6]:
class RFESelector(BaseSelector):
    """Recursive Feature Elimination selector"""
    def __init__(self, config: Dict):
        super().__init__(config)
        self.params = config["feature_selection"]["rfe"]
        
    def _create_estimator(self, y: pd.DataFrame) -> XGBClassifier:
        fraud_ratio = y.mean()
        return XGBClassifier(
            scale_pos_weight=(1 - fraud_ratio) / fraud_ratio,
            subsample=self.params["subsample"],
            random_state=self.params["random_state"],
            device='cuda'
        )
        
    def fit(self, X: pd.DataFrame, y: pd.DataFrame) -> None:
        estimator = self._create_estimator(y)
        selector = RFE(
            estimator=estimator,
            n_features_to_select=self.params["n_features"],
            step=self.params["step"]
        )
        selector.fit(X, y)
        
        self.result = FeatureSelectionResult(
            selected_features=X.columns[selector.support_],
            selection_mask=selector.support_,
            method_name="rfe"
        )

In [7]:
class ShapSelector(BaseSelector):
    """SHAP-based feature selector"""
    def __init__(self, config: Dict):
        super().__init__(config)
        self.params = config["feature_selection"]["shap"]
        
    def fit(self, X: pd.DataFrame, y: pd.DataFrame) -> None:
        model = XGBClassifier(
            scale_pos_weight=(1 - y.mean()) / y.mean(),
            device='cuda'
        ).fit(X, y)
        
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(X)
        importances = np.abs(shap_values).mean(axis=0)
        top_idx = np.argsort(importances)[-self.params["n_features"]:]
        
        self.result = FeatureSelectionResult(
            selected_features=X.columns[top_idx],
            selection_mask=np.isin(X.columns, X.columns[top_idx]),
            method_name="shap"
        )
        
    def plot_summary(self, shap_values: np.ndarray, features: pd.DataFrame) -> None:
        plt.figure()
        shap.summary_plot(shap_values, features, show=False)
        with TemporaryDirectory() as tmp_dir:
            path = Path(tmp_dir) / "shap_summary.png"
            plt.savefig(path, bbox_inches='tight')
            plt.close()
            mlflow.log_artifact(path)

In [8]:
class FeatureVoter:
    """Ensemble feature selection voter"""
    def __init__(self, config: Dict):
        self.min_votes = config["feature_selection"]["voting"]["min_votes"]
        self.results: List[FeatureSelectionResult] = []
        
    def add_result(self, result: FeatureSelectionResult) -> None:
        self.results.append(result)
        
    def vote(self) -> FeatureSelectionResult:
        selection_matrix = pd.DataFrame(
            {r.method_name: r.selection_mask for r in self.results}
        )
        vote_counts = selection_matrix.sum(axis=1)
        
        final_mask = vote_counts >= self.min_votes
        return FeatureSelectionResult(
            selected_features=selection_matrix.index[final_mask],
            selection_mask=final_mask.values,
            method_name="ensemble"
        )

In [9]:
class FeatureSelectionArtifacts:
    """Handles feature selection artifact logging"""
    def __init__(self, config: Dict):
        self.config = config
        
    def log_features(self, features: pd.Index) -> None:
        with TemporaryDirectory() as tmp_dir:
            path = Path(tmp_dir) / "selected_features.parquet"
            pd.Series(features).to_frame().to_parquet(path)
            mlflow.log_artifact(path, self.config["artifacts"]["data"]["selected"])
            
    def log_datasets(self, datasets: Dict[str, pd.DataFrame]) -> None:
        with TemporaryDirectory() as tmp_dir:
            for i, (split, data) in enumerate(datasets.items()):
                path = Path(tmp_dir) / f"{split}.parquet"
                data.to_parquet(path)
                mlflow.log_artifact(
                    path, 
                    f"{self.config['artifacts']['data']['selected']}/{self.config['dataset']['split_dirs'][i]}"
                )

In [10]:
class FeatureSelector:
    """Main feature selection pipeline"""
    def __init__(self, config: Dict):
        self.config = config
        self.X_train = None
        self.y_train = None
        self.X_val = None
        self.X_test = None
        self.artifacts = FeatureSelectionArtifacts(config)
        
    def load_data(self) -> None:
        """Load data from previous pipeline stages"""
        preprocessing_run_id = find_latest_run_id_by_experiment_and_stage(
            self.config["experiment_names"]["preprocessing"],
            self.config["run_names"]["preprocessing"]
        )
        fe_run_id = find_latest_run_id_by_experiment_and_stage(
            self.config["experiment_names"]["feature_engineering"],
            self.config["run_names"]["feature_engineering"]
        )
        
        engineered_data = get_data(fe_run_id, self.config["dataset"], self.config["artifacts"]["data"]["engineered"])
        targets = get_targets(preprocessing_run_id, self.config["dataset"], "processed")
        
        self.X_train = engineered_data["X_train_enriched"]
        self.y_train = targets["y_train"]
        self.X_val = engineered_data["X_val_enriched"]
        self.X_test = engineered_data["X_test_enriched"]
        
    def validate_data(self) -> None:
        """Validate input data quality"""
        if self.X_train.empty or self.y_train.empty:
            raise ValueError("Empty training data received")
        if self.X_train.shape[0] != self.y_train.shape[0]:
            raise ValueError("Feature/target row count mismatch")

    def run_selection(self) -> Tuple[Dict[str, FeatureSelectionResult], FeatureSelectionResult]:
        """Execute all feature selection methods"""
        selectors = [
            MutualInfoSelector(self.config),
            RFESelector(self.config),
            ShapSelector(self.config)
        ]
        
        individual_results = {}
        for selector in selectors:
            try:
                selector.fit(self.X_train, self.y_train)
                selector.log_metrics()
                individual_results[selector.result.method_name] = selector.result
            except Exception as e:
                mlflow.log_param(f"{selector.__class__.__name__}_error", str(e))
                continue
                
        voter = FeatureVoter(self.config)
        for result in individual_results.values():
            voter.add_result(result)
            
        final_result = voter.vote()
        return individual_results, final_result

    def log_selection_metrics(self, results: Dict[str, FeatureSelectionResult], final_result: FeatureSelectionResult) -> None:
        """Log comprehensive selection metrics"""
        mlflow.log_metrics({
            "original_features": self.X_train.shape[1],
            "final_features_selected": len(final_result.selected_features),
            **{f"{k}_features": len(v.selected_features) for k, v in results.items()}
        })

    def create_selected_datasets(self, final_features: pd.Index) -> Dict[str, pd.DataFrame]:
        """Create datasets with selected features"""

        # Temporary hack to fix column names; don't do this in production
        rename_dict = {old_col: new_col for old_col, new_col in zip(self.X_train.columns, range(len(self.X_train.columns)))}
        self.X_train = self.X_train.rename(columns=rename_dict)
        self.X_val = self.X_val.rename(columns=rename_dict)
        self.X_test = self.X_test.rename(columns=rename_dict)
        
        return {
            "X_train": self.X_train[final_features],
            "X_val": self.X_val[final_features],
            "X_test": self.X_test[final_features]
        }

In [11]:
def feature_selection_pipeline(config: Dict) -> None:
    """Main feature selection workflow"""
    selector = FeatureSelector(config)
    
    # Load and validate data
    selector.load_data()
    selector.validate_data()
    
    # Run selection methods
    individual_results, final_result = selector.run_selection()
    
    # Log results
    selector.log_selection_metrics(individual_results, final_result)
    selector.artifacts.log_features(final_result.selected_features)
    
    # Create and log datasets
    selected_data = selector.create_selected_datasets(final_result.selected_features)
    selector.artifacts.log_datasets(selected_data)

In [12]:
if __name__ == "__main__":
    experiment_name = CONFIG["experiment_names"]["feature_selection"]
    run_name = CONFIG["run_names"]["feature_selection"]
    
    configure_mlflow(experiment_name)
    
    try:
        with mlflow.start_run(run_name=run_name):
            mlflow.set_tags({
                "stage": "feature_selection",
                "model_type": "ensemble_selector"
            })
            
            # Log lineage and configuration
            mlflow.log_dict(CONFIG, "feature_selection_config.yaml")
            mlflow.log_params({
                "preprocessing_run_id": find_latest_run_id_by_experiment_and_stage(
                    CONFIG["experiment_names"]["preprocessing"],
                    CONFIG["run_names"]["preprocessing"]
                ),
                "feature_engineering_run_id": find_latest_run_id_by_experiment_and_stage(
                    CONFIG["experiment_names"]["feature_engineering"],
                    CONFIG["run_names"]["feature_engineering"]
                )
            })
            
            feature_selection_pipeline(CONFIG)
            mlflow.set_tag("status", "completed")
            print(f"Feature selection completed. Run ID: {mlflow.active_run().info.run_id}")
            
    except Exception as e:
        mlflow.log_param("error", str(e))
        mlflow.set_tag("status", "failed")
        mlflow.end_run()
        raise

Feature selection completed. Run ID: 6c32beccba7345b2997b8f5e3a9ed2dd
