In [1]:
import sys
import pandas as pd
import optuna
import mlflow
import matplotlib.pyplot as plt
import seaborn as sns
from tempfile import TemporaryDirectory
from pathlib import Path
from xgboost import XGBClassifier, plot_importance
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from sklearn.model_selection import cross_val_score

sys.path.append('..')
from src.mlflow_utils import configure_mlflow, find_latest_run_id_by_experiment_and_stage, get_targets, get_data, load_config

In [2]:
CONFIG = load_config()

In [3]:
def log_lineage_info(config):
    """Logs lineage information from previous pipeline stages"""
    fs_run_id = find_latest_run_id_by_experiment_and_stage(
        config["experiment_names"]["feature_selection"],
        config["run_names"]["feature_selection"]
    )
    preprocessing_run_id = find_latest_run_id_by_experiment_and_stage(
        config["experiment_names"]["preprocessing"],
        config["run_names"]["preprocessing"]
    )
    resampling_run_id = find_latest_run_id_by_experiment_and_stage(
        config["experiment_names"]["resampling"],
        config["run_names"]["resampling"]
    )
    
    mlflow.log_params({
        "feature_selection_run_id": fs_run_id,
        "preprocessing_run_id": preprocessing_run_id,
        "resampling_run_id": resampling_run_id
    })
    
    return fs_run_id, preprocessing_run_id, resampling_run_id

In [4]:
def train_final_model(best_params, X, y):
    """Trains and logs the final model with best parameters"""
    model = XGBClassifier(**best_params, random_state=42, device='cuda')
    model.fit(X, y)
    mlflow.xgboost.log_model(model, "best_model")
    return model

In [5]:
def log_feature_importance(model):
    """Generates and logs feature importance visualization"""
    fig, ax = plt.subplots(figsize=(10, 8))
    plot_importance(model, ax=ax)
    mlflow.log_figure(fig, "feature_importance.png")
    plt.close()


In [6]:
def evaluate_model(model, X_test, y_test):
    """Evaluates model performance and logs metrics/artifacts"""
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    
    # Calculate metrics
    clf_report = classification_report(y_test, y_pred, output_dict=True)
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    roc_auc = auc(fpr, tpr)
    
    # Log metrics
    mlflow.log_metrics({
        "test_roc_auc": roc_auc,
        "test_precision": clf_report['1']['precision'],
        "test_recall": clf_report['1']['recall'],
        "test_f1": clf_report['1']['f1-score']
    })
    
    return y_pred, y_proba, clf_report, fpr, tpr, roc_auc

In [7]:
def log_classification_report(y_test, y_pred):
    """Logs classification report as text artifact"""
    with TemporaryDirectory() as tmpdir:
        report_path = Path(tmpdir) / "classification_report.txt"
        with open(report_path, "w") as f:
            f.write(classification_report(y_test, y_pred, digits=6))
        mlflow.log_artifact(report_path)

In [8]:
def log_confusion_matrix(y_test, y_pred):
    """Generates and logs confusion matrix visualization"""
    plt.figure(figsize=(8, 6))
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Non-Fraud', 'Fraud'], 
                yticklabels=['Non-Fraud', 'Fraud'])
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    mlflow.log_figure(plt.gcf(), "confusion_matrix.png")
    plt.close()

In [9]:
def log_roc_curve(fpr, tpr, roc_auc):
    """Generates and logs ROC curve visualization"""
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2, 
             label=f'ROC Curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([-0.05, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc='lower right')
    mlflow.log_figure(plt.gcf(), "roc_curve.png")
    plt.close()

In [10]:
def run_optuna_study(X_train, y_train, X_val, y_val):
    """Runs Optuna hyperparameter optimization"""
    def objective(trial):
        with mlflow.start_run(nested=True):
            params = {
                'n_estimators': trial.suggest_int('n_estimators', 50, 500),
                'max_depth': trial.suggest_int('max_depth', 3, 10),
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.5, log=True),
                'subsample': trial.suggest_float('subsample', 0.6, 1.0),
                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
                'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
                'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
                'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0, log=True),
                'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0, log=True),
                'random_state': 42,
                'eval_metric': 'logloss',
                'device': 'cuda',
                'n_jobs': -1
            }
            
            mlflow.log_params(params)
            model = XGBClassifier(**params)
            scores = cross_val_score(model, X_train, y_train, cv=3, scoring='roc_auc')
            mean_score = scores.mean()
            mlflow.log_metric("cv_roc_auc", mean_score)
            return mean_score

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=100, n_jobs=6)
    
    mlflow.log_params(study.best_params)
    mlflow.log_metric("best_cv_roc_auc", study.best_value)
    
    return study

In [11]:
def training_pipeline(config):
    """Main training and evaluation pipeline"""
    # Get data from resampling stage
    fs_run_id, preprocessing_run_id, resampling_run_id = log_lineage_info(config)
    resampled_data = get_data(resampling_run_id, config["dataset"], config["artifacts"]["data"]["resampled"])
    resampled_targets = get_targets(resampling_run_id, config["dataset"], "resampled")
    fs_val_data = get_data(fs_run_id, config["dataset"], config["artifacts"]["data"]["selected"])
    val_targets = get_targets(preprocessing_run_id, config["dataset"], "processed")
    
    X_train, y_train = resampled_data["X_train_resampled"], resampled_targets["y_train"]
    X_val, y_val = fs_val_data["X_val"], val_targets["y_val"]
    X_train, y_train = pd.concat([X_train, X_val], ignore_index=True), pd.concat([y_train, y_val], ignore_index=True)
    
    # Hyperparameter optimization
    study = run_optuna_study(X_train, y_train, X_val, y_val)

    # Final model training
    best_model = train_final_model(study.best_params, X_train, y_train)
    log_feature_importance(best_model)

    # Get test data
    fs_run_id, preprocessing_run_id, _ = log_lineage_info(config)
    selected_data = get_data(fs_run_id, config["dataset"], config["artifacts"]["data"]["selected"])
    targets = get_targets(preprocessing_run_id, config["dataset"], "processed")
    X_test, y_test = selected_data["X_test"], targets["y_test"]

    # Model evaluation
    y_pred, y_proba, clf_report, fpr, tpr, roc_auc = evaluate_model(best_model, X_test, y_test)
    log_classification_report(y_test, y_pred)
    log_confusion_matrix(y_test, y_pred)
    log_roc_curve(fpr, tpr, roc_auc)

In [12]:
if __name__ == "__main__":
    experiment_name = CONFIG["experiment_names"]["training"]
    run_name = CONFIG["run_names"]["training"]
    configure_mlflow(experiment_name)
    mlflow.sklearn.autolog()

    with mlflow.start_run(run_name=run_name):
        training_pipeline(CONFIG)
        print("Training pipeline completed. Run ID:", mlflow.active_run().info.run_id)

[I 2025-02-07 20:16:14,618] A new study created in memory with name: no-name-d3f6d03f-5204-4ddb-bc89-aa34b031f18d
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


[I 2025-02-07 20:16:28,916] Trial 1 finished with value: 0.9973070552457171 and parameters: {'n_estimators': 266, 'max_depth': 3, 'learning_rate': 0.024957631761648467, 'subsample': 0.8999146578578006, 'colsample_bytree': 0.9338081759179531, 'gamma': 9.190956623697607e-08, 'min_child_weight': 9, 'reg_alpha': 3.2382641837282435e-05, 'reg_lambda': 1.4129881616454519e-05}. Best is trial 1 with value: 0.9973070552457171.
[I 2025-02-07 20:16:31,020] Trial 3 finished with value: 0.9997223887562104 and parameters: {'n_estimators': 115, 'max_depth': 8, 'learning_rate': 0.010550692854049781, 'subsample': 0.6440035535342551, 'colsample_bytree': 0.8950220997971637, 'gamma': 0.0007487699992168593, 'min_child_weight': 5, 'reg_alpha': 

Training pipeline completed. Run ID: 6fa94adeb2e94984a3973034a4025fe5
