In [1]:
import os
import sys
import pandas as pd
import numpy as np
import mlflow
from pathlib import Path
from tempfile import TemporaryDirectory
from imblearn.over_sampling import SMOTE

sys.path.append('..')
from src.mlflow_utils import configure_mlflow, find_latest_run_id_by_experiment_and_stage, get_targets, get_data, load_config

In [2]:
CONFIG = load_config()

In [3]:
def resampling_pipeline(config):
    with TemporaryDirectory() as tmp_dir:
        mlflow.set_tags({
            "stage": "resampling",
            "model_type": "SMOTE"
        })

        fs_experiment_name = config["experiment_names"]["feature_selection"]
        fs_run_name = config["run_names"]["feature_selection"]
        preprocessing_experiment_name = config["experiment_names"]["preprocessing"]
        preprocessing_run_name = config["run_names"]["preprocessing"]

        
        # Find parent runs
        fs_run_id = find_latest_run_id_by_experiment_and_stage(fs_experiment_name, fs_run_name)
        preprocessing_run_id = find_latest_run_id_by_experiment_and_stage(preprocessing_experiment_name, preprocessing_run_name)

        # Log lineage
        mlflow.log_params({
            "feature_selection_run_id": fs_run_id,
            "preprocessing_run_id": preprocessing_run_id
        })

        # Load data from MLflow
        selected_data = get_data(fs_run_id, config["dataset"], config["artifacts"]["data"]["selected"])
        targets = get_targets(preprocessing_run_id, config["dataset"])

        X_train = selected_data["X_train"]
        y_train = targets["y_train"]

        # Apply SMOTE to training data only
        sm = SMOTE(random_state=42, sampling_strategy='auto')
        X_train_res, y_train_res = sm.fit_resample(X_train, y_train.values.ravel())

        artifacts = config["artifacts"]

        with TemporaryDirectory() as tmp_dir:            
            X_data_path = Path(tmp_dir) / "X_train.parquet"
            y_data_path = Path(tmp_dir) / "y_train.parquet"
            X_train_res.to_parquet(X_data_path)
            pd.DataFrame(y_train_res).to_parquet(y_data_path)
            mlflow.log_artifact(X_data_path, f"{artifacts['data']['resampled']}/training")
            mlflow.log_artifact(y_data_path, f"{artifacts['data']['resampled']}/training")

        # Log configuration
        mlflow.log_dict(config, "resampling_config.json")

In [4]:
if __name__ == "__main__":
    configure_mlflow("CreditCardFraudResmapling")
    with mlflow.start_run(run_name="resampling"):
        resampling_pipeline(CONFIG)
        print("Resampling pipeline completed. Run ID:", mlflow.active_run().info.run_id)

Resampling pipeline completed. Run ID: 5e493c31d48c461f931a0d5e32fa3e85
