In [1]:
import sys
import os
import json
import tempfile
import mlflow
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from mlflow.models.signature import infer_signature

sys.path.append('..')
from src.mlflow_utils import configure_mlflow, load_config, find_latest_run_id_by_experiment_and_stage, get_dataset

True

In [2]:
CONFIG = load_config()

In [3]:
def log_data_artifacts(data: pd.DataFrame) -> None:
    """Log comprehensive data artifacts with schema"""
    with tempfile.TemporaryDirectory() as tmp_dir:
        # Log raw dataset sample
        sample_path = os.path.join(tmp_dir, "data_sample.csv")
        data.sample(1000).to_csv(sample_path, index=False)
        mlflow.log_artifact(sample_path)
        
        # Log dataset schema
        schema = pd.io.json.build_table_schema(data)
        schema_path = os.path.join(tmp_dir, "data_schema.json")
        with open(schema_path, "w") as outfile: 
            json.dump(schema, outfile, indent=2)
            
        mlflow.log_artifact(schema_path)

def log_split_metrics(datasets: dict) -> None:
    """Log dataset metrics with provenance tracking"""
    metrics = {}
    for name, (data, _, _) in datasets.items():
        metrics[f"{name}_samples"] = data.shape[0]
        if len(data.shape) > 1:
            metrics[f"{name}_features"] = data.shape[1]
    
    mlflow.log_metrics(metrics)

def log_processed_data(datasets: dict, base_path: str) -> None:
    """Save and log processed datasets in efficient format"""
    with tempfile.TemporaryDirectory() as tmp_dir:
        for name, (data, _, split) in datasets.items():
            file_path = os.path.join(tmp_dir, f"{name}.parquet")
            pd.DataFrame(data).to_parquet(file_path)
            mlflow.log_artifact(file_path, os.path.join(base_path, split))

In [4]:
def prepare_datasets(X_train, X_val, X_test, y_train, y_val, y_test) -> dict:
    """Create dataset dictionary with proper typing and metadata"""
    return {
        'X_train': (X_train, 'Features', 'training'),
        'X_val': (X_val, 'Features', 'validation'),
        'X_test': (X_test, 'Features', 'testing'),
        'y_train': (y_train, 'Target', 'training'),
        'y_val': (y_val, 'Target', 'validation'),
        'y_test': (y_test, 'Target', 'testing')
    }

In [5]:
def data_pipeline(config: dict) -> None:
    """End-to-end data preparation pipeline with MLflow tracking"""
    # Set experiment metadata
    mlflow.set_tags({
        "stage": config["run_names"]["preprocessing"],
        "dataset_type": config["dataset"]["type"],
        "task": config["dataset"]["task"]
    })

    # Log environment details
    mlflow.log_params({
        "pandas_version": pd.__version__,
        "sklearn_version": sklearn.__version__,
        "mlflow_version": mlflow.__version__
    })

    eda_run_id = find_latest_run_id_by_experiment_and_stage(
        config["experiment_names"]["eda"],
        config["run_names"]["eda"]
    )

    data = get_dataset(eda_run_id, config["artifacts"]["data"]["raw"])
    log_data_artifacts(data)
    
    # Log dataset characteristics
    mlflow.log_params({
        "num_samples": data.shape[0],
        "original_features": data.shape[1],
        "class_ratio": data['Class'].value_counts().to_dict()[1]
    })

    # Split data
    X = data.drop(columns=['Class', 'Time'])
    y = data['Class']
    
    X_train_val, X_test, y_train_val, y_test = train_test_split(
        X, y, test_size=config["dataset"]["split"]["val_size"], stratify=y, random_state=config["dataset"]["split"]["random_state"]
    )
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_val, y_train_val, test_size=config["dataset"]["split"]["test_size"], stratify=y_train_val, random_state=config["dataset"]["split"]["random_state"]
    )
    # Process and log splits
    datasets = prepare_datasets(X_train, X_val, X_test, y_train, y_val, y_test)
    log_split_metrics(datasets)
    
    # Data scaling
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)
    
    # Log scaler as MLflow model
    signature = infer_signature(X_train, scaler.transform(X_train))
    mlflow.sklearn.log_model(
        sk_model=scaler,
        artifact_path=config["models"]["scaler"]["name"],
        signature=signature,
        registered_model_name=config["models"]["scaler"]["registered_model_name"]
    )
    
    # Log processed datasets
    scaled_datasets = prepare_datasets(
        X_train_scaled, X_val_scaled, X_test_scaled, y_train, y_val, y_test
    )
    log_processed_data(scaled_datasets, config["artifacts"]["data"]["processed"])

    # Add data validation checks
    mlflow.log_metrics({
        "train_nan_count": pd.DataFrame(X_train_scaled).isna().sum().sum(),
        "test_negative_samples": (y_test == 1).sum()
    })

    # Log configuration
    mlflow.log_dict(config, "preprocessing_config.json")

In [6]:
if __name__ == "__main__": 
    experiment_name = CONFIG["experiment_names"]["preprocessing"]
    configure_mlflow(experiment_name)
    
    with mlflow.start_run(run_name="data_preprocessing") as run:
        data_pipeline(CONFIG)
        print("Data pipeline completed. Run ID:", mlflow.active_run().info.run_id)

Registered model 'CreditCardScaler' already exists. Creating a new version of this model...
Created version '20' of model 'CreditCardScaler'.


Data pipeline completed. Run ID: 48285a75e94f47f58a1892a74f2b2226
