In [7]:
import sys
import os
import json
import tempfile
import mlflow
import pandas as pd
import joblib
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from mlflow.models.signature import infer_signature

sys.path.append('..')
from dotenv import load_dotenv
from src.mlflow_init import configure_mlflow

load_dotenv()

True

In [8]:
RAW_DATA_DIR = os.environ["RAW_DATA_DIR"]

In [17]:
def log_data_artifacts(data: pd.DataFrame, artifact_path: str) -> None:
    """Log comprehensive data artifacts with schema"""
    with tempfile.TemporaryDirectory() as tmp_dir:
        # Log raw dataset sample
        sample_path = os.path.join(tmp_dir, "data_sample.csv")
        data.sample(1000).to_csv(sample_path, index=False)
        mlflow.log_artifact(sample_path, artifact_path)
        
        # Log dataset schema
        schema = pd.io.json.build_table_schema(data)
        schema_path = os.path.join(tmp_dir, "data_schema.json")
        with open(schema_path, "w") as outfile: 
            json.dump(schema, outfile, indent=2)
            
        mlflow.log_artifact(schema_path, artifact_path)

def log_split_metrics(datasets: dict) -> None:
    """Log dataset metrics with provenance tracking"""
    metrics = {}
    for name, (data, _, _) in datasets.items():
        metrics[f"{name}_samples"] = data.shape[0]
        if len(data.shape) > 1:
            metrics[f"{name}_features"] = data.shape[1]
    
    mlflow.log_metrics(metrics)
    mlflow.log_params({
        "test_size": 0.2,
        "val_size": 0.15,
        "random_state": 42,
        "stratify": True
    })

def log_processed_data(datasets: dict, base_path: str) -> None:
    """Save and log processed datasets in efficient format"""
    with tempfile.TemporaryDirectory() as tmp_dir:
        for name, (data, dtype, role) in datasets.items():
            file_path = os.path.join(tmp_dir, f"{name}.parquet")
            pd.DataFrame(data).to_parquet(file_path)
            mlflow.log_artifact(file_path, os.path.join(base_path, role, dtype))

In [14]:
def prepare_datasets(X_train, X_val, X_test, y_train, y_val, y_test) -> dict:
    """Create dataset dictionary with proper typing and metadata"""
    return {
        'X_train': (X_train, 'Features', 'training'),
        'X_val': (X_val, 'Features', 'validation'),
        'X_test': (X_test, 'Features', 'testing'),
        'y_train': (y_train, 'Target', 'training'),
        'y_val': (y_val, 'Target', 'validation'),
        'y_test': (y_test, 'Target', 'testing')
    }

In [15]:
def data_pipeline(raw_data_dir: str) -> None:
    """End-to-end data preparation pipeline with MLflow tracking"""
    # Set experiment metadata
    mlflow.set_tags({
        "stage": "preprocessing",
        "data_version": "1.0.0",
        "dataset_type": "tabular",
        "task": "classification"
    })
    
    # Load and log raw datarun
    file_path = os.path.join(raw_data_dir, 'creditcard.csv')
    data = pd.read_csv(file_path)
    log_data_artifacts(data, "data/raw")
    
    # Log dataset characteristics
    mlflow.log_params({
        "num_samples": data.shape[0],
        "original_features": data.shape[1],
        "class_ratio": data['Class'].value_counts().to_dict()[1]
    })

    # Split data
    X = data.drop(columns=['Class', 'Time'])
    y = data['Class']
    
    X_train_val, X_test, y_train_val, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )
    
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_val, y_train_val, test_size=0.15, stratify=y_train_val, random_state=42
    )

    # Process and log splits
    datasets = prepare_datasets(X_train, X_val, X_test, y_train, y_val, y_test)
    log_split_metrics(datasets)
    
    # Data scaling
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)
    
    # Log scaler as MLflow model
    signature = infer_signature(X_train, scaler.transform(X_train))
    mlflow.sklearn.log_model(
        sk_model=scaler,
        artifact_path="scaler",
        signature=signature,
        registered_model_name="CreditCardScaler"
    )
    
    # Log processed datasets
    scaled_datasets = prepare_datasets(
        X_train_scaled, X_val_scaled, X_test_scaled, y_train, y_val, y_test
    )
    log_processed_data(scaled_datasets, "data/processed")
    
    # Log environment details
    mlflow.log_params({
        "pandas_version": pd.__version__,
        "sklearn_version": sklearn.__version__,
        "mlflow_version": mlflow.__version__
    })

    # Add data validation checks
    mlflow.log_metrics({
        "train_nan_count": pd.DataFrame(X_train_scaled).isna().sum().sum(),
        "test_negative_samples": (y_test == 1).sum()
    })

In [16]:
if __name__ == "__main__": 
    configure_mlflow("CreditCardFraudPreprocessing")
    
    with mlflow.start_run(run_name="data_preprocessing") as run:
        data_pipeline(RAW_DATA_DIR)
        print("Data pipeline completed. Run ID:", mlflow.active_run().info.run_id)

Registered model 'CreditCardScaler' already exists. Creating a new version of this model...
Created version '14' of model 'CreditCardScaler'.


/tmp/tmpiuth4i_m/X_train.parquet data/processed/training/Features
/tmp/tmpiuth4i_m/X_val.parquet data/processed/validation/Features
/tmp/tmpiuth4i_m/X_test.parquet data/processed/testing/Features
/tmp/tmpiuth4i_m/y_train.parquet data/processed/training/Target
/tmp/tmpiuth4i_m/y_val.parquet data/processed/validation/Target
/tmp/tmpiuth4i_m/y_test.parquet data/processed/testing/Target
Data pipeline completed. Run ID: 6d7d2b94dd084dfe8c97692ef7b1414f
