In [5]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib
import mlflow

In [6]:
# Define directories
raw_data_dir = '../data/raw/'
processed_data_dir = '../data/processed/'
model_dir = '../models/'

In [7]:
# Initialize MLflow and set the tracking URI
mlflow.set_tracking_uri("file:///home/nick/Documents/code/fraud-detection-mlops/mlflow/mlruns")

In [8]:
# Create an experiment
experiment_name = "CreditCardFraudPreprocessing"
mlflow.set_experiment(experiment_name)

2025/02/05 23:58:07 INFO mlflow.tracking.fluent: Experiment with name 'CreditCardFraudPreprocessing' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///home/nick/Documents/code/fraud-detection-mlops/mlflow/mlruns/651438819155313292', creation_time=1738753087707, experiment_id='651438819155313292', last_update_time=1738753087707, lifecycle_stage='active', name='CreditCardFraudPreprocessing', tags={}>

In [11]:
with mlflow.start_run():
    # Load the data
    file_path = os.path.join(raw_data_dir, 'creditcard.csv')
    data = pd.read_csv(file_path)
    
    # Log basic information about the dataset
    mlflow.log_param("num_rows", data.shape[0])
    mlflow.log_param("num_columns", data.shape[1])
    
    # Split features and target
    X = data.drop(columns=['Class', 'Time'])
    y = data['Class']
    
    # Initial split (train_val + test)
    X_train_val, X_test, y_train_val, y_test = train_test_split(
        X, y, 
        test_size=0.2, 
        stratify=y,
        random_state=42
    )
    
    # Split train_val into train and validation
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_val, y_train_val,
        test_size=0.15,
        stratify=y_train_val,
        random_state=42
    )
    
    # Log split sizes
    mlflow.log_metric("train_samples", X_train.shape[0])
    mlflow.log_metric("val_samples", X_val.shape[0])
    mlflow.log_metric("test_samples", X_test.shape[0])
    
    # Scale the data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)
    
    # Save the scaler
    scaler_path = os.path.join(model_dir, 'scaler.pkl')
    joblib.dump(scaler, scaler_path)
    mlflow.log_artifact(scaler_path)
    
    # Prepare datasets for saving
    datasets = {
        # Features
        'X_train_scaled.csv': X_train_scaled,
        'X_val_scaled.csv': X_val_scaled,
        'X_test_scaled.csv': X_test_scaled,
        # Targets
        'y_train.csv': y_train,
        'y_val.csv': y_val,
        'y_test.csv': y_test
    }
    
    # Save datasets and log them as artifacts
    for filename, data in datasets.items():
        file_path = os.path.join(processed_data_dir, filename)
        pd.DataFrame(data).to_csv(file_path, index=False)
        mlflow.log_artifact(file_path)
    
    # Print final dataset shapes
    print(f"Files in {processed_data_dir}:")
    print(os.listdir(processed_data_dir))
    print('\nDataset shapes:')
    print(f"Train: {X_train_scaled.shape} samples")
    print(f"Val:   {X_val_scaled.shape} samples")
    print(f"Test:  {X_test_scaled.shape} samples")

print("MLflow run completed.")

Files in ../data/processed/:
['y_test.csv', '.gitignore', 'X_train_scaled.csv', 'X_val_scaled.csv', 'X_test_scaled.csv', 'y_val.csv', '.gitkeep', 'y_train.csv']

Dataset shapes:
Train: (193668, 29) samples
Val:   (34177, 29) samples
Test:  (56962, 29) samples
MLflow run completed.
