In [1]:
import os
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
import mlflow

In [2]:
mlflow.set_tracking_uri("file:///home/nick/Documents/code/fraud-detection-mlops/mlflow/mlruns")
experiment_name = "CreditCardFraudResampling"
mlflow.set_experiment(experiment_name)

2025/02/06 00:18:10 INFO mlflow.tracking.fluent: Experiment with name 'CreditCardFraudResampling' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///home/nick/Documents/code/fraud-detection-mlops/mlflow/mlruns/929632483163283303', creation_time=1738754290102, experiment_id='929632483163283303', last_update_time=1738754290102, lifecycle_stage='active', name='CreditCardFraudResampling', tags={}>

In [3]:
feature_selected_data_dir = '../data/selected'
processed_data_dir = '../data/processed'
resampled_data_dir = '../data/resampled'

In [5]:
with mlflow.start_run():
    # Load selected features data
    X_train = pd.read_csv(os.path.join(feature_selected_data_dir, 'X_train_selected.csv'))
    y_train = pd.read_csv(os.path.join(processed_data_dir, 'y_train.csv'))

    # Log basic information about the dataset
    mlflow.log_param("num_rows_train", X_train.shape[0])
    mlflow.log_param("num_columns_train", X_train.shape[1])

    # Apply SMOTE to training data only
    sm = SMOTE(random_state=42, sampling_strategy='auto')
    X_train_res, y_train_res = sm.fit_resample(X_train, y_train.values.ravel())

    # Log resampling details
    mlflow.log_param("smote_random_state", 42)
    mlflow.log_param("smote_sampling_strategy", 'auto')
    mlflow.log_metric("original_num_samples", len(y_train))
    mlflow.log_metric("resampled_num_samples", len(y_train_res))

    # Save resampled data
    X_train_res.to_csv(os.path.join(resampled_data_dir, 'X_train_res.csv'), index=False)
    pd.Series(y_train_res).to_csv(os.path.join(resampled_data_dir, 'y_train_res.csv'), index=False)

    # Log artifacts
    mlflow.log_artifact(os.path.join(resampled_data_dir, 'X_train_res.csv'))
    mlflow.log_artifact(os.path.join(resampled_data_dir, 'y_train_res.csv'))

    # Verify the resampling
    print(f"Original class distribution: {pd.Series(y_train.values.ravel()).value_counts()}")
    print(f"Resampled class distribution: {pd.Series(y_train_res).value_counts()}")

print("MLflow run completed.")

Original class distribution: 0    193333
1       335
Name: count, dtype: int64
Resampled class distribution: 0    193333
1    193333
Name: count, dtype: int64
MLflow run completed.
