In [37]:
import os
import tempfile
import mlflow
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
import sys

sys.path.append('..')
from dotenv import load_dotenv
from src.download_data import download_kaggle_dataset

In [38]:
load_dotenv()

True

In [39]:
# Configuration management
MLFLOW_TRACKING_URI = os.environ["MLFLOW_TRACKING_URI"]
EXPERIMENT_NAME = "CreditCardFraudAnalysis"
ARTIFACT_DIR = tempfile.mkdtemp(prefix='eda_artifacts_')  # Use temporary directory
RAW_DATA_DIR = os.environ["RAW_DATA_DIR"]

In [40]:
# Initialize MLflow
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME)

<Experiment: artifact_location='file:///home/nick/Documents/code/fraud-detection-mlops/mlflow/mlruns/633464287265529281', creation_time=1738782014739, experiment_id='633464287265529281', last_update_time=1738782014739, lifecycle_stage='active', name='CreditCardFraudAnalysis', tags={}>

In [41]:
def generate_plot(
    data: pd.DataFrame,
    numerical_features: list,
    target_column: str,
    plot_type: str,
    plot_title: str,
    plot_kws: dict = None,
    figsize: tuple = (20, 30),
    subplot_layout: tuple = (8, 4)
) -> plt.Figure:
    """
    Generate and save a multi-plot visualization for numerical features analysis.
    
    Parameters:
        data (pd.DataFrame): Input dataset
        numerical_features (list): List of numerical features to plot
        target_column (str): Name of target variable column
        plot_type (str): Type of plot to generate (kde, box, hist, violin, scatter)
        plot_title (str): Overall title for the plot matrix
        plot_kws (dict): Keyword arguments for seaborn plotting function
        figsize (tuple): Figure dimensions (width, height)
        subplot_layout (tuple): (rows, cols) for subplot arrangement
        
    Returns:
        plt.Figure: Generated figure object
    
    Raises:
        ValueError: For unsupported plot types
    """
    plot_kws = plot_kws or {}
    n_features = len(numerical_features)
    rows, cols = subplot_layout
    
    # Dynamically adjust layout if needed
    if n_features < rows * cols:
        rows = (n_features + cols - 1) // cols  # Calculate minimum needed rows
    
    fig = plt.figure(figsize=(figsize[0], figsize[1] * rows/8))  # Scale height
    fig.suptitle(plot_title, y=1.02, fontsize=14)
    
    # Supported plot types with validation
    plot_functions = {
        'kde': sns.kdeplot,
        'box': sns.boxplot,
        'hist': sns.histplot,
        'violin': sns.violinplot,
    }
    
    if plot_type not in plot_functions:
        raise ValueError(f"Unsupported plot type: {plot_type}. Choose from {list(plot_functions.keys())}")
    
    # Generate subplots
    for i, feature in enumerate(numerical_features, 1):
        ax = plt.subplot(rows, cols, i)
        
        try:
            if plot_type in ['kde', 'hist']:
                plot_functions[plot_type](
                    data=data, 
                    x=feature, 
                    hue=target_column, 
                    common_norm=False, 
                    ax=ax,
                    **plot_kws
                )
            elif plot_type in ['box', 'violin']:
                plot_functions[plot_type](
                    data=data, 
                    x=target_column, 
                    y=feature, 
                    ax=ax,
                    **plot_kws
                )
                
            ax.set_title(f"{feature}\n", fontsize=10)
            ax.tick_params(axis='x', labelsize=8)
            ax.tick_params(axis='y', labelsize=8)
            
        except Exception as e:
            plt.close()
            raise RuntimeError(f"Error generating {plot_type} plot for {feature}: {str(e)}")
    
    plt.tight_layout()
    return fig

In [42]:
def log_plot(plot_name: str) -> None:
    """Helper function to save and log plots"""
    plot_path = os.path.join(ARTIFACT_DIR, f"{plot_name}_{datetime.now().strftime('%Y%m%d%H%M%S')}.png")
    plt.savefig(plot_path, bbox_inches='tight')
    mlflow.log_artifact(plot_path)
    plt.close()

def log_dataset_summary(data: pd.DataFrame) -> None:
    """Log key dataset statistics"""
    mlflow.log_params({
        "num_rows": data.shape[0],
        "num_columns": data.shape[1],
        "dataset_version": "1.0.0",
        "dataset_source": "kaggle"
    })
    
    missing_values = data.isnull().sum()
    mlflow.log_metrics({
        "total_missing_values": missing_values.sum(),
        "max_missing_in_column": missing_values.max()
    })

In [43]:
def analyze_target_distribution(data: pd.DataFrame, target_col: str = 'Class') -> None:
    """Analyze and log target variable distribution"""
    target_dist = data[target_col].value_counts()
    mlflow.log_metrics({
        "fraud_cases": target_dist[1],
        "non_fraud_cases": target_dist[0],
        "imbalance_ratio": target_dist[0] / target_dist[1]
    })
    
    plt.figure(figsize=(8, 6))
    sns.countplot(x=target_col, data=data)
    plt.title('Class Distribution (0: Normal, 1: Fraud)')
    log_plot("class_distribution")

def log_feature_analysis(data: pd.DataFrame, numerical_features: list, target_col: str = 'Class') -> None:
    """Perform and log feature analysis"""
    # Correlation analysis
    corr_matrix = data[numerical_features + [target_col]].corr()
    plt.figure(figsize=(12, 10))
    sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', center=0)
    plt.title('Feature Correlation Matrix')
    log_plot("correlation_matrix")

    # Outlier analysis using IQR
    outlier_counts = {}
    for feature in numerical_features:
        Q1 = data[feature].quantile(0.25)
        Q3 = data[feature].quantile(0.75)
        IQR = Q3 - Q1
        outlier_counts[feature] = ((data[feature] < (Q1 - 1.5 * IQR)) | 
                                  (data[feature] > (Q3 + 1.5 * IQR))).sum()
    
    mlflow.log_metrics({f"outliers_{k}": v for k, v in outlier_counts.items()})

    # Plot boxplots
    fig = generate_plot(data, numerical_features, target_col, 'box','Feature Boxplots')
    log_plot("boxplots")

    # Plot KDEs
    fig = generate_plot(data, numerical_features, target_col, 'kde','Feature KDEs')
    log_plot("kdes")
    
    # Plot violin plots
    fig = generate_plot(data, numerical_features, target_col, 'violin','Feature Violin Plots')
    log_plot("violins")

In [44]:
def main():
    with mlflow.start_run(run_name="EDA_Run") as run:
        # Log environment details
        mlflow.log_params({
            "python_version": sys.version,
            "pandas_version": pd.__version__,
            "mlflow_version": mlflow.__version__
        })
        
        # Download and log dataset
        dataset_name = 'mlg-ulb/creditcardfraud'
        dataset_dir = download_kaggle_dataset(dataset_name, RAW_DATA_DIR)
        mlflow.log_artifact(os.path.join(dataset_dir, 'creditcard.csv'), "raw_data")
        
        # Load data
        data = pd.read_csv(os.path.join(dataset_dir, 'creditcard.csv'))
        
        # Dataset summary
        log_dataset_summary(data)
        
        # Target analysis
        analyze_target_distribution(data)
        
        # Feature analysis
        numerical_features = data.select_dtypes(include=['float64']).columns.tolist()
        log_feature_analysis(data, numerical_features)
        
        # Add tags for better organization
        mlflow.set_tags({
            "task_type": "eda",
            "dataset": "creditcard-fraud",
            "team": "fraud-detection"
        })

In [45]:
if __name__ == "__main__":
    main()
    print("MLflow run completed. Artifacts stored in:", ARTIFACT_DIR)

Dataset URL: https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud
Dataset downloaded to directory: /home/nick/Documents/code/fraud-detection-mlops/data/raw
MLflow run completed. Artifacts stored in: /tmp/eda_artifacts_z0r6fxb0
