In [77]:
import os
import sys
import tempfile
import mlflow
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
from tempfile import TemporaryDirectory
from pathlib import Path

sys.path.append('..')
from dotenv import load_dotenv
from src.download_data import download_kaggle_dataset
from src.mlflow_utils import configure_mlflow, load_config

load_dotenv()

True

In [78]:
CONFIG = load_config()

In [79]:
def generate_plot(
    data: pd.DataFrame,
    numerical_features: list,
    target_column: str,
    plot_type: str,
    plot_title: str,
    plot_kws: dict = None,
    figsize: tuple = (20, 30),
    subplot_layout: tuple = (8, 4)
) -> plt.Figure:
    """
    Generate and save a multi-plot visualization for numerical features analysis.
    
    Parameters:
        data (pd.DataFrame): Input dataset
        numerical_features (list): List of numerical features to plot
        target_column (str): Name of target variable column
        plot_type (str): Type of plot to generate (kde, box, hist, violin, scatter)
        plot_title (str): Overall title for the plot matrix
        plot_kws (dict): Keyword arguments for seaborn plotting function
        figsize (tuple): Figure dimensions (width, height)
        subplot_layout (tuple): (rows, cols) for subplot arrangement
        
    Returns:
        plt.Figure: Generated figure object
    
    Raises:
        ValueError: For unsupported plot types
    """
    plot_kws = plot_kws or {}
    n_features = len(numerical_features)
    rows, cols = subplot_layout
    
    # Dynamically adjust layout if needed
    if n_features < rows * cols:
        rows = (n_features + cols - 1) // cols  # Calculate minimum needed rows
    
    fig = plt.figure(figsize=(figsize[0], figsize[1] * rows/8))  # Scale height
    fig.suptitle(plot_title, y=1.02, fontsize=14)
    
    # Supported plot types with validation
    plot_functions = {
        'kde': sns.kdeplot,
        'box': sns.boxplot,
        'hist': sns.histplot,
        'violin': sns.violinplot,
    }
    
    if plot_type not in plot_functions:
        raise ValueError(f"Unsupported plot type: {plot_type}. Choose from {list(plot_functions.keys())}")
    
    # Generate subplots
    for i, feature in enumerate(numerical_features, 1):
        ax = plt.subplot(rows, cols, i)
        
        try:
            if plot_type in ['kde', 'hist']:
                plot_functions[plot_type](
                    data=data, 
                    x=feature, 
                    hue=target_column, 
                    common_norm=False, 
                    ax=ax,
                    **plot_kws
                )
            elif plot_type in ['box', 'violin']:
                plot_functions[plot_type](
                    data=data, 
                    x=target_column, 
                    y=feature, 
                    ax=ax,
                    **plot_kws
                )
                
            ax.set_title(f"{feature}\n", fontsize=10)
            ax.tick_params(axis='x', labelsize=8)
            ax.tick_params(axis='y', labelsize=8)
            
        except Exception as e:
            plt.close()
            raise RuntimeError(f"Error generating {plot_type} plot for {feature}: {str(e)}")
    
    plt.tight_layout()
    return fig

In [80]:
def log_plot(plot_name: str, artifact_dir: str) -> None:
    """Helper function to save and log plots"""
    with TemporaryDirectory() as tmpdir:
        plot_path = os.path.join(Path(tmpdir), f"{plot_name}_{datetime.now().strftime('%Y%m%d%H%M%S')}.png")
        plt.savefig(plot_path, bbox_inches='tight')
        mlflow.log_artifact(plot_path, artifact_dir)
    plt.close()

def log_dataset_summary(data: pd.DataFrame) -> None:
    """Log key dataset statistics"""
    mlflow.log_params({
        "num_rows": data.shape[0],
        "num_columns": data.shape[1],
        "dataset_version": "1.0.0",
        "dataset_source": "kaggle"
    })
    
    missing_values = data.isnull().sum()
    mlflow.log_metrics({
        "total_missing_values": missing_values.sum(),
        "max_missing_in_column": missing_values.max()
    })

In [81]:
def analyze_target_distribution(data: pd.DataFrame, artifact_dir: str, target_col: str = 'Class') -> None:
    """Analyze and log target variable distribution"""
    target_dist = data[target_col].value_counts()
    mlflow.log_metrics({
        "fraud_cases": target_dist[1],
        "non_fraud_cases": target_dist[0],
        "imbalance_ratio": target_dist[0] / target_dist[1]
    })
    
    plt.figure(figsize=(8, 6))
    sns.countplot(x=target_col, data=data)
    plt.title('Class Distribution (0: Normal, 1: Fraud)')
    log_plot("class_distribution", artifact_dir)

In [82]:
def log_feature_analysis(data: pd.DataFrame, numerical_features: list, artifact_dir: str, target_col: str = 'Class') -> None:
    """Perform and log feature analysis"""
    # Correlation analysis
    corr_matrix = data[numerical_features + [target_col]].corr()
    plt.figure(figsize=(12, 10))
    sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', center=0)
    plt.title('Feature Correlation Matrix')
    log_plot("correlation_matrix", artifact_dir)

    # Outlier analysis using IQR
    outlier_counts = {}
    for feature in numerical_features:
        Q1 = data[feature].quantile(0.25)
        Q3 = data[feature].quantile(0.75)
        IQR = Q3 - Q1
        outlier_counts[feature] = ((data[feature] < (Q1 - 1.5 * IQR)) | 
                                  (data[feature] > (Q3 + 1.5 * IQR))).sum()
    
    mlflow.log_metrics({f"outliers_{k}": v for k, v in outlier_counts.items()})

    # Plot boxplots
    fig = generate_plot(data, numerical_features, target_col, 'box','Feature Boxplots')
    log_plot("boxplots", artifact_dir)

    # Plot KDEs
    fig = generate_plot(data, numerical_features, target_col, 'kde','Feature KDEs')
    log_plot("kdes", artifact_dir)
    
    # Plot violin plots
    fig = generate_plot(data, numerical_features, target_col, 'violin','Feature Violin Plots')
    log_plot("violins", artifact_dir)

In [83]:
def eda_pipeline(config: dict):
    # Log environment details
    mlflow.log_params({
        "python_version": sys.version,
        "pandas_version": pd.__version__,
        "mlflow_version": mlflow.__version__
    })
    
    # Download and log dataset
    dataset_name = config["dataset"]["name"]
    dataset_filename = config["dataset"]["filename"]
    raw_data_artifact_path = config["artifacts"]["data"]["raw"]
    plot_artifact_path = config["artifacts"]["plots"]

    with TemporaryDirectory() as tmpdir:
        dataset_dir = download_kaggle_dataset(dataset_name, tmpdir)
        mlflow.log_artifact(os.path.join(dataset_dir, dataset_filename), raw_data_artifact_path)
        # Load data
        data_path = os.path.join(dataset_dir, dataset_filename)
        data = pd.read_csv(data_path)
    
        # Dataset summary
        log_dataset_summary(data)
        
        # Target analysis
        analyze_target_distribution(data, plot_artifact_path)
        
        # Feature analysis
        numerical_features = data.select_dtypes(include=[float]).columns.tolist()
        log_feature_analysis(data, numerical_features, plot_artifact_path)
        
    mlflow.set_tags({
        "task_type": "eda",
        "dataset": "creditcard-fraud",
        "team": "fraud-detection"
    })

In [84]:
if __name__ == "__main__":
    experiment_name = CONFIG["experiment_names"]["eda"]
    run_name = CONFIG["run_names"]["eda"]
    configure_mlflow(experiment_name)
    
    with mlflow.start_run(run_name=run_name) as run:
        eda_pipeline(CONFIG)
        print("EDA pipeline completed. Run ID:", mlflow.active_run().info.run_id)

Dataset URL: https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud
Dataset downloaded to directory: /tmp/tmpjcrxxhrw
EDA pipeline completed. Run ID: a26cefbe123a44c7bab13858a8d38968
