In [1]:
import os
import sys
import mlflow
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
from tempfile import TemporaryDirectory
from pathlib import Path
from typing import List, Dict

sys.path.append('..')
from src.download_data import download_kaggle_dataset
from src.mlflow_utils import configure_mlflow, load_config

In [2]:
CONFIG = load_config()

In [3]:
FeatureList = List[str]
PlotConfig = Dict[str, any]

In [4]:
class EDAAnalyzer:
    """Main class encapsulating EDA functionality"""
    
    def __init__(self, config: Dict, data: pd.DataFrame):
        self.config = config
        self.data = data
        self.numerical_features = self._identify_numerical_features()
        self.categorical_features = self._identify_categorical_features()
        self.target_col = config["dataset"]["target_column"]

    def _identify_numerical_features(self) -> FeatureList:
        """Identify numerical features in dataset"""
        return self.data.select_dtypes(include=[float]).columns.tolist()

    def _identify_categorical_features(self) -> FeatureList:
        """Identify categorical features in dataset"""
        return self.data.select_dtypes(include=['object', 'category']).columns.tolist()

    def _validate_dataset(self) -> None:
        """Perform basic data validation"""
        if self.data.empty:
            raise ValueError("Empty dataset loaded")
        if self.target_col not in self.data.columns:
            raise ValueError(f"Target column {self.target_col} not found in dataset")

    def generate_distribution_plots(self, plot_type: str) -> plt.Figure:
        """Generate distribution plots for numerical features"""
        plot_config = {
            'kde': {'func': sns.kdeplot, 'kwargs': {'common_norm': False}},
            'box': {'func': sns.boxplot, 'kwargs': {}},
            'violin': {'func': sns.violinplot, 'kwargs': {}},
        }

        if plot_type not in plot_config:
            raise ValueError(f"Unsupported plot type: {plot_type}")

        n_cols = 4
        n_rows = (len(self.numerical_features) + n_cols - 1) // n_cols
        fig, axes = plt.subplots(n_rows, n_cols, figsize=(20, 5*n_rows))
        axes = axes.flatten()

        for i, feature in enumerate(self.numerical_features):
            ax = axes[i]
            try:
                if plot_type == 'kde':
                    plot_config[plot_type]['func'](
                        data=self.data,
                        x=feature,
                        hue=self.target_col,
                        ax=ax,
                        **plot_config[plot_type]['kwargs']
                    )
                else:
                    plot_config[plot_type]['func'](
                        data=self.data,
                        x=self.target_col,
                        y=feature,
                        ax=ax,
                        **plot_config[plot_type]['kwargs']
                    )
                ax.set_title(feature, fontsize=10)
                ax.tick_params(labelsize=8)
            except Exception as e:
                plt.close(fig)
                raise RuntimeError(f"Error generating {plot_type} plot for {feature}: {str(e)}")

        plt.tight_layout()
        return fig

    def analyze_correlations(self) -> plt.Figure:
        """Generate correlation matrix visualization"""
        corr_matrix = self.data[self.numerical_features].corr()
        fig, ax = plt.subplots(figsize=(12, 10))
        sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', center=0, ax=ax)
        plt.title('Feature Correlation Matrix')
        return fig

    def calculate_outliers(self) -> Dict[str, int]:
        """Calculate outlier counts using IQR method"""
        outlier_counts = {}
        for feature in self.numerical_features:
            Q1 = self.data[feature].quantile(0.25)
            Q3 = self.data[feature].quantile(0.75)
            IQR = Q3 - Q1
            outlier_counts[feature] = ((self.data[feature] < (Q1 - 1.5 * IQR)) | 
                                      (self.data[feature] > (Q3 + 1.5 * IQR))).sum()
        return outlier_counts

    def generate_target_analysis(self) -> plt.Figure:
        """Analyze target variable distribution"""
        fig, ax = plt.subplots(figsize=(8, 6))
        sns.countplot(x=self.target_col, data=self.data, ax=ax)
        plt.title('Class Distribution (0: Normal, 1: Fraud)')
        return fig

    def full_analysis(self) -> Dict:
        """Execute complete EDA analysis"""
        self._validate_dataset()
        
        return {
            'target_distribution': self.data[self.target_col].value_counts().to_dict(),
            'missing_values': self.data.isnull().sum().to_dict(),
            'outliers': self.calculate_outliers(),
            'correlation_matrix': self.data.corr(),
            'statistical_summary': self.data.describe().to_dict()
        }

In [5]:
def log_plot(fig: plt.Figure, plot_name: str, artifact_dir: str) -> None:
    """Log matplotlib figure to MLflow"""
    with TemporaryDirectory() as tmpdir:
        plot_path = Path(tmpdir) / f"{plot_name}_{datetime.now().strftime('%Y%m%d%H%M%S')}.png"
        fig.savefig(plot_path, bbox_inches='tight')
        mlflow.log_artifact(plot_path, artifact_dir)
    plt.close(fig)

def log_metrics(prefix: str, metrics: Dict) -> None:
    """Log metrics with prefix to MLflow"""
    mlflow.log_metrics({f"{prefix}_{k}": v for k, v in metrics.items()})

def log_dataset_metadata(data: pd.DataFrame, config: Dict) -> None:
    """Log fundamental dataset metadata"""
    mlflow.log_params({
        "dataset_rows": data.shape[0],
        "dataset_columns": data.shape[1],
        "dataset_version": config["dataset"]["version"],
        "data_source": config["dataset"]["source"]
    })
    
    mlflow.set_tags({
        "dataset_task": config["dataset"]["task"],
        "dataset_type": config["dataset"]["type"]
    })

In [6]:
def eda_pipeline(config: Dict) -> None:
    """Main EDA execution pipeline"""
    # Download and load data
    with TemporaryDirectory() as tmpdir:
        dataset_dir = download_kaggle_dataset(
            config["dataset"]["name"],
            tmpdir
        )
        data_path = Path(dataset_dir) / config["dataset"]["filename"]
        raw_data = pd.read_csv(data_path)
        
        # Log raw dataset
        mlflow.log_artifact(data_path, config["artifacts"]["data"]["raw"])

    # Initialize analyzer
    analyzer = EDAAnalyzer(config, raw_data)
    
    # Perform analysis
    analysis_results = analyzer.full_analysis()
    
    # Log fundamental metrics
    log_dataset_metadata(raw_data, config)
    log_metrics("missing_values", analysis_results["missing_values"])
    log_metrics("outliers", analysis_results["outliers"])
    
    # Log target analysis
    mlflow.log_metrics({
        "class_imbalance_ratio": (analysis_results["target_distribution"][0] / 
                                 analysis_results["target_distribution"][1])
    })
    target_plot = analyzer.generate_target_analysis()
    log_plot(target_plot, "class_distribution", config["artifacts"]["plots"])

    # Log feature analysis
    correlation_plot = analyzer.analyze_correlations()
    log_plot(correlation_plot, "correlation_matrix", config["artifacts"]["plots"])

    for plot_type in ['kde', 'box', 'violin']:
        try:
            fig = analyzer.generate_distribution_plots(plot_type)
            log_plot(fig, f"{plot_type}_plots", config["artifacts"]["plots"])
        except Exception as e:
            mlflow.log_param(f"{plot_type}_plot_error", str(e))
            continue

    # Log statistical summary
    with TemporaryDirectory() as tmpdir:
        stats_path = Path(tmpdir) / "statistical_summary.csv"
        pd.DataFrame(analysis_results["statistical_summary"]).to_csv(stats_path)
        mlflow.log_artifact(stats_path)

In [7]:
if __name__ == "__main__":
    experiment_name = CONFIG["experiment_names"]["eda"]
    run_name = CONFIG["run_names"]["eda"]
    
    configure_mlflow(experiment_name)
    
    with mlflow.start_run(run_name=run_name):
        try:
            mlflow.log_dict(CONFIG, "config.yaml")
            eda_pipeline(CONFIG)
            mlflow.set_tag("status", "completed")
        except Exception as e:
            mlflow.log_param("error", str(e))
            mlflow.set_tag("status", "failed")
            mlflow.end_run()
            raise
        
        print(f"EDA completed. Run ID: {mlflow.active_run().info.run_id}")

Dataset URL: https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud
Dataset downloaded to directory: /tmp/tmpegswmzf0
EDA completed. Run ID: 6568e6dd938941c1be4d276ec45973b6
