This code is for data cleaning it focuses on field level and model level data 
It is also helpful for developers to clean the record using json plugin file
It supports extensible and modular deisgn 

Framework Overview: the code cleans the records with the help of field(each record) and model(whole dataset) cleaner.


Key Components:

Cleaner Registry -> used to register field and model cleaner get the fields/models,get the params or keys. 


Cleaner Manager -> used to take the cleaning rules from the json file(plugins)


Cleaner Config and Field Cleaner config -> defines config for single field


ModelCleanerConfig -> defines config for multiple fileds or whole dataset at a time


SchemaConfig -> used to define config for schema_name,field and model config as input

Classes


Fields: strips whitecase, converts to lowercase, fill na,type conversion, removes non numeric characters 


Models: drop duplicates, outlier treatment 



In [8]:
import logging
from typing import Any, Callable, Dict, List, Optional
from pathlib import Path
import json
import re
import pandas as pd
from datetime import date
from pydantic import BaseModel, PrivateAttr, create_model, ValidationError, model_validator
import numpy as np
from sklearn.ensemble import IsolationForest

# Initialize logger (if not done globally)
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


In [9]:
import logging
from typing import Any, Callable, Dict, List, Optional

class CleanerRegistry:
    """
    A registry for managing and retrieving different types of data cleaners,
    categorized into field-level cleaners and model-level cleaners.
    """
    def __init__(self):
        """
        Initializes the CleanerRegistry with dictionaries to store
        cleaners, their required parameters, and any associated metadata.
        """
        # Stores callable functions for field-level cleaners, mapped by their type string.
        self._field_cleaners: Dict[str, Callable] = {}
        # Stores callable functions for model-level cleaners, mapped by their type string.
        self._model_cleaners: Dict[str, Callable] = {}
        # Stores lists of required parameter names for each field cleaner type.
        self._field_cleaner_params: Dict[str, List[str]] = {}
        # Stores lists of required parameter names for each model cleaner type.
        self._model_cleaner_params: Dict[str, List[str]] = {}
        # Stores additional metadata (e.g., descriptions, default values) for field cleaners.
        self._field_cleaner_metadata: Dict[str, Dict[str, Any]] = {}
        # Stores additional metadata (e.g., descriptions, default values) for model cleaners.
        self._model_cleaner_metadata: Dict[str, Dict[str, Any]] = {}

    def register_field_cleaner(
        self,
        cleaner_type: str,
        cleaner_func: Callable[[str, Dict[str, Any]], Callable],
        required_params: List[str] = None,
        **kwargs: Any
    ) -> None:
        """
        Registers a field-level cleaner function with a unique type string.

        If a cleaner of the same type already exists, it will be overwritten.

        Args:
            cleaner_type: A unique string identifier for the cleaner.
            cleaner_func: The callable function that performs the field cleaning.
                          It typically takes a field value and a dictionary of parameters.
            required_params: An optional list of parameter names that this cleaner requires.
            **kwargs: Additional keyword arguments to store as metadata for the cleaner.
        """
        if cleaner_type in self._field_cleaners:
            logging.warning(f"Field cleaner type '{cleaner_type}' already registered. Overwriting.")
        self._field_cleaners[cleaner_type] = cleaner_func
        self._field_cleaner_params[cleaner_type] = required_params or []
        self._field_cleaner_metadata[cleaner_type] = kwargs
        logging.info(f"Registered field cleaner: {cleaner_type}")

    def get_field_cleaner(self, cleaner_type: str) -> Optional[Callable]:
        """
        Retrieves a registered field cleaner function by its type.

        Args:
            cleaner_type: The string identifier of the cleaner to retrieve.

        Returns:
            The callable field cleaner function if found, otherwise None.
        """
        return self._field_cleaners.get(cleaner_type)

    def get_field_params(self, cleaner_type: str) -> List[str]:
        """
        Retrieves the list of required parameters for a given field cleaner type.

        Args:
            cleaner_type: The string identifier of the cleaner.

        Returns:
            A list of strings representing the required parameter names.
            Returns an empty list if the cleaner type is not found.
        """
        return self._field_cleaner_params.get(cleaner_type, [])

    def get_field_cleaner_types(self) -> List[str]:
        """
        Retrieves a list of all registered field cleaner types.

        Returns:
            A list of strings, where each string is a registered field cleaner type.
        """
        return list(self._field_cleaners.keys())

    def register_model_cleaner(
        self,
        cleaner_type: str,
        cleaner_func: Callable[[List[str], Dict[str, Any]], Callable],
        required_params: List[str] = None,
        **kwargs: Any
    ) -> None:
        """
        Registers a model-level cleaner function with a unique type string.

        If a cleaner of the same type already exists, it will be overwritten.

        Args:
            cleaner_type: A unique string identifier for the cleaner.
            cleaner_func: The callable function that performs the model cleaning.
                          It typically takes a list of field values (representing a record/model)
                          and a dictionary of parameters.
            required_params: An optional list of parameter names that this cleaner requires.
            **kwargs: Additional keyword arguments to store as metadata for the cleaner.
        """
        if cleaner_type in self._model_cleaners:
            logging.warning(f"Model cleaner type '{cleaner_type}' already registered. Overwriting.")
        self._model_cleaners[cleaner_type] = cleaner_func
        self._model_cleaner_params[cleaner_type] = required_params or []
        self._model_cleaner_metadata[cleaner_type] = kwargs
        logging.info(f"Registered model cleaner: {cleaner_type}")

    def get_model_cleaner(self, cleaner_type: str) -> Optional[Callable]:
        """
        Retrieves a registered model cleaner function by its type.

        Args:
            cleaner_type: The string identifier of the cleaner to retrieve.

        Returns:
            The callable model cleaner function if found, otherwise None.
        """
        return self._model_cleaners.get(cleaner_type)

    def get_model_params(self, cleaner_type: str) -> List[str]:
        """
        Retrieves the list of required parameters for a given model cleaner type.

        Args:
            cleaner_type: The string identifier of the cleaner.

        Returns:
            A list of strings representing the required parameter names.
            Returns an empty list if the cleaner type is not found.
        """
        return self._model_cleaner_params.get(cleaner_type, [])

    def get_model_cleaner_types(self) -> List[str]:
        """
        Retrieves a list of all registered model cleaner types.

        Returns:
            A list of strings, where each string is a registered model cleaner type.
        """
        return list(self._model_cleaners.keys())

In [10]:
import json
import logging
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional

# Assuming CleanerRegistry is defined in the same module or imported
# from .cleaner_registry import CleanerRegistry 

logger = logging.getLogger(__name__) # Initialize logger for this module

class CleanerManager():
    """
    Manages the registration and loading of data cleaner plugins.
    It acts as an intermediary, using a CleanerRegistry to store
    and retrieve different types of cleaners.
    """
    def __init__(self, registry: 'CleanerRegistry'): 
        """
        Initializes the CleanerManager with a reference to a CleanerRegistry.

        Args:
            registry: An instance of CleanerRegistry to delegate cleaner
                      registration and retrieval to.
        """
        self.registry = registry

    def register_field_cleaner(
        self,
        cleaner_type: str,
        # cleaner_func is a factory that takes field_name (str) and params (Dict),
        # and returns the actual cleaning function (Callable[[Any], Any]).
        cleaner_func: Callable[[str, Dict[str, Any]], Callable[[Any], Any]],
        required_params: List[str] = None,
        **kwargs: Any
    ) -> None:
        """
        Registers a field-level cleaner factory function with the underlying registry.

        This method is a wrapper around the registry's method, providing a
        consistent interface for cleaner registration.

        Args:
            cleaner_type: A unique string identifier for the cleaner.
            cleaner_func: A factory function that, when called, returns the actual
                          cleaning function for a specific field.
            required_params: An optional list of parameter names that this cleaner requires.
            **kwargs: Additional keyword arguments to store as metadata for the cleaner.
        """
        self.registry.register_field_cleaner(cleaner_type, cleaner_func, required_params, **kwargs)
        logger.info(f"Dynamically registered field cleaner: {cleaner_type}")

    def register_model_cleaner(
        self,
        cleaner_type: str,
        # Signature for model cleaner factory: takes fields (List[str]) and params (Dict),
        # and returns the actual model cleaning function (Callable[[List[Dict[str, Any]]], List[Dict[str, Any]]]).
        cleaner_func: Callable[[List[str], Dict[str, Any]], Callable[[List[Dict[str, Any]]], List[Dict[str, Any]]]],
        required_params: List[str] = None,
        **kwargs: Any
    ) -> None:
        """
        Registers a model-level cleaner factory function with the underlying registry.

        Similar to field cleaner registration, this method provides a wrapper
        for registering model-level cleaners.

        Args:
            cleaner_type: A unique string identifier for the cleaner.
            cleaner_func: A factory function that, when called, returns the actual
                          cleaning function for a list of records/models.
            required_params: An optional list of parameter names that this cleaner requires.
            **kwargs: Additional keyword arguments to store as metadata for the cleaner.
        """
        self.registry.register_model_cleaner(cleaner_type, cleaner_func, required_params, **kwargs)
        logger.info(f"Dynamically registered model cleaner: {cleaner_type}")

    def load_cleaner_plugins(self, config_path: str = "cleaners.json") -> None:
        """
        Loads cleaner configurations from a specified JSON file and registers them.

        This method reads a JSON file that defines field and model cleaners,
        dynamically imports their respective modules and functions, and then
        registers them with the CleanerRegistry.

        Args:
            config_path: The path to the JSON configuration file.
                         Defaults to "cleaners.json".

        Raises:
            ValueError: If the configuration file is invalid JSON or
                        if there are missing required fields in cleaner definitions,
                        or if a specified cleaner module/function cannot be loaded.
        """
        config_file = Path(config_path)
        if not config_file.exists():
            logger.info(f"Cleaner plugin file not found at {config_path}. Skipping plugin loading.")
            return

        try:
            with config_file.open() as f:
                config = json.load(f)
            logger.info(f"Loaded cleaner plugins from {config_path}.")
        except json.JSONDecodeError as e:
            logger.error(f"Invalid JSON in plugin file {config_path}: {e}")
            raise ValueError(f"Invalid JSON in plugin file {config_path}: {e}")

        for cleaner in config.get("field_cleaners", []):
            c_type = cleaner.get("type")
            module_name = cleaner.get("module")
            func_name = cleaner.get("function")
            required_params = cleaner.get("required_params", [])
            description = cleaner.get("description", "")

            if not all([c_type, func_name]):
                logger.error(f"Missing required fields in field cleaner configuration: {cleaner}")
                raise ValueError(f"Missing required fields in field cleaner: {cleaner}")

            try:
                # If no module name is provided, assume the function is in the global scope
                if not module_name:
                    module = globals()
                else:
                    # Dynamically import the module
                    module = __import__(module_name, fromlist=[func_name])

                # Get the cleaner function from the imported module
                cleaner_func = getattr(module, func_name)
                self.registry.register_field_cleaner(
                    c_type, cleaner_func, required_params, description=description
                )
                logger.debug(f"Successfully loaded field cleaner '{c_type}' from {module_name}.{func_name if module_name else 'global scope'}")
            except (ImportError, AttributeError) as e:
                logger.error(f"Failed to load field cleaner '{c_type}' from {module_name}.{func_name}: {e}")
                raise ValueError(f"Failed to load field cleaner '{c_type}' from {module_name}.{func_name}: {e}")

        for cleaner in config.get("model_cleaners", []):
            c_type = cleaner.get("type")
            module_name = cleaner.get("module")
            func_name = cleaner.get("function")
            required_params = cleaner.get("required_params", [])
            description = cleaner.get("description", "")

            if not all([c_type, func_name]):
                logger.error(f"Missing required fields in model cleaner configuration: {cleaner}")
                raise ValueError(f"Missing required fields in model cleaner: {cleaner}")

            try:
                # If no module name is provided, assume the function is in the global scope
                if not module_name:
                    module = globals()
                else:
                    # Dynamically import the module
                    module = __import__(module_name, fromlist=[func_name])

                # Get the cleaner function from the imported module
                cleaner_func = getattr(module, func_name)
                self.registry.register_model_cleaner(
                    c_type, cleaner_func, required_params, description=description
                )
                logger.debug(f"Successfully loaded model cleaner '{c_type}' from {module_name}.{func_name if module_name else 'global scope'}")
            except (ImportError, AttributeError) as e:
                logger.error(f"Failed to load model cleaner '{c_type}' from {module_name}.{func_name}: {e}")
                raise ValueError(f"Failed to load model cleaner '{c_type}' from {module_name}.{func_name}: {e}")

In [None]:
class CleanerConfig(BaseModel):
    """
    Represents the configuration for a single data cleaner.
    This model validates the cleaner type and its parameters against a provided registry.
    """
    # The type of cleaner, e.g., "remove_html_tags", "normalize_whitespace".
    type: str
    # A dictionary of parameters to be passed to the cleaner function. Defaults to an empty dict.
    params: dict = {}

    # Private attribute to store the CleanerRegistry instance.
    # It's not part of the Pydantic model's data fields and won't be serialized.
    _registry: Any = PrivateAttr()

    def __init__(self, registry: Any, **data):
        """
        Initializes the CleanerConfig.

        Args:
            registry: An instance of CleanerRegistry which is used to validate
                      the cleaner type and its required parameters.
            **data: Arbitrary keyword arguments that match the model's fields (type, params).
        """
        super().__init__(**data)
        self._registry = registry
        # Immediately validate the cleaner configuration against the registry upon initialization.
        self.validate_with_registry()  

    def validate_with_registry(self):
        """
        Validates the cleaner's type and parameters against the associated CleanerRegistry.

        Raises:
            ValueError: If the cleaner type is not registered or if
                        any required parameters are missing.
        """
        # Check if the specified cleaner type is registered in the field cleaner types.
        if self.type not in self._registry.get_field_cleaner_types():
            raise ValueError(f"Unknown field cleaner type '{self.type}'")

        # Retrieve the list of required parameters for this cleaner type from the registry.
        required_params = self._registry.get_field_params(self.type)
        # Identify any required parameters that are missing from the provided 'params' dictionary.
        missing = [k for k in required_params if k not in self.params]
        if missing:
            # If there are missing parameters, raise a ValueError.
            raise ValueError(f"Missing params for cleaner '{self.type}': {missing}")


class FieldCleaningConfig(BaseModel):
    """
    Represents the cleaning configuration for a specific field within a data structure.
    It includes the field's name and a list of CleanerConfig instances to apply.
    """
    # The name of the field to which these cleaning operations apply.
    name: str
    # A list of CleanerConfig instances, specifying the sequence of cleaners to apply to the field.
    # Defaults to an empty list, meaning no cleaners are applied by default.
    cleaners: List[CleanerConfig] = []

In [None]:
class ModelCleanerConfig(BaseModel):
    """
    Represents the configuration for a single model-level cleaner.
    This model validates the cleaner type and its parameters against a provided registry.
    """
    # The type of model cleaner, e.g., "deduplicate_records", "merge_fields".
    type: str
    # A list of field names that this model cleaner operates on. Defaults to an empty list.
    fields: List[str] = []
    # A dictionary of parameters to be passed to the model cleaner function. Defaults to an empty dict.
    params: dict = {}

    # Private attribute to store the CleanerRegistry instance.
    # It's not part of the Pydantic model's data fields and won't be serialized.
    _registry: Any = PrivateAttr()

    def __init__(self, registry: Any, **data):
        """
        Initializes the ModelCleanerConfig.

        Args:
            registry: An instance of CleanerRegistry which is used to validate
                      the model cleaner type and its required parameters.
            **data: Arbitrary keyword arguments that match the model's fields (type, fields, params).
        """
        super().__init__(**data)
        self._registry = registry
        # Immediately validate the model cleaner configuration against the registry upon initialization.
        self.validate_with_registry()

    def validate_with_registry(self):
        """
        Validates the model cleaner's type and parameters against the associated CleanerRegistry.

        Raises:
            ValueError: If the model cleaner type is not registered or if
                        any required parameters are missing.
        """
        # Check if the specified cleaner type is registered in the model cleaner types.
        if self.type not in self._registry.get_model_cleaner_types():
            raise ValueError(f"Unknown model cleaner type '{self.type}'")

        # Retrieve the list of required parameters for this model cleaner type from the registry.
        required_params = self._registry.get_model_params(self.type)
        # Identify any required parameters that are missing from the provided 'params' dictionary.
        missing = [k for k in required_params if k not in self.params]
        if missing:
            # If there are missing parameters, raise a ValueError.
            raise ValueError(f"Missing params for model cleaner '{self.type}': {missing}")

In [13]:
class SchemaCleaningConfig(BaseModel):
    schema_name: str
    fields: List[FieldCleaningConfig]
    model_cleaners: List[ModelCleanerConfig] = []

In [14]:
import logging
import re
import pandas as pd
import numpy as np
from datetime import date
from typing import Any, Callable, Dict, List, Optional
from sklearn.ensemble import IsolationForest

# Initialize logger for this module
logger = logging.getLogger(__name__)

# --- Individual Field Cleaner Functions  ---

def create_strip_whitespace_cleaner(field_name: str, params: Dict[str, Any]) -> Callable[[Any], Any]:
    """
    Creates a field cleaner that removes leading and trailing whitespace from string values.

    Args:
        field_name: The name of the field to which this cleaner will be applied (not directly used in the cleaner itself but helpful for context/logging).
        params: A dictionary of parameters for the cleaner (not used for this specific cleaner).

    Returns:
        A callable function that takes a value and returns the value with whitespace stripped if it's a string.
    """
    def strip_whitespace_cleaner(value: Any) -> Any:
        if isinstance(value, str):
            return value.strip()
        return value
    return strip_whitespace_cleaner

def create_to_lowercase_cleaner(field_name: str, params: Dict[str, Any]) -> Callable[[Any], Any]:
    """
    Creates a field cleaner that converts string values to lowercase.

    Args:
        field_name: The name of the field.
        params: A dictionary of parameters (not used for this specific cleaner).

    Returns:
        A callable function that takes a value and returns its lowercase version if it's a string.
    """
    def to_lowercase_cleaner(value: Any) -> Any:
        if isinstance(value, str):
            return value.lower()
        return value
    return to_lowercase_cleaner

def create_fill_na_cleaner(field_name: str, params: Dict[str, Any]) -> Callable[[Any], Any]:
    """
    Creates a field cleaner that replaces NaN or None values with a specified fill value.

    Args:
        field_name: The name of the field.
        params: A dictionary containing the 'value' to fill NaN/None with.

    Returns:
        A callable function that takes a value and returns the fill_value if the input is NaN or None.
    """
    # Extract the fill value from parameters; if not provided, it will be None.
    fill_value = params.get("value")
    def fill_na_cleaner(value: Any) -> Any:
        if pd.isna(value) or value is None:
            return fill_value
        return value
    return fill_na_cleaner

def create_type_conversion_cleaner(field_name: str, params: Dict[str, Any]) -> Callable[[Any], Any]:
    """
    Creates a field cleaner that attempts to convert values to a specified target type.

    Args:
        field_name: The name of the field.
        params: A dictionary containing 'target_type' (e.g., "int", "float", "str", "date").

    Returns:
        A callable function that attempts to convert the input value to the target type.
        Logs a warning and returns the original value if conversion fails.
    """
    # The target type to convert to, must be present in params.
    target_type = params["target_type"]
    def type_conversion_cleaner(value: Any) -> Any:
        try:
            if target_type == "int":
                return int(value)
            elif target_type == "float":
                return float(value)
            elif target_type == "str":
                return str(value)
            elif target_type == "date":
                # Assumes date strings are in ISO format (YYYY-MM-DD)
                return date.fromisoformat(str(value))
        except (ValueError, TypeError):
            logging.warning(f"Could not convert '{value}' to type '{target_type}' for field '{field_name}'.")
            return value
        return value
    return type_conversion_cleaner

def create_remove_non_numeric_cleaner(field_name: str, params: Dict[str, Any]) -> Callable[[Any], Any]:
    """
    Creates a field cleaner that removes all characters except digits and dots from a string.
    Useful for cleaning numeric strings that may contain currency symbols or other non-numeric text.

    Args:
        field_name: The name of the field.
        params: A dictionary of parameters (not used for this specific cleaner).

    Returns:
        A callable function that takes a value and returns a string with only numeric characters and dots.
    """
    def remove_non_numeric_cleaner(value: Any) -> Any:
        if isinstance(value, str):
            return re.sub(r'[^0-9.]', '', value)
        return value
    return remove_non_numeric_cleaner

def income_field_cleaner(value):
    """
    A specific cleaner function for 'Income' values, handling 'k' suffix and general string to float conversion.
    This function is intended to be used directly or wrapped, not as a factory function in the current registry design.
    """
    if value is None:
        return None
    if isinstance(value, (int, float)):
        return value
    if isinstance(value, str):
        val = value.strip().lower()
        # Regex to match numbers with optional decimal and 'k' suffix
        match = re.match(r'^(\d+(\.\d+)?)(k)?$', val)
        if match:
            num = float(match.group(1))
            if match.group(3) == 'k': # If 'k' suffix is present, multiply by 1000
                num *= 1000
            return num
        else:
            try:
                return float(val) # Attempt direct conversion if regex doesn't match
            except ValueError:
                logging.warning(f"Cannot convert Income value '{value}' to float.")
                return None
    return None


# --- Record-Level Model Cleaner (Applied to a single record/dictionary)  ---

def create_inconsistent_data_removal_cleaner(fields: List[str], params: Dict[str, Any]) -> Callable[[Dict[str, Any]], Optional[Dict[str, Any]]]:
    """
    Creates a model cleaner that removes an entire record if specific fields
    ('Age', 'Income') contain inconsistent or invalid numeric values (e.g., non-positive).

    Args:
        fields: A list of field names to check for inconsistencies (e.g., ["Age", "Income"]).
        params: A dictionary of parameters (not used for this specific cleaner beyond general context).

    Returns:
        A callable function that takes a single record (dictionary) and returns
        the record if it's consistent, or None if it should be removed.
    """
    def inconsistent_data_removal_cleaner(data: Dict[str, Any]) -> Optional[Dict[str, Any]]:
        # This cleaner acts on a single record (dictionary) and can return None to indicate removal.
        if "Age" in data and data["Age"] is not None:
            try:
                # Check if Age is numeric and less than or equal to zero
                if float(data["Age"]) <= 0:
                    logging.debug(f"Removing row due to inconsistent Age: {data.get('Age')}")
                    return None
            except (ValueError, TypeError):
                # If Age cannot be converted to float, consider it invalid for this check
                logging.debug(f"Invalid Age value for removal check: {data.get('Age')}")
                pass

        if "Income" in data and data["Income"] is not None:
            try:
                # Check if Income is numeric and less than or equal to zero
                if float(data["Income"]) <= 0:
                    logging.debug(f"Removing row due to inconsistent Income: {data.get('Income')}")
                    return None
            except (ValueError, TypeError) as e:
                # If Income cannot be converted to float, consider it invalid for this check
                logging.debug(f"Income value invalid for removal check: {data.get('Income')} - {e}")
                pass
        return data # If no inconsistency found, return the original data
    return inconsistent_data_removal_cleaner


# --- Batch Model Cleaner Functions (Applied to a list of records/dictionaries) ---

### **Batch Deduplication Cleaner**


def create_drop_duplicate_rows_cleaner(fields: List[str], params: Dict[str, Any]) -> Callable[[List[Dict[str, Any]]], List[Dict[str, Any]]]:
    """
    Creates a cleaner that removes duplicate records from a list of dictionaries.

    Args:
        fields: A list of keys (column names) to consider when identifying duplicates.
                If empty, all keys will be considered.
        params: A dictionary that can include a 'keep' strategy (e.g., 'first', 'last', False)
                to determine which duplicate to keep. Defaults to 'first'.

    Returns:
        A callable function that takes a list of dictionaries (records) and returns
        a new list with duplicate records removed.
    """
    # Determine the subset of fields to check for duplication. If `fields` is empty, check all columns.
    subset_fields = fields if fields else None
    # Determine the strategy for keeping duplicates (e.g., 'first' occurrence).
    keep_strategy = params.get('keep', 'first')

    def batch_deduplication_cleaner(data_list: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
        if not data_list:
            return []

        # Convert list of dicts to a Pandas DataFrame for efficient duplication handling.
        df = pd.DataFrame(data_list)
        initial_rows = df.shape[0] # Record initial row count for logging.
        # Drop duplicates based on the specified subset of fields and keeping strategy.
        df.drop_duplicates(subset=subset_fields, keep=keep_strategy, inplace=True)
        final_rows = df.shape[0] # Record final row count.
        logger.info(f"Batch deduplication removed: {initial_rows - final_rows} rows. {final_rows} rows remaining.")
        # Convert the cleaned DataFrame back to a list of dictionaries.
        return df.to_dict(orient='records')
    return batch_deduplication_cleaner


### **Batch Outlier Treatment Cleaner**


def create_treat_outliers_cleaner(fields: List[str], params: Dict[str, Any]) -> Callable[[List[Dict[str, Any]]], List[Dict[str, Any]]]:
    """
    Creates a cleaner that detects and treats outliers in specified numeric fields within a batch of records.

    Args:
        fields: A list of numeric field names (columns) to apply outlier treatment on.
        params: A dictionary that can include:
            - 'percentile': The percentile threshold for outlier detection (e.g., 95 for top 5% outliers). Defaults to 95.
            - 'random_state': Seed for reproducibility of the Isolation Forest model. Defaults to 42.

    Returns:
        A callable function that takes a list of dictionaries (records) and returns
        a new list with outliers treated based on an auto-selected method (delete, cap, or transform).
    """
    numeric_fields = fields # These are the columns where outlier treatment will be applied.
    percentile = params.get('percentile', 95) # Percentile for defining outlier threshold.
    random_state = params.get('random_state', 42) # Random state for Isolation Forest.

    def batch_outlier_treatment_cleaner(data_list: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
        if not data_list:
            return []

        df = pd.DataFrame(data_list)
        cleaned_df = df.copy() # Create a copy to modify

        for col in numeric_fields:
            # Skip if column doesn't exist or is not numeric.
            if col not in cleaned_df.columns or not pd.api.types.is_numeric_dtype(cleaned_df[col]):
                logger.warning(f"Skipping outlier treatment for non-numeric or missing column: {col}")
                continue

            # Ensure sufficient data for outlier detection.
            if len(cleaned_df[col].dropna()) < 2 or cleaned_df[col].nunique() < 2:
                logger.debug(f"Not enough data or unique values for outlier detection in '{col}'. Skipping.")
                continue

            col_data = cleaned_df[col].dropna() # Isolate non-NaN numeric data for outlier detection.
            if col_data.empty:
                continue

            # Reshape data for IsolationForest (requires 2D array).
            X_batch = col_data.values.reshape(-1, 1)

            try:
                # Initialize and fit Isolation Forest model.
                model = IsolationForest(contamination=0.1, random_state=random_state)
                model.fit(X_batch)
                # Get anomaly scores (lower score indicates more anomalous).
                scores = -model.decision_function(X_batch)
                # Determine the threshold for outliers based on the specified percentile.
                threshold = np.percentile(scores, percentile)
                
                # Identify indices of outliers from the original DataFrame.
                outliers_series = pd.Series(scores >= threshold, index=col_data.index)
                outliers = outliers_series[outliers_series].index # Get indices of outliers

                outlier_ratio = len(outliers) / len(col_data) # Calculate ratio of outliers.

                # Auto-select treatment method based on outlier ratio.
                method = "transform" # Default method
                if outlier_ratio < 0.02:
                    method = "delete"
                elif 0.02 <= outlier_ratio < 0.1:
                    method = "cap"
                else:
                    method = "transform" # Fallback for higher ratios

                logger.info(f"Auto-selected outlier treatment method for column '{col}': {method} (Outlier ratio: {outlier_ratio:.2f})")

                if method == "delete":
                    # Remove rows identified as outliers.
                    cleaned_df = cleaned_df.drop(outliers)
                elif method == "cap":
                    # Cap outliers to the 5th and 95th percentile of non-outlier data.
                    non_outlier_vals = cleaned_df.loc[~cleaned_df.index.isin(outliers), col].dropna()
                    if not non_outlier_vals.empty:
                        lower_cap = np.percentile(non_outlier_vals, 5)
                        upper_cap = np.percentile(non_outlier_vals, 95)
                        # Apply capping only to actual outlier values that exceed the caps.
                        cleaned_df.loc[cleaned_df.index.isin(outliers) & (cleaned_df[col] < lower_cap), col] = lower_cap
                        cleaned_df.loc[cleaned_df.index.isin(outliers) & (cleaned_df[col] > upper_cap), col] = upper_cap
                    else:
                         logger.warning(f"Cannot cap column '{col}': no non-outlier values to determine caps.")
                elif method == "transform":
                    # Apply log1p transformation to reduce the impact of large values.
                    # This transformation is applied safely, handling non-numeric or negative values.
                    cleaned_df[col] = cleaned_df[col].apply(lambda x: np.log1p(x) if pd.notna(x) and x >= 0 else x)
            except Exception as e:
                logger.error(f"Error during outlier treatment for column '{col}': {e}", exc_info=True)
                continue

        return cleaned_df.to_dict(orient='records') # Return the cleaned data as a list of dictionaries.
    return batch_outlier_treatment_cleaner

In [15]:
# Initialize Cleaner Registry and Manager
# An instance of CleanerRegistry is created. This object will be responsible
# for storing and managing all the different types of cleaning functions.
cleaner_registry = CleanerRegistry()
# An instance of CleanerManager is created, passing the cleaner_registry to it.
# The manager acts as an interface for registering cleaners and loading them from configurations.
cleaner_manager = CleanerManager(cleaner_registry)

# Register field cleaners
# Each registration includes:
# - A unique 'type' string (e.g., "strip_whitespace").
# - The factory function itself (e.g., `create_strip_whitespace_cleaner`), which, when called
#   with field-specific parameters, will produce the actual cleaning function.
# - A list of `required_params` that the factory function expects in its `params` dictionary.
# - A `description` for better understanding and documentation.
cleaner_manager.register_field_cleaner("strip_whitespace", create_strip_whitespace_cleaner, required_params=[], description="Strips leading/trailing whitespace")
cleaner_manager.register_field_cleaner("to_lowercase", create_to_lowercase_cleaner, required_params=[], description="Converts string to lowercase")
cleaner_manager.register_field_cleaner("fill_na", create_fill_na_cleaner, required_params=["value"], description="Fills missing values with a specified value")
cleaner_manager.register_field_cleaner("type_conversion", create_type_conversion_cleaner, required_params=["target_type"], description="Converts field to target type")
cleaner_manager.register_field_cleaner("remove_non_numeric", create_remove_non_numeric_cleaner, required_params=[], description="Removes non-numeric characters from string")


# Register model cleaners
# Similar to field cleaners, these lines register model-level cleaner factory functions.
# Model cleaners typically operate on entire records or batches of records.
# - "drop_duplicates" uses `create_drop_duplicate_rows_cleaner` to handle batch deduplication.
# - "remove_inconsistent" uses `create_inconsistent_data_removal_cleaner` to remove records
#   based on inconsistencies in specific fields.
# - "treat_outliers" uses `create_treat_outliers_cleaner` for batch outlier detection and treatment.
cleaner_manager.register_model_cleaner("drop_duplicates", create_drop_duplicate_rows_cleaner, required_params=[], description="Drops duplicate rows from the dataset")
cleaner_manager.register_model_cleaner("remove_inconsistent", create_inconsistent_data_removal_cleaner, required_params=[], description="Removes rows with inconsistent data (e.g., age <= 0)")
cleaner_manager.register_model_cleaner("treat_outliers", create_treat_outliers_cleaner, required_params=["fields"], description="Treats outliers in specified numeric fields across the dataset")

2025-06-01 22:23:32,556 - INFO - Registered field cleaner: strip_whitespace
2025-06-01 22:23:32,557 - INFO - Dynamically registered field cleaner: strip_whitespace
2025-06-01 22:23:32,557 - INFO - Registered field cleaner: to_lowercase
2025-06-01 22:23:32,557 - INFO - Dynamically registered field cleaner: to_lowercase
2025-06-01 22:23:32,558 - INFO - Registered field cleaner: fill_na
2025-06-01 22:23:32,558 - INFO - Dynamically registered field cleaner: fill_na
2025-06-01 22:23:32,558 - INFO - Registered field cleaner: type_conversion
2025-06-01 22:23:32,558 - INFO - Dynamically registered field cleaner: type_conversion
2025-06-01 22:23:32,558 - INFO - Registered field cleaner: remove_non_numeric
2025-06-01 22:23:32,559 - INFO - Dynamically registered field cleaner: remove_non_numeric
2025-06-01 22:23:32,559 - INFO - Registered model cleaner: drop_duplicates
2025-06-01 22:23:32,559 - INFO - Dynamically registered model cleaner: drop_duplicates
2025-06-01 22:23:32,560 - INFO - Registere

In [16]:
import logging
from typing import Any, Callable, Dict, List, Optional
from pydantic import BaseModel, create_model, ValidationError, model_validator

# Assuming these classes are defined and imported from their respective modules:
# from .cleaner_registry import CleanerRegistry
# from .config_models import SchemaCleaningConfig, FieldCleaningConfig, ModelCleanerConfig

logger = logging.getLogger(__name__) # Initialize logger for this module

class DynamicCleaner:
    """
    A dynamic data cleaner that applies a series of field-level and model-level
    cleaning operations based on a provided `SchemaCleaningConfig`.
    It leverages Pydantic for dynamic model creation and validation,
    integrating with a CleanerRegistry for cleaner functions.
    """
    def __init__(self, cleaning_config: 'SchemaCleaningConfig', cleaner_registry: 'CleanerRegistry'):
        """
        Initializes the DynamicCleaner.

        Args:
            cleaning_config: An instance of SchemaCleaningConfig detailing
                             the cleaning operations to perform.
            cleaner_registry: An instance of CleanerRegistry containing
                              all available cleaner factory functions.
        """
        self.cleaning_config = cleaning_config
        self.cleaner_registry = cleaner_registry
        # Dynamically creates a Pydantic model based on the cleaning configuration.
        self.cleaning_model = self._create_cleaning_model()

    def _create_cleaning_model(self) -> type:
        """
        Dynamically creates a Pydantic BaseModel that incorporates the specified
        field-level and record-level model cleaning logic.

        This Pydantic model will have fields corresponding to the schema fields
        in `cleaning_config`, and will use a `model_validator(mode='after')`
        to apply the cleaning logic.

        Returns:
            A dynamically created Pydantic BaseModel class configured for cleaning.
        """
        fields = {}
        # Define fields for the dynamic Pydantic model.
        # Each field from the cleaning_config will be a model field, accepting Any type.
        for field_config in self.cleaning_config.fields:
            fields[field_config.name] = (Any, ...) # (type, default_value_or_ellipsis_for_required)

        # Create a base Pydantic model with the defined fields.
        DynamicBaseCleanModel = create_model(self.cleaning_config.schema_name + "CleanBase", **fields)

        def apply_field_cleaners(cls, values: Dict[str, Any]) -> Dict[str, Any]:
            """
            Applies field-level cleaners to a single record's values.
            This function is used within the Pydantic model's validator.
            """
            cleaned_values = values.copy()
            for field_config in self.cleaning_config.fields:
                field_name = field_config.name
                current_value = cleaned_values.get(field_name)
                for cleaner_config in field_config.cleaners:
                    # Retrieve the cleaner factory from the registry.
                    cleaner_func_factory = self.cleaner_registry.get_field_cleaner(cleaner_config.type)
                    if cleaner_func_factory:
                        # Create the actual cleaning function by calling the factory.
                        cleaner_func = cleaner_func_factory(field_name, cleaner_config.params)
                        # Apply the cleaner.
                        cleaned_value = cleaner_func(current_value)
                        logger.debug(f"Applied field cleaner '{cleaner_config.type}' on '{field_name}'. Original: {current_value}, Cleaned: {cleaned_value}")
                        current_value = cleaned_value # Update value for next cleaner in chain.
                cleaned_values[field_name] = current_value # Store the final cleaned value.
            return cleaned_values

        def apply_record_level_model_cleaners(cls, values: Dict[str, Any]) -> Optional[Dict[str, Any]]:
            """
            Applies record-level model cleaners (e.g., those that might remove an entire record).
            This function is used within the Pydantic model's validator.
            """
            cleaned_values = values.copy()
            for model_cleaner_config in self.cleaning_config.model_cleaners:
                # This check assumes 'remove_inconsistent' is designed as a record-level cleaner.
                # For more complex scenarios, you might need a metadata flag (e.g., `is_batch_cleaner: False`)
                # within `ModelCleanerConfig` or the `CleanerRegistry` to differentiate.
                if model_cleaner_config.type == "remove_inconsistent": # Example of a record-level model cleaner
                    cleaner_func_factory = self.cleaner_registry.get_model_cleaner(model_cleaner_config.type)
                    if cleaner_func_factory:
                        # Call the factory to get the specific cleaner function for this record.
                        cleaner_func = cleaner_func_factory(model_cleaner_config.fields, model_cleaner_config.params)
                        # Apply the record-level cleaner.
                        cleaned_values = cleaner_func(cleaned_values)
                        if cleaned_values is None:
                            logger.debug(f"Record-level model cleaner '{model_cleaner_config.type}' caused row removal.")
                            return None # Indicate this record should be discarded
            return cleaned_values

        class CustomCleanModel(DynamicBaseCleanModel):
            """
            A Pydantic model that extends the dynamically created base model
            and includes a `model_validator` to apply cleaning logic.
            """
            @model_validator(mode='after')
            def perform_record_level_cleaning(self) -> Any:
                """
                Pydantic validator that runs after initial model parsing.
                It orchestrates the application of field-level and record-level
                model cleaners.
                """
                data = self.model_dump() # Get the raw data from the Pydantic instance.
                
                # Apply all configured field-level cleaners.
                cleaned_data = apply_field_cleaners(self.__class__, data)

                # Apply record-level model cleaners (e.g., inconsistent data removal).
                cleaned_data = apply_record_level_model_cleaners(self.__class__, cleaned_data)
                
                # If a record-level cleaner returns None, it signals that the record should be removed.
                if cleaned_data is None:
                    # Raise a specific ValueError to be caught by the `clean_record` method,
                    # indicating that the record should be discarded.
                    raise ValueError("RECORD_TO_BE_REMOVED_BY_CLEANER")

                # Update the Pydantic model instance with the cleaned values.
                for key, value in cleaned_data.items():
                    setattr(self, key, value)
                return self

        return CustomCleanModel

    def clean_record(self, record: Dict[str, Any]) -> Optional[Dict[str, Any]]:
        """
        Cleans a single data record (dictionary) by applying the configured
        field-level and record-level model cleaners through the dynamic Pydantic model.

        Args:
            record: The dictionary representing a single data record.

        Returns:
            The cleaned record as a dictionary, or None if the record is
            marked for removal by a record-level cleaner.
        """
        try:
            # Validate the record against the dynamically created Pydantic model.
            # This triggers the `perform_record_level_cleaning` validator.
            cleaned_instance = self.cleaning_model.model_validate(record)
            # Return the cleaned data as a dictionary.
            return cleaned_instance.model_dump()
        except ValidationError as e:
            # Log validation errors, typically for data that doesn't conform to Pydantic's
            # type hints after cleaning, though in this dynamic setup, it's less common
            # for basic type issues unless explicit type conversions fail badly.
            logger.error(f"Validation error during cleaning for record: {record}. Errors: {e.errors()}")
            return None
        except ValueError as e:
            # Catch the specific error indicating record removal by a cleaner.
            if str(e) == "RECORD_TO_BE_REMOVED_BY_CLEANER":
                logger.info(f"Record removed by cleaner: {record}")
                return None
            else:
                # Re-raise any other unexpected ValueErrors.
                raise e

    def clean_data(self, data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
        """
        Cleans a list of data records by applying both record-by-record
        cleaning (field and record-level model cleaners) and then
        batch-level model cleaners.

        Args:
            data: A list of dictionaries, where each dictionary is a data record.

        Returns:
            A list of cleaned data records.
        """
        # Step 1: Apply record-level field and model cleaners to each record individually.
        cleaned_records_step1 = []
        for i, record in enumerate(data):
            cleaned_record = self.clean_record(record)
            if cleaned_record is not None: # Only add records that were not marked for removal.
                cleaned_records_step1.append(cleaned_record)
        
        logger.info(f"After record-level cleaning, {len(cleaned_records_step1)} out of {len(data)} records remain.")

        # Step 2: Apply batch-level model cleaners to the list of records.
        final_cleaned_data = cleaned_records_step1 # Start with the data from step 1.
        for model_cleaner_config in self.cleaning_config.model_cleaners:
            # Identify batch-level model cleaners. This check needs to be robust.
            # A cleaner way might be to add a `is_batch_cleaner: bool` flag to ModelCleanerConfig.
            if model_cleaner_config.type in ["drop_duplicates", "treat_outliers"]:
                cleaner_func_factory = self.cleaner_registry.get_model_cleaner(model_cleaner_config.type)
                if cleaner_func_factory:
                    # Get the batch cleaner function (it expects a list of records).
                    cleaner_func = cleaner_func_factory(model_cleaner_config.fields, model_cleaner_config.params)
                    
                    # Apply the batch cleaner.
                    initial_count = len(final_cleaned_data)
                    final_cleaned_data = cleaner_func(final_cleaned_data)
                    logger.info(f"Applied batch cleaner '{model_cleaner_config.type}'. Records changed from {initial_count} to {len(final_cleaned_data)}.")

        logging.info(f"Cleaning complete. Final count: {len(final_cleaned_data)} records.")
        return final_cleaned_data

In [17]:
raw_data = [
    {"Name": " Alice ", "Age": 25, "Income": 50000.0, "Email": "ALICE@EXAMPLE.COM"},
    {"Name": "  Bob", "Age": None, "Income": 45000.0, "Email": "bob@example.com"},
    {"Name": "David", "Age": 30, "Income": None, "Email": "DAVID@EXAMPLE.COM"},
    {"Name": "Eve", "Age": 35, "Income": 75000.0, "Email": "EVE@EXAMPLE.COM"},
    {"Name": " Alice ", "Age": 25, "Income": 50000.0, "Email": "ALICE@EXAMPLE.COM"}, # Duplicate for testing
    {"Name": "Frank", "Age": -5, "Income": 80000.0, "Email": "frank@example.com"}, # Inconsistent age (will be removed)
    {"Name": "Grace", "Age": 40, "Income": -100.0, "Email": "grace@example.com"}, # Inconsistent income (will be removed)
    {"Name": "Henry", "Age": 50, "Income": 10000000.0, "Email": "henry@example.com"}, # **OUTLIER INCOME**
    {"Name": "Jane", "Age": 28, "Income": 52000.0, "Email": "jane@example.com"},
    {"Name": "Henry", "Age": 50, "Income": 10000000.0, "Email": "henry@example.com"} # Another duplicate/outlier
]


In [18]:

# --- UPDATED SchemaCleaningConfig to include batch cleaners ---
cleaning_schema_config = SchemaCleaningConfig(
    schema_name="CustomerDataCleaning",
    fields=[
        FieldCleaningConfig(
            name="Name",
            cleaners=[
                CleanerConfig(registry=cleaner_registry, type="strip_whitespace", params={}),
                CleanerConfig(registry=cleaner_registry, type="to_lowercase", params={})
            ]
        ),
        FieldCleaningConfig(
            name="Age",
            cleaners=[
                CleanerConfig(registry=cleaner_registry, type="fill_na", params={"value": 30}),
                CleanerConfig(registry=cleaner_registry, type="type_conversion", params={"target_type": "int"})
            ]
        ),
        FieldCleaningConfig(
            name="Income",
            cleaners=[
                CleanerConfig(registry=cleaner_registry, type="remove_non_numeric", params={}),
                CleanerConfig(registry=cleaner_registry, type="type_conversion", params={"target_type": "float"})
            ]
        ),
        FieldCleaningConfig(
            name="Email",
            cleaners=[
                CleanerConfig(registry=cleaner_registry, type="to_lowercase", params={})
            ]
        )
    ],
    model_cleaners=[
        ModelCleanerConfig(
            registry=cleaner_registry,
            type="remove_inconsistent", # Record-level cleaner
            fields=[],
            params={}
        ),
        ModelCleanerConfig(
            registry=cleaner_registry,
            type="drop_duplicates", # Batch-level cleaner
            fields=["Name", "Email"], # Consider Name and Email for identifying duplicates
            params={"keep": "first"} # Keep the first occurrence
        ),
        ModelCleanerConfig(
            registry=cleaner_registry,
            type="treat_outliers",
            fields=[], # <-- This `fields` can be an empty list for ModelCleanerConfig
            params={
                "fields": ["Income"],
                "percentile": 99
            }
        )
    ]
)




In [19]:
# --- Execute Cleaning ---
dynamic_cleaner = DynamicCleaner(cleaning_schema_config, cleaner_registry)
cleaned_data = dynamic_cleaner.clean_data(raw_data)

2025-06-01 22:23:32,585 - ERROR - Validation error during cleaning for record: {'Name': 'Frank', 'Age': -5, 'Income': 80000.0, 'Email': 'frank@example.com'}. Errors: [{'type': 'value_error', 'loc': (), 'msg': 'Value error, RECORD_TO_BE_REMOVED_BY_CLEANER', 'input': {'Name': 'Frank', 'Age': -5, 'Income': 80000.0, 'Email': 'frank@example.com'}, 'ctx': {'error': ValueError('RECORD_TO_BE_REMOVED_BY_CLEANER')}, 'url': 'https://errors.pydantic.dev/2.11/v/value_error'}]
2025-06-01 22:23:32,586 - ERROR - Validation error during cleaning for record: {'Name': 'Grace', 'Age': 40, 'Income': -100.0, 'Email': 'grace@example.com'}. Errors: [{'type': 'value_error', 'loc': (), 'msg': 'Value error, RECORD_TO_BE_REMOVED_BY_CLEANER', 'input': {'Name': 'Grace', 'Age': 40, 'Income': -100.0, 'Email': 'grace@example.com'}, 'ctx': {'error': ValueError('RECORD_TO_BE_REMOVED_BY_CLEANER')}, 'url': 'https://errors.pydantic.dev/2.11/v/value_error'}]
2025-06-01 22:23:32,587 - INFO - After record-level cleaning, 8 ou

In [20]:

print("\n--- Final Cleaned Data ---")
for record in cleaned_data:
    print(record)


--- Final Cleaned Data ---
{'Name': 'alice', 'Age': 25, 'Income': 50000.0, 'Email': 'alice@example.com'}
{'Name': 'bob', 'Age': 30, 'Income': 45000.0, 'Email': 'bob@example.com'}
{'Name': 'david', 'Age': 30, 'Income': nan, 'Email': 'david@example.com'}
{'Name': 'eve', 'Age': 35, 'Income': 75000.0, 'Email': 'eve@example.com'}
{'Name': 'henry', 'Age': 50, 'Income': 10000000.0, 'Email': 'henry@example.com'}
{'Name': 'jane', 'Age': 28, 'Income': 52000.0, 'Email': 'jane@example.com'}


In [None]:

# --- Load cleaning_schema_config from JSON ---
config_path = "cleaners.json" # Relative path
# If cleaners.json is in a different directory, use:
# config_path = "/path/to/your/configs/cleaners.json"
# or
# from pathlib import Path
# config_path = Path(__file__).parent / "configs" / "cleaners.json" # For a 'configs' subfolder

try:
    with open(config_path, 'r') as f:
        config_data = json.load(f)
    
    # 1. Create the SchemaCleaningConfig from the loaded dictionary
    cleaning_schema_config = SchemaCleaningConfig.model_validate(config_data)
    
    # 2. IMPORTANT: Manually inject the registry into all nested CleanerConfig and ModelCleanerConfig instances
    # This is necessary because Pydantic doesn't know about `_registry` during deserialization from JSON.
    cleaning_schema_config.set_all_cleaners_registry(cleaner_registry)

    logger.info(f"Successfully loaded cleaning schema from '{config_path}'.")

except FileNotFoundError:
    logger.error(f"Error: The file '{config_path}' was not found. Please ensure it exists in the correct directory.")
    exit() # Exit if the config file is crucial and missing
except json.JSONDecodeError as e:
    logger.error(f"Error decoding JSON from '{config_path}': {e}")
    exit()
except ValidationError as e:
    logger.error(f"Validation error when parsing cleaning schema from '{config_path}': {e.errors()}")
    exit()
except Exception as e:
    logger.error(f"An unexpected error occurred while loading cleaning schema: {e}")
    exit()


# --- Execute Cleaning ---
dynamic_cleaner = DynamicCleaner(cleaning_schema_config, cleaner_registry)
cleaned_data = dynamic_cleaner.clean_data(raw_data)

print("\n--- Final Cleaned Data ---")
for record in cleaned_data:
    print(record)

2025-06-01 22:23:32,608 - ERROR - An unexpected error occurred while loading cleaning schema: ModelCleanerConfig.__init__() missing 1 required positional argument: 'registry'
2025-06-01 22:23:32,609 - ERROR - Validation error during cleaning for record: {'Name': 'Frank', 'Age': -5, 'Income': 80000.0, 'Email': 'frank@example.com'}. Errors: [{'type': 'value_error', 'loc': (), 'msg': 'Value error, RECORD_TO_BE_REMOVED_BY_CLEANER', 'input': {'Name': 'Frank', 'Age': -5, 'Income': 80000.0, 'Email': 'frank@example.com'}, 'ctx': {'error': ValueError('RECORD_TO_BE_REMOVED_BY_CLEANER')}, 'url': 'https://errors.pydantic.dev/2.11/v/value_error'}]
2025-06-01 22:23:32,609 - ERROR - Validation error during cleaning for record: {'Name': 'Grace', 'Age': 40, 'Income': -100.0, 'Email': 'grace@example.com'}. Errors: [{'type': 'value_error', 'loc': (), 'msg': 'Value error, RECORD_TO_BE_REMOVED_BY_CLEANER', 'input': {'Name': 'Grace', 'Age': 40, 'Income': -100.0, 'Email': 'grace@example.com'}, 'ctx': {'error


--- Final Cleaned Data ---
{'Name': 'alice', 'Age': 25, 'Income': 50000.0, 'Email': 'alice@example.com'}
{'Name': 'bob', 'Age': 30, 'Income': 45000.0, 'Email': 'bob@example.com'}
{'Name': 'david', 'Age': 30, 'Income': nan, 'Email': 'david@example.com'}
{'Name': 'eve', 'Age': 35, 'Income': 75000.0, 'Email': 'eve@example.com'}
{'Name': 'henry', 'Age': 50, 'Income': 10000000.0, 'Email': 'henry@example.com'}
{'Name': 'jane', 'Age': 28, 'Income': 52000.0, 'Email': 'jane@example.com'}


: 