In [None]:
import os
import json
from loguru import logger
from typing import Any, Dict, List

In [8]:
def fetch_all_objects(directory: str) -> List[Dict[str, Any]]:
    """
    Load all JSON files from a specified directory into a list of dictionaries.

    Args:
        directory (str): Path to the directory containing JSON files.

    Returns:
        List[Dict[str, Any]]: List of dictionaries, each representing the contents of a JSON file.

    Note:
        This function mimics querying data from an API by reading local files.
        In production, replace this logic with actual API calls as needed.
    """
    json_list = []

    for current_file in os.scandir(directory):
        if current_file.is_file() and current_file.name.endswith(".json"):
            with open(current_file.path, "r") as json_file:
                json_data = json.load(json_file)
                json_list.append(json_data)

    logger.info("Source objects loaded successfully.")
    return json_list

In [19]:
def _validate_attributes(attributes: Dict[str, Any]) -> None:
    """Validate attribute structure and values"""
    if not isinstance(attributes, dict):
        raise ValueError(
            f"Invalid attribute type: expected dict, got {type(attributes)}"
        )

    required_fields = ["attributeName", "attributeValue"]

    if not attributes["data"]:
        raise ValueError(f"Missing `data` in `attributes`")

    attributes_data_keys = attributes["data"][0].keys()
    if not all(k in attributes_data_keys for k in required_fields):
        missing_fields = [k for k in required_fields if k not in attributes_data_keys]

        raise ValueError(f"Missing required fields: {missing_fields}")
    return None

In [10]:
def _validate_acm(acm: Dict[str, Any]) -> bool:
    """Validate the ACM structure"""
    required_fields = ["portion", "banner"]

    if not all(field in acm for field in required_fields):
        missing = [f for f in required_fields if f not in acm]

        logger.warning(f"Missing ACM fields: {missing}")
        return False

    return True

In [11]:
def _validate_required_fields(obj: Dict[str, Any]) -> bool:
    """Validate required object fields"""
    if not obj.get("id"):
        logger.warning("Raw object is missing 'id' attribute")
        return False

    if not _validate_acm(obj.get("acm", {})):
        logger.error(f"Failed ACM validation for object {obj.get('id')}")
        return False

    if not obj.get("attributes"):
        logger.warning(f"No attributes found for object {obj.get('id')}")
        return False

    return True

In [14]:
def preprocess_raw_data(raw_objects: List[Dict[str, Any]]):
    """
    Preprocess and validate raw objects

    Args:
        raw_objects: List of raw input objects

    Returns:
        Dict mapping object IDs to preprocessed objects
    """
    processed_data = []

    for obj in raw_objects:
        # 1. First validate required fields
        if not _validate_required_fields(obj):
            continue

        # 2. Then validate attributes
        attributes = obj.get("attributes", {})

        try:
            _validate_attributes(attributes)
        except ValueError as e:
            logger.error(
                f"Attribute validation failed for object {obj.get('id')}: {str(e)}"
            )
            continue

        # 3. If all validations pass, format and add to processed data
        processed_data.append(obj)
    return processed_data

In [13]:
# Get the source objects
data_path = "../data/1_raw/input"
source_objects = fetch_all_objects(data_path)

[32m2025-09-10 10:39:28.347[0m | [1mINFO    [0m | [36m__main__[0m:[36mfetch_all_objects[0m:[36m23[0m - [1mSource objects loaded successfully.[0m


In [17]:
preprocessed_data = preprocess_raw_data(source_objects)

[32m2025-09-10 10:49:05.042[0m | [31m[1mERROR   [0m | [36m__main__[0m:[36mpreprocess_raw_data[0m:[36m24[0m - [31m[1mAttribute validation failed for object 76276379-bcde-fghi-jklm-nopqrstuvwxy: Mission `data` in `attributes`[0m
[32m2025-09-10 10:49:05.043[0m | [31m[1mERROR   [0m | [36m__main__[0m:[36mpreprocess_raw_data[0m:[36m24[0m - [31m[1mAttribute validation failed for object 12345678-abcd-efgh-ijkl-mnopqrstuvwy: Mission `data` in `attributes`[0m


In [18]:
len(preprocessed_data)

12