In [1]:
import os
import json
import yaml
from loguru import logger
from datetime import datetime
from rich import print as prt
from typing import Dict, Any, List, Optional

### Helper Funtions

In [2]:
def fetch_all_objects(directory: str) -> List[Dict[str, Any]]:
    """
    Load all JSON files from a specified directory into a list of dictionaries.

    Args:
        directory (str): Path to the directory containing JSON files.

    Returns:
        List[Dict[str, Any]]: List of dictionaries, each representing the contents of a JSON file.

    Note:
        This function mimics querying data from an API by reading local files.
        In production, replace this logic with actual API calls as needed.
    """
    json_list = []

    for current_file in os.scandir(directory):
        if current_file.is_file() and current_file.name.endswith(".json"):
            with open(current_file.path, "r") as json_file:
                json_data = json.load(json_file)
                json_list.append(json_data)

    logger.info("Source objects loaded successfully.")
    return json_list

In [3]:
def get_source_objects(data_path: str):
    """
    Get the source raw objects from the data folder and store them as a list of dictionaries.
    """
    try:
        with open(data_path, "r") as f:
            source_objects = json.load(f)

        if not isinstance(source_objects, list):
            logger.error("Source objects should be a list of dictionaries.")
            return []

        logger.info("Source objects loaded successfully.")
        return source_objects
    except FileNotFoundError:
        logger.error("Source objects file not found.")
        return []
    except json.JSONDecodeError:
        logger.error("Error decoding JSON from source objects file.")
        return []

In [4]:
def load_attribute_mapping(config_file: str) -> Dict[str, Dict[str, str]]:
    """
    Load attribute mapping configuration from a YAML file.

    Returns:
        Dict[str, Dict[str, str]]: The attribute mapping dictionary

    Raises:
        FileNotFoundError: If the configuration file doesn't exist
        yaml.YAMLError: If the YAML file is malformed
        KeyError: If required keys are missing from the configuration
    """
    try:
        if not config_file:
            raise FileNotFoundError(f"Configuration file not found: {config_file}")

        with open(config_file, "r") as f:
            config = yaml.safe_load(f)

        if "attribute_mapping" not in config:
            raise KeyError("'attribute_mapping' key not found in configuration file")

        attribute_mapping = config["attribute_mapping"]

        # Validate the structure
        for attr_name, mapping in attribute_mapping.items():
            if not isinstance(mapping, dict):
                raise ValueError(
                    f"Invalid mapping for attribute '{attr_name}': expected dict, got {type(mapping)}"
                )

            if "field" not in mapping or "container" not in mapping:
                raise KeyError(
                    f"Missing required keys ('field', 'container') for attribute '{attr_name}'"
                )

        logger.info(f"Successfully loaded attribute mapping from {config_file}")
        return attribute_mapping

    except FileNotFoundError as e:
        logger.error(f"Configuration file not found: {e}")
        raise
    except yaml.YAMLError as e:
        logger.error(f"Error parsing YAML configuration: {e}")
        raise
    except (KeyError, ValueError) as e:
        logger.error(f"Invalid configuration structure: {e}")
        raise

In [5]:
def load_classification_config(config_path: str) -> Dict[str, Any]:
    """
    Load classification configuration from a JSON file.
    """
    try:
        with open(config_path, "r") as file:
            config = yaml.safe_load(file)
            if "restrictions" not in config:
                raise ValueError("Missing 'restrictions' key in classification config")

            logger.info("Classification configuration loaded successfully.")
            return config["restrictions"]
    except FileNotFoundError:
        raise FileNotFoundError(f"Configuration file not found: {config_path}")
    except yaml.YAMLError as e:
        raise ValueError(f"Error parsing YAML file: {e}")

In [6]:
def save_standard_objects(
    output_path: str, cleaned_objects: List[Dict[str, Any]]
) -> None:
    """
    Save each cleaned standard object to a separate JSON file.

    This function saves each cleaned standard object to a JSON file with a filename
    based on the object ID and current timestamp. It handles file writing errors
    and ensures proper JSON formatting.

    Args:
        cleaned_objects (List[Dict[str, Any]]): List of cleaned standard objects to save

    Returns:
        None: The function does not return a value, but logs the results of the save operation.

    Raises:
        OSError: If the output directory cannot be created or accessed
    """
    # Generate timestamp for this batch
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    for i, obj in enumerate(cleaned_objects):
        try:
            # Get object ID, fallback to index if ID is missing
            obj_id = obj.get("id", f"object_{i}")

            # Create filename with object ID and timestamp
            filename = f"{obj_id}_{timestamp}.json"
            file_path = os.path.join(output_path, filename)

            # Ensure the object is JSON serializable
            if not isinstance(obj, dict):
                raise ValueError(f"Object {i} is not a valid dictionary")

            # Write JSON file with proper formatting
            with open(file_path, "w", encoding="utf-8") as f:
                json.dump(obj, f, indent=2)

            logger.info(f"Successfully saved object {obj_id} to {filename}")
        except (ValueError, TypeError) as e:
            error_msg = f"Object {i} serialization error: {e}"
            logger.error(error_msg)
            raise
        except OSError as e:
            error_msg = f"File write error for object {i}: {e}"
            logger.error(error_msg)
            raise
        except Exception as e:
            error_msg = f"Unexpected error saving object {i}: {e}"
            logger.error(error_msg)
            raise

### Parser Functions

In [8]:
def is_empty_container(container: Any) -> bool:
    """
    Check if a container is empty.

    Args:
        container (Any): The container to check

    Returns:
        bool: True if the container is empty or None, False otherwise
    """
    try:
        if container is None:
            return True
        if isinstance(container, dict):
            return all(is_empty_container(value) for value in container.values())
        if isinstance(container, list):
            return all(is_empty_container(item) for item in container)
        return False  # For other types, consider them non-empty if they are not None
    except Exception as e:
        logger.error(f"Error checking if container is empty: {e}")
        raise ValueError("Invalid container structure")


def clean_object(obj: Dict[str, Any]) -> Dict[str, Any]:
    """
    Remove empty containers from the object.

    Args:
        obj: The object to clean

    Returns:
        The cleaned object with empty containers removed
    """
    try:
        if isinstance(obj, dict):
            # Use dictionary comprehension to clean nested dictionaries
            return {
                key: clean_object(value)
                for key, value in obj.items()
                if not is_empty_container(value)
            }
        elif isinstance(obj, list):
            # Use list comprehension to clean nested lists
            return [clean_object(item) for item in obj if not is_empty_container(item)]
        else:
            # return non-container value as is.
            return obj
    except Exception as e:
        logger.error(f"Error cleaning object: {e}")
        raise ValueError("An error occurred while cleaning the object")

In [9]:
def extract_ism(acm: dict) -> dict:
    """Extract the reduced 'ism' structure from any ACM dict."""
    return {
        "banner": acm.get("banner"),
        "classification": acm.get("classif"),
        "ownerProducer": acm.get("owner_prod"),
        "releaseableTo": acm.get("rel_to"),
        "disseminationControls": acm.get("dissem_ctrls"),
    }

In [10]:
def extract_created_date(source_object: Dict[str, Any]) -> Optional[int]:
    """
    Extracts the 'createdDate' (Unix timestamp) from the source object's attributes.

    Args:
        source_object (Dict[str, Any]): The source dictionary containing object data.

    Returns:
        Optional[int]: The Unix timestamp of 'Date Of Introduction' if found, otherwise None.
    """
    try:
        for attr in source_object.get("attributes", {}).get("data", []):
            name = attr.get("attributeName", "").strip().lower()
            if name == "date of introduction":
                value = attr.get("attributeValue")
                if isinstance(value, int):
                    return value
    except Exception as e:
        logger.error(f"There was an error extracting createdDate: {e}")
    return None

In [11]:
def extract_elevation(source_object: Dict[str, Any]) -> Optional[Any]:
    """
    Retrieves the elevation value from the source object, handling variations
    in the attribute name (e.g., "Elevation", "Elevation(m)", "Elevation (m)").

    Args:
        source_object (Dict[str, Any]): The source JSON-like object.

    Returns:
        Optional[Any]: The elevation value if found, otherwise None.
    """
    elevation_value = None

    # Define possible variations of the "Elevation" attribute name
    elevation_variations = ["elevation", "elevation(m)", "elevation (m)"]

    try:
        # Ensure the source object is a dictionary and contains the expected structure
        if not isinstance(source_object, dict):
            raise ValueError("source object must be a dictionary.")

        if (
            "attributes" not in source_object
            or "data" not in source_object["attributes"]
        ):
            raise KeyError(
                "source object does not contain the expected 'attributes.data' structure."
            )

        # Iterate through the attributes to find the elevation value
        for attr in source_object["attributes"]["data"]:
            attribute_name = attr.get("attributeName", "").lower()

            if (
                attribute_name in elevation_variations
                and attr.get("attributeValue") is not None
            ):
                elevation_value = attr.get("attributeValue")
                break  # Exit the loop once the elevation value is found

        if isinstance(elevation_value, str):
            try:
                elevation_value = float(elevation_value)
            except Exception as e:
                logger.error(f"Error transforming elevation into float: {e}")
    except Exception as e:
        # Log the exception for debugging purposes
        logger.error(f"Error occurred while retrieving elevation: {e}")

    return elevation_value

In [12]:
def prepare_dates(source_objects: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    Converts date strings in 'lastVerified.timestamp' and 'Date Of Introduction' attributes to Unix timestamps.
    """

    def to_unix(date_str: str) -> Optional[int]:
        for fmt in ("%Y-%m-%dT%H:%M:%S.%fZ", "%Y-%m-%dT%H:%M:%SZ", "%Y-%m-%d"):
            try:
                return int(datetime.strptime(date_str, fmt).timestamp())
            except Exception:
                continue
        return None

    for obj in source_objects:
        # lastVerified.timestamp
        ts = obj.get("lastVerified", {}).get("timestamp")
        if isinstance(ts, str):
            unix_ts = to_unix(ts)
            if unix_ts is not None:
                obj["lastVerified"]["timestamp"] = unix_ts

        # Date Of Introduction in attributes
        for attr in obj.get("attributes", {}).get("data", []):
            if attr.get("attributeName", "").strip().lower() == "date of introduction":
                date_str = attr.get("attributeValue")
                if isinstance(date_str, str):
                    unix_ts = to_unix(date_str)
                    if unix_ts is not None:
                        attr["attributeValue"] = unix_ts

    logger.info("Dates prepared successfully.")
    return source_objects

In [13]:
def prepare_attribute_index(source: Dict[str, Any]) -> Dict[str, Dict]:
    """
    Prepare a complete attribute index including both standard attributes and top-level fields.

    Args:
        source: Source dictionary containing object data

    Returns:
        Dict[str, Dict]: Attribute index with both regular attributes and transformed top-level fields
    """
    # Get standard attributes from data items
    data_items = source.get("attributes", {}).get("data", [])
    attr_index = {item.get("attributeName"): item for item in data_items}

    # Define top-level fields to be included in attribute mapping
    top_level_fields = {
        "domain": "Domain",
        "allegience": "Allegience",
        "allegienceAor": "Allegience Aor",
        "eoid": "Enterprise Object ID",
    }

    # Add top-level fields to attribute index with proper structure
    for source_field, attr_name in top_level_fields.items():
        if source_field in source:
            attr_index[attr_name] = {
                "attributeValue": source[source_field],
                "acm": source.get("acm", {}),
            }

    return attr_index

In [14]:
def parse_location(source_object: Dict[str, Any]) -> Optional[Dict[str, Any]]:
    """
    Processes location information from the input object's geographic data.

    Args:
        source_object (Dict[str, Any]): The input object containing location data.

    Returns:
        Dict[str, Any]: Processed location data, or None if data is invalid.
    """
    try:
        location_data = source_object.get("latestKnownLocation")
        if not location_data:
            logger.warning(
                f"No location data found for object {source_object.get('id')}"
            )
            return None

        geometry_data = location_data.get("geometry")
        if not geometry_data:
            logger.warning(
                f"No geometry data found for object {source_object.get('id')}"
            )
            return None

        # Enhance coordinate validation - accept arrays with 2 or more values
        coords = geometry_data.get("coordinates", [])
        if not coords or len(coords) < 2:
            logger.warning(f"Invalid coordinates for object {source_object.get('id')}")
            return None

        if len(coords) > 2:
            logger.warning(
                f"Using first two values from {len(coords)}-element coordinate array for object {source_object.get('id')}."
            )

        elevation_value = extract_elevation(source_object)

        return {
            "ism": extract_ism(location_data.get("acm", {})),
            "id": location_data.get("id"),
            "timestamp": location_data.get("lastVerified", {}).get("timestamp"),
            "latitude": coords[1],
            "longitude": coords[0],
            "altitude": {
                "value": None,
                "quality": None,
                "error": None,
                "units": {"value": None},
            },
            "elevation": {
                "value": elevation_value,
                "quality": None,
                "error": None,
                "units": {"value": None},
            },
            "derivation": geometry_data.get("type"),
            "quality": None,
            "locationName": None,
        }
    except Exception as e:
        logger.error(
            f"Unexpected error writing location values for object {source_object.get('id')}: {str(e)}"
        )
        return None

In [None]:
# TODO: Think about how to really handle setting the 'acm' for shipClass and shipName
# That value should come from the attributes 'acm' not the top-level 'acm'


def parse_ship_class_name(
    source_object: Dict[str, Any], standard_object: Dict[str, Any]
) -> Dict[str, Any]:
    """
    Updates the standard_object's maritimeMetadata with shipClass and shipName if the object is a ship.

    Args:
        source_object (Dict[str, Any]): Input dictionary containing vessel information.
        standard_object (Dict[str, Any]): Dictionary to be updated with vessel metadata.

    Returns:
        Dict[str, Any]: The updated standard_object.
    """
    try:
        attributes = source_object.get("attributes", {}).get("data", [])
        class_name = source_object.get("className")
        acm = source_object.get("acm", {})

        # Using any() to more efficiently determine shipName-shipClass without a for loop
        is_ship = any(
            attr.get("attributeName") == "Echelon"
            and attr.get("attributeValue") == "SHIP"
            for attr in attributes
        )
        ship_name = next(
            (
                attr.get("attributeValue")
                for attr in attributes
                if attr.get("attributeName") == "Name"
            ),
            None,
        )

        if is_ship:
            if "maritimeMetadata" not in standard_object or not isinstance(
                standard_object["maritimeMetadata"], dict
            ):
                standard_object["maritimeMetadata"] = {}

            standard_object["maritimeMetadata"]["shipClass"] = {
                "value": class_name,
                "ism": extract_ism(acm),
            }
            if ship_name:
                standard_object["maritimeMetadata"]["shipName"] = {
                    "value": ship_name,
                    "ism": extract_ism(acm),
                }

            logger.info(
                f"Set shipClass and shipName for object {standard_object.get('id')}"
            )
        return standard_object
    except Exception as e:
        logger.error(
            f"Error parsing ship class/name for object {standard_object.get('id', 'unknown')}: {e}"
        )
        return standard_object

In [None]:
# TODO: Think about how to really handle setting the 'acm'
# That value should come from the attributes 'acm' not the top-level 'acm'


def parse_facility_name_id(
    source_object: Dict[str, Any], standard_object: Dict[str, Any]
) -> Dict[str, Any]:
    """
    Updates the standard_object with facilityName and facilityId if the object represents a facility.

    Args:
        source_object (Dict[str, Any]): Dictionary containing facility information.
        standard_object (Dict[str, Any]): Dictionary to be updated with facility metadata.

    Returns:
        Dict[str, Any]: The updated standard_object.
    """
    try:
        attributes = source_object.get("attributes", {}).get("data", [])
        class_name = source_object.get("className")
        acm = source_object.get("acm", {})

        is_facility = class_name == "Facility"
        facility_name = source_object.get("name")

        facility_id = next(
            (
                attr.get("attributeValue")
                for attr in attributes
                if attr.get("attributeName") == "OSuffix"
                and attr.get("attributeValue") is not None
            ),
            None,
        )

        if is_facility:
            if "facility" not in standard_object or not isinstance(
                standard_object["facility"], dict
            ):
                standard_object["facility"] = {}

            standard_object["facility"]["facilityName"] = {
                "value": facility_name,
                "ism": extract_ism(acm),
            }
            if facility_id:
                standard_object["facility"]["facilityId"] = {
                    "value": facility_id,
                    "ism": extract_ism(acm),
                }

            logger.info(
                f"Set facilityName and facilityId for object {standard_object.get('id')}"
            )
        return standard_object
    except Exception as e:
        logger.error(
            f"Error parsing facility name/id for object {standard_object.get('id', 'unknown')}: {e}"
        )
        return standard_object

In [17]:
def is_classif_too_high(ism: Dict[str, Any], config: Dict[str, Any]) -> bool:
    """Return True if ISM is too highly classified or contains forbidden controls/terms."""
    if not ism:
        logger.warning("ISM is empty, cannot determine classification level.")
        return False

    if (
        ism.get("classification") == "TS"
        or set(ism.get("sciControls", [])) & set(config["forbidden_sci"])
        or set(ism.get("disseminationControls", [])) & set(config["forbidden_controls"])
        or any(
            term in ism.get("banner", "").upper() for term in config["forbidden_terms"]
        )
    ):
        # logger.warning(f"ISM too high or contains forbidden values: {ism}")
        return True

    return False

In [18]:
def is_more_restrictive(
    ism1: Dict[str, Any], ism2: Dict[str, Any], config: Dict[str, Any]
) -> bool:
    """Return True if ism1 is more restrictive than ism2."""
    if not ism1 or not ism2:
        return False

    # FGI controls
    ism1_fgi = any(c.startswith("FGI") for c in ism1.get("sciControls", []))
    ism2_fgi = any(c.startswith("FGI") for c in ism2.get("sciControls", []))
    if ism1_fgi != ism2_fgi:
        return ism1_fgi

    # NOFORN
    ism1_noforn = "NOFORN" in ism1.get("disseminationControls", [])
    ism2_noforn = "NOFORN" in ism2.get("disseminationControls", [])
    if ism1_noforn != ism2_noforn:
        return ism1_noforn

    # REL controls
    rel1 = "REL" in ism1.get("disseminationControls", [])
    rel2 = "REL" in ism2.get("disseminationControls", [])
    if rel1 and rel2:
        ism1_release = set(ism1.get("releasableTo", []))
        ism2_release = set(ism2.get("releasableTo", []))
        ism1_groups = ism1_release & set(config["special_groups"])
        ism2_groups = ism2_release & set(config["special_groups"])
        # More restrictive if ism1 has fewer groups or fewer releasable entities
        if ism1_groups != ism2_groups:
            return len(ism1_groups) < len(ism2_groups)
        return len(ism1_release) < len(ism2_release)

    # Classification hierarchy
    classif1 = ism1.get("classification", "U")
    classif2 = ism2.get("classification", "U")
    return config["classifications"].get(classif1, 0) > config["classifications"].get(
        classif2, 0
    )

In [19]:
def find_most_restrictive_valid_ism(
    obj: Dict[str, Any], config: Dict[str, Any]
) -> Optional[Dict[str, Any]]:
    """
    Traverse a nested object to find the most restrictive valid ISM (Information Security Marking).

    The function searches through all dictionaries and lists within the provided object,
    identifies ISMs that are not too highly classified (using is_classif_too_high),
    and returns the most restrictive valid ISM according to the is_more_restrictive function.

    Args:
        obj (Dict[str, Any]): The object to search for ISMs.
        config (Dict[str, Any]): The classification configuration dictionary.

    Returns:
        Optional[Dict[str, Any]]: The most restrictive valid ISM found, or None if no valid ISM exists.
    """
    most_restrictive = None
    stack = [obj]  # Use a stack to traverse the object hierarchy

    while stack:
        item = stack.pop()

        if isinstance(item, dict):
            # Check if the current item has an ISM and if it's valid
            if "ism" in item:
                ism = item.get("ism")
                if ism and not is_classif_too_high(ism, config):
                    # Early exit if 'TS' found
                    if ism.get("classification") == "TS":
                        return ism.copy()
                    if most_restrictive is None or is_more_restrictive(
                        ism, most_restrictive, config
                    ):
                        most_restrictive = ism.copy()

            # Add all dictionary values to the stack
            stack.extend(item.values())
        elif isinstance(item, list):
            # Add all list items to the stack
            stack.extend(item)

    # Print the most restrictive ISM found
    # logger.info(f"Most restrictive valid ISM found: {most_restrictive}")
    return most_restrictive

In [20]:
def apply_restrictions(
    standard_object: Dict[str, Any], config: Dict[str, Any]
) -> Optional[Dict[str, Any]]:
    """
    Recursively process a standard object to remove or redact data that is too highly classified,
    according to the provided classification configuration.

    The function finds the most restrictive valid ISM (Information Security Marking) within the object,
    then traverses all nested dictionaries and lists, replacing any data with a classification that is
    considered too high with a placeholder. The processed object will include an 'overallClassification'
    field set to the most restrictive valid ISM found.

    Args:
        standard_object (Dict[str, Any]): The object to process and apply restrictions to.
        config (Dict[str, Any]): The classification configuration dictionary.

    Returns:
        Optional[Dict[str, Any]]: The processed object with restricted data redacted, or None if no valid ISM is found.
    """
    # Find the most restrictive valid ISM in the object
    most_restrictive_ism = find_most_restrictive_valid_ism(standard_object, config)

    if not most_restrictive_ism:
        # If no valid ISM is found, return None
        logger.warning("No valid ISM found for object")
        return None

    def process_item(item: Any) -> Any:
        if isinstance(item, dict):
            # If the item has an ISM, check if it is too high
            if "ism" in item and is_classif_too_high(item["ism"], config):
                logger.debug(f"Removing item due to high classification: {item}")
                return None

            # Process all key-value pairs in the dictionary
            return {k: process_item(v) for k, v in item.items()}
        if isinstance(item, list):
            # Process all items in the list
            return [process_item(x) for x in item]

        return item  # Return the item as is if it's neither a dict nor a list

    # Process the object and add the overall classification
    processed_object = process_item(standard_object)
    if isinstance(processed_object, dict):
        processed_object["overallClassification"] = most_restrictive_ism

    return processed_object

In [21]:
def build_standard_object(
    target_structure: Dict[str, Any],
    attr_index: Dict[str, Dict],
    attribute_map: Dict[str, Dict[str, str]],
) -> Dict[str, Any]:
    """
    Build a standard object by mapping attributes from the source data to target fields.

    This function takes a pre-initialized target_structure dictionary and populates it with
    transformed attribute values based on the provided attribute mapping. Each
    attribute value is wrapped with ISM classification metadata.

    Args:
        target_structure (Dict[str, Any]): Pre-initialized dictionary containing basic object metadata
            and empty containers (ontology, maritimeMetadata, equipment, facility)

        attr_index (Dict[str, Dict]): Index of attribute data items keyed by attribute name,
            where each item contains 'attributeValue' and 'acm' fields

        attribute_map (Dict[str, Dict[str, str]]): Mapping configuration where keys are attribute names and values are dicts with
            'field' and 'container' specifications

    Returns:
        Dict[str, Any]: The populated target_structure dictionary with mapped attributes organized
            into their designated containers, or empty dict if an error occurs

    Note:
        - Attributes mapped to "root" container are placed directly in the target_structure dict
        - Other containers are nested under their respective keys
        - Each mapped value includes the original value and ISM classification metadata
        - Missing attributes in attr_index are silently skipped
    """
    try:
        for attr_name, mapping in attribute_map.items():
            item = attr_index.get(attr_name)

            if not item:
                continue

            target_field = mapping["field"]
            container = mapping["container"]

            transformed_value = {
                "value": item.get("attributeValue"),
                "ism": extract_ism(item.get("acm", {})),
            }

            if container == "root":
                target_structure[target_field] = transformed_value
            else:
                # Ensure nested container exists
                if container not in target_structure:
                    target_structure[container] = {}
                target_structure[container][target_field] = transformed_value

        return target_structure
    except Exception as e:
        logger.error(f"Error building standard object: {e}")
        return {}

In [22]:
def transform_source_object(
    source: Dict[str, Any],
    attribute_map: Dict[str, Dict[str, str]],
    restrictions_config: Dict[str, Any],
) -> Dict[str, Any]:
    """
    Transform a source object into a structured format based on the provided attribute mapping.

    Args:
        source: The source dictionary containing object data with attributes, ACM, and metadata
        attribute_map: Dictionary mapping attribute names to their target field and container locations
        restrictions_config: Configuration dict for classification restrictions

    Returns:
        Dict containing the transformed object with structured fields
    """
    try:
        if not isinstance(source, dict) or not isinstance(attribute_map, dict):
            logger.error("Invalid source object or attribute map.")
            return {}

        # Special handling for createdDate.
        created_date = extract_created_date(source)

        # Initialize target structure with basic metadata
        target_structure = {
            "version": source.get("version"),
            "overallClassification": extract_ism(source.get("acm", {})),
            "id": source.get("id"),
            "name": source.get("name"),
            "createdDate": created_date,
            "lastUpdatedDate": source.get("lastVerified", {}).get("timestamp"),
            "excerciseIndicator": source.get("gideId"),
            "location": parse_location(source),
            "maritimeMetadata": {},
            "landMetadata": {},
            "equipment": {},
            "unit": {},
            "ontology": {},
            "facility": {},
            "provenance": {},
        }

        # Get complete attribute index including top-level fields
        attr_index = prepare_attribute_index(source)

        # Build and return the standard object
        standard_object = build_standard_object(
            target_structure, attr_index, attribute_map
        )

        # Apply the bespoke functions that parse maritime and facility attributes
        parse_ship_class_name(source, standard_object)
        parse_facility_name_id(source, standard_object)

        # Apply classification restrictions
        processed_object = apply_restrictions(standard_object, restrictions_config)

        if processed_object is not None:
            logger.info(
                f"Finished transforming object with ID: {processed_object.get('id', 'unknown')}"
            )
            return processed_object
        else:
            logger.warning(
                f"Transformation resulted in None for object with ID: {source.get('id', 'unknown')}"
            )
            return {}
    except Exception as e:
        logger.error(
            f"Error transforming object with ID {source.get('id', 'unknown')}: {e}"
        )
        return {}

### Main Execution

In [None]:
def run_pipeline(
    data_path: str,
    attribute_mapping_path: str,
    output_path: str,
    restrictions_path: str,
) -> List[Dict[str, Any]] | None:
    """
    Orchestrates the main data processing workflow:
    - Fetches source objects from a local JSON file.
    - Processes objects using the parser logic to generate standard objects.
    - Saves processed (standard) objects locally.

    Args:
        data_path (str): Path to the JSON file containing raw input objects.
        attribute_mapping_path (str): Path to the YAML configuration file for attribute mapping.
        output_path (str): Directory path to save the processed standard objects.
        restrictions_path (str): Path to the JSON configuration file for classification restrictions.

    Returns:
        List[Dict[str, Any]]:
            - standard_objects: List of processed standard objects.

    Raises:
        Exception: Logs any errors encountered during processing and returns None.
    """
    try:
        logger.info("Starting data fetching and processing pipeline...")

        # Get the source objects
        source_objects = fetch_all_objects(data_path)
        logger.info(f"Fetched {len(source_objects)} objects")

        # Load attribute mapping and restrictions from configuration file
        attribute_mapping = load_attribute_mapping(attribute_mapping_path)
        restrictions_config = load_classification_config(restrictions_path)

        # Prepare dates in source objects
        source_objects = prepare_dates(source_objects)

        # Transform each source object into the standard format
        standard_objects = [
            transform_source_object(obj, attribute_mapping, restrictions_config)
            for obj in source_objects
        ]
        logger.info(f"Processed {len(standard_objects)} objects into standard format")

        # Apply the cleanup function to all standard objects
        cleaned_standard_objects = [clean_object(obj) for obj in standard_objects]
        logger.info("Cleaned standard objects by removing any empty containers")

        # Save the cleaned standard objects to JSON files
        save_standard_objects(output_path, cleaned_standard_objects)

        return cleaned_standard_objects
    except Exception as e:
        logger.error(f"Error in pipeline execution: {e}")
        return None

In [25]:
# Set the paths for data, config, and output
attribute_mapping_path = "../config/attribute_mapping.yaml"
restrictions_path = "../config/classifications_config.yaml"
data_path = "../data/1_raw/input"
output_path = "../data/2_processed/output"

In [None]:
# Run the entire pipeline
# standard_objects = run_pipeline(data_path, config_file, output_path)

### Execute the pipeline logic one cell at a time
* this can be used for debugging or simply to see the sequential behavior of the code

In [26]:
# Get the source objects
source_objects = fetch_all_objects(data_path)

[32m2025-09-09 15:10:18.455[0m | [1mINFO    [0m | [36m__main__[0m:[36mfetch_all_objects[0m:[36m23[0m - [1mSource objects loaded successfully.[0m


In [27]:
# Load attribute mapping and restrictions from configuration file
attribute_mapping = load_attribute_mapping(attribute_mapping_path)
restrictions_config = load_classification_config(restrictions_path)

[32m2025-09-09 15:10:19.655[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_attribute_mapping[0m:[36m37[0m - [1mSuccessfully loaded attribute mapping from ../config/attribute_mapping.yaml[0m
[32m2025-09-09 15:10:19.657[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_classification_config[0m:[36m11[0m - [1mClassification configuration loaded successfully.[0m


In [28]:
# Prepare dates in source objects
source_objects = prepare_dates(source_objects)

[32m2025-09-09 15:10:39.346[0m | [1mINFO    [0m | [36m__main__[0m:[36mprepare_dates[0m:[36m31[0m - [1mDates prepared successfully.[0m


In [29]:
# Transform each source object into the standard format
standard_objects = [
    transform_source_object(obj, attribute_mapping, restrictions_config)
    for obj in source_objects
]

[32m2025-09-09 15:12:36.113[0m | [1mINFO    [0m | [36m__main__[0m:[36mparse_facility_name_id[0m:[36m48[0m - [1mSet facilityName and facilityId for object 34567890-cdef-ghij-klmn-opqrstuvwxyz[0m
[32m2025-09-09 15:12:36.113[0m | [1mINFO    [0m | [36m__main__[0m:[36mtransform_source_object[0m:[36m60[0m - [1mFinished transforming object with ID: 34567890-cdef-ghij-klmn-opqrstuvwxyz[0m
[32m2025-09-09 15:12:36.113[0m | [1mINFO    [0m | [36m__main__[0m:[36mparse_ship_class_name[0m:[36m50[0m - [1mSet shipClass and shipName for object d134eafb-59bf-4c94-8dc2-e547f98e1b21[0m
[32m2025-09-09 15:12:36.114[0m | [1mINFO    [0m | [36m__main__[0m:[36mtransform_source_object[0m:[36m60[0m - [1mFinished transforming object with ID: d134eafb-59bf-4c94-8dc2-e547f98e1b21[0m
[32m2025-09-09 15:12:36.114[0m | [1mINFO    [0m | [36m__main__[0m:[36mparse_ship_class_name[0m:[36m50[0m - [1mSet shipClass and shipName for object 12345678-abcd-efgh-ijkl-mnopqrs

In [33]:
prt(standard_objects[3])

In [25]:
# Apply the cleanup function to all standard objects
cleaned_standard_objects = [clean_object(obj) for obj in standard_objects]

In [27]:
prt(cleaned_standard_objects[0])