In [None]:
import sys
import json
import yaml
import logging
from datetime import datetime
from rich import print as prt
from typing import Dict, Any, List

sys.path.append("../")
from src.utils.config_loader import get_input_path, get_output_path

# Configure standard logging for Jupyter
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
    handlers=[logging.StreamHandler()],  # This will output to the notebook
)

logger = logging.getLogger(__name__)

In [None]:
def load_source_schema() -> Dict[str, Any]:
    """
    Load the source schema from a JSON file.

    Returns:
        Dict[str, Any]: The loaded JSON schema

    Raises:
        FileNotFoundError: If the schema file doesn't exist
        json.JSONDecodeError: If the JSON file is malformed
    """
    try:
        schema_file = get_input_path("schema")
        if not schema_file.exists():
            raise FileNotFoundError(f"Schema file not found: {schema_file}")

        with open(schema_file, "r") as f:
            schema = json.load(f)

        logger.info(f"Successfully loaded source schema from {schema_file}")
        return schema

    except FileNotFoundError as e:
        logger.error(f"Schema file not found: {e}")
        raise
    except json.JSONDecodeError as e:
        logger.error(f"Error parsing JSON schema: {e}")
        raise

In [None]:
def load_attribute_mapping() -> Dict[str, Dict[str, str]]:
    """
    Load attribute mapping configuration from a YAML file.

    Returns:
        Dict[str, Dict[str, str]]: The attribute mapping dictionary

    Raises:
        FileNotFoundError: If the configuration file doesn't exist
        yaml.YAMLError: If the YAML file is malformed
        KeyError: If required keys are missing from the configuration
    """
    try:
        config_file = get_input_path("attribute_mapping")

        if not config_file.exists():
            raise FileNotFoundError(f"Configuration file not found: {config_file}")

        with open(config_file, "r") as f:
            config = yaml.safe_load(f)

        if "attribute_mapping" not in config:
            raise KeyError("'attribute_mapping' key not found in configuration file")

        attribute_mapping = config["attribute_mapping"]

        # Validate the structure
        for attr_name, mapping in attribute_mapping.items():
            if not isinstance(mapping, dict):
                raise ValueError(
                    f"Invalid mapping for attribute '{attr_name}': expected dict, got {type(mapping)}"
                )

            if "field" not in mapping or "container" not in mapping:
                raise KeyError(
                    f"Missing required keys ('field', 'container') for attribute '{attr_name}'"
                )

        logger.info(f"Successfully loaded attribute mapping from {config_file}")
        return attribute_mapping

    except FileNotFoundError as e:
        logger.error(f"Configuration file not found: {e}")
        raise
    except yaml.YAMLError as e:
        logger.error(f"Error parsing YAML configuration: {e}")
        raise
    except (KeyError, ValueError) as e:
        logger.error(f"Invalid configuration structure: {e}")
        raise

In [None]:
def get_source_objects():
    """
    Get the source raw objects from the data folder and store them as a list of dictionaries.
    """
    try:
        data_path = get_input_path("raw_objects")

        with open(data_path, "r") as f:
            source_objects = json.load(f)

        if not isinstance(source_objects, list):
            logger.error("Source objects should be a list of dictionaries.")
            return []

        logger.info("Source objects loaded successfully.")
        return source_objects
    except FileNotFoundError:
        logger.error("Source objects file not found.")
        return []
    except json.JSONDecodeError:
        logger.error("Error decoding JSON from source objects file.")
        return []

In [None]:
def extract_ism(acm: dict) -> dict:
    """Extract the reduced 'ism' structure from any ACM dict."""
    return {
        "banner": acm.get("banner"),
        "classification": acm.get("classif"),
        "ownerProducer": acm.get("owner_prod"),
        "releaseableTo": acm.get("rel_to"),
        'disseminationControls': acm.get("dissem_ctrls"),
    }

In [None]:
def build_standard_object(target_structure: Dict[str, Any], attr_index: Dict[str, Dict], attribute_map: Dict[str, Dict[str, str]]) -> Dict[str, Any]:
    """
    Build a standard object by mapping attributes from the source data to target fields.
    
    This function takes a pre-initialized target_structure dictionary and populates it with
    transformed attribute values based on the provided attribute mapping. Each
    attribute value is wrapped with ISM classification metadata.
    
    Args:
        target_structure (Dict[str, Any]): Pre-initialized dictionary containing basic object metadata
            and empty containers (ontology, maritimeMetadata, equipment, facility)
        
        attr_index (Dict[str, Dict]): Index of attribute data items keyed by attribute name,
            where each item contains 'attributeValue' and 'acm' fields
        
        attribute_map (Dict[str, Dict[str, str]]): Mapping configuration where keys are attribute names and values are dicts with
            'field' and 'container' specifications
    
    Returns:
        Dict[str, Any]: The populated target_structure dictionary with mapped attributes organized
            into their designated containers, or empty dict if an error occurs
    
    Note:
        - Attributes mapped to "root" container are placed directly in the target_structure dict
        - Other containers are nested under their respective keys
        - Each mapped value includes the original value and ISM classification metadata
        - Missing attributes in attr_index are silently skipped
    """
    try:
        for attr_name, mapping in attribute_map.items():
            item = attr_index.get(attr_name)
            
            if not item:
                continue
            
            target_field = mapping["field"]
            container = mapping["container"]
            
            transformed_value = {
                "value": item.get("attributeValue"),
                "ism": extract_ism(item.get("acm", {}))
            }
            
            if container == "root":
                target_structure[target_field] = transformed_value
            else:
                # Ensure nested container exists
                if container not in target_structure:
                    target_structure[container] = {}
                target_structure[container][target_field] = transformed_value
        
        return target_structure
    except Exception as e:
        print(f"Error building standard object: {e}")
        return {}

In [None]:
def transform_source_object(source: Dict[str, Any], attribute_map: Dict[str, Dict[str, str]]) -> Dict[str, Any]:
    """
    Transform a source object into a structured format based on the provided attribute mapping.
    
    Args:
        source: The source dictionary containing object data with attributes, ACM, and metadata
        attribute_map: Dictionary mapping attribute names to their target field and container locations
    
    Returns:
        Dict containing the transformed object with structured fields including:
        - Basic metadata (version, id, name, etc.)
        - Overall classification from ACM
        - Mapped attributes organized into appropriate containers (root, ontology, maritimeMetadata, facility)
    """
    try:
        # Check if source is a valid dictionary
        if not isinstance(source, dict):
            logger.error("Source object is not a valid dictionary.")
            return {}
        
        # Check that attribute_map is provided
        if not attribute_map or not isinstance(attribute_map, dict):
            logger.error("Attribute map is not provided or is not a valid dictionary.")
            return {}
        
        target_structure = {
            "version": source.get("version"),
            "overallClassification": extract_ism(source.get("acm", {})),
            "id": source.get("id"),
            "name": source.get("name"),
            "lastUpdatedDate": source.get("lastVerified", {}).get("timestamp"),
            "excerciseIndicator": source.get("gide_id"),
        }
        
        # Pre-initialize containers
        target_structure["maritimeMetadata"] = {}
        target_structure["ontology"] = {}
        target_structure["equipment"] = {}
        target_structure["facility"] = {}
        
        # Build quick index for attributes.data
        data_items = source.get("attributes", {}).get("data", [])
        attr_index = {item.get("attributeName"): item for item in data_items}
        
        standard_object = build_standard_object(target_structure, attr_index, attribute_map)
        
        print(f"Finished transforming object with ID: {standard_object.get('id', 'unknown')}")
        return standard_object
    except Exception as e:
        print(f"Error transforming object with ID {source.get('id', 'unknown')}: {e}")
        return {}

In [None]:
def remove_empty_containers(obj: Dict[str, Any], container_keys: List[str]) -> Dict[str, Any]:
    """
    Remove empty containers from a standard object.
    
    This function removes any container dictionaries that are empty, helping to 
    clean up the object structure and reduce noise in the final output.
    
    Args:
        obj (Dict[str, Any]): The standard object with potentially empty containers
        container_keys (List[str]): List of keys representing containers to check for emptiness
    
    Returns:
        Dict[str, Any]: The cleaned object with empty containers removed
    """
    
    # Create a copy of the object to avoid modifying the original
    cleaned_obj = obj.copy()
    
    # Remove empty containers
    for container_key in container_keys:
        if container_key in cleaned_obj:
            container = cleaned_obj[container_key]
            # Remove if container is empty dict or None
            if not container or (isinstance(container, dict) and len(container) == 0):
                del cleaned_obj[container_key]
    
    return cleaned_obj

In [None]:
def save_standard_objects(cleaned_objects: List[Dict[str, Any]]) -> None:
    """
    Save each cleaned standard object to a separate JSON file.
    
    This function saves each cleaned standard object to a JSON file with a filename
    based on the object ID and current timestamp. It handles file writing errors
    and ensures proper JSON formatting.
    
    Args:
        cleaned_objects (List[Dict[str, Any]]): List of cleaned standard objects to save
    
    Returns:
        None: The function does not return a value, but logs the results of the save operation.
    
    Raises:
        OSError: If the output directory cannot be created or accessed
    """
    output_path = get_output_path("processed_dir")
    
    # Generate timestamp for this batch
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    for i, obj in enumerate(cleaned_objects):
        try:
            # Get object ID, fallback to index if ID is missing
            obj_id = obj.get("id", f"object_{i}")
            
            # Create filename with object ID and timestamp
            filename = f"{obj_id}_{timestamp}.json"
            file_path = output_path / filename
            
            # Ensure the object is JSON serializable
            if not isinstance(obj, dict):
                raise ValueError(f"Object {i} is not a valid dictionary")
            
            # Write JSON file with proper formatting
            with open(file_path, 'w', encoding='utf-8') as f:
                json.dump(obj, f, indent=2, ensure_ascii=False, sort_keys=True)
            
            logger.info(f"Successfully saved object {obj_id} to {filename}")
        except (ValueError, TypeError) as e:
            error_msg = f"Object {i} serialization error: {e}"
            logger.error(error_msg)
            raise
        except OSError as e:
            error_msg = f"File write error for object {i}: {e}"
            logger.error(error_msg)
            raise
        except Exception as e:
            error_msg = f"Unexpected error saving object {i}: {e}"
            logger.error(error_msg)
            raise

In [None]:
# Get the source objects
source_objects = get_source_objects()

In [None]:
# Load attribute mapping from configuration file
attribute_mapping = load_attribute_mapping()

In [None]:
standard_objects = [transform_source_object(obj, attribute_mapping) for obj in source_objects]

In [None]:
# Define the container keys that should be checked for emptiness
container_keys = ["maritimeMetadata", "ontology", "equipment", "facility"]

# Apply the cleanup function to all standard objects
cleaned_standard_objects = [remove_empty_containers(obj, container_keys) for obj in standard_objects]

In [None]:
# Save the cleaned standard objects to JSON files
save_standard_objects(cleaned_standard_objects)