In [1]:
import json
import yaml
import logging
from datetime import datetime
from rich import print as prt
from typing import Dict, Any, List, Optional

# Configure standard logging for Jupyter
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
    handlers=[logging.StreamHandler()],  # This will output to the notebook
)
logger = logging.getLogger(__name__)

In [2]:
def load_attribute_mapping(config_file: str) -> Dict[str, Dict[str, str]]:
    """
    Load attribute mapping configuration from a YAML file.

    Returns:
        Dict[str, Dict[str, str]]: The attribute mapping dictionary

    Raises:
        FileNotFoundError: If the configuration file doesn't exist
        yaml.YAMLError: If the YAML file is malformed
        KeyError: If required keys are missing from the configuration
    """
    try:
        if not config_file:
            raise FileNotFoundError(f"Configuration file not found: {config_file}")

        with open(config_file, "r") as f:
            config = yaml.safe_load(f)

        if "attribute_mapping" not in config:
            raise KeyError("'attribute_mapping' key not found in configuration file")

        attribute_mapping = config["attribute_mapping"]

        # Validate the structure
        for attr_name, mapping in attribute_mapping.items():
            if not isinstance(mapping, dict):
                raise ValueError(
                    f"Invalid mapping for attribute '{attr_name}': expected dict, got {type(mapping)}"
                )

            if "field" not in mapping or "container" not in mapping:
                raise KeyError(
                    f"Missing required keys ('field', 'container') for attribute '{attr_name}'"
                )

        logger.info(f"Successfully loaded attribute mapping from {config_file}")
        return attribute_mapping

    except FileNotFoundError as e:
        logger.error(f"Configuration file not found: {e}")
        raise
    except yaml.YAMLError as e:
        logger.error(f"Error parsing YAML configuration: {e}")
        raise
    except (KeyError, ValueError) as e:
        logger.error(f"Invalid configuration structure: {e}")
        raise

In [3]:
def get_source_objects(data_path: str):
    """
    Get the source raw objects from the data folder and store them as a list of dictionaries.
    """
    try:
        with open(data_path, "r") as f:
            source_objects = json.load(f)

        if not isinstance(source_objects, list):
            logger.error("Source objects should be a list of dictionaries.")
            return []

        logger.info("Source objects loaded successfully.")
        return source_objects
    except FileNotFoundError:
        logger.error("Source objects file not found.")
        return []
    except json.JSONDecodeError:
        logger.error("Error decoding JSON from source objects file.")
        return []

In [4]:
def extract_ism(acm: dict) -> dict:
    """Extract the reduced 'ism' structure from any ACM dict."""
    return {
        "banner": acm.get("banner"),
        "classification": acm.get("classif"),
        "ownerProducer": acm.get("owner_prod"),
        "releaseableTo": acm.get("rel_to"),
        'disseminationControls': acm.get("dissem_ctrls"),
    }

In [5]:
def extract_elevation(source_object: Dict[str, Any]) -> Optional[Any]:
    """
    Retrieves the elevation value from the source object, handling variations
    in the attribute name (e.g., "Elevation", "Elevation(m)", "Elevation (m)").

    Args:
        source_object (Dict[str, Any]): The source JSON-like object.

    Returns:
        Optional[Any]: The elevation value if found, otherwise None.
    """
    elevation_value = None
    
    # Define possible variations of the "Elevation" attribute name
    elevation_variations = ["elevation", "elevation(m)", "elevation (m)"]
    try:
        # Ensure the source object is a dictionary and contains the expected structure
        if not isinstance(source_object, dict):
            raise ValueError("source object must be a dictionary.")
        
        if "attributes" not in source_object or "data" not in source_object["attributes"]:
            raise KeyError("source object does not contain the expected 'attributes.data' structure.")
        
        # Iterate through the attributes to find the elevation value
        for attr in source_object["attributes"]["data"]:
            attribute_name = attr.get("attributeName", "").lower()
            
            if attribute_name in elevation_variations and attr.get("attributeValue") is not None:
                elevation_value = attr.get("attributeValue")
                break  # Exit the loop once the elevation value is found
    except Exception as e:
        # Log the exception for debugging purposes
        print(f"Error occurred while retrieving elevation: {e}")
    
    return elevation_value

In [6]:
def build_standard_object(target_structure: Dict[str, Any], attr_index: Dict[str, Dict], attribute_map: Dict[str, Dict[str, str]]) -> Dict[str, Any]:
    """
    Build a standard object by mapping attributes from the source data to target fields.
    
    This function takes a pre-initialized target_structure dictionary and populates it with
    transformed attribute values based on the provided attribute mapping. Each
    attribute value is wrapped with ISM classification metadata.
    
    Args:
        target_structure (Dict[str, Any]): Pre-initialized dictionary containing basic object metadata
            and empty containers (ontology, maritimeMetadata, equipment, facility)
        
        attr_index (Dict[str, Dict]): Index of attribute data items keyed by attribute name,
            where each item contains 'attributeValue' and 'acm' fields
        
        attribute_map (Dict[str, Dict[str, str]]): Mapping configuration where keys are attribute names and values are dicts with
            'field' and 'container' specifications
    
    Returns:
        Dict[str, Any]: The populated target_structure dictionary with mapped attributes organized
            into their designated containers, or empty dict if an error occurs
    
    Note:
        - Attributes mapped to "root" container are placed directly in the target_structure dict
        - Other containers are nested under their respective keys
        - Each mapped value includes the original value and ISM classification metadata
        - Missing attributes in attr_index are silently skipped
    """
    try:
        for attr_name, mapping in attribute_map.items():
            item = attr_index.get(attr_name)
            
            if not item:
                continue
            
            target_field = mapping["field"]
            container = mapping["container"]
            
            transformed_value = {
                "value": item.get("attributeValue"),
                "ism": extract_ism(item.get("acm", {}))
            }
            
            if container == "root":
                target_structure[target_field] = transformed_value
            else:
                # Ensure nested container exists
                if container not in target_structure:
                    target_structure[container] = {}
                target_structure[container][target_field] = transformed_value
        
        return target_structure
    except Exception as e:
        print(f"Error building standard object: {e}")
        return {}

In [7]:
def prepare_attribute_index(source: Dict[str, Any]) -> Dict[str, Dict]:
    """
    Prepare a complete attribute index including both standard attributes and top-level fields.
    
    Args:
        source: Source dictionary containing object data
        
    Returns:
        Dict[str, Dict]: Attribute index with both regular attributes and transformed top-level fields
    """
    # Get standard attributes from data items
    data_items = source.get("attributes", {}).get("data", [])
    attr_index = {item.get("attributeName"): item for item in data_items}
    
    # Define top-level fields to be included in attribute mapping
    top_level_fields = {
        "domain": "domain",
        "allegience": "allegience",
        "allegienceAor": "allegienceAor"
        # Add other top-level fields here as needed
    }
    
    # Add top-level fields to attribute index with proper structure
    for source_field, attr_name in top_level_fields.items():
        if source_field in source:
            attr_index[attr_name] = {
                "attributeValue": source[source_field],
                "acm": source.get("acm", {})
            }
    
    return attr_index

In [8]:
def parse_location(source_object: Dict[str, Any]) -> Optional[Dict[str, Any]]:
    """
    Processes location information from the input object's geographic data.

    Args:
        source_object (Dict[str, Any]): The input object containing location data.

    Returns:
        Dict[str, Any]: Processed location data, or None if data is invalid.
    """
    try:
        location_data = source_object.get("latestKnownLocation")
        if not location_data:
            logger.debug(f"No location data found for object {source_object.get('id')}")
            return None

        geometry_data = location_data.get("geometry")
        if not geometry_data:
            logger.debug(f"No geometry data found for object {source_object.get('id')}")
            return None

        coords = geometry_data.get("coordinates")
        if not coords or len(coords) != 2:
            logger.debug(f"Invalid coordinates for object {source_object.get('id')}")
            return None

        elevation_value = extract_elevation(source_object)
        if isinstance(elevation_value, str):
            try:
                elevation_value = float(elevation_value)
            except Exception as e:
                logger.warning(f"Error transforming elevation into float: {e}")
                elevation_value = None

        return {
            "ism": extract_ism(location_data.get("acm", {})),
            "id": location_data.get("id"),
            "timestamp": location_data.get("lastVerified", {}).get("timestamp"),
            "latitude": coords[1],
            "longitude": coords[0],
            "altitude": {
                "value": None,
                "quality": None,
                "error": None,
                "units": {"value": None},
            },
            "elevation": {
                "value": elevation_value,
                "quality": None,
                "error": None,
                "units": {"value": None},
            },
            "derivation": geometry_data.get("type"),
            "quality": None,
            "locationName": None,
        }
    except Exception as e:
        logger.error(f"Unexpected error writing location values for object {source_object.get('id')}: {str(e)}")
        return None

In [9]:
def parse_ship_class_name(source_object: Dict[str, Any], standard_object: Dict[str, Any]) -> Dict[str, Any]:
    """
    Updates the standard_object's maritimeMetadata with shipClass and shipName if the object is a ship.

    Args:
        source_object (Dict[str, Any]): Input dictionary containing vessel information.
        standard_object (Dict[str, Any]): Dictionary to be updated with vessel metadata.

    Returns:
        Dict[str, Any]: The updated standard_object.
    """
    try:
        attributes = source_object.get('attributes', {}).get('data', [])
        class_name = source_object.get('className')
        acm = source_object.get('acm', {})
        
        # Using any() to more efficiently determine shipName-shipClass without a for loop
        is_ship = any(
            attr.get('attributeName') == 'Echelon' and attr.get('attributeValue') == 'SHIP'
            for attr in attributes
        )
        ship_name = next(
            (attr.get('attributeValue') for attr in attributes if attr.get('attributeName') == 'Name'),
            None
        )
        
        if is_ship:
            if 'maritimeMetadata' not in standard_object or not isinstance(standard_object['maritimeMetadata'], dict):
                standard_object['maritimeMetadata'] = {}
            
            standard_object['maritimeMetadata']['shipClass'] = {
                "value": class_name,
                "ism": extract_ism(acm)
            }
            if ship_name:
                standard_object['maritimeMetadata']['shipName'] = {
                    "value": ship_name,
                    "ism": extract_ism(acm)
                }
            logger.debug(f"Set shipClass and shipName for object {standard_object.get('id')}")
        return standard_object
    except Exception as e:
        logger.error(f"Error parsing ship class/name for object {standard_object.get('id', 'unknown')}: {e}")
        return standard_object

In [10]:
def parse_facility_name_id(source_object: Dict[str, Any], standard_object: Dict[str, Any]) -> Dict[str, Any]:
    """
    Updates the standard_object with facilityName and facilityId if the object represents a facility.

    Args:
        source_object (Dict[str, Any]): Dictionary containing facility information.
        standard_object (Dict[str, Any]): Dictionary to be updated with facility metadata.

    Returns:
        Dict[str, Any]: The updated standard_object.
    """
    try:
        attributes = source_object.get('attributes', {}).get('data', [])
        class_name = source_object.get('className')
        acm = source_object.get('acm', {})

        is_facility = class_name == 'Facility'
        facility_name = source_object.get('name')
        
        facility_id = next(
            (attr.get('attributeValue') for attr in attributes if attr.get('attributeName') == 'OSuffix' and attr.get('attributeValue') is not None),
            None
        )

        if is_facility:
            if 'facility' not in standard_object or not isinstance(standard_object['facility'], dict):
                standard_object['facility'] = {}

            standard_object['facility']['facilityName'] = {
                "value": facility_name,
                "ism": extract_ism(acm)
            }
            if facility_id:
                standard_object['facility']['facilityId'] = {
                    "value": facility_id,
                    "ism": extract_ism(acm)
                }
            logger.debug(f"Set facilityName and facilityId for object {standard_object.get('id')}")
        return standard_object
    except Exception as e:
        logger.error(f"Error parsing facility name/id for object {standard_object.get('id', 'unknown')}: {e}")
        return standard_object

In [None]:
# TODO: Create functions for handling invalid classifications, sci, controls, groups, and terms (e.g. TOP SECRET)

In [11]:
def transform_source_object(source: Dict[str, Any], attribute_map: Dict[str, Dict[str, str]]) -> Dict[str, Any]:
    """
    Transform a source object into a structured format based on the provided attribute mapping.
    
    Args:
        source: The source dictionary containing object data with attributes, ACM, and metadata
        attribute_map: Dictionary mapping attribute names to their target field and container locations
    
    Returns:
        Dict containing the transformed object with structured fields including:
        - Basic metadata (version, id, name, etc.)
        - Overall classification from ACM
        - Mapped attributes organized into appropriate containers (root, ontology, maritimeMetadata, facility)
    """
    try:
        if not isinstance(source, dict) or not isinstance(attribute_map, dict):
            logger.error("Invalid source object or attribute map.")
            return {}
        
        # Special handling for createdDate. This is done because "Date Of Introduction" is an attribute nested within data 
        try:
            created_date = None
            
            for attr in source["attributes"]["data"]:
                if attr.get("attributeName") in ["Date Of Introduction"]:
                    created_date = attr.get("attributeValue")
        except Exception as e:
            print(f"There was an error setting createdDate: {e}")
        
        # Initialize target structure with basic metadata
        target_structure = {
            "version": source.get("version"),
            "overallClassification": extract_ism(source.get("acm", {})),
            "id": source.get("id"),
            "name": source.get("name"),
            "createdDate": created_date,  # Use the extracted created date
            "lastUpdatedDate": source.get("lastVerified", {}).get("timestamp"),
            "excerciseIndicator": source.get("gide_id"),
            "location": parse_location(source),
            # Initialize containers
            "maritimeMetadata": {},
            "ontology": {},
            "equipment": {},
            "facility": {}
        }
        
        # Get complete attribute index including top-level fields
        attr_index = prepare_attribute_index(source)
        
        # Build and return the standard object
        standard_object = build_standard_object(target_structure, attr_index, attribute_map)
        
        # Apply the bespoke functions that parse maritime and facility attributes
        parse_ship_class_name(source, standard_object)
        parse_facility_name_id(source, standard_object)
        
        logger.info(f"Finished transforming object with ID: {standard_object.get('id', 'unknown')}")
        return standard_object
        
    except Exception as e:
        logger.error(f"Error transforming object with ID {source.get('id', 'unknown')}: {e}")
        return {}

In [12]:
def remove_empty_containers(obj: Dict[str, Any], container_keys: List[str]) -> Dict[str, Any]:
    """
    Remove empty containers from a standard object.
    
    This function removes any container dictionaries that are empty, helping to 
    clean up the object structure and reduce noise in the final output.
    
    Args:
        obj (Dict[str, Any]): The standard object with potentially empty containers
        container_keys (List[str]): List of keys representing containers to check for emptiness
    
    Returns:
        Dict[str, Any]: The cleaned object with empty containers removed
    """
    
    # Create a copy of the object to avoid modifying the original
    cleaned_obj = obj.copy()
    
    # Remove empty containers
    for container_key in container_keys:
        if container_key in cleaned_obj:
            container = cleaned_obj[container_key]
            # Remove if container is empty dict or None
            if not container or (isinstance(container, dict) and len(container) == 0):
                del cleaned_obj[container_key]
    
    return cleaned_obj

In [13]:
def save_standard_objects(output_path: str, cleaned_objects: List[Dict[str, Any]]) -> None:
    """
    Save each cleaned standard object to a separate JSON file.
    
    This function saves each cleaned standard object to a JSON file with a filename
    based on the object ID and current timestamp. It handles file writing errors
    and ensures proper JSON formatting.
    
    Args:
        cleaned_objects (List[Dict[str, Any]]): List of cleaned standard objects to save
    
    Returns:
        None: The function does not return a value, but logs the results of the save operation.
    
    Raises:
        OSError: If the output directory cannot be created or accessed
    """
    # Generate timestamp for this batch
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    for i, obj in enumerate(cleaned_objects):
        try:
            # Get object ID, fallback to index if ID is missing
            obj_id = obj.get("id", f"object_{i}")
            
            # Create filename with object ID and timestamp
            filename = f"{obj_id}_{timestamp}.json"
            file_path = output_path / filename
            
            # Ensure the object is JSON serializable
            if not isinstance(obj, dict):
                raise ValueError(f"Object {i} is not a valid dictionary")
            
            # Write JSON file with proper formatting
            with open(file_path, 'w', encoding='utf-8') as f:
                json.dump(obj, f, indent=2, ensure_ascii=False)
            
            logger.info(f"Successfully saved object {obj_id} to {filename}")
        except (ValueError, TypeError) as e:
            error_msg = f"Object {i} serialization error: {e}"
            logger.error(error_msg)
            raise
        except OSError as e:
            error_msg = f"File write error for object {i}: {e}"
            logger.error(error_msg)
            raise
        except Exception as e:
            error_msg = f"Unexpected error saving object {i}: {e}"
            logger.error(error_msg)
            raise

In [14]:
# Get the source objects
data_path = "../data/1_raw/source_objects.json"
source_objects = get_source_objects(data_path)

2025-08-22 10:22:14,249 - __main__ - INFO - Source objects loaded successfully.


In [15]:
# Load attribute mapping from configuration file
config_file = "../config/attribute_mapping.yaml"
attribute_mapping = load_attribute_mapping(config_file)

2025-08-22 10:22:15,064 - __main__ - INFO - Successfully loaded attribute mapping from ../config/attribute_mapping.yaml


In [16]:
standard_objects = [transform_source_object(obj, attribute_mapping) for obj in source_objects]

2025-08-22 10:22:16,190 - __main__ - INFO - Finished transforming object with ID: 23456789-bcde-fghi-jklm-nopqrstuvwxy
2025-08-22 10:22:16,190 - __main__ - INFO - Finished transforming object with ID: 34567890-cdef-ghij-klmn-opqrstuvwxyz
2025-08-22 10:22:16,191 - __main__ - INFO - Finished transforming object with ID: 12345678-abcd-efgh-ijkl-mnopqrstuvwx
2025-08-22 10:22:16,191 - __main__ - INFO - Finished transforming object with ID: 59a4e7ed-778c-4b5f-824b-521fca1b9ba7
2025-08-22 10:22:16,192 - __main__ - INFO - Finished transforming object with ID: e46080c7-6af2-4a58-b4b3-a0f60f00be7a
2025-08-22 10:22:16,192 - __main__ - INFO - Finished transforming object with ID: 99b3d2c7-8e5d-4c64-97a4-102dfd93d7e9
2025-08-22 10:22:16,193 - __main__ - INFO - Finished transforming object with ID: d134eafb-59bf-4c94-8dc2-e547f98e1b21
2025-08-22 10:22:16,193 - __main__ - INFO - Finished transforming object with ID: e5c61b13-0228-4813-8851-f69044bb4454
2025-08-22 10:22:16,193 - __main__ - INFO - Fini

In [17]:
prt(standard_objects[0])

In [None]:
# Define the container keys that should be checked for emptiness
container_keys = ["maritimeMetadata", "ontology", "equipment", "facility"]

# Apply the cleanup function to all standard objects
cleaned_standard_objects = [remove_empty_containers(obj, container_keys) for obj in standard_objects]

In [None]:
prt(cleaned_standard_objects[0])

In [None]:
# Save the cleaned standard objects to JSON files
# output_path = "../data/2_processed/"
# save_standard_objects(output_path, cleaned_standard_objects)