In [1]:
import os
import json
import yaml
from loguru import logger
from databricks import sql
from datetime import datetime
from rich import print as prt
from dotenv import load_dotenv
from difflib import get_close_matches
from typing import Dict, Any, List, Optional
from jsonschema import validate, ValidationError

load_dotenv()

True

### Helper Functions

In [2]:
def fetch_all_objects(directory: str) -> List[Dict[str, Any]]:
    """
    Load all JSON files from a specified directory into a list of dictionaries.

    Args:
        directory (str): Path to the directory containing JSON files.

    Returns:
        List[Dict[str, Any]]: List of dictionaries, each representing the contents of a JSON file.

    Note:
        This function mimics querying data from an API by reading local files.
        In production, replace this logic with actual API calls as needed.
    """
    json_list = []

    for current_file in os.scandir(directory):
        if current_file.is_file() and current_file.name.endswith(".json"):
            with open(current_file.path, "r") as json_file:
                json_data = json.load(json_file)
                json_list.append(json_data)

    logger.info("Source objects loaded successfully.")
    return json_list

In [3]:
def load_attribute_mapping(config_file: str) -> Dict[str, Dict[str, str]]:
    """
    Load attribute mapping configuration from a YAML file.

    Returns:
        Dict[str, Dict[str, str]]: The attribute mapping dictionary

    Raises:
        FileNotFoundError: If the configuration file doesn't exist
        yaml.YAMLError: If the YAML file is malformed
        KeyError: If required keys are missing from the configuration
    """
    try:
        if not config_file:
            raise FileNotFoundError(f"Configuration file not found: {config_file}")

        with open(config_file, "r") as f:
            config = yaml.safe_load(f)

        if "attribute_mapping" not in config:
            raise KeyError("'attribute_mapping' key not found in configuration file")

        attribute_mapping = config["attribute_mapping"]

        # Validate the structure
        for attr_name, mapping in attribute_mapping.items():
            if not isinstance(mapping, dict):
                raise ValueError(
                    f"Invalid mapping for attribute '{attr_name}': expected dict, got {type(mapping)}"
                )

            if "field" not in mapping or "container" not in mapping:
                raise KeyError(
                    f"Missing required keys ('field', 'container') for attribute '{attr_name}'"
                )

        logger.info(f"Successfully loaded attribute mapping from {config_file}")
        return attribute_mapping

    except FileNotFoundError as e:
        logger.error(f"Configuration file not found: {e}")
        raise
    except yaml.YAMLError as e:
        logger.error(f"Error parsing YAML configuration: {e}")
        raise
    except (KeyError, ValueError) as e:
        logger.error(f"Invalid configuration structure: {e}")
        raise

In [4]:
def load_valid_attribute_names(file_path: str) -> list:
    """
    Loads the list of valid attribute names from a YAML file.

    Args:
    - file_path (str): Path to the YAML file containing valid attribute names.

    Returns:
    - list: A list of valid attribute names.
    """
    try:
        with open(file_path, "r") as file:
            data = yaml.safe_load(file)
            logger.info("Loaded valid attribute names successfully.")
            return data.get("valid_attribute_names", [])
    except Exception as e:
        logger.error(f"Error loading valid attribute names: {e}")
        return []

In [5]:
def load_classification_config(config_path: str) -> Dict[str, Any]:
    """
    Load classification configuration from a JSON file.
    """
    try:
        with open(config_path, "r") as file:
            config = yaml.safe_load(file)
            if "restrictions" not in config:
                raise ValueError("Missing 'restrictions' key in classification config")

            logger.info("Classification configuration loaded successfully.")
            return config["restrictions"]
    except FileNotFoundError:
        raise FileNotFoundError(f"Configuration file not found: {config_path}")
    except yaml.YAMLError as e:
        raise ValueError(f"Error parsing YAML file: {e}")

In [6]:
def load_standard_object_schema(schema_path: str) -> dict:
    """
    Load the JSON schema from a file.

    Args:
        schema_path (str): Path to the JSON schema
    Returns:
        dict: The loaded JSON schema
    """
    try:
        with open(schema_path, "r") as schema_file:
            schema = json.load(schema_file)
        logger.info(f"Schema loaded successfully from {schema_path}")
        return schema
    except FileNotFoundError:
        logger.error(f"Schema file not found at {schema_path}")
        return {}
    except json.JSONDecodeError as e:
        logger.error(f"Invalid JSON in schema file: {e}")
        return {}

In [7]:
def save_standard_objects(
    output_path: str, cleaned_objects: List[Dict[str, Any]]
) -> None:
    """
    Save each cleaned standard object to a separate JSON file.

    This function saves each cleaned standard object to a JSON file with a filename
    based on the object ID and current timestamp. It handles file writing errors
    and ensures proper JSON formatting.

    Args:
        cleaned_objects (List[Dict[str, Any]]): List of cleaned standard objects to save

    Returns:
        None: The function does not return a value, but logs the results of the save operation.

    Raises:
        OSError: If the output directory cannot be created or accessed
    """
    # Generate timestamp for this batch
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    for i, obj in enumerate(cleaned_objects):
        try:
            # Get object ID, fallback to index if ID is missing
            obj_id = obj.get("id", f"object_{i}")

            # Create filename with object ID and timestamp
            filename = f"{obj_id}_{timestamp}.json"
            file_path = os.path.join(output_path, filename)

            # Ensure the object is JSON serializable
            if not isinstance(obj, dict):
                raise ValueError(f"Object {i} is not a valid dictionary")

            # Write JSON file with proper formatting
            with open(file_path, "w", encoding="utf-8") as f:
                json.dump(obj, f, indent=2)

            logger.info(f"Successfully saved object {obj_id} to {filename}")
        except (ValueError, TypeError) as e:
            logger.error(f"Object {i} serialization error: {e}")
            raise
        except OSError as e:
            logger.error(f"File write error for object {i}: {e}")
            raise
        except Exception as e:
            logger.error(f"Unexpected error saving object {i}: {e}")
            raise

### Attribute Name Drift Detector Functions

In [8]:
def capture_unexpected_attributes(
    source_objects: List[Dict[str, Any]], valid_attribute_names: List[str]
) -> Dict[str, Any]:
    """
    Captures unexpected attribute names from source objects.

    Args:
    - source_objects (List[Dict[str, Any]]): A list of JSON objects to check.
    - valid_attribute_names (List[str]): A list of valid attribute names.

    Returns:
    - Dict[str, Any]: Dictionary containing unexpected attributes and their details.
    """
    unexpected_attributes = {}
    total_attributes_checked = 0
    objects_with_issues = []

    for obj_index, json_object in enumerate(source_objects):
        attributes = json_object.get("attributes", {}).get("data", [])
        object_unexpected = []

        for attr_index, attribute in enumerate(attributes):
            total_attributes_checked += 1
            attribute_name = attribute.get("attributeName")

            if attribute_name not in valid_attribute_names:
                if attribute_name not in unexpected_attributes:
                    unexpected_attributes[attribute_name] = {"count": 0, "objects": []}

                unexpected_attributes[attribute_name]["count"] += 1
                unexpected_attributes[attribute_name]["objects"].append(
                    {
                        "object_index": obj_index,
                        "attribute_index": attr_index,
                        "object_id": json_object.get("id", "unknown"),
                    }
                )
                object_unexpected.append(attribute_name)

        if object_unexpected:
            objects_with_issues.append(
                {
                    "object_index": obj_index,
                    "object_id": json_object.get("id", "unknown"),
                    "unexpected_attributes": object_unexpected,
                }
            )

    return {
        "unexpected_attributes": unexpected_attributes,
        "objects_with_issues": objects_with_issues,
        "total_attributes_checked": total_attributes_checked,
        "total_objects_checked": len(source_objects),
    }

In [9]:
def add_fuzzy_matching(
    unexpected_attributes: Dict[str, Any], valid_attribute_names: List[str]
) -> None:
    """
    Adds fuzzy matching suggestions to unexpected attributes.

    Args:
    - unexpected_attributes (Dict[str, Any]): Dictionary of unexpected attributes to enhance.
    - valid_attribute_names (List[str]): List of valid attribute names for matching.
    """

    for unexpected_name in unexpected_attributes.keys():
        similar_names = get_close_matches(
            unexpected_name, valid_attribute_names, n=3, cutoff=0.6
        )
        unexpected_attributes[unexpected_name]["similar_valid_names"] = similar_names

In [10]:
def save_analysis_report(
    analysis_results: Dict[str, Any],
    attribute_report_path: str,
) -> bool:
    """
    Saves the analysis results to a YAML file.

    Args:
    - analysis_results (Dict[str, Any]): Results from the attribute analysis.
    - attribute_report_path (str): Path to save the report.

    Returns:
    - bool: True if successful, False otherwise.
    """
    try:
        report_data = {
            "analysis_summary": {
                "total_objects_checked": analysis_results["total_objects_checked"],
                "objects_with_issues": len(analysis_results["objects_with_issues"]),
                "unique_unexpected_attributes": len(
                    analysis_results["unexpected_attributes"]
                ),
            },
            "unexpected_attribute_names": list(
                analysis_results["unexpected_attributes"].keys()
            ),
            "detailed_findings": analysis_results["unexpected_attributes"],
            "affected_objects": analysis_results["objects_with_issues"],
        }

        with open(attribute_report_path, "w") as file:
            yaml.dump(report_data, file, default_flow_style=False)
        logger.info(f"Analysis report saved to '{attribute_report_path}'")
        return True

    except Exception as e:
        logger.error(f"Error saving analysis report: {e}")
        return False

In [11]:
def detect_unexpected_attribute_names(
    source_objects: List[Dict[str, Any]],
    valid_attribute_names: List[str],
    attribute_report_path: str,
) -> Dict[str, Any]:
    """
    Main function to detect unexpected attribute names with comprehensive reporting.

    Args:
    - source_objects (List[Dict[str, Any]]): A list of JSON objects to check.
    - valid_attribute_names (List[str]): A list of valid attribute names.
    - attribute_report_path (str): Path to save the detailed report.

    Returns:
    - Dict[str, Any]: Summary of findings.
    """
    # Capture unexpected attributes
    analysis_results = capture_unexpected_attributes(
        source_objects, valid_attribute_names
    )

    # If unexpected attributes found, enhance with fuzzy matching
    if analysis_results["unexpected_attributes"]:
        add_fuzzy_matching(
            analysis_results["unexpected_attributes"], valid_attribute_names
        )

        # Log findings
        logger.warning(
            f"Found {len(analysis_results['unexpected_attributes'])} unexpected attribute names "
            f"across {len(analysis_results['objects_with_issues'])} objects"
        )

        for attr_name, details in analysis_results["unexpected_attributes"].items():
            logger.warning(
                f"Unexpected attribute: '{attr_name}' (found {details['count']} times)"
            )
            if details.get("similar_valid_names"):
                logger.info(
                    f"  Possible matches: {', '.join(details['similar_valid_names'])}"
                )

        # Save detailed report
        save_analysis_report(analysis_results, attribute_report_path)
    else:
        logger.info("No unexpected attribute names detected.")

    return {
        "total_objects_checked": analysis_results["total_objects_checked"],
        "total_attributes_checked": analysis_results["total_attributes_checked"],
        "objects_with_issues": len(analysis_results["objects_with_issues"]),
        "unique_unexpected_attributes": len(analysis_results["unexpected_attributes"]),
    }

### Preprocessor Functions

In [12]:
def _validate_attributes(attributes: Dict[str, Any]) -> None:
    """Validate attribute structure and values"""
    if not isinstance(attributes, dict):
        raise ValueError(
            f"Invalid attribute type: expected dict, got {type(attributes)}"
        )

    required_fields = ["attributeName", "attributeValue"]

    if not attributes["data"]:
        raise ValueError(f"Missing `data` in `attributes`")

    attributes_data_keys = attributes["data"][0].keys()
    if not all(k in attributes_data_keys for k in required_fields):
        missing_fields = [k for k in required_fields if k not in attributes_data_keys]

        raise ValueError(f"Missing required fields: {missing_fields}")
    return None

In [13]:
def _validate_acm(acm: Dict[str, Any]) -> bool:
    """Validate the ACM structure"""
    required_fields = ["portion", "banner"]

    if not all(field in acm for field in required_fields):
        missing = [f for f in required_fields if f not in acm]

        logger.warning(f"Missing ACM fields: {missing}")
        return False

    return True

In [14]:
def _validate_required_fields(obj: Dict[str, Any]) -> bool:
    """Validate required object fields"""
    if not obj.get("id"):
        logger.warning("Raw object is missing 'id' attribute")
        return False

    if not _validate_acm(obj.get("acm", {})):
        logger.error(f"Failed ACM validation for object {obj.get('id')}")
        return False

    if not obj.get("attributes"):
        logger.warning(f"No attributes found for object {obj.get('id')}")
        return False

    return True

In [15]:
def handle_special_cases_raw(raw_object: Dict[str, Any]) -> None:
    """
    Handle special cases for attributes that need custom processing logic on raw objects.

    This function modifies the raw object's attributes in place to handle:
    - Target Restriction: Convert to boolean
    - Military Symbology Code: Validate length and nullify if invalid

    Args:
        raw_object: The raw object containing attributes.data to process
    """
    try:
        attributes_data = raw_object.get("attributes", {}).get("data", [])

        # Use a list to track attributes to remove (to avoid modifying list during iteration)
        attributes_to_remove = []

        for i, attr in enumerate(attributes_data):
            attr_name = attr.get("attributeName")
            attr_value = attr.get("attributeValue")

            if not attr_name:
                continue

            # Handle Target Restriction - ensure boolean value
            if attr_name == "Target Restriction":
                if attr_value is not None:
                    attr["attributeValue"] = bool(attr_value)
                    logger.debug(
                        f"Converted Target Restriction to boolean: {attr['attributeValue']}"
                    )

            # Handle Military Symbology Code - validate length
            elif attr_name == "Military Symbology Code":
                if attr_value is None or (
                    isinstance(attr_value, str) and len(attr_value) != 15
                ):
                    # Mark this attribute for removal instead of setting to None
                    attributes_to_remove.append(i)
                    logger.debug(
                        f"Removing invalid Military Symbology Code (length: {len(attr_value) if attr_value else 'None'})"
                    )

        # Remove invalid attributes in reverse order to maintain indices
        for i in reversed(attributes_to_remove):
            attributes_data.pop(i)

    except Exception as e:
        logger.error(
            f"Error handling special cases for object {raw_object.get('id', 'unknown')}: {e}"
        )

In [16]:
def preprocess_raw_data(raw_objects: List[Dict[str, Any]]):
    """
    Preprocess and validate raw objects

    Args:
        raw_objects: List of raw input objects

    Returns:
        Dict mapping object IDs to preprocessed objects
    """
    processed_data = []

    for obj in raw_objects:
        # 1. First validate required fields
        if not _validate_required_fields(obj):
            continue

        # 2. Then validate attributes
        attributes = obj.get("attributes", {})

        try:
            _validate_attributes(attributes)
        except ValueError as e:
            logger.error(
                f"Attribute validation failed for object {obj.get('id')}: {str(e)}"
            )
            continue

        # 3. Handle special cases for attribute processing
        handle_special_cases_raw(obj)

        # 4. If all validations pass, format and add to processed data
        processed_data.append(obj)
    return processed_data

In [None]:
def prepare_dates(source_objects: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    Converts date strings in 'lastVerified.timestamp' and 'Date Of Introduction' attributes to Unix timestamps.
    """

    def to_unix(date_str: str) -> Optional[int]:
        for fmt in ("%Y-%m-%dT%H:%M:%S.%fZ", "%Y-%m-%dT%H:%M:%SZ", "%Y-%m-%d"):
            try:
                return int(datetime.strptime(date_str, fmt).timestamp())
            except Exception:
                continue
        return None

    for obj in source_objects:
        # lastVerified.timestamp
        ts = obj.get("lastVerified", {}).get("timestamp")
        if isinstance(ts, str):
            unix_ts = to_unix(ts)
            if unix_ts is not None:
                obj["lastVerified"]["timestamp"] = unix_ts

        # TODO: Once I implement the fuzzy matching logic, this can be further simplified
        # Date Of Introduction in attributes
        for attr in obj.get("attributes", {}).get("data", []):
            if attr.get("attributeName", "").strip().lower() == "date of introduction":
                date_str = attr.get("attributeValue")
                if isinstance(date_str, str):
                    unix_ts = to_unix(date_str)
                    if unix_ts is not None:
                        attr["attributeValue"] = unix_ts

    logger.info("Dates prepared successfully.")
    return source_objects

### Classification Restriction Functions

In [18]:
def is_classif_too_high(ism: Dict[str, Any], config: Dict[str, Any]) -> bool:
    """Return True if ISM is too highly classified or contains forbidden controls/terms."""
    if not ism:
        logger.warning("ISM is empty, cannot determine classification level.")
        return False

    if (
        ism.get("classification") == "TS"
        or set(ism.get("sciControls", [])) & set(config["forbidden_sci"])
        or set(ism.get("disseminationControls", [])) & set(config["forbidden_controls"])
        or any(
            term in ism.get("banner", "").upper() for term in config["forbidden_terms"]
        )
    ):
        # logger.warning(f"ISM too high or contains forbidden values: {ism}")
        return True

    return False

In [19]:
def is_more_restrictive(
    ism1: Dict[str, Any], ism2: Dict[str, Any], config: Dict[str, Any]
) -> bool:
    """Return True if ism1 is more restrictive than ism2."""
    if not ism1 or not ism2:
        return False

    # FGI controls
    ism1_fgi = any(c.startswith("FGI") for c in ism1.get("sciControls", []))
    ism2_fgi = any(c.startswith("FGI") for c in ism2.get("sciControls", []))
    if ism1_fgi != ism2_fgi:
        return ism1_fgi

    # NOFORN
    ism1_noforn = "NOFORN" in ism1.get("disseminationControls", [])
    ism2_noforn = "NOFORN" in ism2.get("disseminationControls", [])
    if ism1_noforn != ism2_noforn:
        return ism1_noforn

    # REL controls
    rel1 = "REL" in ism1.get("disseminationControls", [])
    rel2 = "REL" in ism2.get("disseminationControls", [])
    if rel1 and rel2:
        ism1_release = set(ism1.get("releasableTo", []))
        ism2_release = set(ism2.get("releasableTo", []))
        ism1_groups = ism1_release & set(config["special_groups"])
        ism2_groups = ism2_release & set(config["special_groups"])
        # More restrictive if ism1 has fewer groups or fewer releasable entities
        if ism1_groups != ism2_groups:
            return len(ism1_groups) < len(ism2_groups)
        return len(ism1_release) < len(ism2_release)

    # Classification hierarchy
    classif1 = ism1.get("classification", "U")
    classif2 = ism2.get("classification", "U")
    return config["classifications"].get(classif1, 0) > config["classifications"].get(
        classif2, 0
    )

In [20]:
def find_most_restrictive_valid_ism(
    obj: Dict[str, Any], config: Dict[str, Any]
) -> Optional[Dict[str, Any]]:
    """
    Traverse a nested object to find the most restrictive valid ISM (Information Security Marking).

    The function searches through all dictionaries and lists within the provided object,
    identifies ISMs that are not too highly classified (using is_classif_too_high),
    and returns the most restrictive valid ISM according to the is_more_restrictive function.

    Args:
        obj (Dict[str, Any]): The object to search for ISMs.
        config (Dict[str, Any]): The classification configuration dictionary.

    Returns:
        Optional[Dict[str, Any]]: The most restrictive valid ISM found, or None if no valid ISM exists.
    """
    most_restrictive = None
    stack = [obj]  # Use a stack to traverse the object hierarchy

    while stack:
        item = stack.pop()

        if isinstance(item, dict):
            # Check if the current item has an ISM and if it's valid
            if "ism" in item:
                ism = item.get("ism")
                if ism and not is_classif_too_high(ism, config):
                    # Early exit if 'TS' found
                    if ism.get("classification") == "TS":
                        return ism.copy()
                    if most_restrictive is None or is_more_restrictive(
                        ism, most_restrictive, config
                    ):
                        most_restrictive = ism.copy()

            # Add all dictionary values to the stack
            stack.extend(item.values())
        elif isinstance(item, list):
            # Add all list items to the stack
            stack.extend(item)

    # Print the most restrictive ISM found
    # logger.info(f"Most restrictive valid ISM found: {most_restrictive}")
    return most_restrictive

In [21]:
def apply_restrictions(
    standard_object: Dict[str, Any], config: Dict[str, Any]
) -> Optional[Dict[str, Any]]:
    """
    Recursively process a standard object to remove or redact data that is too highly classified,
    according to the provided classification configuration.

    The function finds the most restrictive valid ISM (Information Security Marking) within the object,
    then traverses all nested dictionaries and lists, replacing any data with a classification that is
    considered too high with a placeholder. The processed object will include an 'overallClassification'
    field set to the most restrictive valid ISM found.

    Args:
        standard_object (Dict[str, Any]): The object to process and apply restrictions to.
        config (Dict[str, Any]): The classification configuration dictionary.

    Returns:
        Optional[Dict[str, Any]]: The processed object with restricted data redacted, or None if no valid ISM is found.
    """
    # Find the most restrictive valid ISM in the object
    most_restrictive_ism = find_most_restrictive_valid_ism(standard_object, config)

    if not most_restrictive_ism:
        # If no valid ISM is found, return None
        logger.warning("No valid ISM found for object")
        return None

    def process_item(item: Any) -> Any:
        if isinstance(item, dict):
            # If the item has an ISM, check if it is too high
            if "ism" in item and is_classif_too_high(item["ism"], config):
                logger.debug(f"Removing item due to high classification: {item}")
                return None

            # Process all key-value pairs in the dictionary
            return {k: process_item(v) for k, v in item.items()}
        if isinstance(item, list):
            # Process all items in the list
            return [process_item(x) for x in item]

        return item  # Return the item as is if it's neither a dict nor a list

    # Process the object and add the overall classification
    processed_object = process_item(standard_object)
    if isinstance(processed_object, dict):
        processed_object["overallClassification"] = most_restrictive_ism

    return processed_object

### Validate the final standard object against the schema
* Simply run a function that makes sure the final parsed object conforms to the standard_object_schema

In [22]:
def validate_standard_object(
    standard_object: dict,
    schema: dict,
) -> bool:
    """
    Validate a standard object against the JSON schema with detailed error reporting.

    Args:
        standard_object (dict): The processed standard object to validate
        schema_path (str): Path to the JSON schema file

    Returns:
        bool: True if validation passes, False otherwise
    """
    try:
        # Validate the object
        validate(instance=standard_object, schema=schema)

        logger.info("VALIDATION PASSED: Standard object conforms to schema")
        logger.info(f"   - Object ID: {standard_object.get('id', 'Unknown')}")
        return True

    except ValidationError as e:
        logger.error("VALIDATION FAILED: Object does not conform to schema")
        logger.error(f"Object ID: {standard_object.get('id', 'Unknown')}")
        logger.error(
            f"   Error Located: {' -> '.join(str(x) for x in e.absolute_path) if e.absolute_path else 'Root level'}"
        )
        logger.error(f"   Error Message: {e.message}")

        # Try to provide more context about the failing value
        if e.absolute_path:
            failing_value = standard_object
            try:
                for path_element in e.absolute_path:
                    failing_value = failing_value[path_element]
                logger.warning(f"   Failing Value: {failing_value}")
                logger.warning(f"   Value Type: {type(failing_value).__name__}")
            except (KeyError, TypeError, IndexError):
                logger.error("   Could not retrieve failing value")

        # Show the schema requirement that failed
        if hasattr(e, "schema"):
            schema_info = e.schema
            if isinstance(schema_info, dict):
                if "type" in schema_info:
                    logger.warning(f"   Expected Type: {schema_info['type']}")
                if "required" in schema_info:
                    logger.warning(f"   Required Fields: {schema_info['required']}")

        return False

    except Exception as e:
        logger.error(f"VALIDATION FAILED: Unexpected error during validation")
        logger.error(f"   Error Type: {type(e).__name__}")
        logger.error(f"   Error Message: {str(e)}")
        return False

In [23]:
def run_validations(
    cleaned_objects: List[Dict[str, Any]],
    schema_path: str,
) -> Dict[str, Any]:
    """
    Validate all standard objects against the JSON schema.

    Args:
        cleaned_objects: List of cleaned standard objects to validate
        schema_path: Path to the JSON schema file

    Returns:
        Dict containing validation summary and details
    """
    schema = load_standard_object_schema(schema_path)
    if not schema:
        logger.warning("Cannot validate objects without a valid schema.")
        return {
            "all_valid": False,
            "total_objects": 0,
            "valid_count": 0,
            "failed_count": 0,
            "failed_objects": [],
        }

    if not cleaned_objects:
        logger.warning("No standard objects to validate.")
        return {
            "all_valid": False,
            "total_objects": 0,
            "valid_count": 0,
            "failed_count": 0,
            "failed_objects": [],
        }

    logger.info(f"Validating {len(cleaned_objects)} standard objects...")

    validation_results = []
    failed_objects = []

    for i, obj in enumerate(cleaned_objects):
        try:
            is_valid = validate_standard_object(obj, schema)
            validation_results.append(is_valid)

            if not is_valid:
                failed_objects.append(
                    {
                        "index": i,
                        "id": obj.get("id", "Unknown"),
                        "name": obj.get("name", "Unknown"),
                    }
                )
        except Exception as e:
            logger.error(f"Unexpected error validating object {i}: {e}")
            validation_results.append(False)
            failed_objects.append(
                {
                    "index": i,
                    "id": obj.get("id", "Unknown"),
                    "name": obj.get("name", "Unknown"),
                    "error": str(e),
                }
            )

    # Calculate summary
    total_objects = len(cleaned_objects)
    valid_count = sum(validation_results)
    failed_count = len(failed_objects)
    all_valid = all(validation_results)

    # Log summary
    if all_valid:
        logger.info(
            f"ALL VALIDATION PASSED: {total_objects}/{total_objects} objects conform to schema"
        )
        logger.info("Ready to proceed with all valid standard objects.")
    else:
        logger.warning(f" VALIDATION SUMMARY:")
        logger.warning(f"   Total objects: {total_objects}")
        logger.warning(f"   Valid objects: {valid_count}")
        logger.warning(f"   Failed objects: {failed_count}")
        logger.warning(f"   Success rate: {(valid_count/total_objects)*100:.1f}%")

        # Log details of failed objects (limit to first 10 to avoid spam)
        max_display = 10
        logger.warning(
            f"Failed object details (showing first {min(failed_count, max_display)}):"
        )
        for i, failed in enumerate(failed_objects[:max_display]):
            error_msg = (
                f" - Error: {failed.get('error', 'Schema validation failed')}"
                if "error" in failed
                else ""
            )
            logger.warning(
                f"   {i+1}. Index {failed['index']}: {failed['id']} ({failed['name']}){error_msg}"
            )

        if failed_count > max_display:
            logger.warning(
                f"   ... and {failed_count - max_display} more failed objects"
            )

    return {
        "all_valid": all_valid,
        "total_objects": total_objects,
        "valid_count": valid_count,
        "failed_count": failed_count,
        "failed_objects": failed_objects,
        "validation_results": validation_results,
    }

### Parser Functions

In [24]:
def is_empty_container(container: Any) -> bool:
    """
    Check if a container is empty.

    Args:
        container (Any): The container to check

    Returns:
        bool: True if the container is empty or None, False otherwise
    """
    try:
        if container is None:
            return True
        if isinstance(container, dict):
            return all(is_empty_container(value) for value in container.values())
        if isinstance(container, list):
            return all(is_empty_container(item) for item in container)
        return False  # For other types, consider them non-empty if they are not None
    except Exception as e:
        logger.error(f"Error checking if container is empty: {e}")
        raise ValueError("Invalid container structure")


def fix_container_types(objects: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    Fix container types to match schema requirements.

    Converts dict containers to arrays where required by the schema.
    Currently handles: provenance (dict -> array)

    Args:
        objects: List of standard objects to fix

    Returns:
        List of objects with corrected container types
    """
    # Define containers that should be arrays in the schema
    array_containers = {"location", "equipment", "provenance"}  # Add more as needed

    for obj in objects:
        for container_name in array_containers:
            if container_name in obj and isinstance(obj[container_name], dict):
                # Convert dict to array format
                if obj[container_name]:  # If not empty dict
                    obj[container_name] = [obj[container_name]]
                else:  # If empty dict
                    obj[container_name] = []

    return objects


def clean_object(obj: Dict[str, Any]) -> Dict[str, Any]:
    """
    Remove empty containers from the object.

    Args:
        obj: The object to clean

    Returns:
        The cleaned object with empty containers removed
    """
    try:
        if isinstance(obj, dict):
            # Use dictionary comprehension to clean nested dictionaries
            return {
                key: clean_object(value)
                for key, value in obj.items()
                if not is_empty_container(value)
            }
        elif isinstance(obj, list):
            # Use list comprehension to clean nested lists
            return [clean_object(item) for item in obj if not is_empty_container(item)]
        else:
            # return non-container value as is.
            return obj
    except Exception as e:
        logger.error(f"Error cleaning object: {e}")
        raise ValueError("An error occurred while cleaning the object")

In [25]:
# This might fail. If it does, then i will probably need to do something similar to fix_container_types
def extract_ism(acm: dict) -> dict:
    """Extract the reduced 'ism' structure from any ACM dict."""
    return {
        "banner": acm.get("banner"),
        "classification": acm.get("classif"),
        "ownerProducer": acm.get("owner_prod", []),
        "releaseableTo": acm.get("rel_to", []),
        "disseminationControls": acm.get("dissem_ctrls", []),
        "sciControls": acm.get("f_clearance", []),
    }

In [26]:
def extract_created_date(source_object: Dict[str, Any]) -> Optional[int]:
    """
    Extracts the 'createdDate' (Unix timestamp) from the source object's attributes.

    Args:
        source_object (Dict[str, Any]): The source dictionary containing object data.

    Returns:
        Optional[int]: The Unix timestamp of 'Date Of Introduction' if found, otherwise None.
    """
    try:
        for attr in source_object.get("attributes", {}).get("data", []):
            name = attr.get("attributeName", "").strip().lower()
            if name == "date of introduction":
                value = attr.get("attributeValue")
                if isinstance(value, int):
                    return value
    except Exception as e:
        logger.error(f"There was an error extracting createdDate: {e}")
    return None

In [None]:
# TODO: This will have to be updated once I implement the fuzzy matching logic
def extract_elevation(source_object: Dict[str, Any]) -> Optional[Any]:
    """
    Retrieves the elevation value from the source object, handling variations
    in the attribute name (e.g., "Elevation", "Elevation(m)", "Elevation (m)").

    Args:
        source_object (Dict[str, Any]): The source JSON-like object.

    Returns:
        Optional[Any]: The elevation value if found, otherwise None.
    """
    elevation_value = None

    # Define possible variations of the "Elevation" attribute name
    elevation_variations = ["elevation", "elevation(m)", "elevation (m)"]

    try:
        # Ensure the source object is a dictionary and contains the expected structure
        if not isinstance(source_object, dict):
            raise ValueError("source object must be a dictionary.")

        if (
            "attributes" not in source_object
            or "data" not in source_object["attributes"]
        ):
            raise KeyError(
                "source object does not contain the expected 'attributes.data' structure."
            )

        # Iterate through the attributes to find the elevation value
        for attr in source_object["attributes"]["data"]:
            attribute_name = attr.get("attributeName", "").lower()

            if (
                attribute_name in elevation_variations
                and attr.get("attributeValue") is not None
            ):
                elevation_value = attr.get("attributeValue")
                break  # Exit the loop once the elevation value is found

        if isinstance(elevation_value, str):
            try:
                elevation_value = float(elevation_value)
            except Exception as e:
                logger.error(f"Error transforming elevation into float: {e}")
    except Exception as e:
        # Log the exception for debugging purposes
        logger.error(f"Error occurred while retrieving elevation: {e}")

    return elevation_value


def extract_desired_attribute(
    source_object: Dict[str, Any], attribute_name: str
) -> Optional[Any]:
    attribute_value = None

    try:
        # Ensure the source object is a dictionary and contains the expected structure
        if not isinstance(source_object, dict):
            raise ValueError("source object must be a dictionary.")

        if (
            "attributes" not in source_object
            or "data" not in source_object["attributes"]
        ):
            raise KeyError(
                "source object does not contain the expected 'attributes.data' structure."
            )

        # Iterate through the attributes to find the desired attribute
        for attr in source_object["attributes"]["data"]:
            if (
                attr.get("attributeName") == attribute_name
                and attr.get("attributeValue") is not None
            ):
                attribute_value = attr.get("attributeValue")
                break  # Exit the loop once the desired attribute is found

    except Exception as e:
        logger.error(f"Error occurred while retrieving {attribute_name}: {e}")

    return attribute_value

In [28]:
def prepare_attribute_index(source: Dict[str, Any]) -> Dict[str, Dict]:
    """
    Prepare a complete attribute index including both standard attributes and top-level fields.

    Args:
        source: Source dictionary containing object data

    Returns:
        Dict[str, Dict]: Attribute index with both regular attributes and transformed top-level fields
    """
    # Get standard attributes from data items
    data_items = source.get("attributes", {}).get("data", [])
    attr_index = {item.get("attributeName"): item for item in data_items}

    # Define top-level fields to be included in attribute mapping
    top_level_fields = {
        "domain": "Domain",
        "allegiance": "Allegiance",
        "allegianceAor": "Allegiance Aor",
        "eoid": "Enterprise Object ID",
    }

    # Add top-level fields to attribute index with proper structure
    for source_field, attr_name in top_level_fields.items():
        if source_field in source:
            attr_index[attr_name] = {
                "attributeValue": source[source_field],
                "acm": source.get("acm", {}),
            }

    return attr_index

In [None]:
def parse_location(source_object: Dict[str, Any]) -> Optional[Dict[str, Any]]:
    """
    Processes location information from the input object's geographic data.

    Args:
        source_object (Dict[str, Any]): The input object containing location data.

    Returns:
        Dict[str, Any]: Processed location data, or None if data is invalid.
    """
    try:
        location_data = source_object.get("latestKnownLocation")
        if not location_data:
            logger.warning(
                f"No location data found for object {source_object.get('id')}"
            )
            return None

        geometry_data = location_data.get("geometry")
        if not geometry_data:
            logger.warning(
                f"No geometry data found for object {source_object.get('id')}"
            )
            return None

        # Enhance coordinate validation - accept arrays with 2 or more values
        coords = geometry_data.get("coordinates", [])
        if not coords or len(coords) < 2:
            logger.warning(f"Invalid coordinates for object {source_object.get('id')}")
            return None

        if len(coords) > 2:
            logger.warning(
                f"Using first two values from {len(coords)}-element coordinate array for object {source_object.get('id')}."
            )

        elevation_value = extract_elevation(source_object)
        if isinstance(elevation_value, str):
            try:
                elevation_value = float(elevation_value)
            except Exception as e:
                logger.error(f"Error transforming elevation into float: {e}")

        return {
            "ism": extract_ism(location_data.get("acm", {})),
            "id": location_data.get("id"),
            "timestamp": location_data.get("lastVerified", {}).get("timestamp"),
            "latitude": coords[1],
            "longitude": coords[0],
            "altitude": {
                "value": None,
                "quality": None,
                "error": None,
                "units": {"value": None},
            },
            "elevation": {
                "value": elevation_value,
                "quality": None,
                "error": None,
                "units": {"value": None},
            },
            "derivation": geometry_data.get("type"),
            "quality": None,
            "locationName": None,
            "countryCode": None,
        }
    except Exception as e:
        logger.error(
            f"Unexpected error writing location values for object {source_object.get('id')}: {str(e)}"
        )
        return None

In [None]:
def parse_ship_class_name(
    source_object: Dict[str, Any], standard_object: Dict[str, Any]
) -> Dict[str, Any]:
    """
    Updates the standard_object's maritimeMetadata with shipClass and shipName if the object is a ship.

    Args:
        source_object (Dict[str, Any]): Input dictionary containing vessel information.
        standard_object (Dict[str, Any]): Dictionary to be updated with vessel metadata.

    Returns:
        Dict[str, Any]: The updated standard_object.
    """
    try:
        attributes = source_object.get("attributes", {}).get("data", [])
        class_name = source_object.get("className")
        acm = source_object.get("acm", {})

        # Using any() to more efficiently determine shipName-shipClass without a for loop
        is_ship = any(
            attr.get("attributeName") == "Echelon"
            and attr.get("attributeValue") == "SHIP"
            for attr in attributes
        )
        # Find shipName and its ACM from the attribute
        ship_name_attr = next(
            (attr for attr in attributes if attr.get("attributeName") == "Name"),
            None,
        )

        ship_name = ship_name_attr.get("attributeValue") if ship_name_attr else None
        ship_name_acm = ship_name_attr.get("acm", {}) if ship_name_attr else {}

        if is_ship:
            if "maritimeMetadata" not in standard_object or not isinstance(
                standard_object["maritimeMetadata"], dict
            ):
                standard_object["maritimeMetadata"] = {}

            standard_object["maritimeMetadata"]["shipClass"] = {
                "value": class_name,
                "ism": extract_ism(acm),
            }
            if ship_name:
                standard_object["maritimeMetadata"]["shipName"] = {
                    "value": ship_name,
                    "ism": extract_ism(ship_name_acm),
                }

            logger.info(
                f"Set shipClass and shipName for object {standard_object.get('id')}"
            )
        return standard_object
    except Exception as e:
        logger.error(
            f"Error parsing ship class/name for object {standard_object.get('id', 'unknown')}: {e}"
        )
        return standard_object

In [None]:
def parse_facility_name_id(
    source_object: Dict[str, Any], standard_object: Dict[str, Any]
) -> Dict[str, Any]:
    """
    Updates the standard_object with facilityName and facilityId if the object represents a facility.

    Args:
        source_object (Dict[str, Any]): Dictionary containing facility information.
        standard_object (Dict[str, Any]): Dictionary to be updated with facility metadata.

    Returns:
        Dict[str, Any]: The updated standard_object.
    """
    try:
        attributes = source_object.get("attributes", {}).get("data", [])
        class_name = source_object.get("className")
        acm = source_object.get("acm", {})

        is_facility = class_name == "Facility"
        facility_name = source_object.get("name")

        # Find OSuffix attribute and its ACM
        facility_id_attr = next(
            (
                attr
                for attr in attributes
                if attr.get("attributeName") == "OSuffix"
                and attr.get("attributeValue") is not None
            ),
            None,
        )
        facility_id = (
            facility_id_attr.get("attributeValue") if facility_id_attr else None
        )
        facility_id_acm = facility_id_attr.get("acm", {}) if facility_id_attr else {}

        if is_facility:
            if "facility" not in standard_object or not isinstance(
                standard_object["facility"], dict
            ):
                standard_object["facility"] = {}

            standard_object["facility"]["facilityName"] = {
                "value": facility_name,
                "ism": extract_ism(acm),
            }
            if facility_id:
                standard_object["facility"]["facilityId"] = {
                    "value": facility_id,
                    "ism": extract_ism(facility_id_acm),
                }

            logger.info(
                f"Set facilityName and facilityId for object {standard_object.get('id')}"
            )
        return standard_object
    except Exception as e:
        logger.error(
            f"Error parsing facility name/id for object {standard_object.get('id', 'unknown')}: {e}"
        )
        return standard_object

In [None]:
def build_standard_object(
    target_structure: Dict[str, Any],
    attr_index: Dict[str, Dict],
    attribute_map: Dict[str, Dict[str, str]],
) -> Dict[str, Any]:
    """
    Build a standard object by mapping attributes from the source data to target fields.

    This function takes a pre-initialized target_structure dictionary and populates it with
    transformed attribute values based on the provided attribute mapping. Each
    attribute value is wrapped with ISM classification metadata.

    Args:
        target_structure (Dict[str, Any]): Pre-initialized dictionary containing basic object metadata
            and empty containers (ontology, maritimeMetadata, equipment, facility)

        attr_index (Dict[str, Dict]): Index of attribute data items keyed by attribute name,
            where each item contains 'attributeValue' and 'acm' fields

        attribute_map (Dict[str, Dict[str, str]]): Mapping configuration where keys are attribute names and values are dicts with
            'field' and 'container' specifications

    Returns:
        Dict[str, Any]: The populated target_structure dictionary with mapped attributes organized
            into their designated containers, or empty dict if an error occurs

    Note:
        - Attributes mapped to "root" container are placed directly in the target_structure dict
        - Other containers are nested under their respective keys
        - Each mapped value includes the original value and ISM classification metadata
        - Missing attributes in attr_index are silently skipped
    """
    try:
        for attr_name, mapping in attribute_map.items():
            item = attr_index.get(attr_name)

            if not item:
                continue

            target_field = mapping["field"]
            container = mapping["container"]

            transformed_value = {
                "value": item.get("attributeValue"),
                "ism": extract_ism(item.get("acm", {})),
            }

            if container == "root":
                target_structure[target_field] = transformed_value
            else:
                # Ensure nested container exists
                if container not in target_structure:
                    target_structure[container] = {}
                target_structure[container][target_field] = transformed_value

        return target_structure
    except Exception as e:
        logger.error(f"Error building standard object: {e}")
        return {}

In [None]:
def transform_source_object(
    source: Dict[str, Any], attribute_map: Dict[str, Dict[str, str]]
) -> Dict[str, Any]:
    """
    Transform a source object into a structured format based on the provided attribute mapping.

    Args:
        source: The source dictionary containing object data with attributes, ACM, and metadata
        attribute_map: Dictionary mapping attribute names to their target field and container locations

    Returns:
        Dict containing the transformed object with structured fields
    """
    try:
        if not isinstance(source, dict) or not isinstance(attribute_map, dict):
            logger.error("Invalid source object or attribute map.")
            return {}

        # Special handling for createdDate.
        created_date = extract_created_date(source)

        # Initialize target structure with basic metadata
        target_structure = {
            "version": source.get("version"),
            "overallClassification": extract_ism(source.get("acm", {})),
            "id": source.get("id"),
            "name": source.get("name"),
            "sourceId": source.get("gideId"),
            "createdDate": created_date,
            "lastUpdatedDate": source.get("lastVerified", {}).get("timestamp"),
            "maritimeMetadata": {},
            "landMetadata": {},
            "location": parse_location(source),
            "equipment": {},
            "unit": {},
            "ontology": {},
            "facility": {},
        }

        # Get complete attribute index including top-level fields
        attr_index = prepare_attribute_index(source)

        # Build and return the standard object
        standard_object = build_standard_object(
            target_structure, attr_index, attribute_map
        )

        # Apply the bespoke functions that parse maritime and facility attributes
        parse_ship_class_name(source, standard_object)
        parse_facility_name_id(source, standard_object)

        if standard_object is not None:
            logger.info(
                f"Finished transforming object with ID: {standard_object.get('id', 'unknown')}"
            )
            return standard_object
        else:
            logger.warning(
                f"Transformation resulted in None for object with ID: {source.get('id', 'unknown')}"
            )
            return {}
    except Exception as e:
        logger.error(
            f"Error transforming object with ID {source.get('id', 'unknown')}: {e}"
        )
        return {}

In [None]:
def process_objects(
    source_objects: List[Dict[str, Any]],
    attribute_mapping: Dict[str, Dict[str, str]],
    restrictions_config: Dict[str, Any],
) -> List[Dict[str, Any]]:
    """
    Processes a list of input objects by parsing, validating, and applying ISM policies.

    Args:
        source_objects (List[Dict[str, Any]]): List of objects to process.
        attribute_mapping (Dict[str, Dict[str, str]]): Mapping configuration for attributes.
        restrictions_config (Dict[str, Any]): Configuration for classification restrictions.

    Returns:
        List[Dict[str, Any]]:
            - A list of processed objects.
    """
    logger.info(f"Processing total objects: {len(source_objects)}")

    # Preprocess the raw data to ensure we're working with a clean set
    preprocessed_objects = preprocess_raw_data(source_objects)
    logger.info(f"Successfully pre-processed {len(preprocessed_objects)} object(s)")

    processed_objects = []
    for obj in preprocessed_objects:  # Iterate over preprocessed raw data
        try:
            obj_id = obj.get("id")  # Ensure obj_id is extracted
            logger.info(f"Processing object with ID: {obj_id}")

            # Transform source object to standard format
            standard_obj = transform_source_object(obj, attribute_mapping)

            # Apply classification restrictions
            processed_obj = apply_restrictions(standard_obj, restrictions_config)

            if processed_obj is not None:
                processed_objects.append(processed_obj)
        except Exception as e:
            logger.error(
                f"Unexpected error processing object {obj.get('id')}: {str(e)}"
            )

    # Fix container types to match schema requirements
    cleaned_processed_objects = fix_container_types(processed_objects)
    logger.info("Fixed container types to match schema requirements")

    return cleaned_processed_objects

### Load into Databricks Functions

In [None]:
# TODO: Add the functions for loading the standard objects into delta table
def get_databricks_connection():
    """Create a connection to Databricks using credentials from environment variables."""
    try:
        logger.info("Connection Established...")
        return sql.connect(
            server_hostname=os.getenv("SERVER_HOSTNAME"),
            http_path=os.getenv("HTTP_PATH"),
            access_token=os.getenv("ACCESS_TOKEN"),
        )
    except Exception as e:
        logger.error(f"Failed to connect to Databricks: {e}")

## Main Execution

#### Execute the pipeline logic one cell at a time
* The code below is taken from [`oms_data_pipeline.py`](../src/pipelines/oms_data_pipeline.py) and executes it line-by-line
* This can be used for debugging or simply to see the sequential behavior of the code

In [None]:
# Set the paths for data, config, and output
data_path = "../data/1_raw/input"
output_path = "../data/2_processed/output"

attribute_mapping_path = "../configs/attribute_mapping.yaml"
valid_attributes_path = "../configs/valid_attributes_6_19_25.yaml"
restrictions_path = "../configs/classifications_config.yaml"
attribute_report_path = "../configs/unexpected_attributes_report.yaml"
# schema_path = "../configs/schemas/standard_object_schema_v1.4.json"
schema_path = "../configs/schemas/standard_object_schema_v1.5.json"

In [None]:
# Get the source objects
source_objects = fetch_all_objects(data_path)

In [None]:
# Load attribute mapping and restrictions from configuration file
attribute_mapping = load_attribute_mapping(attribute_mapping_path)
valid_attribute_names = load_valid_attribute_names(valid_attributes_path)
restrictions_config = load_classification_config(restrictions_path)

In [None]:
# Detect unexpected attribute names across all input objects
detect_unexpected_attribute_names(
    source_objects, valid_attribute_names, attribute_report_path
)

In [None]:
# Prepare dates in source objects
source_objects = prepare_dates(source_objects)

In [None]:
# Transform each source object into the standard format
standard_objects = process_objects(
    source_objects, attribute_mapping, restrictions_config
)

In [None]:
# Apply the cleanup function to all standard objects
cleaned_standard_objects = [clean_object(obj) for obj in standard_objects]

In [None]:
# Run the validations on the standard objects
validation_summary = run_validations(cleaned_standard_objects, schema_path)

In [None]:
# Save the cleaned standard objects to JSON files
save_standard_objects(output_path, cleaned_standard_objects)

### Load to Databricks
* The final step is to load the data into Databricks delta table