In [1]:
import os
import json
import yaml
from loguru import logger
from pathlib import Path
from difflib import get_close_matches
from typing import Dict, Any, List, Tuple

In [2]:
def fetch_all_objects(directory: str) -> List[Dict[str, Any]]:
    """
    Load all JSON files from a specified directory into a list of dictionaries.

    Args:
        directory (str): Path to the directory containing JSON files.

    Returns:
        List[Dict[str, Any]]: List of dictionaries, each representing the contents of a JSON file.

    Note:
        This function mimics querying data from an API by reading local files.
        In production, replace this logic with actual API calls as needed.
    """
    json_list = []

    for current_file in os.scandir(directory):
        if current_file.is_file() and current_file.name.endswith(".json"):
            with open(current_file.path, "r") as json_file:
                json_data = json.load(json_file)
                json_list.append(json_data)

    logger.info("Source objects loaded successfully.")
    return json_list

In [3]:
def load_valid_attribute_names(file_path: str) -> list:
    """
    Loads the list of valid attribute names from a YAML file.

    Args:
    - file_path (str): Path to the YAML file containing valid attribute names.

    Returns:
    - list: A list of valid attribute names.
    """
    try:
        with open(file_path, "r") as file:
            data = yaml.safe_load(file)
            logger.info("Loaded valid attribute names successfully.")
            return data.get("valid_attribute_names", [])
    except Exception as e:
        logger.error(f"Error loading valid attribute names: {e}")
        return []

In [4]:
def capture_unexpected_attributes(
    source_objects: List[Dict[str, Any]], valid_attribute_names: List[str]
) -> Dict[str, Any]:
    """
    Captures unexpected attribute names from source objects.

    Args:
    - source_objects (List[Dict[str, Any]]): A list of JSON objects to check.
    - valid_attribute_names (List[str]): A list of valid attribute names.

    Returns:
    - Dict[str, Any]: Dictionary containing unexpected attributes and their details.
    """
    unexpected_attributes = {}
    total_attributes_checked = 0
    objects_with_issues = []

    for obj_index, json_object in enumerate(source_objects):
        attributes = json_object.get("attributes", {}).get("data", [])
        object_unexpected = []

        for attr_index, attribute in enumerate(attributes):
            total_attributes_checked += 1
            attribute_name = attribute.get("attributeName")

            if attribute_name not in valid_attribute_names:
                if attribute_name not in unexpected_attributes:
                    unexpected_attributes[attribute_name] = {"count": 0, "objects": []}

                unexpected_attributes[attribute_name]["count"] += 1
                unexpected_attributes[attribute_name]["objects"].append(
                    {
                        "object_index": obj_index,
                        "attribute_index": attr_index,
                        "object_id": json_object.get("id", "unknown"),
                    }
                )
                object_unexpected.append(attribute_name)

        if object_unexpected:
            objects_with_issues.append(
                {
                    "object_index": obj_index,
                    "object_id": json_object.get("id", "unknown"),
                    "unexpected_attributes": object_unexpected,
                }
            )

    return {
        "unexpected_attributes": unexpected_attributes,
        "objects_with_issues": objects_with_issues,
        "total_attributes_checked": total_attributes_checked,
        "total_objects_checked": len(source_objects),
    }

In [5]:
def add_fuzzy_matching(
    unexpected_attributes: Dict[str, Any], valid_attribute_names: List[str]
) -> None:
    """
    Adds fuzzy matching suggestions to unexpected attributes.

    Args:
    - unexpected_attributes (Dict[str, Any]): Dictionary of unexpected attributes to enhance.
    - valid_attribute_names (List[str]): List of valid attribute names for matching.
    """

    for unexpected_name in unexpected_attributes.keys():
        similar_names = get_close_matches(
            unexpected_name, valid_attribute_names, n=3, cutoff=0.6
        )
        unexpected_attributes[unexpected_name]["similar_valid_names"] = similar_names

In [6]:
def save_analysis_report(
    analysis_results: Dict[str, Any],
    output_file: str,
) -> bool:
    """
    Saves the analysis results to a YAML file.

    Args:
    - analysis_results (Dict[str, Any]): Results from the attribute analysis.
    - output_file (str): Path to save the report.

    Returns:
    - bool: True if successful, False otherwise.
    """
    try:
        report_data = {
            "analysis_summary": {
                "total_objects_checked": analysis_results["total_objects_checked"],
                "objects_with_issues": len(analysis_results["objects_with_issues"]),
                "unique_unexpected_attributes": len(
                    analysis_results["unexpected_attributes"]
                ),
            },
            "unexpected_attribute_names": list(
                analysis_results["unexpected_attributes"].keys()
            ),
            "detailed_findings": analysis_results["unexpected_attributes"],
            "affected_objects": analysis_results["objects_with_issues"],
        }

        with open(output_file, "w") as file:
            yaml.dump(report_data, file, default_flow_style=False)
        logger.info(f"Analysis report saved to '{output_file}'")
        return True

    except Exception as e:
        logger.error(f"Error saving analysis report: {e}")
        return False

In [7]:
def detect_unexpected_attribute_names(
    source_objects: List[Dict[str, Any]],
    valid_attribute_names: List[str],
    output_file: str,
) -> Dict[str, Any]:
    """
    Main function to detect unexpected attribute names with comprehensive reporting.

    Args:
    - source_objects (List[Dict[str, Any]]): A list of JSON objects to check.
    - valid_attribute_names (List[str]): A list of valid attribute names.
    - output_file (str): Path to save the detailed report.

    Returns:
    - Dict[str, Any]: Summary of findings.
    """
    # Capture unexpected attributes
    analysis_results = capture_unexpected_attributes(
        source_objects, valid_attribute_names
    )

    # If unexpected attributes found, enhance with fuzzy matching
    if analysis_results["unexpected_attributes"]:
        add_fuzzy_matching(
            analysis_results["unexpected_attributes"], valid_attribute_names
        )

        # Log findings
        logger.warning(
            f"Found {len(analysis_results['unexpected_attributes'])} unexpected attribute names "
            f"across {len(analysis_results['objects_with_issues'])} objects"
        )

        for attr_name, details in analysis_results["unexpected_attributes"].items():
            logger.warning(
                f"Unexpected attribute: '{attr_name}' (found {details['count']} times)"
            )
            if details.get("similar_valid_names"):
                logger.info(
                    f"  Possible matches: {', '.join(details['similar_valid_names'])}"
                )

        # Save detailed report
        save_analysis_report(analysis_results, output_file)
    else:
        logger.info("No unexpected attribute names detected.")

    return {
        "total_objects_checked": analysis_results["total_objects_checked"],
        "total_attributes_checked": analysis_results["total_attributes_checked"],
        "objects_with_issues": len(analysis_results["objects_with_issues"]),
        "unique_unexpected_attributes": len(analysis_results["unexpected_attributes"]),
    }

In [8]:
# Get the source objects
data_path = "../data/1_raw/input"
source_objects = fetch_all_objects(data_path)

[32m2025-09-12 11:25:25.492[0m | [1mINFO    [0m | [36m__main__[0m:[36mfetch_all_objects[0m:[36m23[0m - [1mSource objects loaded successfully.[0m


In [9]:
output_file = "../configs/unexpected_attributes_report.yaml"
valid_attribute_names = load_valid_attribute_names(
    "../configs/valid_attributes_6_19_25.yaml"
)

[32m2025-09-12 11:25:33.859[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_valid_attribute_names[0m:[36m14[0m - [1mLoaded valid attribute names successfully.[0m


In [10]:
# Detect unexpected attribute names across all input objects
detect_unexpected_attribute_names(source_objects, valid_attribute_names, output_file)

[32m2025-09-12 11:25:35.710[0m | [1mINFO    [0m | [36m__main__[0m:[36mdetect_unexpected_attribute_names[0m:[36m39[0m - [1m  Possible matches: Elevation[0m
[32m2025-09-12 11:25:35.711[0m | [1mINFO    [0m | [36m__main__[0m:[36mdetect_unexpected_attribute_names[0m:[36m39[0m - [1m  Possible matches: MIDB Facility Surrogate Key, MIDB Unit Surrogate Key, MIDB Equipment Surrogate Key[0m
[32m2025-09-12 11:25:35.712[0m | [1mINFO    [0m | [36m__main__[0m:[36mdetect_unexpected_attribute_names[0m:[36m39[0m - [1m  Possible matches: Date Of Introduction[0m
[32m2025-09-12 11:25:35.712[0m | [1mINFO    [0m | [36m__main__[0m:[36mdetect_unexpected_attribute_names[0m:[36m39[0m - [1m  Possible matches: Date Of Introduction[0m
[32m2025-09-12 11:25:35.712[0m | [1mINFO    [0m | [36m__main__[0m:[36mdetect_unexpected_attribute_names[0m:[36m39[0m - [1m  Possible matches: Date Of Introduction[0m
[32m2025-09-12 11:25:35.713[0m | [1mINFO    [0m | [36m

{'total_objects_checked': 14,
 'total_attributes_checked': 151,
 'objects_with_issues': 8,
 'unique_unexpected_attributes': 6}