In [1]:
import json

In [2]:
def load_json_file(filepath):
    """Load JSON data from file."""
    with open(filepath, 'r') as f:
        return json.load(f)

In [3]:
def validate_structure(data):
    """Validate the basic structure of the data."""
    errors = []
    
    # Check required top-level fields
    required_fields = ['areas', 'areas_ua', 'frontline', 'geos', 'unit_count', 'units']
    for field in required_fields:
        if field not in data:
            errors.append(f"Missing required field: {field}")
    
    # Validate areas
    if 'areas' in data:
        if not isinstance(data['areas'], list):
            errors.append("'areas' must be an array")
        else:
            for idx, area in enumerate(data['areas']):
                if not isinstance(area, list):
                    errors.append(f"areas[{idx}] must be an array of coordinates")
                elif len(area) < 3:
                    errors.append(f"areas[{idx}] must have at least 3 coordinate pairs")
                else:
                    for cidx, coord in enumerate(area):
                        if not isinstance(coord, list) or len(coord) != 2:
                            errors.append(f"areas[{idx}][{cidx}] must be a coordinate pair [lat, lon]")
                        elif not all(isinstance(c, (int, float)) for c in coord):
                            errors.append(f"areas[{idx}][{cidx}] coordinates must be numbers")
    
    # Validate areas_ua
    if 'areas_ua' in data and not isinstance(data['areas_ua'], list):
        errors.append("'areas_ua' must be an array")
    
    # Validate frontline
    if 'frontline' in data:
        if not isinstance(data['frontline'], list):
            errors.append("'frontline' must be an array")
        else:
            for idx, line in enumerate(data['frontline']):
                if not isinstance(line, list):
                    errors.append(f"frontline[{idx}] must be an array of coordinates")
                elif len(line) < 2:
                    errors.append(f"frontline[{idx}] must have at least 2 coordinate pairs")
    
    # Validate geos
    if 'geos' in data:
        if not isinstance(data['geos'], dict):
            errors.append("'geos' must be an object")
        else:
            if 'ru' not in data['geos']:
                errors.append("'geos' must have 'ru' field")
            if 'ua' not in data['geos']:
                errors.append("'geos' must have 'ua' field")
            
            for side in ['ru', 'ua']:
                if side in data['geos']:
                    if not isinstance(data['geos'][side], list):
                        errors.append(f"geos.{side} must be an array")
                    else:
                        for idx, event in enumerate(data['geos'][side]):
                            if not isinstance(event, dict):
                                errors.append(f"geos.{side}[{idx}] must be an object")
                            else:
                                if 'c' not in event:
                                    errors.append(f"geos.{side}[{idx}] missing 'c' (coordinates)")
                                elif not isinstance(event['c'], list) or len(event['c']) != 2:
                                    errors.append(f"geos.{side}[{idx}].c must be [lon, lat]")
                                
                                if 'd' not in event:
                                    errors.append(f"geos.{side}[{idx}] missing 'd' (description)")
                                elif not isinstance(event['d'], str) or len(event['d']) == 0:
                                    errors.append(f"geos.{side}[{idx}].d must be a non-empty string")
    
    # Validate unit_count
    if 'unit_count' in data:
        if not isinstance(data['unit_count'], dict):
            errors.append("'unit_count' must be an object")
        else:
            for side in ['ru', 'ua']:
                if side not in data['unit_count']:
                    errors.append(f"unit_count.{side} is required")
                elif not isinstance(data['unit_count'][side], int):
                    errors.append(f"unit_count.{side} must be an integer")
                elif data['unit_count'][side] < 0:
                    errors.append(f"unit_count.{side} must be non-negative")
    
    # Validate units
    if 'units' in data:
        if not isinstance(data['units'], dict):
            errors.append("'units' must be an object")
        else:
            for side in ['ru', 'ua']:
                if side not in data['units']:
                    errors.append(f"units.{side} is required")
                elif not isinstance(data['units'][side], list):
                    errors.append(f"units.{side} must be an array")
                else:
                    for idx, unit in enumerate(data['units'][side]):
                        if not isinstance(unit, list) or len(unit) != 2:
                            errors.append(f"units.{side}[{idx}] must be [unit_id, [lon, lat]]")
                        else:
                            if not isinstance(unit[0], int):
                                errors.append(f"units.{side}[{idx}][0] (unit_id) must be integer")
                            if not isinstance(unit[1], list) or len(unit[1]) != 2:
                                errors.append(f"units.{side}[{idx}][1] must be [lon, lat]")
                            elif not all(isinstance(c, (int, float)) for c in unit[1]):
                                errors.append(f"units.{side}[{idx}][1] coordinates must be numbers")
    
    return errors

In [4]:
def validate_business_rules(data):
    """Validate additional business rules."""
    warnings = []
    
    # Check if unit counts match actual units
    if 'units' in data and 'unit_count' in data:
        ru_actual = len(data['units'].get('ru', []))
        ru_expected = data['unit_count'].get('ru', 0)
        if ru_actual != ru_expected:
            warnings.append({
                'type': 'COUNT_MISMATCH',
                'message': f"Russian unit count mismatch: expected {ru_expected}, found {ru_actual} (off by {abs(ru_expected - ru_actual)})"
            })
        
        ua_actual = len(data['units'].get('ua', []))
        ua_expected = data['unit_count'].get('ua', 0)
        if ua_actual != ua_expected:
            warnings.append({
                'type': 'COUNT_MISMATCH',
                'message': f"Ukrainian unit count mismatch: expected {ua_expected}, found {ua_actual} (off by {abs(ua_expected - ua_actual)})"
            })
    
    # Check for valid coordinate ranges
    def check_coordinates(coords, context):
        if len(coords) >= 2:
            lon, lat = coords[0], coords[1]
            if not (-180 <= lon <= 180):
                warnings.append({
                    'type': 'INVALID_COORDINATE',
                    'message': f"Invalid longitude {lon} in {context} (must be -180 to 180)"
                })
            if not (-90 <= lat <= 90):
                warnings.append({
                    'type': 'INVALID_COORDINATE',
                    'message': f"Invalid latitude {lat} in {context} (must be -90 to 90)"
                })
    
    # Validate geos coordinates
    if 'geos' in data:
        for side in ['ru', 'ua']:
            if side in data['geos']:
                for idx, event in enumerate(data['geos'][side]):
                    if 'c' in event and isinstance(event['c'], list):
                        check_coordinates(event['c'], f"geos.{side}[{idx}]")
    
    # Validate unit coordinates
    if 'units' in data:
        for side in ['ru', 'ua']:
            if side in data['units']:
                for idx, unit in enumerate(data['units'][side]):
                    if isinstance(unit, list) and len(unit) >= 2 and isinstance(unit[1], list):
                        check_coordinates(unit[1], f"units.{side}[{idx}]")
    
    # Check for duplicate unit IDs
    if 'units' in data:
        for side in ['ru', 'ua']:
            if side in data['units']:
                unit_ids = []
                for unit in data['units'][side]:
                    if isinstance(unit, list) and len(unit) >= 1:
                        unit_ids.append(unit[0])
                
                duplicates = set([uid for uid in unit_ids if unit_ids.count(uid) > 1])
                if duplicates:
                    warnings.append({
                        'type': 'DUPLICATE_UNIT_ID',
                        'message': f"Duplicate unit IDs in {side}: {len(duplicates)} duplicates found (e.g., {list(duplicates)[:5]})"
                    })
    
    return warnings

In [5]:
print("=" * 40)
print("JSON DATA VALIDATION REPORT")
print("=" * 40)

# Load data
print("\nLoading data")
data = load_json_file('../data/20250901.json')

# Validate structure
print("\n1. STRUCTURE VALIDATION")
print("-" * 40)
structure_errors = validate_structure(data)

if len(structure_errors) == 0:
    print("Data structure is VALID!")
else:
    print(f"Found {len(structure_errors)} structure errors:\n")
    for idx, error in enumerate(structure_errors, 1):
        print(f"  {idx}. {error}")

# Validate business rules
print("\n\n2. BUSINESS RULES VALIDATION")
print("-" * 40)
business_warnings = validate_business_rules(data)

if len(business_warnings) == 0:
    print("All business rules passed!")
else:
    print(f"Found {len(business_warnings)} warnings:\n")
    for idx, warning in enumerate(business_warnings, 1):
        print(f"  {idx}. [{warning['type']}] {warning['message']}")

# Summary statistics
print("\n\n3. DATA SUMMARY")
print("-" * 40)
print(f"Areas (polygons): {len(data.get('areas', []))}")
print(f"Ukrainian areas: {len(data.get('areas_ua', []))}")
print(f"Frontline segments: {len(data.get('frontline', []))}")
print("\nGeographic events:")
print(f"  - Russian incidents: {len(data.get('geos', {}).get('ru', []))}")
print(f"  - Ukrainian incidents: {len(data.get('geos', {}).get('ua', []))}")
print("\nMilitary units:")
print(f"  - Russian: {len(data.get('units', {}).get('ru', []))} units (expected: {data.get('unit_count', {}).get('ru', 0)})")
print(f"  - Ukrainian: {len(data.get('units', {}).get('ua', []))} units (expected: {data.get('unit_count', {}).get('ua', 0)})")

# Overall status
print("\n" + "=" * 40)
if len(structure_errors) == 0 and len(business_warnings) == 0:
    print("VALIDATION COMPLETE - All checks passed!")
elif len(structure_errors) == 0:
    print("VALIDATION COMPLETE - Structure valid but has warnings")
else:
    print("VALIDATION FAILED - Structure errors found")
print("=" * 40)

JSON DATA VALIDATION REPORT

Loading data

1. STRUCTURE VALIDATION
----------------------------------------
Data structure is VALID!


2. BUSINESS RULES VALIDATION
----------------------------------------

  1. [COUNT_MISMATCH] Russian unit count mismatch: expected 867, found 866 (off by 1)
  2. [COUNT_MISMATCH] Ukrainian unit count mismatch: expected 493, found 492 (off by 1)
  3. [DUPLICATE_UNIT_ID] Duplicate unit IDs in ru: 3 duplicates found (e.g., [1393, 1149, 1085])
  4. [DUPLICATE_UNIT_ID] Duplicate unit IDs in ua: 1 duplicates found (e.g., [480])


3. DATA SUMMARY
----------------------------------------
Areas (polygons): 11
Ukrainian areas: 0
Frontline segments: 2

Geographic events:
  - Russian incidents: 17
  - Ukrainian incidents: 20

Military units:
  - Russian: 866 units (expected: 867)
  - Ukrainian: 492 units (expected: 493)

