In [1]:
import json
import pandas as pd
from collections import Counter

# Load the sample JSON file
json_file = r'C:\Users\mmorr\Desktop\Apps\SP_Streamlit\data\results-2025-10-08-2025-10-14.json'

with open(json_file, 'r', encoding='utf-8') as f:
    data = json.load(f)

print(f"Total documents in file: {len(data)}")

ModuleNotFoundError: No module named 'pandas'

In [None]:
# Analyze event-name field presence and values
results = []

for doc in data[:100]:  # Sample first 100 docs
    doc_id = doc.get('id', 'unknown')
    
    # Check if doc has proper structure
    if 'auto' not in doc:
        results.append({
            'doc_id': doc_id,
            'has_auto': False,
            'has_gai': False,
            'gai_count': 0,
            'has_event_name_field': False,
            'event_name_value': None
        })
        continue
    
    gai = doc['auto'].get('gai', [])
    if len(gai) < 2:
        results.append({
            'doc_id': doc_id,
            'has_auto': True,
            'has_gai': True,
            'gai_count': len(gai),
            'has_event_name_field': False,
            'event_name_value': None
        })
        continue
    
    # Get the second gai entry (index 1)
    gai_values = gai[1].get('value', [])
    
    # Look for event-name field
    event_name_field = None
    for item in gai_values:
        if item.get('type') == 'event-name':
            event_name_field = item.get('value')
            break
    
    results.append({
        'doc_id': doc_id,
        'has_auto': True,
        'has_gai': True,
        'gai_count': len(gai),
        'has_event_name_field': event_name_field is not None,
        'event_name_value': event_name_field
    })

df_analysis = pd.DataFrame(results)
df_analysis

In [None]:
# Summary statistics
print("\n=== SUMMARY STATISTICS (first 100 docs) ===")
print(f"Docs with 'auto' field: {df_analysis['has_auto'].sum()}")
print(f"Docs with 'gai' field: {df_analysis['has_gai'].sum()}")
print(f"Docs with event-name field present: {df_analysis['has_event_name_field'].sum()}")
print(f"Docs with non-empty event-name value: {df_analysis['event_name_value'].notna().sum()}")

# Count empty/None event names
empty_event_names = df_analysis[
    df_analysis['has_event_name_field'] & 
    (df_analysis['event_name_value'].isna() | (df_analysis['event_name_value'] == ''))
]
print(f"\nDocs with event-name field but EMPTY/NONE value: {len(empty_event_names)}")

In [None]:
# Look at actual event-name values
print("\n=== SAMPLE EVENT-NAME VALUES ===")
sample_events = df_analysis[df_analysis['event_name_value'].notna()]['event_name_value'].head(20)
for i, event in enumerate(sample_events, 1):
    print(f"{i}. {event}")

In [None]:
# Test the normalization logic from dsr.py
def normalize_value(response_value):
    """Test the normalization logic from dsr.py lines 138-142"""
    normalized_value = (
        response_value
        if response_value and response_value.strip().lower() not in ['n/a', 'na', 'none', 'null', '']
        else None
    )
    return normalized_value

# Test on sample event names
print("\n=== TESTING NORMALIZATION LOGIC ===")
test_values = df_analysis['event_name_value'].dropna().head(10).tolist()
for val in test_values:
    normalized = normalize_value(val)
    print(f"Original: '{val}' -> Normalized: '{normalized}'")

In [None]:
# Check for documents where event-name might be missing but projects exists
print("\n=== CHECKING FALLBACK TO PROJECTS ===")

fallback_cases = []
for doc in data[:100]:
    if 'auto' not in doc or len(doc['auto'].get('gai', [])) < 2:
        continue
    
    gai_values = doc['auto']['gai'][1].get('value', [])
    
    event_name = None
    projects = None
    
    for item in gai_values:
        if item.get('type') == 'event-name':
            event_name = item.get('value')
        elif item.get('type') == 'projects':
            projects = item.get('value')
    
    # Case where event-name is empty but projects exists
    if (not event_name or event_name.strip().lower() in ['n/a', 'na', 'none', 'null', '']) and projects:
        fallback_cases.append({
            'doc_id': doc['id'],
            'event_name': event_name,
            'projects': projects
        })

print(f"Found {len(fallback_cases)} cases where projects could be used as fallback")
if fallback_cases:
    for case in fallback_cases[:10]:
        print(f"  Doc {case['doc_id']}: event-name='{case['event_name']}', projects='{case['projects']}'")

In [None]:
# Full analysis on entire file
print("\n=== ANALYZING ENTIRE FILE ===")

total_docs = len(data)
docs_with_event_field = 0
docs_with_empty_event = 0
docs_with_valid_event = 0
docs_missing_auto = 0

for doc in data:
    if 'auto' not in doc:
        docs_missing_auto += 1
        continue
    
    gai = doc['auto'].get('gai', [])
    if len(gai) < 2:
        continue
    
    gai_values = gai[1].get('value', [])
    
    for item in gai_values:
        if item.get('type') == 'event-name':
            docs_with_event_field += 1
            value = item.get('value', '')
            
            # Check if value is empty or invalid
            if not value or value.strip().lower() in ['n/a', 'na', 'none', 'null', '']:
                docs_with_empty_event += 1
            else:
                docs_with_valid_event += 1
            break

print(f"Total documents: {total_docs}")
print(f"Documents missing 'auto' field: {docs_missing_auto}")
print(f"Documents with 'event-name' field: {docs_with_event_field}")
print(f"Documents with VALID event-name: {docs_with_valid_event}")
print(f"Documents with EMPTY/INVALID event-name: {docs_with_empty_event}")
print(f"\nPercentage with valid event-name: {docs_with_valid_event / total_docs * 100:.1f}%")
print(f"Percentage with empty event-name: {docs_with_empty_event / total_docs * 100:.1f}%")