# Locality Lens - Comprehensive Test Suite

This notebook tests the complete Locality Lens workflow with:
- Basic functionality tests
- Parallel execution verification
- Different user profiles
- Edge cases
- Performance benchmarks
- State inspection

## ‚ö†Ô∏è Important Notes:

1. **Markdown cells** (like this one) are for documentation only - they cannot be executed. Only run Python code cells.
2. **Run cells in order** - Start with Cell 2 (Setup and Imports) first, then run other cells sequentially.
3. **Working directory** - The notebook automatically detects if you're running from `tests/` or project root.
4. **If you get import errors**, make sure you've installed dependencies: `pip install -r requirements.txt`


## Setup and Imports

In [1]:
import sys
import os
import time
from pathlib import Path

# Add project root to path
# Handle both cases: running from tests/ or from project root
current_dir = Path.cwd()
if current_dir.name == 'tests':
    project_root = current_dir.parent
else:
    project_root = current_dir

sys.path.insert(0, str(project_root))

# Import required modules
try:
    from src.graph.graph import compile_graph
    from src.graph.state import LocalityState
    import json
    from pprint import pprint
    print(f"‚úÖ Imports successful!")
    print(f"üìÅ Project root: {project_root}")
    print(f"üìÅ Current dir: {current_dir}")
except ImportError as e:
    print(f"‚ùå Import error: {e}")
    print(f"üìÅ Project root: {project_root}")
    print(f"üìÅ Current dir: {current_dir}")
    print(f"üìÅ Python path: {sys.path[:3]}")
    raise

‚úÖ Imports successful!
üìÅ Project root: /Users/nitish.ranjan/Documents/AiDash/Educational/research/locality-lens
üìÅ Current dir: /Users/nitish.ranjan/Documents/AiDash/Educational/research/locality-lens/tests


In [2]:
def create_initial_state(
    user_input: str,
    user_profile: str = None
):
    """Create initial state for testing."""
    return {
        "user_input": user_input,
        "user_profile": user_profile,
        "coordinates": None,
        "address": None,
        "osm_data": {},
        "aqi_data": None,
        "selected_metrics": [],
        "statistics": {},
        "user_intent": {},
        "summary": None,
        "recommendations": [],
        "visualization_data": None,
        "errors": [],
        "warnings": [],
        "next_action": "",
        "processing_steps": []
    }


def run_test(graph, state, test_name: str) -> dict:
    """Run a test and return results with timing."""
    print(f"\n{'='*60}")
    print(f"TEST: {test_name}")
    print(f"{'='*60}")
    
    start_time = time.time()
    
    try:
        # Run graph with streaming to track progress
        events = []
        for event in graph.stream(state, stream_mode="updates"):
            events.append(event)
            for node_name, node_state in event.items():
                if isinstance(node_state, dict):
                    steps = node_state.get("processing_steps", [])
                    if steps:
                        print(f"  ‚úì {node_name}: {steps[-1]}")
        
        # Get final state
        final_state = graph.invoke(state)
        
        elapsed = time.time() - start_time
        
        return {
            "success": True,
            "elapsed_time": elapsed,
            "final_state": final_state,
            "events": events,
            "errors": final_state.get("errors", []),
            "warnings": final_state.get("warnings", [])
        }
    except Exception as e:
        elapsed = time.time() - start_time
        return {
            "success": False,
            "elapsed_time": elapsed,
            "error": str(e),
            "final_state": None
        }


def print_test_results(result: dict, verbose: bool = False):
    """Print test results in a formatted way."""
    if result["success"]:
        print(f"\n‚úÖ TEST PASSED")
        print(f"‚è±Ô∏è  Time: {result['elapsed_time']:.2f}s")
        
        if result.get("errors"):
            print(f"\n‚ö†Ô∏è  Errors: {len(result['errors'])}")
            for error in result["errors"]:
                print(f"   - {error}")
        
        if result.get("warnings"):
            print(f"\n‚ö†Ô∏è  Warnings: {len(result['warnings'])}")
            for warning in result["warnings"]:
                print(f"   - {warning}")
        
        final_state = result["final_state"]
        
        if verbose:
            print(f"\nüìä Final State Summary:")
            print(f"   - Coordinates: {final_state.get('coordinates')}")
            print(f"   - Address: {final_state.get('address')}")
            print(f"   - Selected Metrics: {len(final_state.get('selected_metrics', []))}")
            print(f"   - Statistics Count: {len(final_state.get('statistics', {}))}")
            print(f"   - Summary Generated: {final_state.get('summary') is not None}")
            print(f"   - User Intent: {final_state.get('user_intent', {})}")
    else:
        print(f"\n‚ùå TEST FAILED")
        print(f"‚è±Ô∏è  Time: {result['elapsed_time']:.2f}s")
        print(f"\nüí• Error: {result.get('error')}")


In [3]:
# Compile the graph
print("Compiling graph...")
graph = compile_graph()
print("‚úÖ Graph compiled successfully!")


Compiling graph...
‚úÖ Graph compiled successfully!


## Test 1: Basic Flow - Address Input with Bachelor Profile


In [4]:
test1_state = create_initial_state(
    user_input="Indiranagar, Bangalore",
    user_profile="Bachelor/Young Professional"
)

result1 = run_test(graph, test1_state, "Test 1: Address + Bachelor Profile")
print_test_results(result1, verbose=True)

# Display summary if available
if result1["success"] and result1["final_state"].get("summary"):
    print(f"\nüìù Generated Summary:")
    print("-" * 60)
    summary = result1["final_state"]["summary"]
    print(summary[:500] + "..." if len(summary) > 500 else summary)



TEST: Test 1: Address + Bachelor Profile
  ‚úì validate_input: validate_input: SUCCESS - Address detected, needs geocoding
‚úÖ Loaded .env from: /Users/nitish.ranjan/Documents/AiDash/Educational/research/locality-lens/.env
‚úÖ GROQ_API_KEY loaded (length: 56)
  ‚úì extract_intent_and_select_metrics: extract_intent_and_select_metrics: SUCCESS - Extracted intent, selected 7 metrics
  ‚úì geocode_location: geocode_location: SUCCESS - Geocoded to (12.9732913, 77.6404672)
  ‚úì fetch_osm_data: fetch_osm_data: SUCCESS - Fetched 25 POI categories
  ‚úì calculate_statistics: calculate_statistics: SUCCESS - Calculated 7 metrics
  ‚úì generate_summary: generate_summary: SUCCESS - Summary generated

‚úÖ TEST PASSED
‚è±Ô∏è  Time: 14.69s

‚ö†Ô∏è  Errors: 1
   - Error calculating statistics: cannot access local variable 'area_km2' where it is not associated with a value

   - Road density calculation not yet implemented

üìä Final State Summary:
   - Coordinates: (12.9732913, 77.6404672)
   - Addr

## Test 2: Coordinates Input with Family Profile


In [12]:
test2_state = create_initial_state(
    user_input="Nirvana Country, Gurgaon",  # Bangalore coordinates
    user_profile="Family with Kids"
)

result2 = run_test(graph, test2_state, "Test 2: Coordinates + Family Profile")
print_test_results(result2, verbose=True)

# Check selected metrics
if result2["success"]:
    print(f"\nüìã Selected Metrics ({len(result2['final_state'].get('selected_metrics', []))}):")
    for metric in result2["final_state"].get("selected_metrics", [])[:10]:
        print(f"   - {metric}")



TEST: Test 2: Coordinates + Family Profile
  ‚úì validate_input: validate_input: SUCCESS - Address detected, needs geocoding
  ‚úì extract_intent_and_select_metrics: extract_intent_and_select_metrics: SUCCESS - Extracted intent, selected 7 metrics
  ‚úì geocode_location: geocode_location: SUCCESS - Geocoded to (28.4150509, 77.0642955)
  ‚úì fetch_osm_data: fetch_osm_data: ERROR - cannot access local variable 'metro' where it is not associated with a value
  ‚úì handle_error: handle_error: Error handling completed

‚úÖ TEST PASSED
‚è±Ô∏è  Time: 2.95s

‚ö†Ô∏è  Errors: 1
   - Error fetching OSM data: cannot access local variable 'metro' where it is not associated with a value

üìä Final State Summary:
   - Coordinates: None
   - Address: None
   - Selected Metrics: 0
   - Statistics Count: 0
   - Summary Generated: True
   - User Intent: {}

üìã Selected Metrics (0):


## Test 3: Performance Benchmark - Multiple Locations


In [6]:
test_locations = [
    ("Indiranagar, Bangalore", "Bachelor"),
    ("Nirvana Country, Gurgaon", "Family"),
    ("12.9352, 77.6245", "Student"),  # Another Bangalore location
]

performance_results = []

for location, profile in test_locations:
    test_state = create_initial_state(
        user_input=location,
        user_profile=profile
    )
    
    result = run_test(graph, test_state, f"Performance: {location}")
    
    if result["success"]:
        performance_results.append({
            "location": location,
            "profile": profile,
            "time": result["elapsed_time"],
            "metrics_count": len(result["final_state"].get("selected_metrics", [])),
            "statistics_count": len(result["final_state"].get("statistics", {})),
            "has_summary": result["final_state"].get("summary") is not None
        })
    
    print_test_results(result, verbose=False)

# Summary
print(f"\n{'='*60}")
print("üìä PERFORMANCE SUMMARY")
print(f"{'='*60}")

if performance_results:
    avg_time = sum(r["time"] for r in performance_results) / len(performance_results)
    min_time = min(r["time"] for r in performance_results)
    max_time = max(r["time"] for r in performance_results)
    
    print(f"\n‚è±Ô∏è  Timing:")
    print(f"   - Average: {avg_time:.2f}s")
    print(f"   - Min: {min_time:.2f}s")
    print(f"   - Max: {max_time:.2f}s")
    
    print(f"\nüìã Details:")
    for r in performance_results:
        print(f"   - {r['location']:30s} | {r['time']:5.2f}s | {r['metrics_count']:2d} metrics | {r['statistics_count']:2d} stats")



TEST: Performance: Indiranagar, Bangalore
  ‚úì validate_input: validate_input: SUCCESS - Address detected, needs geocoding
  ‚úì extract_intent_and_select_metrics: extract_intent_and_select_metrics: SUCCESS - Extracted intent, selected 8 metrics
  ‚úì geocode_location: geocode_location: SUCCESS - Geocoded to (12.9732913, 77.6404672)
  ‚úì fetch_osm_data: fetch_osm_data: SUCCESS - Fetched 25 POI categories
  ‚úì calculate_statistics: calculate_statistics: SUCCESS - Calculated 8 metrics
  ‚úì generate_summary: generate_summary: SUCCESS - Summary generated

‚úÖ TEST PASSED
‚è±Ô∏è  Time: 27.39s

   - Road density calculation not yet implemented
   - Road density calculation not yet implemented

TEST: Performance: Nirvana Country, Gurgaon
  ‚úì validate_input: validate_input: SUCCESS - Address detected, needs geocoding
  ‚úì extract_intent_and_select_metrics: extract_intent_and_select_metrics: SUCCESS - Extracted intent, selected 8 metrics
  ‚úì geocode_location: geocode_location: SUCCESS

## Test 6: Extract GeoDataFrames for All Metrics & Data Cleaning Analysis

This section extracts the actual GeoDataFrames (gdf) for all POI categories to:
- Inspect raw data structure
- Understand data quality issues
- Demonstrate cleaning operations
- Show before/after statistics


In [7]:
import osmnx as ox
import geopandas as gpd
from shapely.geometry import Point
import pandas as pd
import numpy as np

# Configure OSMnx
ox.settings.log_console = False
ox.settings.use_cache = True
ox.settings.timeout = 300

def fetch_all_pois_gdf(location_point, radius=2000):
    """
    Fetch all POI categories as GeoDataFrames.
    
    Returns a dictionary with category names as keys and GeoDataFrames as values.
    """
    print(f"üìç Fetching OSM data for location: {location_point}")
    print(f"üìè Search radius: {radius}m (2km)")
    print(f"{'='*60}\n")
    
    # Single optimized query for all POI categories
    all_pois = ox.features_from_point(
        location_point,
        tags={
            # Essential amenities
            'amenity': [
                'school', 'hospital', 'clinic', 'doctors', 'dentist',
                'restaurant', 'cafe', 'fast_food', 'food_court',
                'pharmacy', 'bank', 'atm', 'library', 'place_of_worship',
                'community_centre', 'kindergarten', 'childcare', 'tuition',
                'university', 'college', 'cinema', 'bar', 'pub', 'nightclub'
            ],
            # Leisure & recreation
            'leisure': [
                'park', 'garden', 'recreation_ground', 'playground',
                'fitness_centre', 'gym', 'sports_centre'
            ],
            # Transportation
            'railway': 'station',
            'highway': 'bus_stop',
            # Shopping
            'shop': True,  # All shop types
            # Tourism
            'tourism': ['hotel', 'attraction'],
            # Buildings (for residential density)
            'building': 'residential',
            # Roads (for road density)
            'highway': ['primary', 'secondary', 'tertiary', 'residential', 'cycleway']
        },
        dist=radius
    )
    
    print(f"‚úÖ Fetched {len(all_pois)} total features from OSM\n")
    
    # Classify into categories
    gdfs = {}
    
    if not all_pois.empty:
        # Schools
        if 'amenity' in all_pois.columns:
            gdfs['schools'] = all_pois[all_pois['amenity'] == 'school'].copy()
            
            # Hospitals & Clinics
            gdfs['hospitals'] = all_pois[all_pois['amenity'].isin(['hospital', 'clinic', 'doctors', 'dentist'])].copy()
            
            # Restaurants (combined)
            gdfs['restaurants'] = all_pois[all_pois['amenity'].isin(['restaurant', 'cafe', 'fast_food', 'food_court'])].copy()
            
            # Cafes (separate)
            gdfs['cafes'] = all_pois[all_pois['amenity'] == 'cafe'].copy()
            
            # Fast food (separate)
            gdfs['fast_food'] = all_pois[all_pois['amenity'] == 'fast_food'].copy()
            
            # Banks & ATMs
            gdfs['banks'] = all_pois[all_pois['amenity'].isin(['bank', 'atm'])].copy()
            
            # Pharmacies
            gdfs['pharmacies'] = all_pois[all_pois['amenity'] == 'pharmacy'].copy()
            
            # Gyms & Fitness
            gdfs['gyms'] = all_pois[all_pois['amenity'].isin(['gym', 'fitness_centre'])].copy()
            if 'leisure' in all_pois.columns:
                gyms_leisure = all_pois[all_pois['leisure'].isin(['fitness_centre', 'gym'])].copy()
                gdfs['gyms'] = pd.concat([gdfs['gyms'], gyms_leisure]).drop_duplicates()
            
            # Libraries
            gdfs['libraries'] = all_pois[all_pois['amenity'] == 'library'].copy()
            
            # Places of Worship
            gdfs['worship'] = all_pois[all_pois['amenity'] == 'place_of_worship'].copy()
            
            # Nightlife
            gdfs['nightlife'] = all_pois[all_pois['amenity'].isin(['bar', 'pub', 'nightclub'])].copy()
            
            # Cinemas
            gdfs['cinemas'] = all_pois[all_pois['amenity'] == 'cinema'].copy()
            
            # Universities
            gdfs['universities'] = all_pois[all_pois['amenity'].isin(['university', 'college'])].copy()
            
            # Kindergartens
            gdfs['kindergartens'] = all_pois[all_pois['amenity'] == 'kindergarten'].copy()
            
            # Childcare
            gdfs['childcare'] = all_pois[all_pois['amenity'] == 'childcare'].copy()
            
            # Tuition
            gdfs['tuition'] = all_pois[all_pois['amenity'] == 'tuition'].copy()
            
            # Community Centres
            gdfs['community'] = all_pois[all_pois['amenity'] == 'community_centre'].copy()
        
        # Parks & Gardens
        if 'leisure' in all_pois.columns:
            gdfs['parks'] = all_pois[all_pois['leisure'].isin(['park', 'garden', 'recreation_ground'])].copy()
            
            # Playgrounds
            gdfs['playgrounds'] = all_pois[all_pois['leisure'] == 'playground'].copy()
            
            # Sports facilities
            gdfs['sports'] = all_pois[all_pois['leisure'] == 'sports_centre'].copy()
        
        # Transportation
        if 'railway' in all_pois.columns:
            gdfs['metro_stations'] = all_pois[all_pois['railway'] == 'station'].copy()
        
        if 'highway' in all_pois.columns:
            gdfs['bus_stops'] = all_pois[all_pois['highway'] == 'bus_stop'].copy()
        
        # Shopping
        if 'shop' in all_pois.columns:
            gdfs['shops'] = all_pois[all_pois['shop'].notna()].copy()
        
        # Hotels
        if 'tourism' in all_pois.columns:
            gdfs['hotels'] = all_pois[all_pois['tourism'] == 'hotel'].copy()
        
        # Residential buildings
        if 'building' in all_pois.columns:
            gdfs['residential'] = all_pois[all_pois['building'] == 'residential'].copy()
    
    # Remove empty categories
    gdfs = {k: v for k, v in gdfs.items() if not v.empty}
    
    return gdfs


def analyze_gdf_quality(gdf, category_name):
    """
    Analyze data quality of a GeoDataFrame.
    
    Returns a dictionary with quality metrics.
    """
    if gdf.empty:
        return {
            'total_count': 0,
            'has_geometry': 0,
            'valid_geometry': 0,
            'has_name': 0,
            'duplicate_names': 0,
            'null_geometries': 0,
            'invalid_geometries': 0,
            'empty_geometries': 0
        }
    
    analysis = {
        'total_count': len(gdf),
        'has_geometry': gdf['geometry'].notna().sum() if 'geometry' in gdf.columns else 0,
        'valid_geometry': 0,
        'has_name': gdf['name'].notna().sum() if 'name' in gdf.columns else 0,
        'duplicate_names': 0,
        'null_geometries': gdf['geometry'].isna().sum() if 'geometry' in gdf.columns else 0,
        'invalid_geometries': 0,
        'empty_geometries': 0
    }
    
    if 'geometry' in gdf.columns:
        analysis['valid_geometry'] = gdf['geometry'].apply(lambda g: g.is_valid if g is not None else False).sum()
        analysis['empty_geometries'] = gdf['geometry'].apply(lambda g: g.is_empty if g is not None else True).sum()
        analysis['invalid_geometries'] = analysis['total_count'] - analysis['valid_geometry'] - analysis['null_geometries']
    
    if 'name' in gdf.columns:
        name_counts = gdf['name'].value_counts()
        analysis['duplicate_names'] = (name_counts > 1).sum()
    
    return analysis


def clean_gdf(gdf, category_name=""):
    """
    Clean a GeoDataFrame using the same logic as the production code.
    
    This is a copy of the clean_and_deduplicate_pois function for testing.
    """
    if gdf.empty:
        return gdf
    
    original_count = len(gdf)
    
    # Remove entries without valid geometry
    if 'geometry' in gdf.columns:
        gdf = gdf[gdf['geometry'].notna()].copy()
        gdf = gdf[~gdf['geometry'].is_empty].copy()
        gdf = gdf[gdf['geometry'].is_valid].copy()
    
    # Deduplicate by normalized name (case-insensitive)
    if 'name' in gdf.columns:
        gdf['_name_normalized'] = (
            gdf['name']
            .fillna('')
            .astype(str)
            .str.lower()
            .str.strip()
            .str.replace(r'\s+', ' ', regex=True)
        )
        gdf = gdf.drop_duplicates(subset=['_name_normalized'], keep='first')
        gdf = gdf.drop(columns=['_name_normalized'], errors='ignore')
    
    # Deduplicate by location (if same name at same location)
    if 'name' in gdf.columns and 'geometry' in gdf.columns and not gdf.empty:
        try:
            gdf['_name_norm'] = gdf['name'].fillna('').astype(str).str.lower().str.strip()
            gdf = gdf.drop_duplicates(subset=['_name_norm', 'geometry'], keep='first')
            gdf = gdf.drop(columns=['_name_norm'], errors='ignore')
        except Exception:
            pass
    
    cleaned_count = len(gdf)
    removed = original_count - cleaned_count
    
    return gdf


# Test location
test_location = (28.4150509, 77.0642955)  # Indiranagar, Bangalore

print("üîç Fetching all POI GeoDataFrames...\n")
all_gdfs = fetch_all_pois_gdf(test_location, radius=2000)

print(f"‚úÖ Extracted {len(all_gdfs)} POI categories\n")
print("üìä Categories found:")
for category in sorted(all_gdfs.keys()):
    count = len(all_gdfs[category])
    print(f"   - {category:20s}: {count:4d} POIs")


üîç Fetching all POI GeoDataFrames...

üìç Fetching OSM data for location: (28.4150509, 77.0642955)
üìè Search radius: 2000m (2km)

‚úÖ Fetched 2476 total features from OSM

‚úÖ Extracted 17 POI categories

üìä Categories found:
   - banks               :   22 POIs
   - cafes               :    3 POIs
   - community           :    1 POIs
   - fast_food           :    6 POIs
   - gyms                :    2 POIs
   - hospitals           :   29 POIs
   - hotels              :    7 POIs
   - kindergartens       :    2 POIs
   - parks               :   89 POIs
   - pharmacies          :    8 POIs
   - playgrounds         :    2 POIs
   - residential         :    1 POIs
   - restaurants         :   29 POIs
   - schools             :   26 POIs
   - shops               :   62 POIs
   - sports              :    2 POIs
   - worship             :    2 POIs


### Data Quality Analysis: Before Cleaning


In [8]:
# Analyze data quality for all categories BEFORE cleaning
print("="*80)
print("üìä DATA QUALITY ANALYSIS - BEFORE CLEANING")
print("="*80)

quality_before = {}

for category, gdf in all_gdfs.items():
    quality_before[category] = analyze_gdf_quality(gdf, category)

# Create summary DataFrame
quality_df = pd.DataFrame(quality_before).T
quality_df = quality_df.sort_values('total_count', ascending=False)

print("\nüìã Summary Statistics:")
print(quality_df.to_string())

print("\n\nüîç Key Quality Issues:")
print(f"   - Categories with null geometries: {(quality_df['null_geometries'] > 0).sum()}")
print(f"   - Categories with invalid geometries: {(quality_df['invalid_geometries'] > 0).sum()}")
print(f"   - Categories with duplicate names: {(quality_df['duplicate_names'] > 0).sum()}")
print(f"   - Total POIs across all categories: {quality_df['total_count'].sum()}")
print(f"   - POIs without names: {quality_df['total_count'].sum() - quality_df['has_name'].sum()}")

# Show examples of problematic data
print("\n\n‚ö†Ô∏è  Examples of Data Quality Issues:\n")

# Show categories with most duplicates
if quality_df['duplicate_names'].sum() > 0:
    print("üìå Top 5 categories with duplicate names:")
    top_duplicates = quality_df.nlargest(5, 'duplicate_names')[['total_count', 'duplicate_names']]
    for category, row in top_duplicates.iterrows():
        if row['duplicate_names'] > 0:
            print(f"   - {category:20s}: {int(row['duplicate_names'])} duplicates out of {int(row['total_count'])} total")

# Show sample of data with issues
print("\n\nüìù Sample Raw Data (First 3 rows from 'restaurants'):")
if 'restaurants' in all_gdfs and not all_gdfs['restaurants'].empty:
    sample = all_gdfs['restaurants'].head(3)
    print(sample[['name', 'amenity', 'geometry']].to_string() if 'name' in sample.columns else sample.head(3).to_string())


üìä DATA QUALITY ANALYSIS - BEFORE CLEANING

üìã Summary Statistics:
                total_count  has_geometry  valid_geometry  has_name  duplicate_names  null_geometries  invalid_geometries  empty_geometries
shops                  1043          1043            1043       985               37                0                   0                 0
restaurants             529           529             529       527               21                0                   0                 0
residential             397           397             397        38                0                0                   0                 0
hospitals               181           181             181       177                3                0                   0                 0
banks                   176           176             176       147               23                0                   0                 0
worship                 156           156             156       117                5     

### Apply Cleaning & Compare Results


In [9]:
# Clean all GeoDataFrames
print("="*80)
print("üßπ APPLYING DATA CLEANING")
print("="*80)

cleaned_gdfs = {}
cleaning_stats = {}

for category, gdf in all_gdfs.items():
    original_count = len(gdf)
    cleaned = clean_gdf(gdf.copy(), category)
    cleaned_count = len(cleaned)
    removed = original_count - cleaned_count
    removal_pct = (removed / original_count * 100) if original_count > 0 else 0
    
    cleaned_gdfs[category] = cleaned
    cleaning_stats[category] = {
        'before': original_count,
        'after': cleaned_count,
        'removed': removed,
        'removal_pct': removal_pct
    }
    
    if removed > 0:
        print(f"‚úÖ {category:20s}: {original_count:4d} ‚Üí {cleaned_count:4d} (removed {removed:3d}, {removal_pct:.1f}%)")
    else:
        print(f"‚úÖ {category:20s}: {original_count:4d} ‚Üí {cleaned_count:4d} (no changes)")

print(f"\nüìä Cleaning Summary:")
total_before = sum(s['before'] for s in cleaning_stats.values())
total_after = sum(s['after'] for s in cleaning_stats.values())
total_removed = total_before - total_after

print(f"   - Total POIs before: {total_before}")
print(f"   - Total POIs after:  {total_after}")
print(f"   - Total removed:     {total_removed} ({total_removed/total_before*100:.1f}%)")

# Analyze quality AFTER cleaning
print("\n" + "="*80)
print("üìä DATA QUALITY ANALYSIS - AFTER CLEANING")
print("="*80)

quality_after = {}

for category, gdf in cleaned_gdfs.items():
    quality_after[category] = analyze_gdf_quality(gdf, category)

quality_after_df = pd.DataFrame(quality_after).T
quality_after_df = quality_after_df.sort_values('total_count', ascending=False)

print("\nüìã Summary Statistics (After Cleaning):")
print(quality_after_df.to_string())

# Compare before and after
print("\n\n" + "="*80)
print("üìà BEFORE vs AFTER COMPARISON")
print("="*80)

comparison_data = []
for category in quality_before.keys():
    before = quality_before[category]
    after = quality_after.get(category, {})
    comparison_data.append({
        'category': category,
        'count_before': before['total_count'],
        'count_after': after.get('total_count', 0),
        'removed': before['total_count'] - after.get('total_count', 0),
        'duplicates_before': before['duplicate_names'],
        'duplicates_after': after.get('duplicate_names', 0),
        'invalid_geom_before': before['invalid_geometries'],
        'invalid_geom_after': after.get('invalid_geometries', 0)
    })

comparison_df = pd.DataFrame(comparison_data)
comparison_df = comparison_df.sort_values('removed', ascending=False)

print("\nüìä Cleaning Impact by Category:")
print(comparison_df.to_string())


üßπ APPLYING DATA CLEANING
‚úÖ schools             :   76 ‚Üí   71 (removed   5, 6.6%)
‚úÖ hospitals           :  181 ‚Üí  173 (removed   8, 4.4%)
‚úÖ restaurants         :  529 ‚Üí  496 (removed  33, 6.2%)
‚úÖ cafes               :  101 ‚Üí   91 (removed  10, 9.9%)
‚úÖ fast_food           :  125 ‚Üí  114 (removed  11, 8.8%)
‚úÖ banks               :  176 ‚Üí   55 (removed 121, 68.8%)
‚úÖ pharmacies          :   85 ‚Üí   62 (removed  23, 27.1%)
‚úÖ gyms                :   18 ‚Üí   16 (removed   2, 11.1%)
‚úÖ libraries           :    7 ‚Üí    7 (no changes)
‚úÖ worship             :  156 ‚Üí  101 (removed  55, 35.3%)
‚úÖ nightlife           :   45 ‚Üí   45 (no changes)
‚úÖ universities        :   17 ‚Üí   16 (removed   1, 5.9%)
‚úÖ kindergartens       :   12 ‚Üí   12 (no changes)
‚úÖ childcare           :    2 ‚Üí    2 (no changes)
‚úÖ community           :   14 ‚Üí   14 (no changes)
‚úÖ parks               :   89 ‚Üí   35 (removed  54, 60.7%)
‚úÖ playgrounds         :   33 ‚Üí    6 (r

### Inspect Individual GeoDataFrames

Select a category to inspect its cleaned GeoDataFrame:


In [None]:
# Select a category to inspect (change this to explore different categories)
category_to_inspect = 'restaurants'  # Try: 'schools', 'hospitals', 'parks', 'shops', etc.

if category_to_inspect in cleaned_gdfs:
    gdf = cleaned_gdfs[category_to_inspect]
    
    print("="*80)
    print(f"üîç INSPECTING: {category_to_inspect.upper()}")
    print("="*80)
    
    print(f"\nüìä Basic Info:")
    print(f"   - Total POIs: {len(gdf)}")
    print(f"   - Columns: {len(gdf.columns)}")
    print(f"   - CRS: {gdf.crs if hasattr(gdf, 'crs') else 'Not set'}")
    
    print(f"\nüìã Column Names:")
    print(f"   {', '.join(gdf.columns[:10].tolist())}")
    if len(gdf.columns) > 10:
        print(f"   ... and {len(gdf.columns) - 10} more columns")
    
    print(f"\nüìù Sample Data (First 5 rows):")
    # Show key columns if available
    key_cols = ['name', 'amenity', 'leisure', 'geometry']
    available_cols = [col for col in key_cols if col in gdf.columns]
    
    if available_cols:
        display_df = gdf[available_cols].head(5).copy()
        # Format geometry for display
        if 'geometry' in display_df.columns:
            display_df['geometry'] = display_df['geometry'].apply(
                lambda g: f"Point({g.x:.6f}, {g.y:.6f})" if hasattr(g, 'x') else str(g)[:50]
            )
        print(display_df.to_string())
    else:
        print(gdf.head(5).to_string())
    
    print(f"\nüìà Data Quality:")
    quality = analyze_gdf_quality(gdf, category_to_inspect)
    for key, value in quality.items():
        print(f"   - {key:20s}: {value}")
    
    # Show names if available
    if 'name' in gdf.columns:
        print(f"\nüìõ POI Names (sample):")
        names = gdf['name'].dropna().head(10).tolist()
        for i, name in enumerate(names, 1):
            print(f"   {i:2d}. {name}")
        if len(gdf['name'].dropna()) > 10:
            print(f"   ... and {len(gdf['name'].dropna()) - 10} more")
    
    # Show statistics
    print(f"\nüìä Statistics:")
    if 'name' in gdf.columns:
        print(f"   - POIs with names: {gdf['name'].notna().sum()} ({gdf['name'].notna().sum()/len(gdf)*100:.1f}%)")
        print(f"   - Unique names: {gdf['name'].nunique()}")
    
    if 'geometry' in gdf.columns:
        print(f"   - Valid geometries: {gdf['geometry'].apply(lambda g: g.is_valid if g is not None else False).sum()}")
        
else:
    print(f"‚ùå Category '{category_to_inspect}' not found.")
    print(f"Available categories: {', '.join(sorted(cleaned_gdfs.keys()))}")


### Export All Cleaned GeoDataFrames

Export cleaned GeoDataFrames for further analysis:


In [None]:
# Export all cleaned GeoDataFrames
# You can access them via: cleaned_gdfs['category_name']

print("="*80)
print("üíæ EXPORTED CLEANED GEODATAFRAMES")
print("="*80)

print("\n‚úÖ All cleaned GeoDataFrames are stored in the 'cleaned_gdfs' dictionary")
print("\nüìã Available categories:")
for i, category in enumerate(sorted(cleaned_gdfs.keys()), 1):
    count = len(cleaned_gdfs[category])
    print(f"   {i:2d}. {category:20s}: {count:4d} POIs")

print("\nüí° Usage Examples:")
print("   # Access a specific category:")
print("   restaurants_gdf = cleaned_gdfs['restaurants']")
print("   schools_gdf = cleaned_gdfs['schools']")
print("")
print("   # Get all POI names:")
print("   restaurant_names = cleaned_gdfs['restaurants']['name'].dropna().tolist()")
print("")
print("   # Get coordinates:")
print("   restaurant_coords = cleaned_gdfs['restaurants']['geometry'].apply(lambda g: (g.x, g.y) if hasattr(g, 'x') else None)")
print("")
print("   # Filter by name:")
print("   specific_restaurant = cleaned_gdfs['restaurants'][cleaned_gdfs['restaurants']['name'] == 'Restaurant Name']")
print("")
print("   # Export to file:")
print("   cleaned_gdfs['restaurants'].to_file('restaurants.geojson', driver='GeoJSON')")

# Create a summary DataFrame
summary_data = []
for category, gdf in cleaned_gdfs.items():
    summary_data.append({
        'category': category,
        'count': len(gdf),
        'has_names': gdf['name'].notna().sum() if 'name' in gdf.columns else 0,
        'unique_names': gdf['name'].nunique() if 'name' in gdf.columns else 0,
        'valid_geometries': gdf['geometry'].apply(lambda g: g.is_valid if g is not None else False).sum() if 'geometry' in gdf.columns else 0
    })

summary_df = pd.DataFrame(summary_data)
summary_df = summary_df.sort_values('count', ascending=False)

print("\n\nüìä Final Summary:")
print(summary_df.to_string(index=False))


## Test 4: State Inspection - Full Workflow Trace


In [11]:
test4_state = create_initial_state(
    user_input="HSR Layout, Bangalore",
    user_profile="Senior Citizen"
)

result4 = run_test(graph, test4_state, "Test 4: Full State Inspection")

if result4["success"]:
    final_state = result4["final_state"]
    
    print(f"\n{'='*60}")
    print("üîç COMPLETE STATE INSPECTION")
    print(f"{'='*60}")
    
    print(f"\nüì• Input:")
    print(f"   - User Input: {final_state.get('user_input')}")
    print(f"   - User Profile: {final_state.get('user_profile')}")
    
    print(f"\nüåç Geocoding:")
    print(f"   - Coordinates: {final_state.get('coordinates')}")
    print(f"   - Address: {final_state.get('address')}")
    
    print(f"\nüéØ User Intent:")
    intent = final_state.get('user_intent', {})
    pprint(intent, width=80)
    
    print(f"\nüìä Selected Metrics ({len(final_state.get('selected_metrics', []))}):")
    for metric in final_state.get('selected_metrics', []):
        print(f"   - {metric}")
    
    print(f"\nüìà Statistics ({len(final_state.get('statistics', {}))}):")
    stats = final_state.get('statistics', {})
    for key, value in list(stats.items())[:10]:
        print(f"   - {key}: {value}")
    if len(stats) > 10:
        print(f"   ... and {len(stats) - 10} more")
    
    print(f"\nüìù Summary:")
    summary = final_state.get('summary', 'N/A')
    if summary and summary != 'N/A':
        print(summary[:300] + "..." if len(summary) > 300 else summary)
    else:
        print("   No summary generated")
    
    print(f"\nüìã Processing Steps ({len(final_state.get('processing_steps', []))}):")
    for step in final_state.get('processing_steps', []):
        print(f"   - {step}")
    
    if final_state.get('errors'):
        print(f"\n‚ùå Errors:")
        for error in final_state.get('errors', []):
            print(f"   - {error}")
    
    if final_state.get('warnings'):
        print(f"\n‚ö†Ô∏è  Warnings:")
        for warning in final_state.get('warnings', []):
            print(f"   - {warning}")



TEST: Test 4: Full State Inspection
  ‚úì validate_input: validate_input: SUCCESS - Address detected, needs geocoding
  ‚úì extract_intent_and_select_metrics: extract_intent_and_select_metrics: SUCCESS - Extracted intent, selected 7 metrics
  ‚úì geocode_location: geocode_location: SUCCESS - Geocoded to (12.9116225, 77.6388622)
  ‚úì fetch_osm_data: fetch_osm_data: SUCCESS - Fetched 25 POI categories
  ‚úì calculate_statistics: calculate_statistics: SUCCESS - Calculated 7 metrics
  ‚úì generate_summary: generate_summary: SUCCESS - Summary generated

üîç COMPLETE STATE INSPECTION

üì• Input:
   - User Input: HSR Layout, Bangalore
   - User Profile: Senior Citizen

üåç Geocoding:
   - Coordinates: (12.9116225, 77.6388622)
   - Address: HSR Layout, Bengaluru South City Corporation, Bengaluru, Bangalore South, Bengaluru Urban, Karnataka, India

üéØ User Intent:
{'concerns': ['accessibility', 'comfort'],
 'lifestyle': 'Comfortable, relaxed, community-based lifestyle',
 'metric_selectio

## Test 5: Edge Cases


In [12]:
# Test 5a: No Profile
test5a_state = create_initial_state(
    user_input="MG Road, Bangalore",
    user_profile=None
)

result5a = run_test(graph, test5a_state, "Test 5a: No Profile (Defaults)")
print_test_results(result5a, verbose=False)

# Test 5b: Empty Input
test5b_state = create_initial_state(
    user_input="",
    user_profile="Bachelor"
)

result5b = run_test(graph, test5b_state, "Test 5b: Empty Input")
print_test_results(result5b, verbose=False)

# Test 5c: Custom Free-Text Profile
test5c_state = create_initial_state(
    user_input="Jayanagar, Bangalore",
    user_profile="I'm a fitness enthusiast who loves parks and gyms, need good connectivity"
)

result5c = run_test(graph, test5c_state, "Test 5c: Custom Free-Text Profile")
print_test_results(result5c, verbose=True)

if result5c["success"]:
    intent = result5c["final_state"].get("user_intent", {})
    selected = result5c["final_state"].get("selected_metrics", [])
    
    print(f"\nüéØ Extracted Intent:")
    print(f"   - Profile Type: {intent.get('profile_type')}")
    print(f"   - Priorities: {intent.get('priorities', [])}")
    
    print(f"\nüìä Selected Metrics:")
    fitness_metrics = [m for m in selected if 'gym' in m.lower() or 'fitness' in m.lower() or 'park' in m.lower()]
    if fitness_metrics:
        print(f"   ‚úÖ Found fitness-related metrics: {fitness_metrics}")
    print(f"   All metrics: {selected}")



TEST: Test 5a: No Profile (Defaults)
  ‚úì validate_input: validate_input: SUCCESS - Address detected, needs geocoding
  ‚úì extract_intent_and_select_metrics: extract_intent_and_select_metrics: SKIPPED - No profile, used defaults
  ‚úì geocode_location: geocode_location: SUCCESS - Geocoded to (12.9755264, 77.6067902)
  ‚úì fetch_osm_data: fetch_osm_data: SUCCESS - Fetched 25 POI categories
  ‚úì calculate_statistics: calculate_statistics: SUCCESS - Calculated 7 metrics
  ‚úì generate_summary: generate_summary: SUCCESS - Summary generated

‚úÖ TEST PASSED
‚è±Ô∏è  Time: 12.59s

TEST: Test 5b: Empty Input
  ‚úì validate_input: validate_input: FAILED - No input provided
  ‚úì handle_error: handle_error: Error handling completed

‚úÖ TEST PASSED
‚è±Ô∏è  Time: 0.00s

‚ö†Ô∏è  Errors: 2
   - User input is required
   - User input is required

TEST: Test 5c: Custom Free-Text Profile
  ‚úì validate_input: validate_input: SUCCESS - Address detected, needs geocoding
  ‚úì extract_intent_and_sele

## Summary and Conclusions


In [13]:
print(f"\n{'='*60}")
print("üìã TEST SUITE SUMMARY")
print(f"{'='*60}")

# Collect all results
all_results = []
if 'result1' in locals() and result1.get("success") is not None:
    all_results.append(result1)
if 'result2' in locals() and result2.get("success") is not None:
    all_results.append(result2)
if 'result4' in locals() and result4.get("success") is not None:
    all_results.append(result4)
if 'result5a' in locals() and result5a.get("success") is not None:
    all_results.append(result5a)
if 'result5c' in locals() and result5c.get("success") is not None:
    all_results.append(result5c)

if all_results:
    passed = sum(1 for r in all_results if r.get("success", False))
    failed = len(all_results) - passed
    
    print(f"\n‚úÖ Passed: {passed}/{len(all_results)}")
    print(f"‚ùå Failed: {failed}/{len(all_results)}")
    
    if performance_results:
        avg_time = sum(r["time"] for r in performance_results) / len(performance_results)
        print(f"\n‚è±Ô∏è  Average Execution Time: {avg_time:.2f}s")
    
    print(f"\nüéØ Key Findings:")
    print(f"   - Parallel execution: Intent extraction runs independently")
    print(f"   - Profile handling: Works with categorical and free-text profiles")
    print(f"   - Error handling: Gracefully handles invalid inputs")
    print(f"   - Default fallback: Uses defaults when profile is missing")
    print(f"   - Metrics selection: LLM selects relevant metrics based on intent")
    
    print(f"\n{'='*60}")
else:
    print("\n‚ö†Ô∏è  No test results available. Run the test cells above first.")



üìã TEST SUITE SUMMARY

‚úÖ Passed: 5/5
‚ùå Failed: 0/5

‚è±Ô∏è  Average Execution Time: 17.85s

üéØ Key Findings:
   - Parallel execution: Intent extraction runs independently
   - Profile handling: Works with categorical and free-text profiles
   - Error handling: Gracefully handles invalid inputs
   - Default fallback: Uses defaults when profile is missing
   - Metrics selection: LLM selects relevant metrics based on intent

