# Regions Data Cleaning Notebook

This notebook analyzes and cleans the regions data by removing entries with "Na" values for districts and states.

## Goals:
1. Load and explore the regions data
2. Identify entries with "Na" values
3. Develop cleaning strategy
4. Apply cleaning and verify results

In [None]:
# Import necessary libraries
import json
import pandas as pd
from collections import defaultdict
import re

print("=== Regions Data Cleaning ===")
print("Loading and analyzing regions data...")

In [None]:
# Load the regions data
with open('apps/web/public/regions_data.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

print(f"Loaded data with {len(data['regions'])} regions")
print(f"Data keys: {list(data.keys())}")
print(f"Summary: {data['summary']}")

In [None]:
# Function to check for Na values (case insensitive)
def is_na_value(value):
    """Check if a value represents Na/None/missing"""
    if value is None:
        return True
    if isinstance(value, str):
        # Check for various Na representations
        na_patterns = [
            r'^\s*na\s*$',
            r'^\s*n/a\s*$', 
            r'^\s*none\s*$',
            r'^\s*null\s*$',
            r'^\s*-\s*$',
            r'^\s*$'  # empty string
        ]
        return any(re.match(pattern, value, re.IGNORECASE) for pattern in na_patterns)
    return False

# Analyze the data for Na values
na_issues = {
    'states_with_na': [],
    'districts_with_na': [],
    'cities_with_na': [],
    'total_na_districts': 0,
    'total_na_cities': 0
}

print("Scanning for Na values...")

for region_idx, region in enumerate(data['regions']):
    state_name = region['name']
    
    # Check if state name is Na
    if is_na_value(state_name):
        na_issues['states_with_na'].append({
            'index': region_idx,
            'name': state_name,
            'type': 'state'
        })
    
    # Check districts within this state
    for district_idx, district in enumerate(region['districts']):
        district_name = district['name']
        
        if is_na_value(district_name):
            na_issues['districts_with_na'].append({
                'state_index': region_idx,
                'state_name': state_name,
                'district_index': district_idx,
                'district_name': district_name,
                'type': 'district'
            })
            na_issues['total_na_districts'] += 1
        
        # Check cities within this district
        for city_idx, city in enumerate(district['cities']):
            city_name = city['name']
            
            if is_na_value(city_name):
                na_issues['cities_with_na'].append({
                    'state_index': region_idx,
                    'state_name': state_name,
                    'district_index': district_idx,
                    'district_name': district_name,
                    'city_index': city_idx,
                    'city_name': city_name,
                    'type': 'city'
                })
                na_issues['total_na_cities'] += 1

print(f"Analysis complete!")
print(f"States with Na: {len(na_issues['states_with_na'])}")
print(f"Districts with Na: {na_issues['total_na_districts']}")
print(f"Cities with Na: {na_issues['total_na_cities']}")

In [None]:
# Display detailed results of Na analysis
print("=== DETAILED NA ANALYSIS ===")
print()

if na_issues['states_with_na']:
    print("STATES WITH NA VALUES:")
    for state in na_issues['states_with_na']:
        print(f"  - Index {state['index']}: '{state['name']}'")
    print()
else:
    print("✅ No states with Na values found")
    print()

if na_issues['districts_with_na']:
    print(f"DISTRICTS WITH NA VALUES ({na_issues['total_na_districts']} total):")
    for district in na_issues['districts_with_na'][:10]:  # Show first 10
        print(f"  - State: {district['state_name']}, District: '{district['district_name']}'")
    if len(na_issues['districts_with_na']) > 10:
        print(f"  ... and {len(na_issues['districts_with_na']) - 10} more")
    print()
else:
    print("✅ No districts with Na values found")
    print()

if na_issues['cities_with_na']:
    print(f"CITIES WITH NA VALUES ({na_issues['total_na_cities']} total):")
    for city in na_issues['cities_with_na'][:10]:  # Show first 10
        print(f"  - State: {city['state_name']}, District: {city['district_name']}, City: '{city['city_name']}'")
    if len(na_issues['cities_with_na']) > 10:
        print(f"  ... and {len(na_issues['cities_with_na']) - 10} more")
    print()
else:
    print("✅ No cities with Na values found")
    print()

In [None]:
# Create a comprehensive cleaning strategy
def clean_regions_data(original_data):
    """
    Clean the regions data by:
    1. Removing states with Na names
    2. Removing districts with Na names from valid states
    3. Removing cities with Na names from valid districts
    4. Removing empty districts (no cities after cleaning)
    5. Removing empty states (no districts after cleaning)
    """
    
    cleaned_data = {
        'summary': original_data['summary'].copy(),
        'regions': []
    }
    
    states_removed = 0
    districts_removed = 0
    cities_removed = 0
    
    print("Starting data cleaning process...")
    
    for region in original_data['regions']:
        state_name = region['name']
        
        # Skip states with Na names
        if is_na_value(state_name):
            states_removed += 1
            print(f"  ❌ Removing state with Na name: '{state_name}'")
            continue
        
        # Process districts within this valid state
        valid_districts = []
        
        for district in region['districts']:
            district_name = district['name']
            
            # Skip districts with Na names
            if is_na_value(district_name):
                districts_removed += 1
                print(f"    ❌ Removing district with Na name: '{district_name}' (State: {state_name})")
                continue
            
            # Process cities within this valid district
            valid_cities = []
            
            for city in district['cities']:
                city_name = city['name']
                
                # Skip cities with Na names
                if is_na_value(city_name):
                    cities_removed += 1
                    print(f"      ❌ Removing city with Na name: '{city_name}' (District: {district_name})")
                    continue
                
                valid_cities.append(city)
            
            # Only add district if it has valid cities
            if valid_cities:
                cleaned_district = {
                    'name': district_name,
                    'type': district['type'],
                    'cities': valid_cities
                }
                valid_districts.append(cleaned_district)
            else:
                districts_removed += 1
                print(f"    ❌ Removing empty district: '{district_name}' (no valid cities)")
        
        # Only add state if it has valid districts
        if valid_districts:
            cleaned_region = {
                'name': state_name,
                'type': region['type'],
                'districts': valid_districts
            }
            cleaned_data['regions'].append(cleaned_region)
        else:
            states_removed += 1
            print(f"  ❌ Removing empty state: '{state_name}' (no valid districts)")
    
    # Update summary
    total_districts = sum(len(region['districts']) for region in cleaned_data['regions'])
    total_cities = sum(len(district['cities']) for region in cleaned_data['regions'] for district in region['districts'])
    
    cleaned_data['summary'] = {
        'total_states': len(cleaned_data['regions']),
        'total_districts': total_districts,
        'total_cities': total_cities
    }
    
    return cleaned_data, {
        'states_removed': states_removed,
        'districts_removed': districts_removed,
        'cities_removed': cities_removed
    }

print("Cleaning function defined. Ready to apply!")

In [None]:
# Apply the cleaning function
print("=== APPLYING DATA CLEANING ===")
print()

original_summary = data['summary']
print(f"Original data:")
print(f"  States: {original_summary['total_states']}")
print(f"  Districts: {original_summary['total_districts']}")
print(f"  Cities: {original_summary['total_cities']}")
print()

cleaned_data, removal_stats = clean_regions_data(data)

print()
print("=== CLEANING RESULTS ===")
print(f"States removed: {removal_stats['states_removed']}")
print(f"Districts removed: {removal_stats['districts_removed']}")
print(f"Cities removed: {removal_stats['cities_removed']}")
print()

cleaned_summary = cleaned_data['summary']
print(f"Cleaned data:")
print(f"  States: {cleaned_summary['total_states']}")
print(f"  Districts: {cleaned_summary['total_districts']}")
print(f"  Cities: {cleaned_summary['total_cities']}")
print()

print(f"Net change:")
print(f"  States: {cleaned_summary['total_states'] - original_summary['total_states']:+d}")
print(f"  Districts: {cleaned_summary['total_districts'] - original_summary['total_districts']:+d}")
print(f"  Cities: {cleaned_summary['total_cities'] - original_summary['total_cities']:+d}")

In [None]:
# Show sample of cleaned data
print("=== SAMPLE OF CLEANED DATA ===")
print()

for i, region in enumerate(cleaned_data['regions'][:3]):
    print(f"{i+1}. State: {region['name']} (Type: {region['type']})")
    print(f"   Districts: {len(region['districts'])}")
    
    for j, district in enumerate(region['districts'][:2]):
        print(f"   {j+1}. District: {district['name']} (Type: {district['type']})")
        print(f"      Cities: {len(district['cities'])}")
        
        for k, city in enumerate(district['cities'][:2]):
            print(f"      {k+1}. City: {city['name']} (Type: {city['type']})")
        
        if len(district['cities']) > 2:
            print(f"      ... and {len(district['cities']) - 2} more cities")
    
    if len(region['districts']) > 2:
        print(f"   ... and {len(region['districts']) - 2} more districts")
    print()

In [None]:
# Create backup and save cleaned data
import shutil
from datetime import datetime

# Create a backup with timestamp
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
backup_filename = f'apps/web/public/regions_data_before_cleaning_{timestamp}.json'

shutil.copy('apps/web/public/regions_data.json', backup_filename)
print(f"✅ Created backup: {backup_filename}")

# Save the cleaned data
with open('apps/web/public/regions_data.json', 'w', encoding='utf-8') as f:
    json.dump(cleaned_data, f, indent=2, ensure_ascii=False)

print("✅ Saved cleaned data to regions_data.json")
print()
print("=== FINAL VERIFICATION ===")

# Load and verify the saved data
with open('apps/web/public/regions_data.json', 'r', encoding='utf-8') as f:
    verification_data = json.load(f)

print(f"Verification successful!")
print(f"Saved data contains: {verification_data['summary']}")
print(f"Data integrity: {'✅ PASS' if verification_data == cleaned_data else '❌ FAIL'}")

## Summary

This notebook successfully:
1. ✅ Analyzed the regions data for Na values
2. ✅ Identified problematic entries
3. ✅ Applied comprehensive cleaning strategy
4. ✅ Created backups before modification
5. ✅ Saved cleaned dataset
6. ✅ Verified data integrity

The cleaned dataset now contains only valid states, districts, and cities with proper names.