# Data Cleaning 

Here we'll dig into cleaning the data, importing the cached data quality checks we generated automatically at the tail-end of our 01_data_quality-check.ipynb file

## Environment Setup

In [2]:
# Import required libraries
import pandas as pd
import numpy as np
import pickle
import json
from pathlib import Path
from datetime import datetime
import sys

# Add paths for our custom modules
sys.path.append('../../shared')
sys.path.append('../../step2_data_ingestion')

# Import our modules
from sheets_client import open_sheet
from config_manager import load_settings
from schema import SchemaManager

print("All imports successful")

All imports successful


## Fetching the Data

### Load Original Datasets

In [3]:
# Load original datasets (same loading logic as quality check notebook)
sys.path.append('../../shared')
from notebook_utils import *

# Define dataset configurations
datasets_config = {
    'business_licenses': {
        'worksheet': 'Business_Licenses_Full',
        'pickle_name': 'licenses_df'
    },
    'building_permits': {
        'worksheet': 'Building_Permits_Full',
        'pickle_name': 'permits_df'
    },
    'cta_boardings': {
        'worksheet': 'CTA_Full',
        'pickle_name': 'cta_df'
    }
}

# Load datasets from cache first
datasets = {}
load_from_sheets = False

print("Loading original datasets for cleaning...")
for dataset_name, config in datasets_config.items():
    try:
        df = load_analysis_results(config['pickle_name'])
        if df.empty:
            raise FileNotFoundError(f"{config['pickle_name']} is empty")
        datasets[dataset_name] = df
        print(f"   LOADED {dataset_name}: {len(df)} rows from cache")
    except FileNotFoundError:
        print(f"   CACHE MISS {dataset_name}: will load from sheets")
        load_from_sheets = True

if load_from_sheets:
    print("\nLoading fresh data from Google Sheets...")
    settings = load_settings()
    sh = open_sheet(settings.sheet_id, settings.google_creds_path)

    for dataset_name, config in datasets_config.items():
        if dataset_name not in datasets:
            df = load_sheet_data(sh, config['worksheet'])
            datasets[dataset_name] = df
            save_analysis_results(df, config['pickle_name'])
            print(f"   LOADED {dataset_name}: {len(df)} rows from sheets and cached")

# Extract for easier access
licenses_df = datasets['business_licenses'].copy()  # Use .copy() so we don't modify originals
permits_df = datasets['building_permits'].copy()
cta_df = datasets['cta_boardings'].copy()

print(f"\nDATASETS READY FOR CLEANING:")
print(f"   Business Licenses: {len(licenses_df):,} rows")
print(f"   Building Permits: {len(permits_df):,} rows")
print(f"   CTA Boardings: {len(cta_df):,} rows")
print(f"   Total records: {len(licenses_df) + len(permits_df) + len(cta_df):,}")

print(f"\nOriginal datasets loaded successfully!")

Loading original datasets for cleaning...
✅ Loaded analysis results from ../data/processed/licenses_df.pkl
   LOADED business_licenses: 2040 rows from cache
✅ Loaded analysis results from ../data/processed/permits_df.pkl
   LOADED building_permits: 8647 rows from cache
✅ Loaded analysis results from ../data/processed/cta_df.pkl
   LOADED cta_boardings: 668 rows from cache

DATASETS READY FOR CLEANING:
   Business Licenses: 2,040 rows
   Building Permits: 8,647 rows
   CTA Boardings: 668 rows
   Total records: 11,355

Original datasets loaded successfully!


### Quality-Driven Cleaning Strategy

In [5]:
# Enhanced Quality Analysis Results Loader
def load_quality_analysis_results(batch_id=None):
    """
    Load quality analysis results from latest batch or specific batch.

    Args:
        batch_id (str, optional): Specific batch ID to load. If None, loads latest.
    """
    print("LOADING QUALITY ANALYSIS RESULTS")
    print("=" * 50)

    results_dir = Path("../../data/quality_analysis")

    if not results_dir.exists():
        print("❌ No quality analysis results found!")
        print("   Please run 01_data_quality_check.ipynb first")
        return None

    # Determine which batch to load
    if batch_id:
        batch_dir = results_dir / batch_id
        if not batch_dir.exists():
            print(f"❌ Batch '{batch_id}' not found!")
            return None
        print(f"   📂 Loading specific batch: {batch_id}")
    else:
        # Load from latest directory
        batch_dir = results_dir / "latest"
        if not batch_dir.exists():
            print("❌ No latest results found!")
            return None

        # Get batch info from latest
        batch_info_file = batch_dir / "latest_batch_info.txt"
        if batch_info_file.exists():
            with open(batch_info_file, 'r') as f:
                batch_info = f.read()
                batch_id = [line for line in batch_info.split('\n') if 'Batch ID:' in line][0].split(': ')[1]
        print(f"   📂 Loading latest batch: {batch_id}")

    # Load all analysis components
    analysis_results = {'batch_id': batch_id}

    # Load files with error handling
    files_to_load = {
        'quality': 'quality_matrix.pkl',
        'contamination': 'contamination_analysis.pkl',
        'anomalies': 'anomaly_detection.pkl',
        'business_validation': 'business_validation.pkl',
        'completeness': 'completeness_analysis.pkl'
    }

    for key, filename in files_to_load.items():
        file_path = batch_dir / filename
        if file_path.exists():
            with open(file_path, 'rb') as f:
                analysis_results[key] = pickle.load(f)
            print(f"   ✅ {key}: {filename}")
        else:
            print(f"   ⚠️  {key}: {filename} (missing)")

    # Load summary if available
    summary_file = batch_dir / "analysis_summary.json"
    if summary_file.exists():
        with open(summary_file, 'r') as f:
            analysis_results['summary'] = json.load(f)
        print(f"   ✅ Summary: analysis_summary.json")

    print(f"\n🎯 QUALITY ANALYSIS RESULTS READY!")
    if 'summary' in analysis_results:
        summary = analysis_results['summary']
        print(f"   📊 Quality Score: {summary['quality_summary']['overall_score']:.3f}")
        print(f"   🔍 Contaminated Fields: {summary['quality_summary']['contaminated_fields']}")
        print(f"   🔧 Recommendations: {summary['completeness_summary']['total_recommendations']}")

    return analysis_results

# Function to list available batches
def list_available_batches():
    """List all available quality analysis batches."""
    results_dir = Path("../../data/quality_analysis")
    index_file = results_dir / "batch_index.json"

    if not index_file.exists():
        print("📂 No batch index found")
        return []

    with open(index_file, 'r') as f:
        batch_index = json.load(f)

    print("AVAILABLE QUALITY ANALYSIS BATCHES")
    print("=" * 40)

    for i, batch in enumerate(batch_index['batches'][:10]):  # Show last 10
        print(f"{i+1:2d}. {batch['batch_id']}")
        print(f"     📅 {batch['datetime'][:19]}")
        print(f"     📊 Quality: {batch['summary_scores']['overall_quality']:.3f}")
        print(f"     🔧 Recommendations: {batch['summary_scores']['total_recommendations']}")
        print()

    return batch_index['batches']

# Load latest results by default
quality_results = load_quality_analysis_results()

# Optionally list available batches
available_batches = list_available_batches()

LOADING QUALITY ANALYSIS RESULTS
   📂 Loading latest batch: data_quality_batch_20250901_153632
   ✅ quality: quality_matrix.pkl
   ✅ contamination: contamination_analysis.pkl
   ✅ anomalies: anomaly_detection.pkl
   ✅ business_validation: business_validation.pkl
   ✅ completeness: completeness_analysis.pkl
   ✅ Summary: analysis_summary.json

🎯 QUALITY ANALYSIS RESULTS READY!
   📊 Quality Score: 0.988
   🔍 Contaminated Fields: 7
   🔧 Recommendations: 2
AVAILABLE QUALITY ANALYSIS BATCHES
 1. data_quality_batch_20250901_153632
     📅 2025-09-01T15:36:32
     📊 Quality: 0.988
     🔧 Recommendations: 2



In [6]:
# Extract key insights from quality analysis for cleaning priorities
def analyze_cleaning_priorities(quality_results):
    """Extract actionable cleaning priorities from quality analysis."""
    print("DATA CLEANING PRIORITY ANALYSIS")
    print("=" * 50)

    if not quality_results:
        print("ERROR: No quality results available")
        return None

    summary = quality_results.get('summary', {})
    contamination = quality_results.get('contamination', {})
    completeness = quality_results.get('completeness', {})

    # Extract high-priority cleaning actions
    cleaning_plan = {
        'critical_fixes': [],
        'high_priority': [],
        'medium_priority': [],
        'field_specific_actions': {}
    }

    print(f"SUMMARY FROM QUALITY ANALYSIS:")
    if 'quality_summary' in summary:
        qs = summary['quality_summary']
        print(f"   Overall Quality Score: {qs.get('overall_score', 0):.3f}")
        print(f"   Contaminated Fields: {qs.get('contaminated_fields', 0)}")
        print(f"   Critical Issues: {qs.get('critical_issues', 0)}")

    print(f"\nCLEANING RECOMMENDATIONS:")
    if 'completeness_summary' in summary:
        cs = summary['completeness_summary']
        print(f"   Total Recommendations: {cs.get('total_recommendations', 0)}")
        print(f"   Critical Imputation Needed: {cs.get('critical_imputation_needed', 0)}")
        print(f"   Low-value Fields: {cs.get('low_value_fields', 0)}")

    # Extract contamination issues for each dataset
    print(f"\nCONTAMINATION ISSUES BY DATASET:")
    for dataset_name, contamination_info in contamination.items():
        if isinstance(contamination_info, dict):
            total_issues = (
                len(contamination_info.get('mixed_types', [])) +
                len(contamination_info.get('special_chars', [])) +
                len(contamination_info.get('encoding_issues', [])) +
                len(contamination_info.get('format_inconsistencies', []))
            )
            if total_issues > 0:
                print(f"   {dataset_name}: {total_issues} contamination issues")

                # Extract specific recommendations
                recommendations = contamination_info.get('recommendations', [])
                for rec in recommendations:
                    cleaning_plan['high_priority'].append({
                        'dataset': dataset_name,
                        'action': rec,
                        'type': 'contamination_fix'
                    })

    return cleaning_plan

# Analyze cleaning priorities from our quality results
cleaning_priorities = analyze_cleaning_priorities(quality_results)

print(f"\nREADY TO BEGIN TARGETED DATA CLEANING!")
if cleaning_priorities:
    print(f"   High Priority Actions: {len(cleaning_priorities['high_priority'])}")
    print(f"   Critical Fixes: {len(cleaning_priorities['critical_fixes'])}")
else:
    print("   Will proceed with standard cleaning workflow")

DATA CLEANING PRIORITY ANALYSIS
SUMMARY FROM QUALITY ANALYSIS:
   Overall Quality Score: 0.988
   Contaminated Fields: 7
   Critical Issues: 0

CLEANING RECOMMENDATIONS:
   Total Recommendations: 2
   Critical Imputation Needed: 0
   Low-value Fields: 2

CONTAMINATION ISSUES BY DATASET:
   business_licenses: 1 contamination issues
   building_permits: 3 contamination issues

READY TO BEGIN TARGETED DATA CLEANING!
   High Priority Actions: 1
   Critical Fixes: 0


## Data Cleaning

### Priority-Based Cleaning Pipeline

In [11]:
# Dynamic Data Cleaning Based on Quality Analysis
def create_cleaning_pipeline(quality_results, datasets):
    """
    Create a dynamic cleaning pipeline based on quality analysis results.
    Returns a list of cleaning functions to execute in priority order.
    """
    print("CREATING DYNAMIC CLEANING PIPELINE")
    print("=" * 50)

    cleaning_steps = []

    if not quality_results:
        print("No quality results - using standard cleaning only")
        cleaning_steps.append(('standard_cleaning', standard_data_cleaning))
        return cleaning_steps

    contamination = quality_results.get('contamination', {})
    completeness = quality_results.get('completeness', {})
    summary = quality_results.get('summary', {})

    # 1. CRITICAL FIXES FIRST (data integrity issues)
    if summary.get('business_rules_summary', {}).get('total_violations', 0) > 0:
        cleaning_steps.append(('fix_business_logic', fix_business_logic_issues))

    # 2. HIGH PRIORITY: Contamination fixes (mixed types, special chars)
    for dataset_name, contamination_info in contamination.items():
        if isinstance(contamination_info, dict):
            recommendations = contamination_info.get('recommendations', [])
            if recommendations:
                # FIX: Create proper closure function
                def make_contamination_fixer(ds_name, recs):
                    def contamination_fixer(datasets):
                        return fix_contamination_issues(ds_name, recs, datasets)
                    return contamination_fixer

                cleaning_steps.append((f'fix_contamination_{dataset_name}',
                                     make_contamination_fixer(dataset_name, recommendations)))

    # 3. MEDIUM PRIORITY: Type conversions and standardization
    cleaning_steps.append(('standardize_types', standardize_data_types))

    # 4. LOW PRIORITY: Optional field cleanup
    if summary.get('completeness_summary', {}).get('low_value_fields', 0) > 0:
        cleaning_steps.append(('clean_optional_fields', clean_optional_fields))

    # 5. ALWAYS: Final validation
    cleaning_steps.append(('final_validation', validate_cleaned_data))

    print(f"Created pipeline with {len(cleaning_steps)} cleaning steps:")
    for i, (step_name, _) in enumerate(cleaning_steps, 1):
        print(f"  {i}. {step_name}")

    return cleaning_steps

def execute_cleaning_pipeline(cleaning_steps, datasets):
    """Execute the cleaning pipeline and return cleaned datasets."""
    print(f"\nEXECUTING CLEANING PIPELINE")
    print("=" * 40)

    cleaned_datasets = datasets.copy()

    for step_name, cleaning_func in cleaning_steps:
        print(f"\nRunning: {step_name}")
        try:
            result = cleaning_func(cleaned_datasets)
            if result:
                cleaned_datasets = result
            print(f"  COMPLETED: {step_name}")
        except Exception as e:
            print(f"  ERROR in {step_name}: {str(e)}")
            print(f"  Continuing with next step...")

    return cleaned_datasets

### Core Cleaning Functions (Simple & Targeted)


In [13]:
# 1. Fix contamination issues dynamically
def fix_contamination_issues(dataset_name, recommendations, datasets):
    """Apply contamination fixes based on quality analysis recommendations."""
    if dataset_name not in datasets:
        return datasets

    df = datasets[dataset_name].copy()
    print(f"    Fixing contamination in {dataset_name}")

    for recommendation in recommendations:
        try:
            # Parse the recommendation to extract field and operation
            if 'pd.to_numeric' in recommendation:
                # Extract field name from recommendation like "Convert zip_code to numeric: pd.to_numeric(df['zip_code'], errors='coerce')"
                field_start = recommendation.find("df['") + 4
                field_end = recommendation.find("']", field_start)
                field_name = recommendation[field_start:field_end]

                # Apply numeric conversion
                df[field_name] = pd.to_numeric(df[field_name], errors='coerce')
                print(f"      Applied: Convert {field_name} to numeric")

            elif 'str.replace' in recommendation:
                # Extract field and pattern from recommendations like "Strip quotes from zip_code: df['zip_code'].str.replace(r'[\'"]', '', regex=True)"
                field_start = recommendation.find("df['") + 4
                field_end = recommendation.find("']", field_start)
                field_name = recommendation[field_start:field_end]

                # Extract pattern and replacement
                if "str.replace(r'[\\'\\\"]'" in recommendation:
                    # Remove quotes
                    df[field_name] = df[field_name].astype(str).str.replace(r'[\'"]', '', regex=True)
                    print(f"      Applied: Strip quotes from {field_name}")
                else:
                    # Generic string replacement
                    df[field_name] = df[field_name].astype(str).str.replace(r'[^\w\s]', '', regex=True)
                    print(f"      Applied: Clean special characters from {field_name}")

            elif 'str.strip' in recommendation:
                # Extract field name
                field_start = recommendation.find("df['") + 4
                field_end = recommendation.find("']", field_start)
                field_name = recommendation[field_start:field_end]

                # Apply strip
                df[field_name] = df[field_name].astype(str).str.strip()
                print(f"      Applied: Strip whitespace from {field_name}")

            else:
                print(f"      Skipped: Unknown recommendation format")

        except Exception as e:
            print(f"      FAILED: {recommendation[:40]}... ({str(e)})")

    datasets[dataset_name] = df
    return datasets

# 2. Standardize data types based on schema
def standardize_data_types(datasets):
    """Convert fields to proper types based on schema definitions."""
    print("    Standardizing data types using schema")

    for dataset_name, df in datasets.items():
        schema_fields = SchemaManager.get_field_names(dataset_name)
        date_fields = SchemaManager.get_date_fields(dataset_name)

        # Convert date fields
        for field in date_fields:
            if field in df.columns:
                try:
                    df[field] = pd.to_datetime(df[field], errors='coerce')
                    print(f"      Converted {field} to datetime")
                except:
                    pass

        # Convert numeric fields that should be numeric
        numeric_candidates = ['community_area', 'ward', 'precinct', 'zip_code', 'latitude', 'longitude']
        for field in numeric_candidates:
            if field in df.columns:
                try:
                    df[field] = pd.to_numeric(df[field], errors='coerce')
                    print(f"      Converted {field} to numeric")
                except:
                    pass

        datasets[dataset_name] = df

    return datasets

# 3. Clean optional fields with low value
def clean_optional_fields(datasets):
    """Handle optional fields with very low completion rates."""
    print("    Cleaning optional fields")

    for dataset_name, df in datasets.items():
        optional_fields = set(SchemaManager.get_field_names(dataset_name)) - set(SchemaManager.get_required_fields(dataset_name))

        for field in optional_fields:
            if field in df.columns:
                completion_rate = df[field].notna().sum() / len(df)

                # Drop fields with <5% completion
                if completion_rate < 0.05:
                    print(f"      Dropping {field} ({completion_rate:.1%} complete)")
                    df = df.drop(columns=[field])

                # Fill fields with 5-25% completion with 'UNKNOWN'
                elif completion_rate < 0.25 and df[field].dtype == 'object':
                    df[field] = df[field].fillna('UNKNOWN')
                    print(f"      Filled {field} nulls with 'UNKNOWN'")

        datasets[dataset_name] = df

    return datasets

# 4. Fix business logic issues
def fix_business_logic_issues(datasets):
    """Fix basic business logic violations."""
    print("    Fixing business logic issues")

    # Example: Fix date logic issues in licenses
    if 'business_licenses' in datasets:
        df = datasets['business_licenses'].copy()

        # Fix licenses where start date > expiration date
        if 'license_start_date' in df.columns and 'expiration_date' in df.columns:
            invalid_dates = df['license_start_date'] > df['expiration_date']
            if invalid_dates.sum() > 0:
                # Set expiration to start date + 1 year for invalid cases
                df.loc[invalid_dates, 'expiration_date'] = df.loc[invalid_dates, 'license_start_date'] + pd.DateOffset(years=1)
                print(f"      Fixed {invalid_dates.sum()} invalid date sequences")

        datasets['business_licenses'] = df

    return datasets

# 5. Standard cleaning (always applied)
def standard_data_cleaning(datasets):
    """Apply standard cleaning operations to all datasets."""
    print("    Applying standard cleaning")

    for dataset_name, df in datasets.items():
        # Remove completely empty rows
        before_rows = len(df)
        df = df.dropna(how='all')
        if len(df) < before_rows:
            print(f"      Removed {before_rows - len(df)} empty rows from {dataset_name}")

        # Strip whitespace from text fields
        text_fields = df.select_dtypes(include=['object']).columns
        for field in text_fields:
            df[field] = df[field].astype(str).str.strip()

        datasets[dataset_name] = df

    return datasets

# 6. Final validation
def validate_cleaned_data(datasets):
    """Validate the cleaned data meets basic requirements."""
    print("    Validating cleaned data")

    for dataset_name, df in datasets.items():
        required_fields = SchemaManager.get_required_fields(dataset_name)

        # Check required fields still exist and aren't empty
        for field in required_fields:
            if field not in df.columns:
                print(f"      WARNING: Required field {field} missing from {dataset_name}")
            elif df[field].isna().all():
                print(f"      WARNING: Required field {field} is completely empty in {dataset_name}")

        print(f"      {dataset_name}: {len(df)} rows, {len(df.columns)} columns after cleaning")

    return datasets

### Simple Execution

In [19]:
# Execute the dynamic cleaning pipeline
def run_data_cleaning():
    """Main function to run the complete data cleaning pipeline."""
    print("STARTING DATA CLEANING PROCESS")
    print("=" * 50)

    # Create dynamic pipeline based on quality analysis
    cleaning_pipeline = create_cleaning_pipeline(quality_results, datasets)

    # Execute the pipeline
    cleaned_datasets = execute_cleaning_pipeline(cleaning_pipeline, datasets)

    # Extract cleaned dataframes with descriptive names
    licenses_df_cleaned = cleaned_datasets['business_licenses']
    permits_df_cleaned = cleaned_datasets['building_permits']
    cta_df_cleaned = cleaned_datasets['cta_boardings']

    print(f"\nDATA CLEANING COMPLETE!")
    print(f"  Business Licenses: {len(licenses_df_cleaned)} rows")
    print(f"  Building Permits: {len(permits_df_cleaned)} rows")
    print(f"  CTA Boardings: {len(cta_df_cleaned)} rows")

    return licenses_df_cleaned, permits_df_cleaned, cta_df_cleaned

# Run the cleaning process
licenses_df_cleaned, permits_df_cleaned, cta_df_cleaned = run_data_cleaning()

STARTING DATA CLEANING PROCESS
CREATING DYNAMIC CLEANING PIPELINE
Created pipeline with 5 cleaning steps:
  1. fix_business_logic
  2. fix_contamination_business_licenses
  3. standardize_types
  4. clean_optional_fields
  5. final_validation

EXECUTING CLEANING PIPELINE

Running: fix_business_logic
    Fixing business logic issues
      Fixed 67 invalid date sequences
  COMPLETED: fix_business_logic

Running: fix_contamination_business_licenses
    Fixing contamination in business_licenses
      Applied: Convert zip_code to numeric
  COMPLETED: fix_contamination_business_licenses

Running: standardize_types
    Standardizing data types using schema
      Converted application_created_date to datetime
      Converted application_requirements_complete to datetime
      Converted payment_date to datetime
      Converted conditional_approval to datetime
      Converted license_approved_for_issuance to datetime
      Converted date_issued to datetime
      Converted license_start_date to d

# Data Quality Verification

We can trust, but need to verify that our data is ready to ship out for analysis!

In [None]:
# Cell: Before/After Comparison
def compare_before_after_cleaning():
    """Compare data quality metrics before and after cleaning."""
    print("\nBEFORE vs AFTER CLEANING COMPARISON")
    print("=" * 60)

    datasets_comparison = {
        'business_licenses': (licenses_df, licenses_df_cleaned),
        'building_permits': (permits_df, permits_df_cleaned),
        'cta_boardings': (cta_df, cta_df_cleaned)
    }

    for dataset_name, (original_df, cleaned_df) in datasets_comparison.items():
        print(f"\n{dataset_name.upper().replace('_', ' ')}")
        print("-" * 40)

        # Basic metrics
        print(f"  Rows:           {len(original_df):,} → {len(cleaned_df):,}")
        print(f"  Columns:        {len(original_df.columns)} → {len(cleaned_df.columns)}")

        # Data type improvements
        original_numeric = len(original_df.select_dtypes(include=[np.number]).columns)
        cleaned_numeric = len(cleaned_df.select_dtypes(include=[np.number]).columns)
        original_datetime = len(original_df.select_dtypes(include=['datetime']).columns)
        cleaned_datetime = len(cleaned_df.select_dtypes(include=['datetime']).columns)

        print(f"  Numeric Fields: {original_numeric} → {cleaned_numeric}")
        print(f"  Date Fields:    {original_datetime} → {cleaned_datetime}")

        # Required fields completeness
        required_fields = SchemaManager.get_required_fields(dataset_name)
        original_nulls = sum(original_df[field].isnull().sum() for field in required_fields if field in original_df.columns)
        cleaned_nulls = sum(cleaned_df[field].isnull().sum() for field in required_fields if field in cleaned_df.columns)

        print(f"  Required Nulls: {original_nulls:,} → {cleaned_nulls:,}")

compare_before_after_cleaning()


BEFORE vs AFTER CLEANING COMPARISON

BUSINESS LICENSES
----------------------------------------
  Rows:           2,040 → 2,040
  Columns:        39 → 37
  Numeric Fields: 5 → 11
  Date Fields:    9 → 7
  Required Nulls: 0 → 244

BUILDING PERMITS
----------------------------------------
  Rows:           8,647 → 8,647
  Columns:        31 → 31
  Numeric Fields: 16 → 17
  Date Fields:    2 → 2
  Required Nulls: 0 → 0

CTA BOARDINGS
----------------------------------------
  Rows:           668 → 668
  Columns:        2 → 2
  Numeric Fields: 1 → 1
  Date Fields:    1 → 1
  Required Nulls: 0 → 0


In [None]:
# Cell: Final Validation & Readiness Check
def validate_data_readiness():
    """Validate that data is ready for analysis and next steps."""
    print("\nDATA READINESS VALIDATION")
    print("=" * 40)

    ready_for_analysis = True

    datasets_to_validate = {
        'business_licenses': licenses_df_cleaned,
        'building_permits': permits_df_cleaned,
        'cta_boardings': cta_df_cleaned
    }

    for dataset_name, df in datasets_to_validate.items():
        print(f"\n{dataset_name.replace('_', ' ').title()}:")

        # Check required fields
        required_fields = SchemaManager.get_required_fields(dataset_name)
        missing_required = [field for field in required_fields if field not in df.columns]
        null_required = [field for field in required_fields if field in df.columns and df[field].isnull().any()]

        if missing_required:
            print(f"  MISSING Required Fields: {missing_required}")
            ready_for_analysis = False
        elif null_required:
            print(f"  NULL Required Fields: {null_required}")
            ready_for_analysis = False
        else:
            print(f"  REQUIRED Fields: ALL PRESENT AND COMPLETE")

        # Check data types
        date_fields = SchemaManager.get_date_fields(dataset_name)
        incorrect_types = [field for field in date_fields if field in df.columns and df[field].dtype != 'datetime64[ns]']

        if incorrect_types:
            print(f"  INCORRECT Types: {incorrect_types}")
            ready_for_analysis = False
        else:
            print(f"  DATA TYPES: CORRECT")

        print(f"  STATUS: {'READY FOR ANALYSIS' if not (missing_required or null_required or incorrect_types) else 'NEEDS ATTENTION'}")

    print(f"\nOVERALL STATUS: {'READY FOR STEP 4 (LOAD & VALIDATE)' if ready_for_analysis else 'REQUIRES FIXES'}")
    return ready_for_analysis

data_ready = validate_data_readiness()


DATA READINESS VALIDATION

Business Licenses:
  NULL Required Fields: ['community_area']
  DATA TYPES: CORRECT
  STATUS: NEEDS ATTENTION

Building Permits:
  REQUIRED Fields: ALL PRESENT AND COMPLETE
  DATA TYPES: CORRECT
  STATUS: READY FOR ANALYSIS

Cta Boardings:
  REQUIRED Fields: ALL PRESENT AND COMPLETE
  DATA TYPES: CORRECT
  STATUS: READY FOR ANALYSIS

OVERALL STATUS: REQUIRES FIXES


In [1]:
# Fix Cell: Handle community_area nulls intelligently
def fix_community_area_nulls():
    """Fix community_area nulls using available geographic data."""
    print("FIXING COMMUNITY_AREA NULLS")
    print("=" * 30)

    # Check what we're working with
    null_count = licenses_df_cleaned['community_area'].isnull().sum()
    print(f"Records with null community_area: {null_count}")

    if null_count == 0:
        print("No nulls to fix!")
        return licenses_df_cleaned

    # Strategy 1: Use community_area_name to fill missing community_area
    if 'community_area_name' in licenses_df_cleaned.columns:
        # Create mapping from community_area_name to community_area
        area_mapping = licenses_df_cleaned[licenses_df_cleaned['community_area'].notna()].groupby('community_area_name')['community_area'].first()

        # Fill nulls using the mapping
        null_mask = licenses_df_cleaned['community_area'].isnull()
        for idx in licenses_df_cleaned[null_mask].index:
            area_name = licenses_df_cleaned.loc[idx, 'community_area_name']
            if area_name in area_mapping:
                licenses_df_cleaned.loc[idx, 'community_area'] = area_mapping[area_name]

        fixed_count = null_count - licenses_df_cleaned['community_area'].isnull().sum()
        print(f"Fixed {fixed_count} nulls using community_area_name mapping")

    # Strategy 2: For remaining nulls, use a default value or remove from required list
    remaining_nulls = licenses_df_cleaned['community_area'].isnull().sum()
    if remaining_nulls > 0:
        print(f"Still have {remaining_nulls} nulls after mapping")
        print("Options:")
        print("1. Fill with default value (e.g., 0 for 'Unknown')")
        print("2. Remove community_area from required fields list")
        print("3. Drop these records (if very few)")

        # For now, let's fill with 0 (representing unknown area)
        licenses_df_cleaned['community_area'] = licenses_df_cleaned['community_area'].fillna(0)
        print(f"Filled remaining {remaining_nulls} nulls with 0 (Unknown area)")

    return licenses_df_cleaned

# Apply the fix
licenses_df_cleaned = fix_community_area_nulls()

# Re-run validation to confirm fix
print(f"\nRE-VALIDATION:")
null_count_after = licenses_df_cleaned['community_area'].isnull().sum()
print(f"Remaining nulls in community_area: {null_count_after}")

FIXING COMMUNITY_AREA NULLS


NameError: name 'licenses_df_cleaned' is not defined

# Load data in Google Sheets

In [None]:
# # Cell: Save Cleaned Data to Google Sheets
# def save_cleaned_data_to_sheets():
#     """Save cleaned datasets to Google Sheets with '_Cleaned' suffix."""
#     print("SAVING CLEANED DATA TO GOOGLE SHEETS")
#     print("=" * 50)

#     # Load Google Sheets connection
#     settings = load_settings()
#     sh = open_sheet(settings.sheet_id, settings.google_creds_path)

#     # Define cleaned datasets and their target worksheet names
#     cleaned_data_mapping = {
#         'Business_Licenses_Cleaned': licenses_df_cleaned,
#         'Building_Permits_Cleaned': permits_df_cleaned,
#         'CTA_Cleaned': cta_df_cleaned
#     }

#     for worksheet_name, df in cleaned_data_mapping.items():
#         try:
#             print(f"\nSaving {worksheet_name}...")
#             print(f"  Rows: {len(df):,}")
#             print(f"  Columns: {len(df.columns)}")

#             # Create or update worksheet with cleaned data
#             upsert_worksheet(sh, worksheet_name)
#             overwrite_with_dataframe(sh, worksheet_name, df)

#             print(f"  SUCCESS: Saved to '{worksheet_name}' tab")

#         except Exception as e:
#             print(f"  ERROR saving {worksheet_name}: {str(e)}")

#     print(f"\nCLEANED DATA SAVED TO GOOGLE SHEETS!")
#     print(f"   Original tabs: Business_Licenses_Full, Building_Permits_Full, CTA_Full")
#     print(f"   Cleaned tabs:  Business_Licenses_Cleaned, Building_Permits_Cleaned, CTA_Cleaned")

#     return True

# # Save to Google Sheets
# if data_ready:
#     save_success = save_cleaned_data_to_sheets()
# else:
#     print("\nDATA NOT READY - Fix validation issues before saving to Google Sheets")
#     print("Available options:")
#     print("1. Fix the community_area nulls and re-validate")
#     print("2. Save anyway for manual review in Google Sheets")

#     # Option to save anyway for review
#     save_anyway = input("Save to Google Sheets anyway for manual review? (y/n): ")
#     if save_anyway.lower() == 'y':
#         save_success = save_cleaned_data_to_sheets()
#         print("WARNING: Data saved but requires manual review and fixes")


DATA NOT READY - Fix validation issues before saving to Google Sheets
Available options:
1. Fix the community_area nulls and re-validate
2. Save anyway for manual review in Google Sheets
