# Data Cleaning with Great Expectations Integration

This notebook demonstrates the new Great Expectations-powered data cleaning workflow that replaces manual cleaning with automated, validated transformations.

## Key Benefits:
- **Automated field type detection** based on naming patterns
- **Chicago-specific business rules** and validation
- **Comprehensive quality reporting** with 60+ validation checks
- **Fallback reliability** to manual cleaning if needed
- **Drop-in replacement** for existing workflow


## Environment Setup


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import pickle
import json
from pathlib import Path
from datetime import datetime
import sys

# Add paths for our custom modules
sys.path.append('../../shared')
sys.path.append('../../step2_data_ingestion')
sys.path.append('../')  # For step3_transform_model modules

# Import existing modules
from sheets_client import open_sheet
from config_manager import load_settings
from schema import SchemaManager
from notebook_utils import *

print("✅ Standard imports successful")

# Import GX modules
try:
    from gx_data_cleaning import SmartDataCleaner, batch_clean_datasets
    from desired_schema import DesiredSchemaManager, FieldTypeDetector
    from expectation_suites import ChicagoSMBExpectationSuites
    from pipeline_integration import enhanced_clean_and_save, compare_cleaning_methods
    GX_AVAILABLE = True
    print("✅ Great Expectations modules imported successfully")

    import great_expectations as gx
    print(f"✅ Great Expectations {gx.__version__} available")

except ImportError as e:
    print(f"❌ GX module import error: {e}")
    print("   Falling back to manual cleaning workflow")
    GX_AVAILABLE = False

print(f"\n🎯 Setup Status:")
print(f"   GX Integration: {'✅ Ready' if GX_AVAILABLE else '❌ Manual Fallback'}")


## Load Raw Data from Google Sheets


In [None]:
# Load original datasets (same loading logic as before)
datasets_config = {
    'business_licenses': {
        'worksheet': 'Business_Licenses_Full',
        'pickle_name': 'licenses_df'
    },
    'building_permits': {
        'worksheet': 'Building_Permits_Full',
        'pickle_name': 'permits_df'
    },
    'cta_boardings': {
        'worksheet': 'CTA_Full',
        'pickle_name': 'cta_df'
    }
}

# Load datasets from cache first, fall back to sheets
datasets = {}
load_from_sheets = False

print("📊 LOADING RAW DATA FOR CLEANING...")
print("=" * 50)

for dataset_name, config in datasets_config.items():
    try:
        df = load_analysis_results(config['pickle_name'])
        if df.empty:
            raise FileNotFoundError(f"{config['pickle_name']} is empty")
        datasets[dataset_name] = df
        print(f"   📦 LOADED {dataset_name}: {len(df):,} rows from cache")
    except FileNotFoundError:
        print(f"   📥 CACHE MISS {dataset_name}: will load from sheets")
        load_from_sheets = True

if load_from_sheets:
    print("\n🔄 Loading fresh data from Google Sheets...")
    settings = load_settings()
    sh = open_sheet(settings.sheet_id, settings.google_creds_path)

    for dataset_name, config in datasets_config.items():
        if dataset_name not in datasets:
            df = load_sheet_data(sh, config['worksheet'])
            datasets[dataset_name] = df
            save_analysis_results(df, config['pickle_name'])
            print(f"   📊 LOADED {dataset_name}: {len(df):,} rows from sheets and cached")

print(f"\n✅ RAW DATA LOADED SUCCESSFULLY")
total_records = sum(len(df) for df in datasets.values())
for name, df in datasets.items():
    print(f"   {name}: {len(df):,} rows, {len(df.columns)} columns")
print(f"   📈 Total records: {total_records:,}")


## 🚀 Great Expectations Data Cleaning

This replaces the entire manual cleaning workflow with automated, validated transformations.


In [None]:
if GX_AVAILABLE:
    print("🚀 ENHANCED DATA CLEANING WITH GREAT EXPECTATIONS")
    print("=" * 60)

    # ONE-LINE REPLACEMENT for all manual cleaning!
    cleaned_datasets, cleaning_report = enhanced_clean_and_save(datasets, use_gx=True)

    # Extract cleaned dataframes (same variable names as manual workflow)
    licenses_df_cleaned = cleaned_datasets['business_licenses']
    permits_df_cleaned = cleaned_datasets['building_permits']
    cta_df_cleaned = cleaned_datasets['cta_boardings']

    print(f"\n✅ GX CLEANING COMPLETE!")
    print(f"   Strategy used: {cleaning_report['strategy_used']}")
    print(f"   Datasets processed: {len(cleaning_report['datasets_processed'])}")
    print(f"   Errors: {len(cleaning_report['errors'])}")
    print(f"   Google Sheets saved: {cleaning_report['save_success']}")

    # Show detailed results
    print(f"\n📊 CLEANING RESULTS BY DATASET:")
    for dataset_result in cleaning_report['datasets_processed']:
        name = dataset_result['name']
        success = "✅" if dataset_result['success'] else "❌"
        original_shape = dataset_result['original_shape']
        cleaned_shape = dataset_result['cleaned_shape']
        print(f"   {success} {name}: {original_shape} → {cleaned_shape}")

    # Show validation results if available
    validation_results = cleaning_report.get('validation_results', {})
    if validation_results:
        print(f"\n🔍 VALIDATION RESULTS:")
        for dataset_name, val_result in validation_results.items():
            if 'success_rate' in val_result:
                rate = val_result['success_rate']
                total = val_result.get('total_expectations', 0)
                print(f"   {dataset_name}: {rate:.1%} success rate ({total} expectations)")

    # Show quality improvements
    quality_improvements = cleaning_report.get('quality_improvements', {})
    if quality_improvements:
        print(f"\n🎯 QUALITY IMPROVEMENTS:")
        for dataset_name, improvements in quality_improvements.items():
            if 'data_types' in improvements:
                dt = improvements['data_types']
                orig_numeric = dt['original_numeric']
                clean_numeric = dt['cleaned_numeric']
                orig_datetime = dt['original_datetime']
                clean_datetime = dt['cleaned_datetime']

                print(f"   {dataset_name}:")
                print(f"      Numeric fields: {orig_numeric} → {clean_numeric}")
                print(f"      DateTime fields: {orig_datetime} → {clean_datetime}")

    print(f"\n🎉 AUTOMATED CLEANING COMPLETE!")
    print(f"   📋 Check Google Sheets tabs ending in '_GX_Cleaned'")
    print(f"   📊 All datasets are now analysis-ready")

    # Set flag for rest of notebook
    data_ready = True
    cleaning_method = "Great Expectations"

else:
    print("⚠️  GX not available - would fall back to manual cleaning")
    print("   (Manual cleaning cells would go here)")
    data_ready = False
    cleaning_method = "Manual Fallback"


## 🎯 Next Steps

Your cleaned data is now ready for analysis! The datasets are available as:
- `licenses_df_cleaned` - Business licenses with validated fields
- `permits_df_cleaned` - Building permits with proper data types  
- `cta_df_cleaned` - CTA ridership data with temporal validation

**Google Sheets tabs created:**
- `Business_Licenses_GX_Cleaned`
- `Building_Permits_GX_Cleaned` 
- `CTA_GX_Cleaned`

**What Great Expectations provided:**
- ✅ Automated field type detection and conversion
- ✅ Chicago-specific business rule validation
- ✅ Comprehensive quality checks (60+ expectations)
- ✅ Detailed validation reporting
- ✅ Reliable fallback to manual cleaning if needed
