# Data Cleaning 

Here we'll dig into cleaning the data, importing the cached data quality checks we generated automatically at the tail-end of our 01_data_quality-check.ipynb file

## Environment Setup

In [2]:
# Import required libraries
import pandas as pd
import numpy as np
import pickle
import json
from pathlib import Path
from datetime import datetime
import sys

# Add paths for our custom modules
sys.path.append('../../shared')
sys.path.append('../../step2_data_ingestion')

# Import our modules
from sheets_client import open_sheet
from config_manager import load_settings
from schema import SchemaManager

print("All imports successful")

All imports successful


## Fetching the Data

### Load Original Datasets

In [3]:
# Load original datasets (same loading logic as quality check notebook)
sys.path.append('../../shared')
from notebook_utils import *

# Define dataset configurations
datasets_config = {
    'business_licenses': {
        'worksheet': 'Business_Licenses_Full',
        'pickle_name': 'licenses_df'
    },
    'building_permits': {
        'worksheet': 'Building_Permits_Full',
        'pickle_name': 'permits_df'
    },
    'cta_boardings': {
        'worksheet': 'CTA_Full',
        'pickle_name': 'cta_df'
    }
}

# Load datasets from cache first
datasets = {}
load_from_sheets = False

print("Loading original datasets for cleaning...")
for dataset_name, config in datasets_config.items():
    try:
        df = load_analysis_results(config['pickle_name'])
        if df.empty:
            raise FileNotFoundError(f"{config['pickle_name']} is empty")
        datasets[dataset_name] = df
        print(f"   LOADED {dataset_name}: {len(df)} rows from cache")
    except FileNotFoundError:
        print(f"   CACHE MISS {dataset_name}: will load from sheets")
        load_from_sheets = True

if load_from_sheets:
    print("\nLoading fresh data from Google Sheets...")
    settings = load_settings()
    sh = open_sheet(settings.sheet_id, settings.google_creds_path)

    for dataset_name, config in datasets_config.items():
        if dataset_name not in datasets:
            df = load_sheet_data(sh, config['worksheet'])
            datasets[dataset_name] = df
            save_analysis_results(df, config['pickle_name'])
            print(f"   LOADED {dataset_name}: {len(df)} rows from sheets and cached")

# Extract for easier access
licenses_df = datasets['business_licenses'].copy()  # Use .copy() so we don't modify originals
permits_df = datasets['building_permits'].copy()
cta_df = datasets['cta_boardings'].copy()

print(f"\nDATASETS READY FOR CLEANING:")
print(f"   Business Licenses: {len(licenses_df):,} rows")
print(f"   Building Permits: {len(permits_df):,} rows")
print(f"   CTA Boardings: {len(cta_df):,} rows")
print(f"   Total records: {len(licenses_df) + len(permits_df) + len(cta_df):,}")

print(f"\nOriginal datasets loaded successfully!")

Loading original datasets for cleaning...
✅ Loaded analysis results from ../data/processed/licenses_df.pkl
   LOADED business_licenses: 2040 rows from cache
✅ Loaded analysis results from ../data/processed/permits_df.pkl
   LOADED building_permits: 8647 rows from cache
✅ Loaded analysis results from ../data/processed/cta_df.pkl
   LOADED cta_boardings: 668 rows from cache

DATASETS READY FOR CLEANING:
   Business Licenses: 2,040 rows
   Building Permits: 8,647 rows
   CTA Boardings: 668 rows
   Total records: 11,355

Original datasets loaded successfully!


### Quality-Driven Cleaning Strategy

In [5]:
# Enhanced Quality Analysis Results Loader
def load_quality_analysis_results(batch_id=None):
    """
    Load quality analysis results from latest batch or specific batch.

    Args:
        batch_id (str, optional): Specific batch ID to load. If None, loads latest.
    """
    print("LOADING QUALITY ANALYSIS RESULTS")
    print("=" * 50)

    results_dir = Path("../../data/quality_analysis")

    if not results_dir.exists():
        print("❌ No quality analysis results found!")
        print("   Please run 01_data_quality_check.ipynb first")
        return None

    # Determine which batch to load
    if batch_id:
        batch_dir = results_dir / batch_id
        if not batch_dir.exists():
            print(f"❌ Batch '{batch_id}' not found!")
            return None
        print(f"   📂 Loading specific batch: {batch_id}")
    else:
        # Load from latest directory
        batch_dir = results_dir / "latest"
        if not batch_dir.exists():
            print("❌ No latest results found!")
            return None

        # Get batch info from latest
        batch_info_file = batch_dir / "latest_batch_info.txt"
        if batch_info_file.exists():
            with open(batch_info_file, 'r') as f:
                batch_info = f.read()
                batch_id = [line for line in batch_info.split('\n') if 'Batch ID:' in line][0].split(': ')[1]
        print(f"   📂 Loading latest batch: {batch_id}")

    # Load all analysis components
    analysis_results = {'batch_id': batch_id}

    # Load files with error handling
    files_to_load = {
        'quality': 'quality_matrix.pkl',
        'contamination': 'contamination_analysis.pkl',
        'anomalies': 'anomaly_detection.pkl',
        'business_validation': 'business_validation.pkl',
        'completeness': 'completeness_analysis.pkl'
    }

    for key, filename in files_to_load.items():
        file_path = batch_dir / filename
        if file_path.exists():
            with open(file_path, 'rb') as f:
                analysis_results[key] = pickle.load(f)
            print(f"   ✅ {key}: {filename}")
        else:
            print(f"   ⚠️  {key}: {filename} (missing)")

    # Load summary if available
    summary_file = batch_dir / "analysis_summary.json"
    if summary_file.exists():
        with open(summary_file, 'r') as f:
            analysis_results['summary'] = json.load(f)
        print(f"   ✅ Summary: analysis_summary.json")

    print(f"\n🎯 QUALITY ANALYSIS RESULTS READY!")
    if 'summary' in analysis_results:
        summary = analysis_results['summary']
        print(f"   📊 Quality Score: {summary['quality_summary']['overall_score']:.3f}")
        print(f"   🔍 Contaminated Fields: {summary['quality_summary']['contaminated_fields']}")
        print(f"   🔧 Recommendations: {summary['completeness_summary']['total_recommendations']}")

    return analysis_results

# Function to list available batches
def list_available_batches():
    """List all available quality analysis batches."""
    results_dir = Path("../../data/quality_analysis")
    index_file = results_dir / "batch_index.json"

    if not index_file.exists():
        print("📂 No batch index found")
        return []

    with open(index_file, 'r') as f:
        batch_index = json.load(f)

    print("AVAILABLE QUALITY ANALYSIS BATCHES")
    print("=" * 40)

    for i, batch in enumerate(batch_index['batches'][:10]):  # Show last 10
        print(f"{i+1:2d}. {batch['batch_id']}")
        print(f"     📅 {batch['datetime'][:19]}")
        print(f"     📊 Quality: {batch['summary_scores']['overall_quality']:.3f}")
        print(f"     🔧 Recommendations: {batch['summary_scores']['total_recommendations']}")
        print()

    return batch_index['batches']

# Load latest results by default
quality_results = load_quality_analysis_results()

# Optionally list available batches
available_batches = list_available_batches()

LOADING QUALITY ANALYSIS RESULTS
   📂 Loading latest batch: data_quality_batch_20250901_153632
   ✅ quality: quality_matrix.pkl
   ✅ contamination: contamination_analysis.pkl
   ✅ anomalies: anomaly_detection.pkl
   ✅ business_validation: business_validation.pkl
   ✅ completeness: completeness_analysis.pkl
   ✅ Summary: analysis_summary.json

🎯 QUALITY ANALYSIS RESULTS READY!
   📊 Quality Score: 0.988
   🔍 Contaminated Fields: 7
   🔧 Recommendations: 2
AVAILABLE QUALITY ANALYSIS BATCHES
 1. data_quality_batch_20250901_153632
     📅 2025-09-01T15:36:32
     📊 Quality: 0.988
     🔧 Recommendations: 2



In [6]:
# Extract key insights from quality analysis for cleaning priorities
def analyze_cleaning_priorities(quality_results):
    """Extract actionable cleaning priorities from quality analysis."""
    print("DATA CLEANING PRIORITY ANALYSIS")
    print("=" * 50)

    if not quality_results:
        print("ERROR: No quality results available")
        return None

    summary = quality_results.get('summary', {})
    contamination = quality_results.get('contamination', {})
    completeness = quality_results.get('completeness', {})

    # Extract high-priority cleaning actions
    cleaning_plan = {
        'critical_fixes': [],
        'high_priority': [],
        'medium_priority': [],
        'field_specific_actions': {}
    }

    print(f"SUMMARY FROM QUALITY ANALYSIS:")
    if 'quality_summary' in summary:
        qs = summary['quality_summary']
        print(f"   Overall Quality Score: {qs.get('overall_score', 0):.3f}")
        print(f"   Contaminated Fields: {qs.get('contaminated_fields', 0)}")
        print(f"   Critical Issues: {qs.get('critical_issues', 0)}")

    print(f"\nCLEANING RECOMMENDATIONS:")
    if 'completeness_summary' in summary:
        cs = summary['completeness_summary']
        print(f"   Total Recommendations: {cs.get('total_recommendations', 0)}")
        print(f"   Critical Imputation Needed: {cs.get('critical_imputation_needed', 0)}")
        print(f"   Low-value Fields: {cs.get('low_value_fields', 0)}")

    # Extract contamination issues for each dataset
    print(f"\nCONTAMINATION ISSUES BY DATASET:")
    for dataset_name, contamination_info in contamination.items():
        if isinstance(contamination_info, dict):
            total_issues = (
                len(contamination_info.get('mixed_types', [])) +
                len(contamination_info.get('special_chars', [])) +
                len(contamination_info.get('encoding_issues', [])) +
                len(contamination_info.get('format_inconsistencies', []))
            )
            if total_issues > 0:
                print(f"   {dataset_name}: {total_issues} contamination issues")

                # Extract specific recommendations
                recommendations = contamination_info.get('recommendations', [])
                for rec in recommendations:
                    cleaning_plan['high_priority'].append({
                        'dataset': dataset_name,
                        'action': rec,
                        'type': 'contamination_fix'
                    })

    return cleaning_plan

# Analyze cleaning priorities from our quality results
cleaning_priorities = analyze_cleaning_priorities(quality_results)

print(f"\nREADY TO BEGIN TARGETED DATA CLEANING!")
if cleaning_priorities:
    print(f"   High Priority Actions: {len(cleaning_priorities['high_priority'])}")
    print(f"   Critical Fixes: {len(cleaning_priorities['critical_fixes'])}")
else:
    print("   Will proceed with standard cleaning workflow")

DATA CLEANING PRIORITY ANALYSIS
SUMMARY FROM QUALITY ANALYSIS:
   Overall Quality Score: 0.988
   Contaminated Fields: 7
   Critical Issues: 0

CLEANING RECOMMENDATIONS:
   Total Recommendations: 2
   Critical Imputation Needed: 0
   Low-value Fields: 2

CONTAMINATION ISSUES BY DATASET:
   business_licenses: 1 contamination issues
   building_permits: 3 contamination issues

READY TO BEGIN TARGETED DATA CLEANING!
   High Priority Actions: 1
   Critical Fixes: 0
