In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import glob
import os

# Set up paths
data_dir = Path("../data")  # Adjust if your CSV files are in a different directory
mapping_file = data_dir / "area_region_mapping.csv"

# Load the area mapping
area_mapping = pd.read_csv(mapping_file)
print("Area mapping loaded:")
print(area_mapping.head())
print(f"\nTotal areas in mapping: {len(area_mapping)}")

# Get list of all area numbers from mapping (excluding regional aggregates)
area_codes = area_mapping[area_mapping['area_code'].str.isdigit()]['area_code'].astype(int).tolist()
print(f"\nArea codes to process: {sorted(area_codes)}")

In [None]:
# Function to combine all area CSV files
def combine_area_dataframes(data_directory, area_codes, area_mapping):
    """
    Combines multiple area CSV files into a single DataFrame.
    
    Parameters:
    - data_directory: Path to directory containing CSV files
    - area_codes: List of area codes to process
    - area_mapping: DataFrame with area metadata
    
    Returns:
    - combined_df: Single DataFrame with all area data
    """
    
    combined_data = []
    
    # Loop through each area code
    for area_code in area_codes:
        # Look for CSV files with pattern: "{area_code}_combine_data.csv"
        csv_pattern = f"{area_code}_combine_data.csv"
        csv_file = data_directory / csv_pattern
        
        if csv_file.exists():
            print(f"Processing {csv_file.name}...")
            
            try:
                # Read the CSV file
                df = pd.read_csv(csv_file)
                
                # Add area metadata
                area_info = area_mapping[area_mapping['area_code'] == str(area_code)].iloc[0]
                df['area_code'] = area_code
                df['region_type'] = area_info['region_type']
                df['location_name'] = area_info['location_name']
                
                # Add to combined data
                combined_data.append(df)
                print(f"  - Added {len(df)} rows for Area {area_code} ({area_info['location_name']})")
                
            except Exception as e:
                print(f"  - Error processing {csv_file.name}: {e}")
        else:
            print(f"File not found: {csv_pattern}")
    
    if combined_data:
        # Combine all DataFrames
        combined_df = pd.concat(combined_data, ignore_index=True)
        print(f"\nCombination complete!")
        print(f"Total rows: {len(combined_df):,}")
        print(f"Total areas processed: {len(combined_data)}")
        print(f"Columns: {list(combined_df.columns)}")
        
        return combined_df
    else:
        print("No data files found to combine!")
        return pd.DataFrame()

# Execute the combination
combined_df = combine_area_dataframes(data_dir, area_codes, area_mapping)

In [None]:
# Data quality checks and summary
if not combined_df.empty:
    print("=== DATA QUALITY SUMMARY ===")
    print(f"Shape: {combined_df.shape}")
    print(f"Date range: {combined_df['datetime'].min()} to {combined_df['datetime'].max()}")
    
    # Check for missing values
    print(f"\nMissing values:")
    missing_counts = combined_df.isnull().sum()
    print(missing_counts[missing_counts > 0])
    
    # Regional summary
    print(f"\nRegional breakdown:")
    regional_summary = combined_df.groupby(['region_type', 'location_name']).size().reset_index(name='row_count')
    print(regional_summary.to_string(index=False))
    
    # Urban vs Rural summary
    print(f"\nUrban vs Rural summary:")
    urban_rural_summary = combined_df.groupby('region_type').agg({
        'area_code': 'nunique',
        'load_mw': ['count', 'mean', 'std'],
        'temperature_c': 'mean'
    }).round(2)
    print(urban_rural_summary)
    
    # Sample of combined data
    print(f"\nSample of combined dataset:")
    print(combined_df.head(10).to_string(index=False))
    
else:
    print("No data to analyze - combined DataFrame is empty.")

In [None]:
# Save the combined dataset
if not combined_df.empty:
    # Save as CSV
    output_file = data_dir / "combined_all_areas.csv"
    combined_df.to_csv(output_file, index=False)
    print(f"Combined dataset saved to: {output_file}")
    
    # Save as Parquet for better performance (optional)
    try:
        parquet_file = data_dir / "combined_all_areas.parquet"
        combined_df.to_parquet(parquet_file, index=False)
        print(f"Also saved as Parquet: {parquet_file}")
    except ImportError:
        print("Parquet not available - install pyarrow or fastparquet for better performance")
    
    # Create separate urban and rural datasets
    urban_df = combined_df[combined_df['region_type'] == 'urban'].copy()
    rural_df = combined_df[combined_df['region_type'] == 'rural'].copy()
    
    if len(urban_df) > 0:
        urban_file = data_dir / "urban_areas_combined.csv"
        urban_df.to_csv(urban_file, index=False)
        print(f"Urban dataset saved: {urban_file} ({len(urban_df):,} rows)")
    
    if len(rural_df) > 0:
        rural_file = data_dir / "rural_areas_combined.csv" 
        rural_df.to_csv(rural_file, index=False)
        print(f"Rural dataset saved: {rural_file} ({len(rural_df):,} rows)")
    
    print(f"\nData combination complete!")
    print(f"Ready for machine learning model development")
    
else:
    print("No data to save - please check that CSV files exist in the correct format.")