In [1]:
import cdsapi
import xarray as xr
import pandas as pd

# ERA5 Land downloading data with APIs

In [18]:
import cdsapi
import os

def download_era5_precip(lat, lon, years, output_dir="data"):
    """
    Download ERA5 precipitation data for specific coordinates and years
    
    Args:
        lat (float): Latitude
        lon (float): Longitude  
        years (list): List of years (e.g., [2020, 2021, 2022])
        output_dir (str): Directory to save files
    
    Returns:
        str: Path to downloaded ZIP file
    """
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    
    # Setup CDS API client
    c = cdsapi.Client()
    
    # Format coordinates for filename (replace dots with underscores)
    lat_str = str(lat).replace('.', '_').replace('-', 'neg')
    lon_str = str(lon).replace('.', '_').replace('-', 'neg')
    
    # Create filename with years
    years_str = "_".join(map(str, sorted(years)))
    filename = f"era5_precip_{lat_str}_{lon_str}_{years_str}.zip"
    filepath = os.path.join(output_dir, filename)
    
    # Define small area around the point (±0.1 degrees)
    area = [lat + 0.1, lon - 0.1, lat - 0.1, lon + 0.1]  # North, West, South, East
    
    print(f"🌍 Downloading ERA5 precipitation data...")
    print(f"📍 Location: {lat}°N, {lon}°E")
    print(f"📅 Years: {years}")
    print(f"📦 Area: {area}")
    print(f"💾 Output: {filename}")
    
    try:
        c.retrieve(
            'reanalysis-era5-single-levels',
            {
                'product_type': 'reanalysis',
                'variable': 'total_precipitation',
                'year': [str(year) for year in years],
                'month': [f'{i:02d}' for i in range(1, 13)],
                'day': [f'{i:02d}' for i in range(1, 32)],
                'time': [f'{i:02d}:00' for i in range(0, 24)],
                'area': area,
                'format': 'netcdf',
            },
            filepath
        )
        print(f"✅ Download complete: {filename}")
        file_size = os.path.getsize(filepath) / (1024 * 1024)  # Convert to MB
        print(f"📏 File size: {file_size:.2f} MB")
        return filepath
        
    except Exception as e:
        print(f"❌ Download failed: {e}")
        return None



In [19]:
lat, lon = 51.5074, -0.1278  # London coordinates
years = [2023, 2024]  # Start with just one year for testing
    
print("Testing download function...")
result = download_era5_precip(lat, lon, years)
    
if result:
    print(f"Success! File saved at: {result}")
else:
    print("Download failed!")

Testing download function...


2025-07-02 14:06:27,360 INFO [2024-09-26T00:00:00] Watch our [Forum](https://forum.ecmwf.int/) for Announcements, news and other discussed topics.


🌍 Downloading ERA5 precipitation data...
📍 Location: 51.5074°N, -0.1278°E
📅 Years: [2023, 2024]
📦 Area: [51.6074, -0.2278, 51.407399999999996, -0.02779999999999999]
💾 Output: era5_precip_51_5074_neg0_1278_2023_2024.zip


2025-07-02 14:06:27,787 INFO Request ID is bfeebeef-2b4f-48dc-8583-0e34092b8d95
2025-07-02 14:06:27,871 INFO status has been updated to accepted
2025-07-02 14:06:41,540 INFO status has been updated to successful


7675b9b99b97dc377c4010f2a0a2eb92.nc:   0%|          | 0.00/1.15M [00:00<?, ?B/s]

✅ Download complete: era5_precip_51_5074_neg0_1278_2023_2024.zip
📏 File size: 1.15 MB
Success! File saved at: data\era5_precip_51_5074_neg0_1278_2023_2024.zip


In [None]:
def extract_and_convert_to_csv(zip_filepath, output_dir=None):
    """
    Extract precipitation data from ZIP file and convert to CSV
    
    Args:
        zip_filepath (str): Path to downloaded ZIP file
        output_dir (str): Directory for CSV output (default: same as ZIP file)
    
    Returns:
        list: Paths to created CSV files
    """
    import zipfile
    import pandas as pd
    import xarray as xr
    
    if output_dir is None:
        output_dir = os.path.dirname(zip_filepath)
    
    csv_files = []
    
    print(f"📦 Processing ZIP file: {os.path.basename(zip_filepath)}")
    
    try:
        with zipfile.ZipFile(zip_filepath, 'r') as zip_ref:
            # List files in archive
            file_list = zip_ref.namelist()
            print(f"📄 Files in archive: {len(file_list)}")
            for file_name in file_list:
                print(f"  - {file_name}")
            
            # Process each NetCDF file
            for file_name in file_list:
                if file_name.endswith('.nc'):
                    print(f"🔄 Processing: {file_name}")
                    
                    # Extract to temporary location
                    temp_path = os.path.join(output_dir, file_name)
                    zip_ref.extract(file_name, output_dir)
                    
                    # Open with xarray
                    ds = xr.open_dataset(temp_path)
                    
                    # Get coordinates info for filename
                    lat_vals = ds.latitude.values
                    lon_vals = ds.longitude.values
                    lat_center = float(lat_vals.mean())
                    lon_center = float(lon_vals.mean())
                    
                    # Format coordinates for filename (replace dots with underscores)
                    lat_str = f"{lat_center:.2f}".replace('.', '_').replace('-', 'neg')
                    lon_str = f"{lon_center:.2f}".replace('.', '_').replace('-', 'neg')
                    
                    # Get years from time dimension
                    years = sorted(list(set(pd.to_datetime(ds.time.values).year)))
                    years_str = "_".join(map(str, years))
                    
                    # Create CSV filename: lat_lon_Y1_Y2
                    csv_filename = f"{lat_str}_{lon_str}_{years_str}.csv"
                    csv_filepath = os.path.join(output_dir, csv_filename)
                    
                    print(f"📊 Dataset info:")
                    print(f"  Variables: {list(ds.data_vars.keys())}")
                    print(f"  Dimensions: {dict(ds.dims)}")
                    print(f"  Time range: {pd.to_datetime(ds.time.values).min()} to {pd.to_datetime(ds.time.values).max()}")
                    
                    # Convert to DataFrame
                    df_list = []
                    
                    # Extract precipitation data
                    precip_data = ds['tp']  # total precipitation variable
                    
                    print("🔄 Converting to DataFrame...")
                    for time_idx, time_val in enumerate(ds.time.values):
                        timestamp = pd.to_datetime(time_val)
                        
                        for lat_idx, lat_val in enumerate(ds.latitude.values):
                            for lon_idx, lon_val in enumerate(ds.longitude.values):
                                precip_value = float(precip_data.isel(
                                    time=time_idx, 
                                    latitude=lat_idx, 
                                    longitude=lon_idx
                                ).values)
                                
                                df_list.append({
                                    'datetime': timestamp,
                                    'latitude': lat_val,
                                    'longitude': lon_val,
                                    'precipitation_mm': precip_value * 1000  # Convert m to mm
                                })
                    
                    # Create DataFrame and save to CSV
                    df = pd.DataFrame(df_list)
                    df = df.sort_values(['datetime', 'latitude', 'longitude'])
                    df.to_csv(csv_filepath, index=False)
                    
                    print(f"✅ Created CSV: {csv_filename}")
                    print(f"📊 Records: {len(df)}")
                    print(f"📅 Date range: {df['datetime'].min()} to {df['datetime'].max()}")
                    print(f"🌧️  Precipitation range: {df['precipitation_mm'].min():.3f} to {df['precipitation_mm'].max():.3f} mm")
                    
                    csv_files.append(csv_filepath)
                    
                    # Clean up temporary NetCDF file
                    os.remove(temp_path)
                    ds.close()
    
    except Exception as e:
        print(f"❌ Error processing ZIP file: {e}")
        import traceback
        traceback.print_exc()
        return []
    
    return csv_files

# Test both functions
if __name__ == "__main__":
    # Example usage
    lat, lon = 51.5074, -0.1278  # London coordinates
    years = [2024]  # Start with just one year for testing
    
    print("Testing download function...")
    zip_file = download_era5_precip(lat, lon, years)
    
    if zip_file:
        print(f"Success! File saved at: {zip_file}")
        
        print("\nTesting extraction function...")
        csv_files = extract_and_convert_to_csv(zip_file)
        
        if csv_files:
            print(f"🎉 Created {len(csv_files)} CSV file(s)")
            
            # Show sample data
            import pandas as pd
            df = pd.read_csv(csv_files[0])
            print("\n📊 Sample data:")
            print(df.head())
            print(f"\nDataset shape: {df.shape}")
        else:
            print("CSV conversion failed!")
    else:
        print("Download failed!")

In [24]:
def extract_and_convert_to_csv(file_path, output_dir=None):
    """
    Extract precipitation data from downloaded file (ZIP or GRIB) and convert to CSV
    
    Args:
        file_path (str): Path to downloaded file
        output_dir (str): Directory for CSV output (default: same as input file)
    
    Returns:
        list: Paths to created CSV files
    """
    import zipfile
    import pandas as pd
    
    if output_dir is None:
        output_dir = os.path.dirname(file_path)
    
    csv_files = []
    
    print(f"📦 Processing file: {os.path.basename(file_path)}")
    
    # Check actual file format by reading first bytes
    try:
        with open(file_path, 'rb') as f:
            first_bytes = f.read(16)
            print(f"🔍 File signature: {first_bytes}")
        
        # Check if it's a ZIP file
        if first_bytes.startswith(b'PK\x03\x04'):
            print("✅ Detected format: ZIP archive")
            return process_zip_file(file_path, output_dir)
        
        # Check if it's a GRIB file
        elif first_bytes.startswith(b'GRIB'):
            print("✅ Detected format: GRIB")
            return process_grib_file(file_path, output_dir)
        
        # Check if it's NetCDF (even with .zip extension)
        elif b'CDF' in first_bytes or file_path.endswith('.nc'):
            print("✅ Detected format: NetCDF")
            return process_netcdf_file(file_path, output_dir)
        
        else:
            print(f"❌ Unknown file format. First bytes: {first_bytes}")
            # Try to process as NetCDF anyway (sometimes NetCDF files have different signatures)
            print("🔄 Attempting to process as NetCDF...")
            return process_netcdf_file(file_path, output_dir)
            
    except Exception as e:
        print(f"❌ Error analyzing file format: {e}")
        return []

def process_zip_file(zip_filepath, output_dir):
    """Process a ZIP file containing NetCDF/GRIB files"""
    import zipfile
    
    csv_files = []
    
    try:
        with zipfile.ZipFile(zip_filepath, 'r') as zip_ref:
            file_list = zip_ref.namelist()
            print(f"📄 Files in archive: {len(file_list)}")
            for file_name in file_list:
                print(f"  - {file_name}")
            
            # Process each data file
            for file_name in file_list:
                if file_name.endswith(('.nc', '.grib', '.grib2', '.grb')):
                    print(f"🔄 Extracting and processing: {file_name}")
                    
                    # Extract to temporary location
                    temp_path = os.path.join(output_dir, file_name)
                    zip_ref.extract(file_name, output_dir)
                    
                    # Process the extracted file
                    if file_name.endswith('.nc'):
                        csv_result = process_netcdf_file(temp_path, output_dir)
                    else:
                        csv_result = process_grib_file(temp_path, output_dir)
                    
                    csv_files.extend(csv_result)
                    
                    # Clean up temporary file
                    os.remove(temp_path)
    
    except Exception as e:
        print(f"❌ Error processing ZIP file: {e}")
        import traceback
        traceback.print_exc()
    
    return csv_files

def process_grib_file(grib_filepath, output_dir):
    """Process a GRIB file"""
    csv_files = []
    
    try:
        # Try with pygrib first
        try:
            import pygrib
            print("🔄 Processing GRIB with pygrib...")
            
            grbs = pygrib.open(grib_filepath)
            print(f"📊 Number of GRIB messages: {grbs.messages}")
            
            # Collect all precipitation data
            df_list = []
            
            for grb in grbs:
                if 'precipitation' in grb.name.lower() or grb.shortName == 'tp':
                    print(f"📋 Processing: {grb.name}, Date: {grb.validDate}")
                    
                    # Get data and coordinates
                    data, lats, lons = grb.data()
                    
                    # Convert to DataFrame format
                    for i in range(data.shape[0]):
                        for j in range(data.shape[1]):
                            if not np.isnan(data[i, j]):
                                df_list.append({
                                    'datetime': grb.validDate,
                                    'latitude': lats[i, j],
                                    'longitude': lons[i, j],
                                    'precipitation_mm': data[i, j] * 1000  # Convert m to mm
                                })
            
            grbs.close()
            
        except ImportError:
            print("⚠️ pygrib not available, trying with xarray...")
            import xarray as xr
            ds = xr.open_dataset(grib_filepath, engine='cfgrib')
            return process_xarray_dataset(ds, output_dir)
        
        # Create DataFrame and CSV
        if df_list:
            import pandas as pd
            df = pd.DataFrame(df_list)
            df = df.sort_values(['datetime', 'latitude', 'longitude'])
            
            # Create filename
            lat_center = df['latitude'].mean()
            lon_center = df['longitude'].mean()
            years = sorted(list(set(df['datetime'].dt.year)))
            
            csv_filename = create_csv_filename(lat_center, lon_center, years)
            csv_filepath = os.path.join(output_dir, csv_filename)
            
            df.to_csv(csv_filepath, index=False)
            print(f"✅ Created CSV: {csv_filename}")
            print(f"📊 Records: {len(df)}")
            
            csv_files.append(csv_filepath)
    
    except Exception as e:
        print(f"❌ Error processing GRIB file: {e}")
        import traceback
        traceback.print_exc()
    
    return csv_files

def process_netcdf_file(nc_filepath, output_dir):
    """Process a NetCDF file"""
    try:
        import xarray as xr
        print("🔄 Processing NetCDF with xarray...")
        
        ds = xr.open_dataset(nc_filepath)
        return process_xarray_dataset(ds, output_dir)
    
    except Exception as e:
        print(f"❌ Error processing NetCDF file: {e}")
        import traceback
        traceback.print_exc()
        return []

def process_xarray_dataset(ds, output_dir):
    """Process an xarray dataset (NetCDF or GRIB)"""
    import pandas as pd
    import numpy as np
    
    csv_files = []
    
    try:
        print(f"📊 Dataset info:")
        print(f"  Variables: {list(ds.data_vars.keys())}")
        print(f"  Dimensions: {dict(ds.dims)}")
        
        # Find precipitation variable
        precip_var = None
        for var in ds.data_vars:
            if 'tp' in var or 'precip' in var.lower() or 'rain' in var.lower():
                precip_var = var
                break
        
        if not precip_var:
            print("❌ No precipitation variable found")
            return []
        
        print(f"🌧️ Using precipitation variable: {precip_var}")
        
        # Get coordinates and time info
        lat_vals = ds.latitude.values if 'latitude' in ds.dims else ds.lat.values
        lon_vals = ds.longitude.values if 'longitude' in ds.dims else ds.lon.values
        lat_center = float(lat_vals.mean())
        lon_center = float(lon_vals.mean())
        
        # Get years from time dimension
        time_vals = pd.to_datetime(ds.time.values)
        years = sorted(list(set(time_vals.year)))
        
        print(f"📅 Time range: {time_vals.min()} to {time_vals.max()}")
        
        # Create CSV filename
        csv_filename = create_csv_filename(lat_center, lon_center, years)
        csv_filepath = os.path.join(output_dir, csv_filename)
        
        # Convert to DataFrame
        df_list = []
        precip_data = ds[precip_var]
        
        print("🔄 Converting to DataFrame...")
        for time_idx, time_val in enumerate(ds.time.values):
            timestamp = pd.to_datetime(time_val)
            
            for lat_idx, lat_val in enumerate(lat_vals):
                for lon_idx, lon_val in enumerate(lon_vals):
                    precip_value = float(precip_data.isel(
                        time=time_idx, 
                        latitude=lat_idx if 'latitude' in ds.dims else lat_idx, 
                        longitude=lon_idx if 'longitude' in ds.dims else lon_idx
                    ).values)
                    
                    if not np.isnan(precip_value):
                        df_list.append({
                            'datetime': timestamp,
                            'latitude': lat_val,
                            'longitude': lon_val,
                            'precipitation_mm': precip_value * 1000  # Convert m to mm
                        })
        
        # Create DataFrame and save to CSV
        df = pd.DataFrame(df_list)
        df = df.sort_values(['datetime', 'latitude', 'longitude'])
        df.to_csv(csv_filepath, index=False)
        
        print(f"✅ Created CSV: {csv_filename}")
        print(f"📊 Records: {len(df)}")
        print(f"📅 Date range: {df['datetime'].min()} to {df['datetime'].max()}")
        print(f"🌧️ Precipitation range: {df['precipitation_mm'].min():.3f} to {df['precipitation_mm'].max():.3f} mm")
        
        csv_files.append(csv_filepath)
        ds.close()
    
    except Exception as e:
        print(f"❌ Error processing dataset: {e}")
        import traceback
        traceback.print_exc()
    
    return csv_files

def create_csv_filename(lat_center, lon_center, years):
    """Create standardized CSV filename"""
    # Format coordinates for filename (replace dots with underscores)
    lat_str = f"{lat_center:.2f}".replace('.', '_').replace('-', 'neg')
    lon_str = f"{lon_center:.2f}".replace('.', '_').replace('-', 'neg')
    years_str = "_".join(map(str, years))
    
    return f"{lat_str}_{lon_str}_{years_str}.csv"

In [26]:
csv_files = extract_and_convert_to_csv(result)

📦 Processing file: era5_precip_51_5074_neg0_1278_2023_2024.zip
🔍 File signature: b'\x89HDF\r\n\x1a\n\x02\x08\x08\x00\x00\x00\x00\x00'
❌ Unknown file format. First bytes: b'\x89HDF\r\n\x1a\n\x02\x08\x08\x00\x00\x00\x00\x00'
🔄 Attempting to process as NetCDF...
🔄 Processing NetCDF with xarray...
📊 Dataset info:
  Variables: ['tp']
  Dimensions: {'valid_time': 17544, 'latitude': 1, 'longitude': 1}
🌧️ Using precipitation variable: tp
❌ Error processing dataset: 'Dataset' object has no attribute 'time'


  print(f"  Dimensions: {dict(ds.dims)}")
Traceback (most recent call last):
  File "C:\Users\pacor\AppData\Local\Temp\ipykernel_3512\2297161511.py", line 209, in process_xarray_dataset
    time_vals = pd.to_datetime(ds.time.values)
                               ^^^^^^^
  File "c:\ProgramData\anaconda3\Lib\site-packages\xarray\core\common.py", line 306, in __getattr__
    raise AttributeError(
AttributeError: 'Dataset' object has no attribute 'time'


In [None]:


def download_and_convert(lat, lon, years, output_dir="data"):
    """
    Complete workflow: download ERA5 data and convert to CSV
    
    Args:
        lat (float): Latitude
        lon (float): Longitude  
        years (list): List of years
        output_dir (str): Output directory
    
    Returns:
        list: Paths to created CSV files
    """
    print("🚀 Starting ERA5 precipitation download and conversion...")
    
    # Step 1: Download data
    zip_file = download_era5_precip(lat, lon, years, output_dir)
    if not zip_file:
        return []
    
    # Step 2: Convert to CSV
    csv_files = extract_and_convert_to_csv(zip_file, output_dir)
    
    if csv_files:
        print(f"🎉 Complete! Created {len(csv_files)} CSV file(s)")
        for csv_file in csv_files:
            print(f"📄 {os.path.basename(csv_file)}")
    
    return csv_files

# Example usage
if __name__ == "__main__":
    # Example: Download data for London coordinates for 2023-2024
    lat, lon = 51.5074, -0.1278  # London
    years = [2023, 2024]
    
    csv_files = download_and_convert(lat, lon, years)
    
    # Display sample data
    if csv_files:
        print("\n📊 Sample data from first CSV:")
        df = pd.read_csv(csv_files[0])
        print(df.head())
        print(f"\nTotal records: {len(df)}")

In [15]:
def extract_and_convert_to_csv(zip_filepath, output_dir=None):
    """
    Extract precipitation data from ZIP file and convert to CSV
    
    Args:
        zip_filepath (str): Path to downloaded ZIP file
        output_dir (str): Directory for CSV output (default: same as ZIP file)
    
    Returns:
        list: Paths to created CSV files
    """
    if output_dir is None:
        output_dir = os.path.dirname(zip_filepath)
    
    csv_files = []
    
    print(f"📦 Processing ZIP file: {os.path.basename(zip_filepath)}")
    
    try:
        with zipfile.ZipFile(zip_filepath, 'r') as zip_ref:
            # List files in archive
            file_list = zip_ref.namelist()
            print(f"📄 Files in archive: {len(file_list)}")
            
            # Process each NetCDF file
            for file_name in file_list:
                if file_name.endswith('.nc'):
                    print(f"🔄 Processing: {file_name}")
                    
                    # Extract to temporary location
                    temp_path = os.path.join(output_dir, file_name)
                    zip_ref.extract(file_name, output_dir)
                    
                    # Open with xarray
                    ds = xr.open_dataset(temp_path)
                    
                    # Get coordinates info for filename
                    lat_vals = ds.latitude.values
                    lon_vals = ds.longitude.values
                    lat_center = float(lat_vals.mean())
                    lon_center = float(lon_vals.mean())
                    
                    # Get years from time dimension
                    years = sorted(list(set(pd.to_datetime(ds.time.values).year)))
                    years_str = "_".join(map(str, years))
                    
                    # Create CSV filename
                    csv_filename = f"{lat_center:.2f}_{lon_center:.2f}_{years_str}.csv"
                    csv_filepath = os.path.join(output_dir, csv_filename)
                    
                    # Convert to DataFrame
                    df_list = []
                    
                    # Extract precipitation data
                    precip_data = ds['tp']  # total precipitation variable
                    
                    for time_idx, time_val in enumerate(ds.time.values):
                        timestamp = pd.to_datetime(time_val)
                        
                        for lat_idx, lat_val in enumerate(ds.latitude.values):
                            for lon_idx, lon_val in enumerate(ds.longitude.values):
                                precip_value = float(precip_data.isel(
                                    time=time_idx, 
                                    latitude=lat_idx, 
                                    longitude=lon_idx
                                ).values)
                                
                                df_list.append({
                                    'datetime': timestamp,
                                    'latitude': lat_val,
                                    'longitude': lon_val,
                                    'precipitation_mm': precip_value * 1000  # Convert m to mm
                                })
                    
                    # Create DataFrame and save to CSV
                    df = pd.DataFrame(df_list)
                    df = df.sort_values(['datetime', 'latitude', 'longitude'])
                    df.to_csv(csv_filepath, index=False)
                    
                    print(f"✅ Created CSV: {csv_filename}")
                    print(f"📊 Records: {len(df)}")
                    print(f"📅 Date range: {df['datetime'].min()} to {df['datetime'].max()}")
                    
                    csv_files.append(csv_filepath)
                    
                    # Clean up temporary NetCDF file
                    os.remove(temp_path)
                    ds.close()
    
    except Exception as e:
        print(f"❌ Error processing ZIP file: {e}")
        return []
    
    return csv_files

In [17]:
extract_and_convert_to_csv('./data', 'test')

📦 Processing ZIP file: data
❌ Error processing ZIP file: [Errno 13] Permission denied: './data'


[]

In [None]:
def analyze_file(file_path):
    """Analyze the downloaded file to determine its format and contents"""
    print("🔍 Analyzing downloaded file...")
    
    try:
        # Read first few bytes to identify file type
        with open(file_path, 'rb') as f:
            first_bytes = f.read(16)
            print(f"File signature: {first_bytes}")
        
        # Check if it's a ZIP file
        if first_bytes.startswith(b'PK\x03\x04'):
            print("✅ File format: ZIP archive")
            try:
                import zipfile
                with zipfile.ZipFile(file_path, 'r') as zip_ref:
                    file_list = zip_ref.namelist()
                    print(f"📦 Files in archive: {len(file_list)}")
                    for file_name in file_list[:5]:  # Show first 5 files
                        file_info = zip_ref.getinfo(file_name)
                        size_mb = file_info.file_size / (1024 * 1024)
                        print(f"  📄 {file_name} ({size_mb:.2f} MB)")
                    
                    if len(file_list) > 5:
                        print(f"  ... and {len(file_list) - 5} more files")
                    
                    # Extract and analyze the first GRIB or NetCDF file
                    for file_name in file_list:
                        if file_name.endswith(('.grib', '.grib2', '.grb', '.nc')):
                            print(f"🔄 Extracting and analyzing: {file_name}")
                            extract_path = os.path.join(os.path.dirname(file_path), file_name)
                            zip_ref.extract(file_name, os.path.dirname(file_path))
                            analyze_extracted_file(extract_path)
                            break
                    
            except Exception as e:
                print(f"⚠️  Error reading ZIP file: {e}")
        
        # Check if it's a GRIB file
        elif first_bytes.startswith(b'GRIB'):
            print("✅ File format: GRIB")
            analyze_grib_file(file_path)
        
        # Check if it's a NetCDF file
        elif file_path.endswith('.nc') or b'CDF' in first_bytes:
            print("✅ File format: NetCDF")
            analyze_netcdf_file(file_path)
        
        else:
            print("❌ Unknown file format. First bytes:", first_bytes)
            
    except Exception as e:
        print(f"❌ Error analyzing file: {e}")
    
    # Show file info
    file_size = os.path.getsize(file_path) / (1024 * 1024)  # Convert to MB
    print(f"📏 File size: {file_size:.2f} MB")

def analyze_grib_file(file_path):
    """Analyze GRIB file contents"""
    try:
        import pygrib
        grbs = pygrib.open(file_path)
        print(f"📊 Number of messages: {grbs.messages}")
        
        # List first few messages
        for i, grb in enumerate(grbs):
            if i >= 3:  # Show first 3 messages
                break
            print(f"  Message {i+1}: {grb.name}, Level: {grb.level}, Date: {grb.validDate}")
        grbs.close()
        
    except ImportError:
        print("⚠️  pygrib not installed - cannot analyze GRIB contents")
    except Exception as e:
        print(f"⚠️  Error reading GRIB file: {e}")

def analyze_netcdf_file(file_path):
    """Analyze NetCDF file contents"""
    try:
        import xarray as xr
        ds = xr.open_dataset(file_path)
        print(f"📊 Variables: {list(ds.data_vars.keys())}")
        print(f"📊 Dimensions: {dict(ds.dims)}")
        ds.close()
    except ImportError:
        print("⚠️  xarray not installed - cannot analyze NetCDF contents")
    except Exception as e:
        print(f"⚠️  Error reading NetCDF file: {e}")

def analyze_extracted_file(file_path):
    """Analyze an extracted file"""
    if file_path.endswith(('.grib', '.grib2', '.grb')):
        analyze_grib_file(file_path)
    elif file_path.endswith('.nc'):
        analyze_netcdf_file(file_path)
    else:
        print(f"📄 Extracted file: {os.path.basename(file_path)}")
        file_size = os.path.getsize(file_path) / (1024 * 1024)
        print(f"📏 Size: {file_size:.2f} MB")

In [2]:
import cdsapi
import zipfile
import xarray as xr
import pandas as pd
import os
from pathlib import Path
from datetime import datetime

def download_era5_precipitation_to_dataframe(
    area=[51.7, -0.5, 51.3, 0.2],  # London area: [N, W, S, E]
    year='2024',
    month='01',
    days=['01', '02', '03'],  # First 3 days of January by default
    output_csv='london_precipitation.csv',
    return_format='time_series',  # 'long_format', 'time_series', or 'spatial'
    cleanup_files=True
):
    """
    Complete function to download ERA5 precipitation data and return as pandas DataFrame
    
    Parameters:
    -----------
    area : list
        Bounding box [North, West, South, East] in decimal degrees
    year : str
        Year to download (e.g., '2024')
    month : str  
        Month to download (e.g., '01' for January)
    days : list
        List of days to download (e.g., ['01', '02', '03'])
    output_csv : str
        Name of the output CSV file
    return_format : str
        Format of returned DataFrame:
        - 'long_format': Every data point as a row
        - 'time_series': Time series with spatial average
        - 'spatial': Spatial data with time average
    cleanup_files : bool
        Whether to delete intermediate files after processing
    
    Returns:
    --------
    pandas.DataFrame
        Precipitation data in the specified format
    """
    
    print("🌧️  ERA5 Precipitation Data Processor")
    print("=" * 50)
    
    # Step 1: Download data from CDS API
    print(f"📥 Downloading ERA5 data for {year}-{month}, days {days}")
    print(f"📍 Area: {area} (N, W, S, E)")
    
    temp_grib_file = f"era5_precip_{year}{month}_temp.grib"
    
    try:
        c = cdsapi.Client()
        
        request = {
            'variable': 'total_precipitation',
            'year': year,
            'month': month,
            'day': days,
            'time': [
                '00:00', '01:00', '02:00', '03:00', '04:00', '05:00',
                '06:00', '07:00', '08:00', '09:00', '10:00', '11:00', 
                '12:00', '13:00', '14:00', '15:00', '16:00', '17:00',
                '18:00', '19:00', '20:00', '21:00', '22:00', '23:00'
            ],
            'area': area,
            'format': 'grib',
        }
        
        c.retrieve('reanalysis-era5-land', request, temp_grib_file)
        print(f"✅ Download complete: {temp_grib_file}")
        
    except Exception as e:
        print(f"❌ Download failed: {e}")
        return None
    
    # Step 2: Check if downloaded file is ZIP or GRIB
    print(f"🔍 Analyzing downloaded file...")
    
    with open(temp_grib_file, 'rb') as f:
        first_bytes = f.read(4)
    
    if first_bytes == b'PK\x03':  # ZIP file
        print("📦 Downloaded file is ZIP format, extracting...")
        grib_file = extract_grib_from_zip(temp_grib_file)
        if grib_file is None:
            print("❌ Failed to extract GRIB from ZIP")
            return None
    elif first_bytes == b'GRIB':  # Already GRIB
        print("📄 Downloaded file is already GRIB format")
        grib_file = temp_grib_file
    else:
        print(f"❌ Unknown file format. First bytes: {first_bytes}")
        return None
    
    # Step 3: Load GRIB data
    print(f"📊 Loading GRIB data from: {grib_file}")
    
    try:
        # Try multiple methods to load GRIB
        ds = None
        methods = [
            ('cfgrib', {}),
            ('cfgrib with error ignore', {'errors': 'ignore'}),
            ('cfgrib with filter', {'filter_by_keys': {'paramId': 228}})
        ]
        
        for method_name, backend_kwargs in methods:
            try:
                print(f"🔄 Trying {method_name}...")
                ds = xr.open_dataset(grib_file, engine='cfgrib', backend_kwargs=backend_kwargs)
                print(f"✅ Success with {method_name}!")
                break
            except Exception as e:
                print(f"❌ {method_name} failed: {str(e)[:100]}...")
                continue
        
        if ds is None:
            print("❌ All GRIB loading methods failed")
            return None
            
    except Exception as e:
        print(f"❌ Error loading GRIB: {e}")
        return None
    
    # Step 4: Convert to DataFrame
    print(f"🔄 Converting to DataFrame (format: {return_format})...")
    
    try:
        if return_format == 'long_format':
            df = convert_to_long_format(ds)
        elif return_format == 'time_series':
            df = convert_to_time_series(ds)
        elif return_format == 'spatial':
            df = convert_to_spatial(ds)
        else:
            print(f"⚠️  Unknown format '{return_format}', using time_series")
            df = convert_to_time_series(ds)
        
        print(f"✅ DataFrame created: {df.shape[0]} rows, {df.shape[1]} columns")
        
    except Exception as e:
        print(f"❌ Error converting to DataFrame: {e}")
        return None
    
    # Step 5: Save to CSV
    print(f"💾 Saving to CSV: {output_csv}")
    
    try:
        # Add metadata header
        metadata = [
            f"# ERA5 Land Reanalysis - Total Precipitation",
            f"# Downloaded: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
            f"# Period: {year}-{month}, days {days}",
            f"# Area: {area} (North, West, South, East)",
            f"# Format: {return_format}",
            f"# Rows: {len(df)}, Columns: {len(df.columns)}",
            f"# Variables: {', '.join(df.columns)}",
            "#"
        ]
        
        with open(output_csv, 'w') as f:
            for line in metadata:
                f.write(line + '\n')
            df.to_csv(f, index=False)
        
        file_size = Path(output_csv).stat().st_size / (1024*1024)
        print(f"✅ CSV saved successfully! Size: {file_size:.2f} MB")
        
    except Exception as e:
        print(f"❌ Error saving CSV: {e}")
    
    # Step 6: Cleanup intermediate files
    if cleanup_files:
        print("🧹 Cleaning up intermediate files...")
        files_to_remove = [temp_grib_file]
        if grib_file != temp_grib_file:
            files_to_remove.append(grib_file)
        
        for file_path in files_to_remove:
            try:
                if os.path.exists(file_path):
                    os.remove(file_path)
                    print(f"   🗑️  Removed: {file_path}")
            except Exception as e:
                print(f"   ⚠️  Could not remove {file_path}: {e}")
    
    # Step 7: Show summary
    print(f"\n📊 Final Dataset Summary:")
    print(f"   Shape: {df.shape}")
    print(f"   Columns: {list(df.columns)}")
    print(f"   Date range: {df['time'].min()} to {df['time'].max()}" if 'time' in df.columns else "   No time column")
    print(f"   Precipitation range: {df.filter(regex='tp|precip').min().min():.6f} to {df.filter(regex='tp|precip').max().max():.6f}")
    
    print(f"\n🎉 Process complete! Data ready for analysis.")
    
    # Close the xarray dataset
    ds.close()
    
    return df

def extract_grib_from_zip(zip_file_path):
    """Extract GRIB file from ZIP archive"""
    try:
        extract_dir = Path(zip_file_path).parent / "extracted"
        extract_dir.mkdir(exist_ok=True)
        
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(extract_dir)
            
            # Find the extracted file
            extracted_files = list(extract_dir.glob('*'))
            if extracted_files:
                return str(extracted_files[0])
        
        return None
    except Exception as e:
        print(f"Error extracting ZIP: {e}")
        return None

def convert_to_long_format(ds):
    """Convert xarray dataset to long format DataFrame"""
    df = ds.to_dataframe().reset_index()
    return df.dropna()

def convert_to_time_series(ds):
    """Convert to time series by averaging spatial dimensions"""
    spatial_dims = [dim for dim in ['latitude', 'longitude'] if dim in ds.dims]
    if spatial_dims:
        ds_avg = ds.mean(dim=spatial_dims)
    else:
        ds_avg = ds
    
    df = ds_avg.to_dataframe().reset_index()
    return df.dropna()

def convert_to_spatial(ds):
    """Convert to spatial format by averaging time dimension"""
    if 'time' in ds.dims:
        ds_avg = ds.mean(dim='time')
    else:
        ds_avg = ds
    
    df = ds_avg.to_dataframe().reset_index()
    return df.dropna()

# Quick preset functions for common use cases
def download_london_precipitation_last_week(output_csv='london_last_week.csv'):
    """Download last week of precipitation data for London"""
    from datetime import datetime, timedelta
    
    # Get last week's dates
    today = datetime.now()
    week_ago = today - timedelta(days=7)
    
    days = [(week_ago + timedelta(days=i)).strftime('%d') for i in range(7)]
    
    return download_era5_precipitation_to_dataframe(
        area=[51.7, -0.5, 51.3, 0.2],  # London
        year=week_ago.strftime('%Y'),
        month=week_ago.strftime('%m'),
        days=days,
        output_csv=output_csv,
        return_format='time_series'
    )

def download_uk_precipitation_month(year='2024', month='01', output_csv='uk_precipitation.csv'):
    """Download a full month of precipitation data for the UK"""
    # UK bounding box
    uk_area = [60.0, -8.0, 49.0, 2.0]  # [N, W, S, E]
    
    # All days in month
    days = [f"{i:02d}" for i in range(1, 32)]  # Will auto-adjust for month length
    
    return download_era5_precipitation_to_dataframe(
        area=uk_area,
        year=year,
        month=month,
        days=days,
        output_csv=output_csv,
        return_format='time_series'
    )

# # Example usage and main execution
# if __name__ == "__main__":
#     print("🚀 ERA5 Precipitation Data Processor")
#     print("=" * 50)
    
#     # Example 1: Download London data for first 3 days of January 2024
#     print("\n📊 Example 1: London precipitation (3 days)")
#     df = download_era5_precipitation_to_dataframe(
#         area=[51.7, -0.5, 51.3, 0.2],  # London
#         year='2024',
#         month='01', 
#         days=['01', '02', '03'],
#         output_csv='london_3days.csv',
#         return_format='time_series'
#     )
    
#     if df is not None:
#         print(f"✅ Success! DataFrame shape: {df.shape}")
#         print(f"📊 Sample data:\n{df.head()}")
    
#     print("\n" + "="*50)
#     print("🎯 Usage Examples:")
#     print("1. df = download_era5_precipitation_to_dataframe()")
#     print("2. df = download_london_precipitation_last_week()")
#     print("3. df = download_uk_precipitation_month('2024', '01')")

In [3]:
# Custom area and time period
df = download_era5_precipitation_to_dataframe(
    area=[51.7, -0.5, 51.3, 0.2],  # London bounding box
    year='2024',
    month='01',
    days=['01', '02', '03', '04', '05'],  # First 5 days
    output_csv='my_precipitation_data.csv',
    return_format='time_series'  # or 'long_format' or 'spatial'
)

🌧️  ERA5 Precipitation Data Processor
📥 Downloading ERA5 data for 2024-01, days ['01', '02', '03', '04', '05']
📍 Area: [51.7, -0.5, 51.3, 0.2] (N, W, S, E)


2025-07-02 13:03:38,281 INFO [2025-06-10T00:00:00] To improve our C3S service, we need to hear from you! Please complete this very short [survey](https://confluence.ecmwf.int/x/E7uBEQ/). Thank you.
2025-07-02 13:03:38,283 INFO [2024-09-26T00:00:00] Watch our [Forum](https://forum.ecmwf.int/) for Announcements, news and other discussed topics.
2025-07-02 13:03:38,706 INFO Request ID is c225dcd2-7d8f-4b9e-afd9-93a88ba7d7de
2025-07-02 13:03:38,843 INFO status has been updated to accepted
2025-07-02 13:03:47,742 INFO status has been updated to running
2025-07-02 13:04:30,017 INFO status has been updated to successful


75bf13f192a81bbbce5196fb932c9bd8.zip:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

✅ Download complete: era5_precip_202401_temp.grib
🔍 Analyzing downloaded file...
❌ Unknown file format. First bytes: b'PK\x03\x04'


------------

In [28]:
# 14:36 test

import cdsapi
import xarray as xr
import pandas as pd
import numpy as np
import os
import warnings
from pathlib import Path

def download_and_process_era5_precip(output_file="london_era5land_hourly_precip"):
    """
    Enhanced function to download and process ERA5-Land precipitation data with multiple fallback options
    """
    
    # Initialize the CDS API client
    c = cdsapi.Client()
    
    # Define the request parameters
    request_params = {
        'variable': 'total_precipitation',
        'year': ['2022'],
        'month': ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12'],
        'day': [f'{i:02d}' for i in range(1, 32)],
        'time': [f'{i:02d}:00' for i in range(24)],
        'area': [51.6, -0.2, 51.4, 0.1],  # London area: North, West, South, East
        'format': 'grib',
    }
    
    grib_file = f"{output_file}.grib"
    netcdf_file = f"{output_file}.nc"
    
    print("Downloading ERA5-Land hourly precipitation data...")
    
    try:
        # Try downloading as GRIB first
        c.retrieve('reanalysis-era5-land', request_params, grib_file)
        print(f"Data downloaded to {grib_file}")
        print(f"File size: {os.path.getsize(grib_file) / (1024*1024):.2f} MB")
        
        # Method 1: Try opening with cfgrib (original approach)
        print("\nMethod 1: Attempting to open with cfgrib engine...")
        try:
            ds = xr.open_dataset(grib_file, engine='cfgrib')
            print("✓ Successfully opened with cfgrib")
            return process_dataset(ds, "cfgrib")
            
        except Exception as e1:
            print(f"✗ cfgrib failed: {e1}")
            
            # Method 2: Try with error handling options
            print("\nMethod 2: Trying cfgrib with error handling...")
            try:
                ds = xr.open_dataset(grib_file, engine='cfgrib', 
                                   backend_kwargs={'errors': 'ignore'})
                print("✓ Successfully opened with cfgrib (ignoring errors)")
                return process_dataset(ds, "cfgrib_ignore_errors")
                
            except Exception as e2:
                print(f"✗ cfgrib with error handling failed: {e2}")
                
                # Method 3: Try pygrib if available
                print("\nMethod 3: Trying pygrib...")
                try:
                    import pygrib
                    return process_with_pygrib(grib_file)
                    
                except ImportError:
                    print("✗ pygrib not available. Install with: pip install pygrib")
                except Exception as e3:
                    print(f"✗ pygrib failed: {e3}")
                
                # Method 4: Download as NetCDF instead
                print("\nMethod 4: Downloading as NetCDF format...")
                try:
                    request_params['format'] = 'netcdf'
                    c.retrieve('reanalysis-era5-land', request_params, netcdf_file)
                    print(f"NetCDF data downloaded to {netcdf_file}")
                    
                    ds = xr.open_dataset(netcdf_file)
                    print("✓ Successfully opened NetCDF file")
                    return process_dataset(ds, "netcdf")
                    
                except Exception as e4:
                    print(f"✗ NetCDF download/processing failed: {e4}")
                    
                    # Method 5: Manual GRIB inspection
                    print("\nMethod 5: Manual GRIB file inspection...")
                    return inspect_grib_file(grib_file)
    
    except Exception as download_error:
        print(f"Download failed: {download_error}")
        return None

def process_dataset(ds, method_name):
    """Process the successfully opened dataset"""
    print(f"\n=== Dataset Info ({method_name}) ===")
    print(f"Variables: {list(ds.variables.keys())}")
    print(f"Dimensions: {ds.dims}")
    print(f"Coordinates: {list(ds.coords.keys())}")
    
    # Find precipitation variable (different names possible)
    precip_vars = [var for var in ds.variables.keys() 
                   if any(keyword in var.lower() for keyword in ['precip', 'tp', 'rain'])]
    
    if precip_vars:
        precip_var = precip_vars[0]
        print(f"Found precipitation variable: {precip_var}")
        
        # Get basic statistics
        precip_data = ds[precip_var]
        print(f"Shape: {precip_data.shape}")
        print(f"Data range: {float(precip_data.min()):.6f} to {float(precip_data.max()):.6f}")
        
        # Create time series if possible
        if 'time' in precip_data.dims:
            ts = precip_data.mean(dim=[d for d in precip_data.dims if d != 'time'])
            print(f"Time series created with {len(ts)} points")
            
            # Convert to pandas for easier handling
            df = ts.to_dataframe(name='precipitation')
            print(f"Sample data:\n{df.head()}")
            
            return ds, df
    else:
        print("No precipitation variable found in dataset")
        print("Available variables:", list(ds.variables.keys()))
    
    return ds, None

def process_with_pygrib(grib_file):
    """Process GRIB file using pygrib library"""
    try:
        import pygrib
        
        print(f"Opening {grib_file} with pygrib...")
        grbs = pygrib.open(grib_file)
        
        print(f"Total messages in file: {grbs.messages}")
        
        # List all messages
        grbs.rewind()
        messages = []
        for grb in grbs:
            messages.append({
                'name': grb.name,
                'shortName': grb.shortName,
                'level': grb.level,
                'validDate': grb.validDate,
                'shape': grb.values.shape
            })
        
        print(f"Found {len(messages)} messages")
        if messages:
            print("Sample messages:")
            for i, msg in enumerate(messages[:3]):
                print(f"  {i+1}: {msg}")
        
        # Try to extract precipitation data
        grbs.rewind()
        precip_messages = grbs.select(shortName='tp')  # total precipitation
        
        if precip_messages:
            print(f"Found {len(precip_messages)} precipitation messages")
            
            # Extract first message as example
            first_msg = precip_messages[0]
            print(f"First message: {first_msg.name}")
            print(f"Valid date: {first_msg.validDate}")
            print(f"Data shape: {first_msg.values.shape}")
            print(f"Data range: {first_msg.values.min():.6f} to {first_msg.values.max():.6f}")
            
            return precip_messages, None
        else:
            print("No precipitation messages found")
            return messages, None
            
    except Exception as e:
        print(f"pygrib processing failed: {e}")
        return None, None

def inspect_grib_file(grib_file):
    """Manually inspect GRIB file properties"""
    try:
        file_size = os.path.getsize(grib_file)
        print(f"\n=== Manual GRIB File Inspection ===")
        print(f"File: {grib_file}")
        print(f"Size: {file_size} bytes ({file_size/(1024*1024):.2f} MB)")
        
        # Check if file is too small
        if file_size < 1000:
            print("⚠️  File is very small, likely corrupted or incomplete")
            
        # Try to read first few bytes
        with open(grib_file, 'rb') as f:
            header = f.read(100)
            print(f"File header (first 20 bytes): {header[:20]}")
            
            # Check for GRIB magic bytes
            if header.startswith(b'GRIB'):
                print("✓ File starts with GRIB magic bytes")
                
                # Try to get GRIB edition
                if len(header) >= 8:
                    edition = header[7]
                    print(f"GRIB Edition: {edition}")
                    
                    if edition not in [1, 2]:
                        print(f"⚠️  Unusual GRIB edition: {edition}")
            else:
                print("✗ File does not start with GRIB magic bytes")
                print("This may not be a valid GRIB file")
        
        return None, None
        
    except Exception as e:
        print(f"File inspection failed: {e}")
        return None, None

def suggest_alternatives():
    """Suggest alternative approaches"""
    print("\n=== Alternative Solutions ===")
    print("1. Update libraries:")
    print("   conda update eccodes cfgrib xarray")
    print("   # or")
    print("   pip install --upgrade eccodes cfgrib xarray pygrib")
    
    print("\n2. Try different data format:")
    print("   - Download as NetCDF instead of GRIB")
    print("   - Use different CDS API parameters")
    
    print("\n3. Use different tools:")
    print("   - CDO (Climate Data Operators)")
    print("   - NCO (NetCDF Operators)")
    print("   - GDAL for GRIB reading")
    
    print("\n4. Check file integrity:")
    print("   - Re-download the file")
    print("   - Check CDS API status")
    print("   - Verify your CDS API credentials")

# Run the enhanced function
if __name__ == "__main__":
    print("Enhanced ERA5-Land Precipitation Data Processor")
    print("=" * 50)
    
    result = download_and_process_era5_precip()
    
    if result is None:
        suggest_alternatives()
    else:
        ds, df = result
        print("\n✓ Processing completed successfully!")
        
        if df is not None:
            print(f"Final dataframe shape: {df.shape}")
        else:
            print("Dataset available for further processing")

Enhanced ERA5-Land Precipitation Data Processor


2025-07-02 14:43:48,423 INFO [2024-09-26T00:00:00] Watch our [Forum](https://forum.ecmwf.int/) for Announcements, news and other discussed topics.
2025-07-02 14:43:48,624 INFO Request ID is 52d7c0ac-3063-46be-aa64-715bb092a612


Downloading ERA5-Land hourly precipitation data...


2025-07-02 14:43:48,703 INFO status has been updated to accepted
2025-07-02 14:43:57,238 INFO status has been updated to running
2025-07-02 15:12:10,884 INFO status has been updated to successful


680d5d694ff0c8393127335063713180.zip:   0%|          | 0.00/193k [00:00<?, ?B/s]

Data downloaded to london_era5land_hourly_precip.grib
File size: 0.19 MB

Method 1: Attempting to open with cfgrib engine...


Can't create file 'london_era5land_hourly_precip.grib.5b7b6.idx'
Traceback (most recent call last):
  File "c:\ProgramData\anaconda3\Lib\site-packages\cfgrib\messages.py", line 274, in itervalues
    yield self.filestream.message_from_file(file, errors=errors)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\ProgramData\anaconda3\Lib\site-packages\cfgrib\messages.py", line 341, in message_from_file
    return Message.from_file(file, offset, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\ProgramData\anaconda3\Lib\site-packages\cfgrib\messages.py", line 105, in from_file
    raise EOFError("End of file: %r" % file)
EOFError: End of file: <_io.BufferedReader name='london_era5land_hourly_precip.grib'>

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "c:\ProgramData\anaconda3\Lib\site-packages\cfgrib\messages.py", line 539, in from_indexpath_or_filestream
    self = cls.from_

✗ cfgrib failed: No valid message found: 'london_era5land_hourly_precip.grib'

Method 2: Trying cfgrib with error handling...


Can't create file 'london_era5land_hourly_precip.grib.5b7b6.idx'
Traceback (most recent call last):
  File "c:\ProgramData\anaconda3\Lib\site-packages\cfgrib\messages.py", line 274, in itervalues
    yield self.filestream.message_from_file(file, errors=errors)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\ProgramData\anaconda3\Lib\site-packages\cfgrib\messages.py", line 341, in message_from_file
    return Message.from_file(file, offset, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\ProgramData\anaconda3\Lib\site-packages\cfgrib\messages.py", line 105, in from_file
    raise EOFError("End of file: %r" % file)
EOFError: End of file: <_io.BufferedReader name='london_era5land_hourly_precip.grib'>

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\pacor\AppData\Local\Temp\ipykernel_3512\1757161118.py", line 44, in download_and_process_era5_precip
    ds = xr.open

✗ cfgrib with error handling failed: No valid message found: 'london_era5land_hourly_precip.grib'

Method 3: Trying pygrib...
Opening london_era5land_hourly_precip.grib with pygrib...
Total messages in file: 0
Found 0 messages
pygrib processing failed: no matches found

✓ Processing completed successfully!
Dataset available for further processing


In [5]:
df

In [29]:
import os
import xarray as xr
import requests
from pathlib import Path
import shutil

def diagnose_grib_file(grib_file):
    """Comprehensive diagnosis of GRIB file issues"""
    print(f"Diagnosing GRIB file: {grib_file}")
    print("=" * 50)
    
    # Check if file exists
    if not os.path.exists(grib_file):
        print("❌ File does not exist!")
        return False
    
    # Check file size
    file_size = os.path.getsize(grib_file)
    print(f"📏 File size: {file_size:,} bytes ({file_size/1024/1024:.2f} MB)")
    
    if file_size == 0:
        print("❌ File is empty!")
        return False
    
    # Check file permissions
    readable = os.access(grib_file, os.R_OK)
    print(f"🔐 File readable: {'✅' if readable else '❌'}")
    
    # Try to read first few bytes
    try:
        with open(grib_file, 'rb') as f:
            first_bytes = f.read(16)
            print(f"🔍 First 16 bytes: {first_bytes}")
            
            # Check for GRIB magic bytes
            if first_bytes.startswith(b'GRIB'):
                print("✅ File starts with GRIB magic bytes")
            else:
                print("❌ File does not start with GRIB magic bytes")
                print("   This might be a corrupted or different file format")
    except Exception as e:
        print(f"❌ Error reading file: {e}")
        return False
    
    # Clean up any existing index files
    cleanup_index_files(grib_file)
    
    return True

def cleanup_index_files(grib_file):
    """Remove any existing cfgrib index files that might be corrupted"""
    grib_path = Path(grib_file)
    parent_dir = grib_path.parent
    base_name = grib_path.name
    
    # Look for index files
    index_patterns = [
        f"{base_name}.*.idx",
        f".{base_name}.*.idx"
    ]
    
    removed_files = []
    for pattern in index_patterns:
        for idx_file in parent_dir.glob(pattern):
            try:
                idx_file.unlink()
                removed_files.append(str(idx_file))
            except Exception as e:
                print(f"Warning: Could not remove {idx_file}: {e}")
    
    if removed_files:
        print(f"🧹 Cleaned up index files: {removed_files}")
    else:
        print("🧹 No index files found to clean up")

def try_multiple_grib_readers(grib_file):
    """Try different methods to read the GRIB file"""
    methods = []
    
    # Method 1: xarray with cfgrib (default)
    try:
        print("🔄 Trying xarray with cfgrib engine...")
        ds = xr.open_dataset(grib_file, engine='cfgrib')
        print("✅ Success with cfgrib!")
        methods.append(("cfgrib", ds))
        ds.close()
    except Exception as e:
        print(f"❌ cfgrib failed: {str(e)[:100]}...")
    
    # Method 2: xarray with cfgrib and filter_by_keys
    try:
        print("🔄 Trying cfgrib with filter_by_keys...")
        ds = xr.open_dataset(grib_file, engine='cfgrib', 
                           backend_kwargs={'filter_by_keys': {'paramId': 228}})
        print("✅ Success with cfgrib + filter!")
        methods.append(("cfgrib_filtered", ds))
        ds.close()
    except Exception as e:
        print(f"❌ cfgrib with filter failed: {str(e)[:100]}...")
    
    # Method 3: Try with different error handling
    try:
        print("🔄 Trying cfgrib with errors='ignore'...")
        ds = xr.open_dataset(grib_file, engine='cfgrib', 
                           backend_kwargs={'errors': 'ignore'})
        print("✅ Success with error ignoring!")
        methods.append(("cfgrib_ignore_errors", ds))
        ds.close()
    except Exception as e:
        print(f"❌ cfgrib with errors='ignore' failed: {str(e)[:100]}...")
    
    # Method 4: Try pygrib if available
    try:
        import pygrib
        print("🔄 Trying pygrib...")
        grbs = pygrib.open(grib_file)
        messages = list(grbs)
        print(f"✅ pygrib found {len(messages)} messages!")
        methods.append(("pygrib", messages))
        grbs.close()
    except ImportError:
        print("⚠️  pygrib not available")
    except Exception as e:
        print(f"❌ pygrib failed: {str(e)[:100]}...")
    
    return methods

def redownload_era5_data():
    """Function to re-download ERA5 data properly"""
    import cdsapi
    
    print("🔄 Attempting to re-download ERA5 data...")
    
    # Example request for London precipitation data
    c = cdsapi.Client()
    
    request = {
        'variable': 'total_precipitation',
        'year': '2024',
        'month': '01',
        'day': ['01', '02', '03'],  # Just a few days for testing
        'time': [
            '00:00', '01:00', '02:00', '03:00',
            '04:00', '05:00', '06:00', '07:00',
            '08:00', '09:00', '10:00', '11:00',
            '12:00', '13:00', '14:00', '15:00',
            '16:00', '17:00', '18:00', '19:00',
            '20:00', '21:00', '22:00', '23:00'
        ],
        'area': [51.7, -0.5, 51.3, 0.2],  # London area: N, W, S, E
        'format': 'grib',
    }
    
    output_file = 'london_era5land_hourly_precip_fixed.grib'
    
    try:
        c.retrieve('reanalysis-era5-land', request, output_file)
        print(f"✅ Successfully downloaded to {output_file}")
        return output_file
    except Exception as e:
        print(f"❌ Download failed: {e}")
        return None

def main_diagnosis(grib_file='london_era5land_hourly_precip.grib'):
    """Main function to diagnose and fix GRIB file issues"""
    print("GRIB File Diagnosis Tool")
    print("=" * 60)
    
    # Step 1: Basic file diagnosis
    if not diagnose_grib_file(grib_file):
        print("\n💡 Recommendations:")
        print("1. Re-download the file")
        print("2. Check your internet connection during download")
        print("3. Verify the CDS API request parameters")
        return
    
    # Step 2: Try different reading methods
    print(f"\n🔄 Attempting to read {grib_file} with different methods...")
    print("=" * 50)
    
    successful_methods = try_multiple_grib_readers(grib_file)
    
    if successful_methods:
        print(f"\n✅ Found {len(successful_methods)} working method(s)!")
        for method_name, data in successful_methods:
            print(f"   - {method_name}")
            if hasattr(data, 'dims'):  # xarray dataset
                print(f"     Dimensions: {dict(data.dims)}")
                print(f"     Variables: {list(data.data_vars)}")
    else:
        print("\n❌ No methods worked. The file appears to be corrupted.")
        print("\n💡 Next steps:")
        print("1. Delete the current file")
        print("2. Re-download using the redownload function")
        print("3. Check your CDS API setup")

# Helper function to use a working method
def load_grib_data(grib_file, method='cfgrib'):
    """Load GRIB data using the specified method"""
    cleanup_index_files(grib_file)
    
    if method == 'cfgrib':
        return xr.open_dataset(grib_file, engine='cfgrib')
    elif method == 'cfgrib_filtered':
        return xr.open_dataset(grib_file, engine='cfgrib', 
                             backend_kwargs={'filter_by_keys': {'paramId': 228}})
    elif method == 'cfgrib_ignore_errors':
        return xr.open_dataset(grib_file, engine='cfgrib', 
                             backend_kwargs={'errors': 'ignore'})
    else:
        raise ValueError(f"Unknown method: {method}")

if __name__ == "__main__":
    # Run the diagnosis
    main_diagnosis()
    
    # Example of how to use after diagnosis:
    # ds = load_grib_data('london_era5land_hourly_precip.grib', method='cfgrib_ignore_errors')
    # print(ds)

GRIB File Diagnosis Tool
Diagnosing GRIB file: london_era5land_hourly_precip.grib
📏 File size: 197,688 bytes (0.19 MB)
🔐 File readable: ✅
🔍 First 16 bytes: b'PK\x03\x04\x14\x00\x00\x00\x08\x00eq\xe2Z\xa4\x95'
❌ File does not start with GRIB magic bytes
   This might be a corrupted or different file format
🧹 No index files found to clean up

🔄 Attempting to read london_era5land_hourly_precip.grib with different methods...
🔄 Trying xarray with cfgrib engine...


Can't create file 'london_era5land_hourly_precip.grib.5b7b6.idx'
Traceback (most recent call last):
  File "c:\ProgramData\anaconda3\Lib\site-packages\cfgrib\messages.py", line 274, in itervalues
    yield self.filestream.message_from_file(file, errors=errors)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\ProgramData\anaconda3\Lib\site-packages\cfgrib\messages.py", line 341, in message_from_file
    return Message.from_file(file, offset, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\ProgramData\anaconda3\Lib\site-packages\cfgrib\messages.py", line 105, in from_file
    raise EOFError("End of file: %r" % file)
EOFError: End of file: <_io.BufferedReader name='london_era5land_hourly_precip.grib'>

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "c:\ProgramData\anaconda3\Lib\site-packages\cfgrib\messages.py", line 539, in from_indexpath_or_filestream
    self = cls.from_

❌ cfgrib failed: No valid message found: 'london_era5land_hourly_precip.grib'...
🔄 Trying cfgrib with filter_by_keys...


Can't create file 'london_era5land_hourly_precip.grib.5b7b6.idx'
Traceback (most recent call last):
  File "c:\ProgramData\anaconda3\Lib\site-packages\cfgrib\messages.py", line 274, in itervalues
    yield self.filestream.message_from_file(file, errors=errors)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\ProgramData\anaconda3\Lib\site-packages\cfgrib\messages.py", line 341, in message_from_file
    return Message.from_file(file, offset, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\ProgramData\anaconda3\Lib\site-packages\cfgrib\messages.py", line 105, in from_file
    raise EOFError("End of file: %r" % file)
EOFError: End of file: <_io.BufferedReader name='london_era5land_hourly_precip.grib'>

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "c:\ProgramData\anaconda3\Lib\site-packages\cfgrib\messages.py", line 539, in from_indexpath_or_filestream
    self = cls.from_

❌ cfgrib with filter failed: No valid message found: 'london_era5land_hourly_precip.grib'...
🔄 Trying cfgrib with errors='ignore'...


Can't create file 'london_era5land_hourly_precip.grib.5b7b6.idx'
Traceback (most recent call last):
  File "c:\ProgramData\anaconda3\Lib\site-packages\cfgrib\messages.py", line 274, in itervalues
    yield self.filestream.message_from_file(file, errors=errors)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\ProgramData\anaconda3\Lib\site-packages\cfgrib\messages.py", line 341, in message_from_file
    return Message.from_file(file, offset, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\ProgramData\anaconda3\Lib\site-packages\cfgrib\messages.py", line 105, in from_file
    raise EOFError("End of file: %r" % file)
EOFError: End of file: <_io.BufferedReader name='london_era5land_hourly_precip.grib'>

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "c:\ProgramData\anaconda3\Lib\site-packages\cfgrib\messages.py", line 539, in from_indexpath_or_filestream
    self = cls.from_

❌ cfgrib with errors='ignore' failed: No valid message found: 'london_era5land_hourly_precip.grib'...
🔄 Trying pygrib...
✅ pygrib found 0 messages!

✅ Found 1 working method(s)!
   - pygrib


In [37]:
# possibly working?


import zipfile
import os
import xarray as xr
from pathlib import Path

def extract_grib_from_zip(zip_file_path, extract_to=None):
    """
    Extract GRIB file from ZIP archive downloaded from CDS API
    
    Parameters:
    zip_file_path (str): Path to the ZIP file (mistakenly named .grib)
    extract_to (str): Directory to extract to (default: same directory as ZIP)
    
    Returns:
    str: Path to the extracted GRIB file
    """
    
    zip_path = Path(zip_file_path)
    
    if extract_to is None:
        extract_to = zip_path.parent
    else:
        extract_to = Path(extract_to)
        extract_to.mkdir(exist_ok=True)
    
    print(f"📦 Extracting ZIP file: {zip_file_path}")
    
    try:
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            # List contents
            file_list = zip_ref.namelist()
            print(f"📋 Files in archive: {file_list}")
            
            # Extract all files
            zip_ref.extractall(extract_to)
            print(f"✅ Extracted to: {extract_to}")
            
            # Find the GRIB file
            grib_files = []
            for filename in file_list:
                extracted_path = extract_to / filename
                if extracted_path.exists():
                    # Check if it's a GRIB file by reading first few bytes
                    with open(extracted_path, 'rb') as f:
                        first_bytes = f.read(4)
                        if first_bytes == b'GRIB':
                            grib_files.append(str(extracted_path))
                            print(f"🎯 Found GRIB file: {filename}")
            
            if grib_files:
                return grib_files[0]  # Return the first GRIB file found
            else:
                # If no GRIB magic bytes found, return the first file (might still be GRIB)
                first_file = str(extract_to / file_list[0])
                print(f"⚠️  No GRIB magic bytes found, returning first file: {first_file}")
                return first_file
                
    except zipfile.BadZipFile:
        print(f"❌ Error: {zip_file_path} is not a valid ZIP file")
        return None
    except Exception as e:
        print(f"❌ Error extracting ZIP file: {e}")
        return None

def load_extracted_grib(grib_file_path):
    """
    Load the extracted GRIB file using xarray
    
    Parameters:
    grib_file_path (str): Path to the extracted GRIB file
    
    Returns:
    xarray.Dataset: The loaded dataset
    """
    
    print(f"📊 Loading GRIB file: {grib_file_path}")
    
    # Try different methods to load the GRIB file
    methods = [
        ('cfgrib', {}),
        ('cfgrib with error handling', {'errors': 'ignore'}),
        ('cfgrib with filter', {'filter_by_keys': {'paramId': 228}}),
    ]
    
    for method_name, backend_kwargs in methods:
        try:
            print(f"🔄 Trying {method_name}...")
            ds = xr.open_dataset(grib_file_path, engine='cfgrib', backend_kwargs=backend_kwargs)
            print(f"✅ Success with {method_name}!")
            print(f"📏 Dataset dimensions: {dict(ds.dims)}")
            print(f"📊 Variables: {list(ds.data_vars)}")
            return ds
        except Exception as e:
            print(f"❌ {method_name} failed: {str(e)[:100]}...")
            continue
    
    print("❌ All methods failed to load the GRIB file")
    return None

def process_london_precipitation_data(zip_file_path):
    """
    Complete workflow to extract and process London precipitation data
    
    Parameters:
    zip_file_path (str): Path to the ZIP file downloaded from CDS
    
    Returns:
    xarray.Dataset: Processed precipitation dataset
    """
    
    print("🌧️  Processing London Precipitation Data")
    print("=" * 50)
    
    # Step 1: Extract the GRIB file from ZIP
    grib_file_path = extract_grib_from_zip(zip_file_path)
    
    if grib_file_path is None:
        print("❌ Failed to extract GRIB file")
        return None
    
    # Step 2: Load the GRIB file
    ds = load_extracted_grib(grib_file_path)
    
    if ds is None:
        print("❌ Failed to load GRIB file")
        return None
    
    # Step 3: Basic data exploration
    print("\n📈 Data Summary:")
    print("=" * 30)
    
    # Print coordinate information
    print("🗺️  Coordinates:")
    for coord in ds.coords:
        coord_data = ds.coords[coord]
        if coord_data.size > 1:
            print(f"   {coord}: {coord_data.min().values} to {coord_data.max().values} ({coord_data.size} points)")
        else:
            print(f"   {coord}: {coord_data.values}")
    
    # Print variable information
    print("\n📊 Data Variables:")
    for var in ds.data_vars:
        var_data = ds[var]
        print(f"   {var}: {var_data.dims} - {var_data.long_name if 'long_name' in var_data.attrs else 'No description'}")
        if hasattr(var_data, 'units'):
            print(f"      Units: {var_data.units}")
    
    return ds

def quick_visualization(ds, variable_name=None):
    """
    Create a quick visualization of the precipitation data
    
    Parameters:
    ds (xarray.Dataset): The precipitation dataset
    variable_name (str): Name of the variable to plot (auto-detect if None)
    """
    
    if variable_name is None:
        # Auto-detect precipitation variable
        possible_names = ['tp', 'total_precipitation', 'precip', 'precipitation']
        for name in possible_names:
            if name in ds.data_vars:
                variable_name = name
                break
        
        if variable_name is None:
            variable_name = list(ds.data_vars)[0]  # Use first variable
    
    print(f"📊 Creating visualization for variable: {variable_name}")
    
    try:
        import matplotlib.pyplot as plt
        
        # Get the variable
        var = ds[variable_name]
        
        # If time dimension exists, plot time series
        if 'time' in var.dims:
            # Average over spatial dimensions if they exist
            if 'latitude' in var.dims and 'longitude' in var.dims:
                var_avg = var.mean(dim=['latitude', 'longitude'])
            else:
                var_avg = var
            
            plt.figure(figsize=(12, 6))
            var_avg.plot()
            plt.title(f'Time Series: {variable_name}')
            plt.xlabel('Time')
            plt.ylabel(f'{variable_name} ({var.units if "units" in var.attrs else ""})')
            plt.xticks(rotation=45)
            plt.tight_layout()
            plt.show()
        
        # If spatial dimensions exist, plot map for first time step
        elif 'latitude' in var.dims and 'longitude' in var.dims:
            plt.figure(figsize=(10, 8))
            var.isel(time=0).plot()
            plt.title(f'Spatial Map: {variable_name} (First Time Step)')
            plt.show()
        
        print("✅ Visualization created successfully!")
        
    except ImportError:
        print("⚠️  Matplotlib not available for visualization")
    except Exception as e:
        print(f"❌ Visualization failed: {e}")

# Main execution
if __name__ == "__main__":
    # Process the London precipitation file
    zip_file = "london_era5land_hourly_precip.grib"  # This is actually a ZIP file
    
    print("🚀 Starting GRIB ZIP extraction and processing...")
    
    # Extract and load the data
    ds = process_london_precipitation_data(zip_file)
    
    if ds is not None:
        print("\n🎉 Success! Dataset loaded successfully.")
        print("\nTo work with your data:")
        print("1. Use ds[variable_name] to access variables")
        print("2. Use ds.sel() or ds.isel() to select data")
        print("3. Use ds.plot() for quick visualizations")
        
        # Optional: Create a quick visualization
        # quick_visualization(ds)
        
        # Keep dataset open for further analysis
        globals()['precipitation_data'] = ds
        print("\n💡 Dataset stored in variable 'precipitation_data' for further use")
    
    else:
        print("\n❌ Failed to process the data. Please check the file and try again.")

🚀 Starting GRIB ZIP extraction and processing...
🌧️  Processing London Precipitation Data
📦 Extracting ZIP file: london_era5land_hourly_precip.grib
📋 Files in archive: ['data.grib']
✅ Extracted to: .
🎯 Found GRIB file: data.grib
📊 Loading GRIB file: data.grib
🔄 Trying cfgrib...
✅ Success with cfgrib!
📏 Dataset dimensions: {'time': 366, 'step': 24, 'latitude': 3, 'longitude': 4}
📊 Variables: ['tp']

📈 Data Summary:
🗺️  Coordinates:
   number: 0
   time: 2021-12-31T00:00:00.000000000 to 2022-12-31T00:00:00.000000000 (366 points)
   step: 3600000000000 nanoseconds to 86400000000000 nanoseconds (24 points)
   surface: 0.0
   latitude: 51.4 to 51.6 (3 points)
   longitude: -0.2 to 0.1 (4 points)
   valid_time: 2021-12-31T01:00:00.000000000 to 2023-01-01T00:00:00.000000000 (8784 points)

📊 Data Variables:
   tp: ('time', 'step', 'latitude', 'longitude') - Total precipitation
      Units: m

🎉 Success! Dataset loaded successfully.

To work with your data:
1. Use ds[variable_name] to access va

  vars, attrs, coord_names = xr.conventions.decode_cf_variables(
  print(f"📏 Dataset dimensions: {dict(ds.dims)}")


xarray.core.dataset.Dataset

In [10]:
# precipitation_data

In [39]:
import pandas as pd
import numpy as np
from pathlib import Path

def explore_dataset_structure(ds):
    """Explore the structure of your dataset to understand what we're working with"""
    print("🔍 Dataset Structure Analysis")
    print("=" * 40)
    
    print(f"📏 Dimensions: {dict(ds.dims)}")
    print(f"📊 Variables: {list(ds.data_vars)}")
    print(f"🗺️  Coordinates: {list(ds.coords)}")
    
    # Show sample of each coordinate
    print("\n📍 Coordinate Details:")
    for coord_name, coord in ds.coords.items():
        if coord.size <= 10:
            print(f"   {coord_name}: {coord.values}")
        else:
            print(f"   {coord_name}: {coord.values[0]} to {coord.values[-1]} ({coord.size} points)")
    
    # Show variable details
    print("\n📊 Variable Details:")
    for var_name in ds.data_vars:
        var = ds[var_name]
        print(f"   {var_name}:")
        print(f"      Shape: {var.shape}")
        print(f"      Dimensions: {var.dims}")
        if hasattr(var, 'units'):
            print(f"      Units: {var.units}")
        if hasattr(var, 'long_name'):
            print(f"      Description: {var.long_name}")
        print(f"      Data range: {float(var.min())} to {float(var.max())}")

def convert_to_long_format_dataframe(ds, variables=None):
    """
    Convert xarray dataset to long-format pandas DataFrame
    Each row represents one observation with all coordinates as columns
    
    Parameters:
    ds: xarray Dataset
    variables: list of variable names to include (None = all variables)
    
    Returns:
    pandas DataFrame in long format
    """
    
    print("📊 Converting to Long Format DataFrame...")
    
    if variables is None:
        variables = list(ds.data_vars)
    
    # Convert to DataFrame - this creates a long format automatically
    df = ds.to_dataframe()
    
    # Reset index to make all coordinates into columns
    df = df.reset_index()
    
    # Remove any NaN values if present
    df = df.dropna()
    
    print(f"✅ Created DataFrame with {len(df)} rows and {len(df.columns)} columns")
    print(f"📊 Columns: {list(df.columns)}")
    
    return df

def convert_to_time_series_dataframe(ds, lat_lon_method='mean', variables=None):
    """
    Convert to time series DataFrame by aggregating spatial dimensions
    
    Parameters:
    ds: xarray Dataset
    lat_lon_method: How to handle lat/lon ('mean', 'median', 'sum', or specific lat/lon values)
    variables: list of variable names to include
    
    Returns:
    pandas DataFrame with time as index
    """
    
    print(f"📈 Converting to Time Series DataFrame (spatial aggregation: {lat_lon_method})...")
    
    if variables is None:
        variables = list(ds.data_vars)
    
    # Check if we have spatial dimensions
    spatial_dims = [dim for dim in ['latitude', 'longitude', 'lat', 'lon'] if dim in ds.dims]
    
    if spatial_dims and lat_lon_method in ['mean', 'median', 'sum']:
        # Aggregate spatial dimensions
        if lat_lon_method == 'mean':
            ds_agg = ds.mean(dim=spatial_dims)
        elif lat_lon_method == 'median':
            ds_agg = ds.median(dim=spatial_dims)
        elif lat_lon_method == 'sum':
            ds_agg = ds.sum(dim=spatial_dims)
        
        print(f"   Aggregated {len(spatial_dims)} spatial dimensions using {lat_lon_method}")
    else:
        ds_agg = ds
    
    # Convert to DataFrame
    df = ds_agg.to_dataframe()
    
    # If time is in the index, keep it there, otherwise reset index
    if 'time' in df.index.names:
        df = df.reset_index(level=[name for name in df.index.names if name != 'time'])
    else:
        df = df.reset_index()
    
    # Remove NaN values
    df = df.dropna()
    
    print(f"✅ Created time series DataFrame with {len(df)} rows and {len(df.columns)} columns")
    
    return df

def convert_to_spatial_dataframe(ds, time_method='mean', variables=None):
    """
    Convert to spatial DataFrame by aggregating time dimension
    
    Parameters:
    ds: xarray Dataset
    time_method: How to handle time ('mean', 'sum', 'max', or specific time index)
    variables: list of variable names to include
    
    Returns:
    pandas DataFrame with lat/lon as columns
    """
    
    print(f"🗺️  Converting to Spatial DataFrame (time aggregation: {time_method})...")
    
    if variables is None:
        variables = list(ds.data_vars)
    
    # Check if we have time dimension
    if 'time' in ds.dims:
        if time_method == 'mean':
            ds_agg = ds.mean(dim='time')
        elif time_method == 'sum':
            ds_agg = ds.sum(dim='time')
        elif time_method == 'max':
            ds_agg = ds.max(dim='time')
        elif isinstance(time_method, int):
            ds_agg = ds.isel(time=time_method)
        else:
            ds_agg = ds.mean(dim='time')  # default
        
        print(f"   Aggregated time dimension using {time_method}")
    else:
        ds_agg = ds
    
    # Convert to DataFrame
    df = ds_agg.to_dataframe().reset_index()
    
    # Remove NaN values
    df = df.dropna()
    
    print(f"✅ Created spatial DataFrame with {len(df)} rows and {len(df.columns)} columns")
    
    return df

def save_dataframe_to_csv(df, filename, add_metadata=True):
    """
    Save DataFrame to CSV with optional metadata
    
    Parameters:
    df: pandas DataFrame
    filename: output filename
    add_metadata: whether to add metadata header
    """
    
    filepath = Path(filename)
    
    print(f"💾 Saving DataFrame to: {filepath}")
    
    if add_metadata:
        # Create metadata header
        metadata_lines = [
            f"# Generated from ERA5 GRIB data",
            f"# Rows: {len(df)}",
            f"# Columns: {len(df.columns)}",
            f"# Column names: {', '.join(df.columns)}",
            f"# Generated on: {pd.Timestamp.now()}",
            "#"
        ]
        
        # Write metadata and data
        with open(filepath, 'w', newline='') as f:
            # Write metadata
            for line in metadata_lines:
                f.write(line + '\n')
            
            # Write CSV data
            df.to_csv(f, index=False)
    else:
        # Simple CSV save
        df.to_csv(filepath, index=False)
    
    file_size = filepath.stat().st_size / (1024*1024)  # MB
    print(f"✅ Saved successfully! File size: {file_size:.2f} MB")
    
    return str(filepath)

def create_multiple_csv_formats(ds, base_filename="london_precipitation"):
    """
    Create CSV files in multiple formats for different use cases
    
    Parameters:
    ds: xarray Dataset
    base_filename: base name for output files
    
    Returns:
    dict: mapping of format names to file paths
    """
    
    print("🚀 Creating multiple CSV formats...")
    print("=" * 40)
    
    output_files = {}
    
    # 1. Long format (all data points)
    try:
        df_long = convert_to_long_format_dataframe(ds)
        file_long = save_dataframe_to_csv(df_long, f"{base_filename}_long_format.csv")
        output_files['long_format'] = file_long
        print(f"📊 Long format sample:\n{df_long.head()}\n")
    except Exception as e:
        print(f"❌ Long format failed: {e}")
    
    # 2. Time series (spatial average)
    try:
        df_time = convert_to_time_series_dataframe(ds, lat_lon_method='mean')
        file_time = save_dataframe_to_csv(df_time, f"{base_filename}_time_series.csv")
        output_files['time_series'] = file_time
        print(f"📈 Time series sample:\n{df_time.head()}\n")
    except Exception as e:
        print(f"❌ Time series failed: {e}")
    
    # 3. Spatial average (time averaged)
    try:
        df_spatial = convert_to_spatial_dataframe(ds, time_method='mean')
        file_spatial = save_dataframe_to_csv(df_spatial, f"{base_filename}_spatial_average.csv")
        output_files['spatial_average'] = file_spatial
        print(f"🗺️  Spatial average sample:\n{df_spatial.head()}\n")
    except Exception as e:
        print(f"❌ Spatial average failed: {e}")
    
    return output_files

def quick_data_summary(df):
    """Generate a quick summary of the DataFrame"""
    print("📊 Data Summary")
    print("=" * 20)
    print(f"Shape: {df.shape}")
    print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    print("\nColumn types:")
    print(df.dtypes)
    print("\nFirst few rows:")
    print(df.head())
    print("\nBasic statistics:")
    print(df.describe())

# Main function to use with your dataset
def process_my_data(ds):
    """
    Main function to process your specific dataset
    Call this with your loaded dataset: process_my_data(ds)
    """
    
    print("🌧️  Processing Your London Precipitation Data")
    print("=" * 50)
    
    # First, explore the structure
    explore_dataset_structure(ds)
    
    print("\n" + "="*50)
    
    # Create multiple formats
    output_files = create_multiple_csv_formats(ds, "london_era5_precipitation")
    
    print(f"\n🎉 Processing complete! Created {len(output_files)} files:")
    for format_name, filepath in output_files.items():
        print(f"   📄 {format_name}: {filepath}")
    
    # Return the long format DataFrame for immediate use
    if 'long_format' in output_files:
        df = pd.read_csv(output_files['long_format'], comment='#')
        print(f"\n💡 Returning long format DataFrame with {len(df)} rows")
        return df
    else:
        print("⚠️  Returning time series DataFrame as fallback")
        return convert_to_time_series_dataframe(ds)

# # Example usage:
# if __name__ == "__main__":
#     # Assuming you have your dataset in variable 'ds'
#     # df = process_my_data(ds)
    
#     print("🔧 Usage Instructions:")
#     print("1. Run: df = process_my_data(ds)")
#     print("2. This will create multiple CSV files and return a DataFrame")
#     print("3. Use the DataFrame for further analysis in pandas")
    
#     # Or create specific formats:
#     print("\n🎯 For specific formats:")
#     print("- Long format: df = convert_to_long_format_dataframe(ds)")
#     print("- Time series: df = convert_to_time_series_dataframe(ds)")
#     print("- Spatial map: df = convert_to_spatial_dataframe(ds)")

In [40]:
df = process_my_data(ds)

🌧️  Processing Your London Precipitation Data
🔍 Dataset Structure Analysis
📏 Dimensions: {'time': 366, 'step': 24, 'latitude': 3, 'longitude': 4}
📊 Variables: ['tp']
🗺️  Coordinates: ['number', 'time', 'step', 'surface', 'latitude', 'longitude', 'valid_time']

📍 Coordinate Details:
   number: 0
   time: 2021-12-31T00:00:00.000000000 to 2022-12-31T00:00:00.000000000 (366 points)
   step: 3600000000000 nanoseconds to 86400000000000 nanoseconds (24 points)
   surface: 0.0
   latitude: [51.6 51.5 51.4]
   longitude: [-2.00000000e-01 -1.00000000e-01  2.77555756e-17  1.00000000e-01]
   valid_time: ['2021-12-31T01:00:00.000000000' '2021-12-31T02:00:00.000000000'
 '2021-12-31T03:00:00.000000000' '2021-12-31T04:00:00.000000000'
 '2021-12-31T05:00:00.000000000' '2021-12-31T06:00:00.000000000'
 '2021-12-31T07:00:00.000000000' '2021-12-31T08:00:00.000000000'
 '2021-12-31T09:00:00.000000000' '2021-12-31T10:00:00.000000000'
 '2021-12-31T11:00:00.000000000' '2021-12-31T12:00:00.000000000'
 '2021-12-3

  print(f"📏 Dimensions: {dict(ds.dims)}")


      Data range: 0.0 to 0.0299998726695776

🚀 Creating multiple CSV formats...
📊 Converting to Long Format DataFrame...
✅ Created DataFrame with 105120 rows and 8 columns
📊 Columns: ['time', 'step', 'latitude', 'longitude', 'number', 'surface', 'valid_time', 'tp']
💾 Saving DataFrame to: london_era5_precipitation_long_format.csv
✅ Saved successfully! File size: 8.38 MB
📊 Long format sample:
          time   step  latitude     longitude  number  surface valid_time  \
276 2021-12-31 1 days      51.6 -2.000000e-01       0      0.0 2022-01-01   
277 2021-12-31 1 days      51.6 -1.000000e-01       0      0.0 2022-01-01   
278 2021-12-31 1 days      51.6  2.775558e-17       0      0.0 2022-01-01   
279 2021-12-31 1 days      51.6  1.000000e-01       0      0.0 2022-01-01   
280 2021-12-31 1 days      51.5 -2.000000e-01       0      0.0 2022-01-01   

           tp  
276  0.002712  
277  0.002751  
278  0.002790  
279  0.002478  
280  0.002195  

📈 Converting to Time Series DataFrame (spatial

In [36]:
df

Unnamed: 0,time,step,latitude,longitude,number,surface,valid_time,tp
0,2021-12-31,1 days 00:00:00,51.6,-2.000000e-01,0,0.0,2022-01-01 00:00:00,0.002712
1,2021-12-31,1 days 00:00:00,51.6,-1.000000e-01,0,0.0,2022-01-01 00:00:00,0.002751
2,2021-12-31,1 days 00:00:00,51.6,2.775558e-17,0,0.0,2022-01-01 00:00:00,0.002790
3,2021-12-31,1 days 00:00:00,51.6,1.000000e-01,0,0.0,2022-01-01 00:00:00,0.002478
4,2021-12-31,1 days 00:00:00,51.5,-2.000000e-01,0,0.0,2022-01-01 00:00:00,0.002195
...,...,...,...,...,...,...,...,...
105115,2022-12-31,0 days 23:00:00,51.5,1.000000e-01,0,0.0,2022-12-31 23:00:00,0.008670
105116,2022-12-31,0 days 23:00:00,51.4,-2.000000e-01,0,0.0,2022-12-31 23:00:00,0.009620
105117,2022-12-31,0 days 23:00:00,51.4,-1.000000e-01,0,0.0,2022-12-31 23:00:00,0.010117
105118,2022-12-31,0 days 23:00:00,51.4,2.775558e-17,0,0.0,2022-12-31 23:00:00,0.010687


In [None]:
df[df['valid_time'] == '2023-01-01 00:00:00']

In [None]:
df_51_01 = df[(df['latitude'] == 51.5) & (df['longitude'] == 0.1)]

In [None]:
df_51_01_tp = (df_51_01
               .groupby('time')['tp']
               .sum().reset_index().sort_values('time')
               )

In [None]:
df_51_01_tp['tp_mm'] = 1000* df_51_01_tp['tp']

In [None]:
df_51_01_tp.head(3)

In [None]:
df_51_01_2023 = df_51_01.copy()

In [None]:
df_51_01_2023['year']  = pd.to_numeric(df_51_01_2023['valid_time'].str[:4])
df_51_01_2023['month']  = pd.to_numeric(df_51_01_2023['valid_time'].str[5:7])
df_51_01_2023['day']  = pd.to_numeric(df_51_01_2023['valid_time'].str[8:10])
df_51_01_2023['hour']  =pd.to_numeric(df_51_01_2023['valid_time'].str[11:13])

In [None]:
df_test = (df_51_01
           .groupby('time')['tp']
           .max()
           ).reset_index()[1:]

In [None]:
df_test['month'] = pd.to_numeric(df_test['time'].str[5:7])

In [None]:
df_test['tp_mm'] = 1000 * df_test['tp']
df_test.head()

In [None]:
df_month = (df_test
 .groupby('month')['tp_mm']
 .sum()
 ).reset_index()

In [None]:
df_month['tp_mm'].sum()

In [None]:
df_51_01_tp['year']  = pd.to_numeric(df_51_01_tp['time'].str[:4])
df_51_01_tp['month'] = pd.to_numeric(df_51_01_tp['time'].str[5:7])

In [None]:
df_51_01_tp_2023 =  df_51_01_tp[df_51_01_tp['year'] >2022] 

In [None]:
(df_51_01_tp_2023
 .groupby('month')['tp']
 .sum()
 )

In [None]:
df_51_01[df_51_01['time'] == '2023-01-10']

In [None]:
import cdsapi
import xarray as xr
import pandas as pd
import os

def download_era5_land_precipitation(years, months, area=None, output_file='era5_land_precip.nc'):
    """
    Download ERA5-Land total precipitation data
    
    Parameters:
    years: list of years as strings (e.g., ['2023', '2024'])
    months: list of months as strings (e.g., ['01', '02', '03'])
    area: [North, West, South, East] bounding box for London: [51.7, -0.5, 51.3, 0.2]
    output_file: output NetCDF filename
    """
    
    # Option 1: Use .cdsapirc file (recommended)
    # Option 1: Use .cdsapirc file (recommended)
    c = cdsapi.Client()
    
    # Option 2: Set credentials programmatically (if .cdsapirc doesn't work)
    # c = cdsapi.Client(url='https://cds.climate.copernicus.eu/api/v2',
    #                   key='YOUR_UID:YOUR_API_KEY')
    
    # Option 2: Set credentials programmatically (if .cdsapirc doesn't work)
    # c = cdsapi.Client(url='https://cds.climate.copernicus.eu/api/v2',
    #                   key='YOUR_UID:YOUR_API_KEY')
    
    # Default to London area if not specified
    if area is None:
        area = [51.7, -0.5, 51.3, 0.2]  # London bounding box
    
    # For hourly data (original temporal resolution)
    c.retrieve(
        'reanalysis-era5-land',
        {
            'variable': 'total_precipitation',
            'year': years,
            'month': months,
            'day': [
                '01', '02', '03', '04', '05', '06',
                '07', '08', '09', '10', '11', '12',
                '13', '14', '15', '16', '17', '18',
                '19', '20', '21', '22', '23', '24',
                '25', '26', '27', '28', '29', '30', '31'
            ],
            'time': [
                '00:00', '01:00', '02:00', '03:00', '04:00', '05:00',
                '06:00', '07:00', '08:00', '09:00', '10:00', '11:00',
                '12:00', '13:00', '14:00', '15:00', '16:00', '17:00',
                '18:00', '19:00', '20:00', '21:00', '22:00', '23:00'
            ],
            'area': area,  # [North, West, South, East]
            'format': 'netcdf',
        },
        output_file
    )
    
    print(f"Data downloaded to {output_file}")
    return output_file

def download_era5_land_daily_stats(years, months, daily_statistic='daily_sum', area=None, output_file='era5_land_daily.nc'):
    """
    Download ERA5-Land daily statistics (recommended for precipitation sums)
    
    Parameters:
    daily_statistic: 'daily_sum', 'daily_mean', 'daily_maximum', 'daily_minimum'
    """
    
    c = cdsapi.Client()
    
    if area is None:
        area = [51.7, -0.5, 51.3, 0.2]  # London bounding box
    
    # For daily statistics - using the correct dataset name and parameters
    c.retrieve(
        'derived-era5-land-daily-statistics',
        {
            'variable': 'total_precipitation',
            'daily_statistic': daily_statistic,
            'year': years,
            'month': months,
            'day': [
                '01', '02', '03', '04', '05', '06',
                '07', '08', '09', '10', '11', '12',
                '13', '14', '15', '16', '17', '18',
                '19', '20', '21', '22', '23', '24',
                '25', '26', '27', '28', '29', '30', '31'
            ],
            'area': area,
            'format': 'netcdf',
        },
        output_file
    )
    
    print(f"Daily statistics data downloaded to {output_file}")
    return output_file

def process_precipitation_data(grib_file, threshold_mm=0.1):
    """
    Process ERA5-Land precipitation data
    
    Parameters:
    grib_file: path to downloaded GRIB file
    threshold_mm: minimum precipitation threshold (to match gauge detection limits)
    """
    
    import os
    
    # Check if file exists and get info
    if not os.path.exists(grib_file):
        raise FileNotFoundError(f"File not found: {grib_file}")
    
    file_size = os.path.getsize(grib_file)
    print(f"File size: {file_size / 1024 / 1024:.2f} MB")
    
    # Check if file is very small (likely an error file)
    if file_size < 1000:  # Less than 1KB suggests an error
        try:
            with open(grib_file, 'r') as f:
                content = f.read()
                print(f"File content (first 500 chars): {content[:500]}")
        except:
            print("Cannot read file as text - might be binary but corrupted")
        raise ValueError("Downloaded file appears to be corrupted or contains an error message")
    
    # Try to load the data - GRIB files should use cfgrib
    try:
        ds = xr.open_dataset(grib_file, engine='cfgrib')
        print("Successfully opened GRIB file with cfgrib engine")
    except Exception as e1:
        print(f"Failed with cfgrib: {e1}")
        # Fallback to other engines
        try:
            ds = xr.open_dataset(grib_file, engine='netcdf4')
            print("Successfully opened with netcdf4 engine")
        except Exception as e2:
            print(f"Failed with netcdf4: {e2}")
            raise ValueError(f"Could not open GRIB file. Primary error: {e1}")
    
    print(f"Dataset variables: {list(ds.variables.keys())}")
    print(f"Dataset dimensions: {ds.dims}")
    print(f"Dataset coordinates: {list(ds.coords.keys())}")
    
    # Get the precipitation variable (check common names)
    precip_var = None
    possible_names = ['tp', 'total_precipitation', 'precipitation', 'precip']
    
    for var_name in possible_names:
        if var_name in ds.variables:
            precip_var = var_name
            break
    
    if precip_var is None:
        raise ValueError(f"Could not find precipitation variable. Available variables: {list(ds.variables.keys())}")
    
    print(f"Using precipitation variable: {precip_var}")
    
    # Convert from m/hour to mm/hour (ERA5-Land precip is in meters)
    precip_mm = ds[precip_var] * 1000  # Convert m to mm
    
    print(f"Original data shape: {precip_mm.shape}")
    print(f"Data range: {precip_mm.min().values:.6f} to {precip_mm.max().values:.6f} mm")
    
    # Apply threshold if specified
    if threshold_mm > 0:
        precip_mm = precip_mm.where(precip_mm >= threshold_mm, 0)
        print(f"Applied {threshold_mm} mm threshold")
    
    # Calculate daily, monthly, and yearly sums
    daily_sum = precip_mm.resample(time='D').sum()
    monthly_sum = precip_mm.resample(time='M').sum()
    yearly_sum = precip_mm.resample(time='Y').sum()
    
    # Get London area average (if multiple grid points)
    if 'latitude' in precip_mm.dims and 'longitude' in precip_mm.dims:
        daily_avg = daily_sum.mean(dim=['latitude', 'longitude'])
        monthly_avg = monthly_sum.mean(dim=['latitude', 'longitude'])
        yearly_avg = yearly_sum.mean(dim=['latitude', 'longitude'])
        print("Averaged over spatial dimensions")
    else:
        daily_avg = daily_sum
        monthly_avg = monthly_sum  
        yearly_avg = yearly_sum
        print("No spatial averaging needed")
    
    return {
        'daily': daily_avg,
        'monthly': monthly_avg,
        'yearly': yearly_avg,
        'raw_data': precip_mm
    }

# Example usage
if __name__ == "__main__":
    # Start with a smaller test - just one month
    years = ['2023']
    months = ['01']  # Start with January only
    
    print("Downloading ERA5-Land hourly precipitation data in GRIB format...")
    
    # Method 1: Download hourly data in GRIB format (more reliable)
    hourly_file = download_era5_land_precipitation(
        years=years, 
        months=months, 
        output_file='london_era5land_hourly_precip.grib'
    )
    
    # Process the data
    results = process_precipitation_data(hourly_file, threshold_mm=0.1)
    
    # Display some statistics
    print("\n=== Precipitation Statistics for London ===")
    print(f"Daily mean: {results['daily'].mean().values:.2f} mm/day")
    print(f"Monthly mean: {results['monthly'].mean().values:.2f} mm/month") 
    print(f"Yearly total: {results['yearly'].sum().values:.2f} mm/year")

In [None]:
# def download_era5_land_precipitation(years, months, area=None, output_file='era5_land_precip.nc'):
#     """
#     Download ERA5-Land total precipitation data
    
#     Parameters:
#     years: list of years as strings (e.g., ['2023', '2024'])
#     months: list of months as strings (e.g., ['01', '02', '03'])
#     area: [North, West, South, East] bounding box for London: [51.7, -0.5, 51.3, 0.2]
#     output_file: output NetCDF filename
#     """
    
#     c = cdsapi.Client()
    
#     # Default to London area if not specified
#     if area is None:
#         area = [51.7, -0.5, 51.3, 0.2]  # London bounding box
    
#     # For hourly data (original temporal resolution)
#     c.retrieve(
#         'reanalysis-era5-land',
#         {
#             'variable': 'total_precipitation',
#             'year': years,
#             'month': months,
#             'day': [
#                 '01', '02', '03', '04', '05', '06',
#                 '07', '08', '09', '10', '11', '12',
#                 '13', '14', '15', '16', '17', '18',
#                 '19', '20', '21', '22', '23', '24',
#                 '25', '26', '27', '28', '29', '30', '31'
#             ],
#             'time': [
#                 '00:00', '01:00', '02:00', '03:00', '04:00', '05:00',
#                 '06:00', '07:00', '08:00', '09:00', '10:00', '11:00',
#                 '12:00', '13:00', '14:00', '15:00', '16:00', '17:00',
#                 '18:00', '19:00', '20:00', '21:00', '22:00', '23:00'
#             ],
#             'area': area,  # [North, West, South, East]
#             'format': 'netcdf',
#         },
#         output_file
#     )
    
#     print(f"Data downloaded to {output_file}")
#     return output_file

# def download_era5_land_daily_stats(years, months, statistic='daily_sum', area=None, output_file='era5_land_daily.nc'):
#     """
#     Download ERA5-Land daily statistics (recommended for precipitation sums)
    
#     Parameters:
#     statistic: 'daily_sum', 'daily_mean', 'daily_maximum', 'daily_minimum'
#     """
    
#     c = cdsapi.Client()
    
#     if area is None:
#         area = [51.7, -0.5, 51.3, 0.2]  # London bounding box
    
#     # For daily statistics (more efficient for precipitation sums)
#     c.retrieve(
#         'derived-era5-land-daily-statistics',
#         {
#             'variable': 'total_precipitation',
#             'statistic': statistic,
#             'year': years,
#             'month': months,
#             'day': [
#                 '01', '02', '03', '04', '05', '06',
#                 '07', '08', '09', '10', '11', '12',
#                 '13', '14', '15', '16', '17', '18',
#                 '19', '20', '21', '22', '23', '24',
#                 '25', '26', '27', '28', '29', '30', '31'
#             ],
#             'area': area,
#             'format': 'netcdf',
#         },
#         output_file
#     )
    
#     print(f"Daily statistics data downloaded to {output_file}")
#     return output_file

# def process_precipitation_data(netcdf_file, threshold_mm=0.1):
#     """
#     Process ERA5-Land precipitation data
    
#     Parameters:
#     netcdf_file: path to downloaded NetCDF file
#     threshold_mm: minimum precipitation threshold (to match gauge detection limits)
#     """
    
#     # Load the data
#     ds = xr.open_dataset(netcdf_file)
    
#     # Convert from m/hour to mm/hour (ERA5-Land precip is in meters)
#     precip_mm = ds['tp'] * 1000  # Convert m to mm
    
#     # Apply threshold if specified
#     if threshold_mm > 0:
#         precip_mm = precip_mm.where(precip_mm >= threshold_mm, 0)
    
#     # Calculate daily, monthly, and yearly sums
#     daily_sum = precip_mm.resample(time='D').sum()
#     monthly_sum = precip_mm.resample(time='M').sum()
#     yearly_sum = precip_mm.resample(time='Y').sum()
    
#     # Get London area average (if multiple grid points)
#     if len(precip_mm.dims) > 1:
#         daily_avg = daily_sum.mean(dim=['latitude', 'longitude'])
#         monthly_avg = monthly_sum.mean(dim=['latitude', 'longitude'])
#         yearly_avg = yearly_sum.mean(dim=['latitude', 'longitude'])
#     else:
#         daily_avg = daily_sum
#         monthly_avg = monthly_sum  
#         yearly_avg = yearly_sum
    
#     return {
#         'daily': daily_avg,
#         'monthly': monthly_avg,
#         'yearly': yearly_avg,
#         'raw_data': precip_mm
#     }

# # Example usage
# if __name__ == "__main__":
#     # Download data for London, 2023-2024
#     years = ['2023']
#     months = ['01', '02', '03', '04', '05', '06', 
#               '07', '08', '09', '10', '11', '12']
    
#     # Method 1: Download daily sums (recommended for precipitation)
#     daily_file = download_era5_land_daily_stats(
#         years=years, 
#         months=months, 
#         statistic='daily_sum',
#         output_file='london_era5land_daily_precip.nc'
#     )
    
#     # Method 2: Download hourly data (if you need hourly resolution)
#     # hourly_file = download_era5_land_precipitation(
#     #     years=['2023'], 
#     #     months=['01', '02'], 
#     #     output_file='london_era5land_hourly_precip.nc'
#     # )
    
#     # Process the data
#     results = process_precipitation_data(daily_file, threshold_mm=0.1)
    
#     # Display some statistics
#     print("\n=== Precipitation Statistics for London ===")
#     print(f"Daily mean: {results['daily'].mean().values:.2f} mm/day")
#     print(f"Monthly mean: {results['monthly'].mean().values:.2f} mm/month") 
#     print(f"Yearly total: {results['yearly'].sum().values:.2f} mm/year")

In [None]:
# import cdsapi
# import xarray as xr
# import pandas as pd
# import os

# def download_era5_land_precipitation(years, months, area=None, output_file='era5_land_precip.nc'):
#     """
#     Download ERA5-Land total precipitation data
    
#     Parameters:
#     years: list of years as strings (e.g., ['2023', '2024'])
#     months: list of months as strings (e.g., ['01', '02', '03'])
#     area: [North, West, South, East] bounding box for London: [51.7, -0.5, 51.3, 0.2]
#     output_file: output NetCDF filename
#     """
    
#     # Option 1: Use .cdsapirc file (recommended)
#     # Option 1: Use .cdsapirc file (recommended)
#     c = cdsapi.Client()
    
#     # Option 2: Set credentials programmatically (if .cdsapirc doesn't work)
#     # c = cdsapi.Client(url='https://cds.climate.copernicus.eu/api/v2',
#     #                   key='YOUR_UID:YOUR_API_KEY')
    
#     # Option 2: Set credentials programmatically (if .cdsapirc doesn't work)
#     # c = cdsapi.Client(url='https://cds.climate.copernicus.eu/api/v2',
#     #                   key='YOUR_UID:YOUR_API_KEY')
    
#     # Default to London area if not specified
#     if area is None:
#         area = [51.7, -0.5, 51.3, 0.2]  # London bounding box
    
#     # For hourly data (original temporal resolution)
#     c.retrieve(
#         'reanalysis-era5-land',
#         {
#             'variable': 'total_precipitation',
#             'year': years,
#             'month': months,
#             'day': [
#                 '01', '02', '03', '04', '05', '06',
#                 '07', '08', '09', '10', '11', '12',
#                 '13', '14', '15', '16', '17', '18',
#                 '19', '20', '21', '22', '23', '24',
#                 '25', '26', '27', '28', '29', '30', '31'
#             ],
#             'time': [
#                 '00:00', '01:00', '02:00', '03:00', '04:00', '05:00',
#                 '06:00', '07:00', '08:00', '09:00', '10:00', '11:00',
#                 '12:00', '13:00', '14:00', '15:00', '16:00', '17:00',
#                 '18:00', '19:00', '20:00', '21:00', '22:00', '23:00'
#             ],
#             'area': area,  # [North, West, South, East]
#             'format': 'netcdf',
#         },
#         output_file
#     )
    
#     print(f"Data downloaded to {output_file}")
#     return output_file

# def download_era5_land_daily_stats(years, months, daily_statistic='daily_sum', area=None, output_file='era5_land_daily.nc'):
#     """
#     Download ERA5-Land daily statistics (recommended for precipitation sums)
    
#     Parameters:
#     daily_statistic: 'daily_sum', 'daily_mean', 'daily_maximum', 'daily_minimum'
#     """
    
#     c = cdsapi.Client()
    
#     if area is None:
#         area = [51.7, -0.5, 51.3, 0.2]  # London bounding box
    
#     # For daily statistics - using the correct dataset name and parameters
#     c.retrieve(
#         'derived-era5-land-daily-statistics',
#         {
#             'variable': 'total_precipitation',
#             'daily_statistic': daily_statistic,
#             'year': years,
#             'month': months,
#             'day': [
#                 '01', '02', '03', '04', '05', '06',
#                 '07', '08', '09', '10', '11', '12',
#                 '13', '14', '15', '16', '17', '18',
#                 '19', '20', '21', '22', '23', '24',
#                 '25', '26', '27', '28', '29', '30', '31'
#             ],
#             'area': area,
#             'format': 'netcdf',
#         },
#         output_file
#     )
    
#     print(f"Daily statistics data downloaded to {output_file}")
#     return output_file

# def process_precipitation_data(netcdf_file, threshold_mm=0.1):
#     """
#     Process ERA5-Land precipitation data
    
#     Parameters:
#     netcdf_file: path to downloaded NetCDF file
#     threshold_mm: minimum precipitation threshold (to match gauge detection limits)
#     """
    
#     # Load the data
#     ds = xr.open_dataset(netcdf_file)
    
#     # Convert from m/hour to mm/hour (ERA5-Land precip is in meters)
#     precip_mm = ds['tp'] * 1000  # Convert m to mm
    
#     # Apply threshold if specified
#     if threshold_mm > 0:
#         precip_mm = precip_mm.where(precip_mm >= threshold_mm, 0)
    
#     # Calculate daily, monthly, and yearly sums
#     daily_sum = precip_mm.resample(time='D').sum()
#     monthly_sum = precip_mm.resample(time='M').sum()
#     yearly_sum = precip_mm.resample(time='Y').sum()
    
#     # Get London area average (if multiple grid points)
#     if len(precip_mm.dims) > 1:
#         daily_avg = daily_sum.mean(dim=['latitude', 'longitude'])
#         monthly_avg = monthly_sum.mean(dim=['latitude', 'longitude'])
#         yearly_avg = yearly_sum.mean(dim=['latitude', 'longitude'])
#     else:
#         daily_avg = daily_sum
#         monthly_avg = monthly_sum  
#         yearly_avg = yearly_sum
    
#     return {
#         'daily': daily_avg,
#         'monthly': monthly_avg,
#         'yearly': yearly_avg,
#         'raw_data': precip_mm
#     }

# # Example usage
# if __name__ == "__main__":
#     # Download data for London, 2023-2024
#     years = ['2023']
#     months = ['01', '02', '03', '04', '05', '06', 
#               '07', '08', '09', '10', '11', '12']
    
#     # Method 1: Download daily sums (recommended for precipitation)
#     daily_file = download_era5_land_daily_stats(
#         years=years, 
#         months=months, 
#         daily_statistic='daily_sum',
#         output_file='london_era5land_daily_precip.nc'
#     )
    
#     # Method 2: Download hourly data (if you need hourly resolution)
#     # hourly_file = download_era5_land_precipitation(
#     #     years=['2023'], 
#     #     months=['01', '02'], 
#     #     output_file='london_era5land_hourly_precip.nc'
#     # )
    
#     # Process the data
#     results = process_precipitation_data(daily_file, threshold_mm=0.1)
    
#     # Display some statistics
#     print("\n=== Precipitation Statistics for London ===")
#     print(f"Daily mean: {results['daily'].mean().values:.2f} mm/day")
#     print(f"Monthly mean: {results['monthly'].mean().values:.2f} mm/month") 
#     print(f"Yearly total: {results['yearly'].sum().values:.2f} mm/year")

In [None]:
import cdsapi
import xarray as xr
import pandas as pd
import os

def download_era5_land_precipitation(years, months, area=None, output_file='era5_land_precip.nc'):
    """
    Download ERA5-Land total precipitation data
    
    Parameters:
    years: list of years as strings (e.g., ['2023', '2024'])
    months: list of months as strings (e.g., ['01', '02', '03'])
    area: [North, West, South, East] bounding box for London: [51.7, -0.5, 51.3, 0.2]
    output_file: output NetCDF filename
    """
    
    # Option 1: Use .cdsapirc file (recommended)
    # Option 1: Use .cdsapirc file (recommended)
    c = cdsapi.Client()
    
    # Option 2: Set credentials programmatically (if .cdsapirc doesn't work)
    # c = cdsapi.Client(url='https://cds.climate.copernicus.eu/api/v2',
    #                   key='YOUR_UID:YOUR_API_KEY')
    
    # Option 2: Set credentials programmatically (if .cdsapirc doesn't work)
    # c = cdsapi.Client(url='https://cds.climate.copernicus.eu/api/v2',
    #                   key='YOUR_UID:YOUR_API_KEY')
    
    # Default to London area if not specified
    if area is None:
        area = [51.7, -0.5, 51.3, 0.2]  # London bounding box
    
    # For hourly data (original temporal resolution)
    c.retrieve(
        'reanalysis-era5-land',
        {
            'variable': 'total_precipitation',
            'year': years,
            'month': months,
            'day': [
                '01', '02', '03', '04', '05', '06',
                '07', '08', '09', '10', '11', '12',
                '13', '14', '15', '16', '17', '18',
                '19', '20', '21', '22', '23', '24',
                '25', '26', '27', '28', '29', '30', '31'
            ],
            'time': [
                '00:00', '01:00', '02:00', '03:00', '04:00', '05:00',
                '06:00', '07:00', '08:00', '09:00', '10:00', '11:00',
                '12:00', '13:00', '14:00', '15:00', '16:00', '17:00',
                '18:00', '19:00', '20:00', '21:00', '22:00', '23:00'
            ],
            'area': area,  # [North, West, South, East]
            'format': 'netcdf',
        },
        output_file
    )
    
    print(f"Data downloaded to {output_file}")
    return output_file

def download_era5_land_daily_stats(years, months, daily_statistic='daily_sum', area=None, output_file='era5_land_daily.nc'):
    """
    Download ERA5-Land daily statistics (recommended for precipitation sums)
    
    Parameters:
    daily_statistic: 'daily_sum', 'daily_mean', 'daily_maximum', 'daily_minimum'
    """
    
    c = cdsapi.Client()
    
    if area is None:
        area = [51.7, -0.5, 51.3, 0.2]  # London bounding box
    
    # For daily statistics - using the correct dataset name and parameters
    c.retrieve(
        'derived-era5-land-daily-statistics',
        {
            'variable': 'total_precipitation',
            'daily_statistic': daily_statistic,
            'year': years,
            'month': months,
            'day': [
                '01', '02', '03', '04', '05', '06',
                '07', '08', '09', '10', '11', '12',
                '13', '14', '15', '16', '17', '18',
                '19', '20', '21', '22', '23', '24',
                '25', '26', '27', '28', '29', '30', '31'
            ],
            'area': area,
            'format': 'netcdf',
        },
        output_file
    )
    
    print(f"Daily statistics data downloaded to {output_file}")
    return output_file

def process_precipitation_data(netcdf_file, threshold_mm=0.1):
    """
    Process ERA5-Land precipitation data
    
    Parameters:
    netcdf_file: path to downloaded NetCDF file
    threshold_mm: minimum precipitation threshold (to match gauge detection limits)
    """
    
    # Load the data
    ds = xr.open_dataset(netcdf_file)
    
    # Convert from m/hour to mm/hour (ERA5-Land precip is in meters)
    precip_mm = ds['tp'] * 1000  # Convert m to mm
    
    # Apply threshold if specified
    if threshold_mm > 0:
        precip_mm = precip_mm.where(precip_mm >= threshold_mm, 0)
    
    # Calculate daily, monthly, and yearly sums
    daily_sum = precip_mm.resample(time='D').sum()
    monthly_sum = precip_mm.resample(time='M').sum()
    yearly_sum = precip_mm.resample(time='Y').sum()
    
    # Get London area average (if multiple grid points)
    if len(precip_mm.dims) > 1:
        daily_avg = daily_sum.mean(dim=['latitude', 'longitude'])
        monthly_avg = monthly_sum.mean(dim=['latitude', 'longitude'])
        yearly_avg = yearly_sum.mean(dim=['latitude', 'longitude'])
    else:
        daily_avg = daily_sum
        monthly_avg = monthly_sum  
        yearly_avg = yearly_sum
    
    return {
        'daily': daily_avg,
        'monthly': monthly_avg,
        'yearly': yearly_avg,
        'raw_data': precip_mm
    }

# Example usage
if __name__ == "__main__":
    # Start with a smaller test - just one month
    years = ['2023']
    months = ['01']  # Start with January only
    
    print("Downloading ERA5-Land hourly precipitation data...")
    
    # Method 1: Download hourly data (more reliable)
    hourly_file = download_era5_land_precipitation(
        years=years, 
        months=months, 
        output_file='london_era5land_hourly_precip.nc'
    )
    
    # Method 2: Try daily statistics (if hourly works)
    # daily_file = download_era5_land_daily_stats(
    #     years=years, 
    #     months=months, 
    #     daily_statistic='daily_sum',
    #     output_file='london_era5land_daily_precip.nc'
    # )
    
    # Process the data
    results = process_precipitation_data(hourly_file, threshold_mm=0.1)
    
    # Display some statistics
    print("\n=== Precipitation Statistics for London ===")
    print(f"Daily mean: {results['daily'].mean().values:.2f} mm/day")
    print(f"Monthly mean: {results['monthly'].mean().values:.2f} mm/month") 
    print(f"Yearly total: {results['yearly'].sum().values:.2f} mm/year")

In [None]:
import cdsapi
import xarray as xr
import pandas as pd
import os

def download_era5_land_precipitation(years, months, area=None, output_file='era5_land_precip.nc'):
    """
    Download ERA5-Land total precipitation data
    
    Parameters:
    years: list of years as strings (e.g., ['2023', '2024'])
    months: list of months as strings (e.g., ['01', '02', '03'])
    area: [North, West, South, East] bounding box for London: [51.7, -0.5, 51.3, 0.2]
    output_file: output NetCDF filename
    """
    
    # Option 1: Use .cdsapirc file (recommended)
    # Option 1: Use .cdsapirc file (recommended)
    c = cdsapi.Client()
    
    # Option 2: Set credentials programmatically (if .cdsapirc doesn't work)
    # c = cdsapi.Client(url='https://cds.climate.copernicus.eu/api/v2',
    #                   key='YOUR_UID:YOUR_API_KEY')
    
    # Option 2: Set credentials programmatically (if .cdsapirc doesn't work)
    # c = cdsapi.Client(url='https://cds.climate.copernicus.eu/api/v2',
    #                   key='YOUR_UID:YOUR_API_KEY')
    
    # Default to London area if not specified
    if area is None:
        area = [51.7, -0.5, 51.3, 0.2]  # London bounding box
    
    # For hourly data (original temporal resolution)
    c.retrieve(
        'reanalysis-era5-land',
        {
            'variable': 'total_precipitation',
            'year': years,
            'month': months,
            'day': [
                '01', '02', '03', '04', '05', '06',
                '07', '08', '09', '10', '11', '12',
                '13', '14', '15', '16', '17', '18',
                '19', '20', '21', '22', '23', '24',
                '25', '26', '27', '28', '29', '30', '31'
            ],
            'time': [
                '00:00', '01:00', '02:00', '03:00', '04:00', '05:00',
                '06:00', '07:00', '08:00', '09:00', '10:00', '11:00',
                '12:00', '13:00', '14:00', '15:00', '16:00', '17:00',
                '18:00', '19:00', '20:00', '21:00', '22:00', '23:00'
            ],
            'area': area,  # [North, West, South, East]
            'format': 'netcdf',
        },
        output_file
    )
    
    print(f"Data downloaded to {output_file}")
    return output_file

def download_era5_land_daily_stats(years, months, daily_statistic='daily_sum', area=None, output_file='era5_land_daily.nc'):
    """
    Download ERA5-Land daily statistics (recommended for precipitation sums)
    
    Parameters:
    daily_statistic: 'daily_sum', 'daily_mean', 'daily_maximum', 'daily_minimum'
    """
    
    c = cdsapi.Client()
    
    if area is None:
        area = [51.7, -0.5, 51.3, 0.2]  # London bounding box
    
    # For daily statistics - using the correct dataset name and parameters
    c.retrieve(
        'derived-era5-land-daily-statistics',
        {
            'variable': 'total_precipitation',
            'daily_statistic': daily_statistic,
            'year': years,
            'month': months,
            'day': [
                '01', '02', '03', '04', '05', '06',
                '07', '08', '09', '10', '11', '12',
                '13', '14', '15', '16', '17', '18',
                '19', '20', '21', '22', '23', '24',
                '25', '26', '27', '28', '29', '30', '31'
            ],
            'area': area,
            'format': 'netcdf',
        },
        output_file
    )
    
    print(f"Daily statistics data downloaded to {output_file}")
    return output_file

def process_precipitation_data(netcdf_file, threshold_mm=0.1):
    """
    Process ERA5-Land precipitation data
    
    Parameters:
    netcdf_file: path to downloaded NetCDF file
    threshold_mm: minimum precipitation threshold (to match gauge detection limits)
    """
    
    import os
    
    # Check if file exists and get info
    if not os.path.exists(netcdf_file):
        raise FileNotFoundError(f"File not found: {netcdf_file}")
    
    file_size = os.path.getsize(netcdf_file)
    print(f"File size: {file_size / 1024 / 1024:.2f} MB")
    
    # Check if file is very small (likely an error file)
    if file_size < 1000:  # Less than 1KB suggests an error
        with open(netcdf_file, 'r') as f:
            content = f.read()
            print(f"File content (first 500 chars): {content[:500]}")
        raise ValueError("Downloaded file appears to be corrupted or contains an error message")
    
    # Try to load the data with explicit engine
    try:
        ds = xr.open_dataset(netcdf_file, engine='netcdf4')
        print("Successfully opened with netcdf4 engine")
    except Exception as e1:
        print(f"Failed with netcdf4: {e1}")
        try:
            ds = xr.open_dataset(netcdf_file, engine='scipy')
            print("Successfully opened with scipy engine")
        except Exception as e2:
            print(f"Failed with scipy: {e2}")
            try:
                # Try cfgrib for GRIB format
                ds = xr.open_dataset(netcdf_file, engine='cfgrib')
                print("Successfully opened with cfgrib engine (GRIB format)")
            except Exception as e3:
                print(f"Failed with cfgrib: {e3}")
                raise ValueError(f"Could not open file with any engine. Engines tried: netcdf4, scipy, cfgrib")
    
    print(f"Dataset variables: {list(ds.variables.keys())}")
    print(f"Dataset dimensions: {ds.dims}")
    print(f"Dataset coordinates: {list(ds.coords.keys())}")
    
    # Get the precipitation variable (check common names)
    precip_var = None
    possible_names = ['tp', 'total_precipitation', 'precipitation', 'precip']
    
    for var_name in possible_names:
        if var_name in ds.variables:
            precip_var = var_name
            break
    
    if precip_var is None:
        raise ValueError(f"Could not find precipitation variable. Available variables: {list(ds.variables.keys())}")
    
    print(f"Using precipitation variable: {precip_var}")
    
    # Convert from m/hour to mm/hour (ERA5-Land precip is in meters)
    precip_mm = ds[precip_var] * 1000  # Convert m to mm
    
    print(f"Original data shape: {precip_mm.shape}")
    print(f"Data range: {precip_mm.min().values:.6f} to {precip_mm.max().values:.6f} mm")
    
    # Apply threshold if specified
    if threshold_mm > 0:
        precip_mm = precip_mm.where(precip_mm >= threshold_mm, 0)
        print(f"Applied {threshold_mm} mm threshold")
    
    # Calculate daily, monthly, and yearly sums
    daily_sum = precip_mm.resample(time='D').sum()
    monthly_sum = precip_mm.resample(time='M').sum()
    yearly_sum = precip_mm.resample(time='Y').sum()
    
    # Get London area average (if multiple grid points)
    if 'latitude' in precip_mm.dims and 'longitude' in precip_mm.dims:
        daily_avg = daily_sum.mean(dim=['latitude', 'longitude'])
        monthly_avg = monthly_sum.mean(dim=['latitude', 'longitude'])
        yearly_avg = yearly_sum.mean(dim=['latitude', 'longitude'])
        print("Averaged over spatial dimensions")
    else:
        daily_avg = daily_sum
        monthly_avg = monthly_sum  
        yearly_avg = yearly_sum
        print("No spatial averaging needed")
    
    return {
        'daily': daily_avg,
        'monthly': monthly_avg,
        'yearly': yearly_avg,
        'raw_data': precip_mm
    }

# Example usage
if __name__ == "__main__":
    # Start with a smaller test - just one month
    years = ['2023']
    months = ['01']  # Start with January only
    
    print("Downloading ERA5-Land hourly precipitation data...")
    
    # Method 1: Download hourly data (more reliable)
    hourly_file = download_era5_land_precipitation(
        years=years, 
        months=months, 
        output_file='london_era5land_hourly_precip.nc'
    )
    
    # Method 2: Try daily statistics (if hourly works)
    # daily_file = download_era5_land_daily_stats(
    #     years=years, 
    #     months=months, 
    #     daily_statistic='daily_sum',
    #     output_file='london_era5land_daily_precip.nc'
    # )
    
    # Process the data
    results = process_precipitation_data(hourly_file, threshold_mm=0.1)
    
    # Display some statistics
    print("\n=== Precipitation Statistics for London ===")
    print(f"Daily mean: {results['daily'].mean().values:.2f} mm/day")
    print(f"Monthly mean: {results['monthly'].mean().values:.2f} mm/month") 
    print(f"Yearly total: {results['yearly'].sum().values:.2f} mm/year")