# Load & Normalise ERA5 using Climatology

In [None]:
import os
import glob
import xarray as xr
import pandas as pd
import numpy as np

def process_era5_normalise_file(file_path, output_base_folder, mean_era5, sigma_era5, 
                        lat_min=47, lat_max=63, lon_min=-15, lon_max=5):
    """
    Process a single ERA5 Zarr file:
    - Reads the dataset.
    - For each variable, normalizes the data and converts the data along with latitude, longitude, and time to a CSV.
    - Filters the data to only keep the UK region.
    - Saves the CSV file in a folder named for the year.
    
    Assumes longitudes are in [0, 360].
    """
    # Open the Zarr file
    ds = xr.open_zarr(file_path, consolidated=True)
    
    # Extract the time coordinate (assuming it's a scalar timestamp)
    time_val = pd.to_datetime(ds.time.values.item())
    
    if time_val.year < 1984 or time_val.year > 1988:
        return
    
    year_folder = os.path.join(output_base_folder, str(time_val.year))
    os.makedirs(year_folder, exist_ok=True)
    
    # Adjust longitude values if necessary
    if lon_min < 0:
        lon_min = lon_min + 360
    if lon_max < 0:
        lon_max = lon_max + 360
    # (Latitude adjustments are not usually needed unless you have a specific requirement)
    
    for var in ds.data_vars:
        # Normalize the variable's data.
        # Assuming each zarr file has one variable and mean_era5/sigma_era5 have one relevant index.
        # Adjust the indexing if your climatology contains multiple entries.
        normalized_data = (ds[var] - mean_era5[0, ...]) / sigma_era5[0, ...]
        
        # Convert the normalized DataArray to a DataFrame with coordinates
        df = normalized_data.to_dataframe().reset_index()
        
        # Filter by latitude
        df = df[(df['latitude'] >= lat_min) & (df['latitude'] <= lat_max)]
        
        # Filter by longitude; adjust if the domain crosses the prime meridian boundary
        df = df[((df['longitude'] >= lon_min) | (df['longitude'] <= lon_max))]
        
        # Create a single 'date' column from the time coordinate if it exists
        if 'time' in df.columns:
            df['date'] = pd.to_datetime(df['time']).dt.date
            df.drop(columns=['time'], inplace=True)
        
        # Name the CSV file using the variable name and date
        csv_filename = f"{var}_{time_val.strftime('%Y%m%d')}.csv"
        csv_path = os.path.join(year_folder, csv_filename)
        
        # Save DataFrame to CSV
        df.to_csv(csv_path, index=False)
        print(f"Saved {csv_path}")


zarr_folder = r"/Users/faresg/Desktop/chaosbench/data/era5"
# This pattern should match files like "era5_full_1.5deg_YYYYMMDD.zarr"
zarr_pattern = os.path.join(zarr_folder, "era5_full_1.5deg_*.zarr")

# Output folder for CSVs
output_base_folder = r"/Users/faresg/Desktop/chaosbench/data/normalised_era5_csv"

# Gather all matching Zarr files
zarr_files = glob.glob(zarr_pattern)
zarr_files.sort()  # optional sorting by filename

mean_era5 = xr.open_dataset(r"/Users/faresg/Desktop/chaosbench/data/climatology/climatology_era5.zarr", engine="zarr")[
    "mean"
].values[:, np.newaxis, np.newaxis]

sigma_era5 = xr.open_dataset(r"/Users/faresg/Desktop/chaosbench/data/climatology/climatology_era5.zarr", engine="zarr")[
    "sigma"
].values[:, np.newaxis, np.newaxis]

# Process each file
for file_path in zarr_files:
    process_era5_normalise_file(file_path, output_base_folder, mean_era5=mean_era5, sigma_era5=sigma_era5)