In [1]:
# MEJNw2
import sys
import glob
import os
sys.path.append("./")

import xarray as xr
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import zipfile
# import seaborn as sns
# import cartopy.crs as ccrs
# import cartopy.feature as cfeature

In [11]:
def unzip_files_and_rename(directory: str):
  zip_files = glob.glob(f"{directory}/*.zip")
  print(f"Found {len(zip_files)} zip files")

  for file in zip_files:
    print("PROCESSING:", file)

    # Unzip the file:
    with zipfile.ZipFile(file, 'r') as zip_ref:
      zip_ref.extractall(directory)

    # All files uzip to data_0.nc, so we need to rename it to the original filename.
    filename = f"{directory}/data_0.nc"

    # Rename to match the original filename
    new_filename = f"{directory}/{file.split('/')[-1]}".replace(".zip", ".nc")
    os.rename(filename, new_filename)
    print(f"Renamed {filename} to {new_filename}")


def rename_nc_files_month_year(directory: str):
  for file in glob.glob(f"{directory}/*.nc"):
    print(file)

    ds = xr.open_dataset(file)
    month: int = ds.valid_time.dt.month.values[0]
    year: int = ds.valid_time.dt.year.values[0]
    print(f"This file is for {month}/{year}")

    # Rename to to the month and year
    new_filename = f"{directory}/../{year}_{month:02d}.nc"

    # If the file already exists, skip it
    if os.path.exists(new_filename):
      print(f"File {new_filename} already exists, skipping")
      continue

    # Move the file to the new filename
    os.rename(file, new_filename)
    print(f"Renamed {file} to {new_filename}")

# Unzip all files in the directory
# unzip_files_and_rename("./data/era5/zips")
rename_nc_files_month_year("./data/era5/zips")

./data/era5/zips/8a89f97797076b239276ef8b1789b6eb.nc
This file is for 4/2023
Renamed ./data/era5/zips/8a89f97797076b239276ef8b1789b6eb.nc to ./data/era5/zips/../2023_04.nc
./data/era5/zips/eb930c850e96fff215f8e509f0079ebd.nc
This file is for 5/2023
Renamed ./data/era5/zips/eb930c850e96fff215f8e509f0079ebd.nc to ./data/era5/zips/../2023_05.nc
./data/era5/zips/d7a29a606662c91dffe6e3f798c83f21.nc
This file is for 9/2023
File ./data/era5/zips/../2023_09.nc already exists, skipping
./data/era5/zips/dccee8aaa0279dc3962c45d10fc3d2d9.nc
This file is for 8/2023
File ./data/era5/zips/../2023_08.nc already exists, skipping
./data/era5/zips/cf150857ebe4aab08baf73c37dd0747.nc
This file is for 11/2023
Renamed ./data/era5/zips/cf150857ebe4aab08baf73c37dd0747.nc to ./data/era5/zips/../2023_11.nc
./data/era5/zips/9aded7367a299f6a4ad8c43cb7f951bb.nc
This file is for 1/2023
Renamed ./data/era5/zips/9aded7367a299f6a4ad8c43cb7f951bb.nc to ./data/era5/zips/../2023_01.nc
./data/era5/zips/4c34c5057affa8d605d5

In [None]:
def calculate_monthly_hourly_means(filename: str):
  """Compress the data into a smaller file by calculating the mean of the data for each month and hour."""
  ds = xr.open_dataset(
    filename,
    # chunks={'valid_time': -1, 'latitude': 100, 'longitude': 100}
  )
  print("Opened dataset")

  # Get the month that this data represents
  month = ds.valid_time.dt.month.values[0]
  year = ds.valid_time.dt.year.values[0]

  # Create new coordinate arrays with round numbers
  new_lats = np.arange(-90, 90, 0.5)  # 0.5° grid from -90 to 90
  new_lons = np.arange(0, 360, 0.5)   # 0.5° grid from 0 to 359.5

  # Interpolate to the new grid (this preserves data better than simple slicing)
  downsampled = ds.interp(latitude=new_lats, longitude=new_lons)

  downsampled = downsampled.assign_coords(
    month=downsampled.valid_time.dt.month,
    hour=downsampled.valid_time.dt.hour
  )

  print("Assigned coords")

  # Group by month and hour and compute the mean
  # This will handle both t2m and ssrd variables
  monthly_hourly_means = downsampled.groupby(['month', 'hour']).mean()
  print(monthly_hourly_means)

  # Change the t2m variable to be float32
  monthly_hourly_means = monthly_hourly_means.astype({'t2m': 'float32', 'ssrd': 'float32'})

  print("Grouped by month and hour")

  # Save the results to a new, much smaller NetCDF file
  monthly_hourly_means.to_netcdf(f'outputs/month_hour_averages_{year}_{month}.nc')

  print("Saved results")

  # Close the dataset
  ds.close()


netcdf_files = sorted(glob.glob("./data/era5/*.nc"))
for file in netcdf_files:
  print(file)
  calculate_monthly_hourly_means(file)
  print("DONE")

./data/era5/2023_01.nc
Opened dataset
Assigned coords
<xarray.Dataset> Size: 100MB
Dimensions:    (month: 1, hour: 24, latitude: 362, longitude: 720)
Coordinates:
  * month      (month) int64 8B 1
  * hour       (hour) int64 192B 0 1 2 3 4 5 6 7 8 ... 16 17 18 19 20 21 22 23
    number     int64 8B 0
  * latitude   (latitude) float64 3kB -90.0 -89.5 -89.0 -88.5 ... 89.5 90.0 90.5
  * longitude  (longitude) float64 6kB 0.0 0.5 1.0 1.5 ... 358.5 359.0 359.5
Data variables:
    t2m        (latitude, longitude, month, hour) float64 50MB 242.6 ... nan
    ssrd       (latitude, longitude, month, hour) float64 50MB 3.327e+07 ... nan
Attributes:
    GRIB_centre:             ecmf
    GRIB_centreDescription:  European Centre for Medium-Range Weather Forecasts
    GRIB_subCentre:          0
    Conventions:             CF-1.7
    institution:             European Centre for Medium-Range Weather Forecasts
    history:                 2025-02-21T10:42 GRIB to CDM+CF via cfgrib-0.9.1...
Grouped by m