In [14]:
import ee
import geemap
import re
import pandas as pd
from scipy.signal import savgol_filter

In [15]:
cloud_project = 'heat-index-forecasting'

try:
  ee.Initialize(project=cloud_project)
except:
  ee.Authenticate()
  ee.Initialize(project=cloud_project)

stations = {
  "Sinait": ee.Geometry.Point([120.459762, 17.89019]).buffer(5000),
  "Tayabas": ee.Geometry.Point([121.596575, 14.018428]).buffer(5000),
  "Tanay": ee.Geometry.Point([121.36927, 14.581167]).buffer(5000),
  "Tuguegarao": ee.Geometry.Point([121.758469, 17.647678]).buffer(5000),
  "Virac": ee.Geometry.Point([124.209834, 13.576558]).buffer(5000),
  "Calapan": ee.Geometry.Point([121.1896667, 13.409775]).buffer(5000),
  "CLSU": ee.Geometry.Point([120.9368, 15.73586]).buffer(5000),
  "Clark": ee.Geometry.Point([120.5616667, 15.1717]).buffer(5000),
  "Daet": ee.Geometry.Point([122.982559, 14.128689]).buffer(5000),
  "Dagupan": ee.Geometry.Point([120.352045, 16.086784]).buffer(5000)
}

start_date = "2014-01-01"
end_date   = "2024-01-01"

In [16]:
def extract_and_process_era5(point, dataset, start_date, end_date, station_name, chunk_size=5):
    """
    Extracts and processes ERA5-Land daily aggregated data for a given point,
    filtered by date and spatial extent, with memory-safe chunked aggregation.
    """

    # ✅ Filter dataset by date and region before extraction
    dataset_filtered = dataset.filterDate(start_date, end_date).filterBounds(point)

    def extract(img):
        vals = img.reduceRegion(
            reducer=ee.Reducer.mean(),
            geometry=point,
            scale=11132,      # ERA5-Land native resolution (~11 km)
            maxPixels=1e13,
            bestEffort=True
        )
        return ee.Feature(None, vals).set("date", img.date().format("YYYY-MM-dd"))

    fc = dataset_filtered.map(extract)

    # Get variable names
    var_names = dataset_filtered.first().bandNames().getInfo()

    # --- Memory-safe chunked aggregation ---
    def safe_extract(fc, var_names, chunk_size=5):
        data_dict = {'date': fc.aggregate_array('date').getInfo()}
        for i in range(0, len(var_names), chunk_size):
            subset = var_names[i:i + chunk_size]
            try:
                subset_dict = ee.Dictionary.fromLists(
                    subset, [fc.aggregate_array(v) for v in subset]
                ).getInfo()
                data_dict.update(subset_dict)
            except Exception as e:
                print(f"⚠️ Skipping bands {subset} due to memory issue: {e}")
        # Pad shorter lists with None
        max_len = len(data_dict['date'])
        for k, v in data_dict.items():
            if len(v) < max_len:
                data_dict[k] = v + [None] * (max_len - len(v))
        return data_dict

    data_dict = safe_extract(fc, var_names, chunk_size)

    df = pd.DataFrame(data_dict)
    df['date'] = pd.to_datetime(df['date'], errors='coerce')

    # --- Conversion rules ---
    UNIT_CONVERSIONS = [
        (r"temperature", lambda x: x - 273.15, "K", "°C", "_C"),
        (r"pressure", lambda x: x / 100, "Pa", "hPa", "_hPa"),
        (r"(u_component|v_component|wind)", lambda x: x * 3.6, "m/s", "km/h", "_kmh"),
        (r"snow_depth", lambda x: x * 100, "m", "cm", "_cm"),
        (r"(sum|precipitation|evaporation|runoff)(_min|_max)?", lambda x: x * 1000, "m", "mm/day", "_mm"),
        (r"(radiation_sum|flux_sum|heat_sum)(_min|_max)?", lambda x: x / 86400, "J/m²/day", "W/m²", "_Wm2")
    ]

    rename_map = {}
    metadata = []

    for col in df.columns:
        if col == "date" or df[col].isnull().all():
            continue
        for pattern, func, orig_unit, conv_unit, suffix in UNIT_CONVERSIONS:
            if re.search(pattern, col.lower()):
                df[col] = func(df[col])
                new_col = col + suffix
                rename_map[col] = new_col
                metadata.append({
                    "Band": col,
                    "Converted Column": new_col,
                    "Original Unit": orig_unit,
                    "Converted Unit": conv_unit
                })
                break

    if rename_map:
        df.rename(columns=rename_map, inplace=True)

    # Save the processed DataFrame to CSV
    out_file = f"{station_name}_ERA5.csv"
    df.to_csv(out_file, index=False)

    print(f"Saved {out_file} ({df.shape[0]} rows)")

    return df

In [17]:
dataset = ee.ImageCollection('ECMWF/ERA5_LAND/DAILY_AGGR')

station_dataframes = {}
for station_name, station_point in stations.items():
    print(f"Extracting and processing data for {station_name}...")
    try:
        df_station = extract_and_process_era5(station_point, dataset, start_date, end_date, station_name)
        station_dataframes[station_name] = df_station
        print(f"Successfully extracted and processed data for {station_name}.")
        print(f"DataFrame shape: {df_station.shape}")
    except Exception as e:
        print(f"Error processing data for {station_name}: {e}")

Extracting and processing data for Sinait...
Saved Sinait_ERA5.csv (3652 rows)
Successfully extracted and processed data for Sinait.
DataFrame shape: (3652, 151)
Extracting and processing data for Tayabas...
Saved Tayabas_ERA5.csv (3652 rows)
Successfully extracted and processed data for Tayabas.
DataFrame shape: (3652, 151)
Extracting and processing data for Tanay...
Saved Tanay_ERA5.csv (3652 rows)
Successfully extracted and processed data for Tanay.
DataFrame shape: (3652, 151)
Extracting and processing data for Tuguegarao...
Saved Tuguegarao_ERA5.csv (3652 rows)
Successfully extracted and processed data for Tuguegarao.
DataFrame shape: (3652, 151)
Extracting and processing data for Virac...
Saved Virac_ERA5.csv (3652 rows)
Successfully extracted and processed data for Virac.
DataFrame shape: (3652, 151)
Extracting and processing data for Calapan...




Saved Calapan_ERA5.csv (3652 rows)
Successfully extracted and processed data for Calapan.
DataFrame shape: (3652, 151)
Extracting and processing data for CLSU...
Saved CLSU_ERA5.csv (3652 rows)
Successfully extracted and processed data for CLSU.
DataFrame shape: (3652, 151)
Extracting and processing data for Clark...
Saved Clark_ERA5.csv (3652 rows)
Successfully extracted and processed data for Clark.
DataFrame shape: (3652, 151)
Extracting and processing data for Daet...
Saved Daet_ERA5.csv (3652 rows)
Successfully extracted and processed data for Daet.
DataFrame shape: (3652, 151)
Extracting and processing data for Dagupan...
Saved Dagupan_ERA5.csv (3652 rows)
Successfully extracted and processed data for Dagupan.
DataFrame shape: (3652, 151)


In [18]:
# from google.colab import drive
# import os

# # Mount Google Drive
# drive.mount('/content/drive')

# # Define the base path in Google Drive
# drive_base_path = '/content/drive/MyDrive/ERA5Datasets'

# # Ensure the directory exists, create if not
# if not os.path.exists(drive_base_path):
#     try:
#         os.makedirs(drive_base_path)
#         print(f"Created directory: {drive_base_path}")
#     except Exception as e:
#         print(f"Error creating directory {drive_base_path}: {e}")
#         # If directory creation fails, proceed with local save only

# # Iterate through the station_dataframes dictionary and save each DataFrame
# for station_name, df_station in station_dataframes.items():
#     # Construct the filename
#     filename = f"{station_name.replace(' ', '_')}_ERA5_Data.csv"
#     drive_path = os.path.join(drive_base_path, filename)

#     try:
#         # Save to Google Drive
#         df_station.to_csv(drive_path, index=False)
#         print(f"File saved to Google Drive: {drive_path}")
#     except Exception as e:
#         print(f"Error saving file to Google Drive for {station_name}: {e}")
#         # Save locally as backup if Drive save fails
#         local_path = filename
#         df_station.to_csv(local_path, index=False)
#         print(f"File saved locally as backup: {local_path}")