<a href="https://colab.research.google.com/github/machiwao/CCTHESS1-CCTHESS2-Dev-and-Docs/blob/jessy/ERA5_data_scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import ee
import geemap
import pandas as pd

In [None]:
def extract_and_process_era5(point, dataset):
    """
    Extracts and processes ERA5-Land daily aggregated data for a given point.

    Args:
        point: An Earth Engine Geometry object representing the location.
        dataset: An Earth Engine ImageCollection (ECMWF/ERA5_LAND/DAILY_AGGR).

    Returns:
        A pandas DataFrame containing the extracted and processed data.
    """

    # Replicate data extraction process (cell T0TiMQLyIR-S)
    def extract(img):
        vals = img.reduceRegion(
            reducer=ee.Reducer.mean(),
            geometry=point,
            scale=11132,  # ERA5-Land native resolution is ~11km
            maxPixels=1e13,
            bestEffort=True
        )
        date_val = img.date().format("YYYY-MM-dd")
        feature = ee.Feature(None, vals).set("date", date_val)
        return feature

    fc = dataset.map(extract)

    # Replicate data aggregation (cell FlOQ8xMiIldM)
    dates = fc.aggregate_array("date").getInfo()
    var_names = dataset.first().bandNames().getInfo()

    data = {}
    for v in var_names:
        try:
            values = fc.aggregate_array(v).getInfo()
            # Check if values were successfully extracted and are not all None
            if all(x is None for x in values):
                print(f"Warning: No valid values extracted for {v}. Skipping.")
                continue
            data[v] = values
            # print(f"Extracted {len([x for x in values if x is not None])} non-null values for {v}")
        except Exception as e:
            print(f"Error extracting {v}: {e}")
            # Populate with None if extraction fails or yields no valid data
            data[v] = [None] * len(dates)


    # Create DataFrame (cell FlOQ8xMiIldM part 2)
    df = pd.DataFrame({"date": dates})
    # Add extracted data, only for bands where data was successfully extracted
    for v in data:
         if v in var_names and v in data:
              # Ensure lengths match before assigning
              if len(data[v]) == len(df):
                  df[v] = data[v]
              else:
                  print(f"Warning: Length mismatch for band {v}. Skipping.")


    # Implement data processing and unit conversion (cell yCa-Bxm5IsgB)
    metadata = [] # Initialize metadata inside the function if it's only for internal use or returned
    for col in df.columns:
        if col == "date":
            continue

        # Skip if all values are null after DataFrame creation
        if df[col].isnull().all():
            print(f"Skipping conversion for {col} - all values are null")
            continue

        original_name = col
        original_unit = "varies" # Default unit
        converted_unit = original_unit
        new_name = col

        # Kelvin → Celsius
        if "temperature" in col.lower():
            df[col] = df[col] - 273.15
            new_name = col + "_C"
            original_unit, converted_unit = "K", "°C"
        # Precipitation / Runoff / Evaporation (m → mm/day)
        elif col.endswith("_sum") or "precipitation" in col.lower() or "evaporation" in col.lower() or "runoff" in col.lower():
             # Check if the column actually contains sum data before applying conversion
            if col in ['snowfall_sum', 'snowmelt_sum', 'surface_latent_heat_flux_sum', 'surface_net_solar_radiation_sum', 'surface_net_thermal_radiation_sum', 'surface_sensible_heat_flux_sum', 'surface_solar_radiation_downwards_sum', 'surface_thermal_radiation_downwards_sum', 'evaporation_from_bare_soil_sum', 'evaporation_from_open_water_surfaces_excluding_oceans_sum', 'evaporation_from_the_top_of_canopy_sum', 'evaporation_from_vegetation_transpiration_sum', 'potential_evaporation_sum', 'runoff_sum', 'snow_evaporation_sum', 'sub_surface_runoff_sum', 'surface_runoff_sum', 'total_evaporation_sum', 'total_precipitation_sum']:
                df[col] = df[col] * 1000
                new_name = col + "_mm"
                original_unit, converted_unit = "m", "mm/day"
            # Handle min/max of these sum bands if they exist and need conversion
            elif col.endswith(("_sum_min", "_sum_max")):
                 # Determine the base name without _min or _max
                 base_name = col[:-4] if col.endswith("_min") else col[:-4] # Corrected this line
                 if base_name in ['snowfall_sum', 'snowmelt_sum', 'surface_latent_heat_flux_sum', 'surface_net_solar_radiation_sum', 'surface_net_thermal_radiation_sum', 'surface_sensible_heat_flux_sum', 'surface_solar_radiation_downwards_sum', 'surface_thermal_radiation_downwards_sum', 'evaporation_from_bare_soil_sum', 'evaporation_from_open_water_surfaces_excluding_oceans_sum', 'evaporation_from_the_top_of_canopy_sum', 'evaporation_from_vegetation_transpiration_sum', 'potential_evaporation_sum', 'runoff_sum', 'snow_evaporation_sum', 'sub_surface_runoff_sum', 'surface_runoff_sum', 'total_evaporation_sum', 'total_precipitation_sum']:
                     df[col] = df[col] * 1000
                     new_name = col + "_mm"
                     original_unit, converted_unit = "m", "mm/day"


        # Pressure (Pa → hPa)
        elif "pressure" in col.lower():
            df[col] = df[col] / 100
            new_name = col + "_hPa"
            original_unit, converted_unit = "Pa", "hPa"
        # Wind (m/s → km/h)
        elif "u_component" in col.lower() or "v_component" in col.lower() or "wind" in col.lower():
            df[col] = df[col] * 3.6
            new_name = col + "_kmh"
            original_unit, converted_unit = "m/s", "km/h"
        # Snow depth (m → cm)
        elif "snow_depth" in col.lower():
            df[col] = df[col] * 100
            new_name = col + "_cm"
            original_unit, converted_unit = "m", "cm"
        # Radiation & Fluxes (J/m²/day → W/m²)
        elif col.endswith("_radiation_sum") or col.endswith("_flux_sum") or col.endswith("_heat_sum"):
             # Check if the column name actually corresponds to a sum band before conversion
            if col in ['surface_latent_heat_flux_sum', 'surface_net_solar_radiation_sum', 'surface_net_thermal_radiation_sum', 'surface_sensible_heat_flux_sum', 'surface_solar_radiation_downwards_sum', 'surface_thermal_radiation_downwards_sum']:
                df[col] = df[col] / 86400
                new_name = col + "_Wm2"
                original_unit, converted_unit = "J/m²/day", "W/m²"
            # Handle min/max of these sum bands if they exist and need conversion
            elif col.endswith(("_radiation_sum_min", "_radiation_sum_max", "_flux_sum_min", "_flux_sum_max", "_heat_sum_min", "_heat_sum_max")):
                 base_name = col.rsplit("_", 2)[0] + "_" + col.rsplit("_", 2)[1] # Get the base name like surface_latent_heat_flux_sum
                 if base_name in ['surface_latent_heat_flux_sum', 'surface_net_solar_radiation_sum', 'surface_net_thermal_radiation_sum', 'surface_sensible_heat_flux_sum', 'surface_solar_radiation_downwards_sum', 'surface_thermal_radiation_downwards_sum']:
                     df[col] = df[col] / 86400
                     new_name = col + "_Wm2"
                     original_unit, converted_unit = "J/m²/day", "W/m²"


        # Rename column if conversion happened
        if new_name != col:
            df.rename(columns={col: new_name}, inplace=True)

        # Add metadata record
        metadata.append({
            "Band": original_name,
            "Converted Column": new_name,
            "Original Unit": original_unit,
            "Converted Unit": converted_unit
        })

    return df #, pd.DataFrame(metadata) # Optionally return metadata as well

In [None]:
import ee
import pandas as pd

cloud_project = 'heat-index-forecasting'

try:
  ee.Initialize(project=cloud_project)
except:
  ee.Authenticate()
  ee.Initialize(project=cloud_project)

# 1. Define ee.Geometry.Point objects for each of the four additional stations
sinait_point = ee.Geometry.Point([120.459762, 17.89019]).buffer(25000)
tayabas_point = ee.Geometry.Point([121.596575, 14.018428]).buffer(25000)
tanay_point = ee.Geometry.Point([121.36927, 14.581167]).buffer(25000)
tuguegarao_point = ee.Geometry.Point([121.758469, 17.647678]).buffer(25000)
virac_point = ee.Geometry.Point([124.209834, 13.576558]).buffer(25000)

# 2. Create a dictionary to store the station names and their corresponding point geometries
stations = {
    "Sinait": sinait_point,
    "Tanay": tanay_point,
    "Tayabas": tayabas_point,
    "Tuguegarao": tuguegarao_point,
    "Virac Synop": virac_point,
}

# 3. and 4. Iterate through the stations and extract/process data
station_dataframes = {}
for station_name, station_point in stations.items():
    print(f"Extracting and processing data for {station_name}...")
    try:
        df_station = extract_and_process_era5(station_point, dataset)
        station_dataframes[station_name] = df_station
        print(f"Successfully extracted and processed data for {station_name}.")
        print(f"DataFrame shape for {station_name}: {df_station.shape}")
    except Exception as e:
        print(f"Error processing data for {station_name}: {e}")

Extracting and processing data for Sinait...


  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]


Successfully extracted and processed data for Sinait.
DataFrame shape for Sinait: (3652, 151)
Extracting and processing data for Tanay...


  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]


Successfully extracted and processed data for Tanay.
DataFrame shape for Tanay: (3652, 151)
Extracting and processing data for Tayabas...


  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]


Successfully extracted and processed data for Tayabas.
DataFrame shape for Tayabas: (3652, 151)
Extracting and processing data for Tuguegarao...


  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]


Successfully extracted and processed data for Tuguegarao.
DataFrame shape for Tuguegarao: (3652, 151)
Extracting and processing data for Virac Synop...
Successfully extracted and processed data for Virac Synop.
DataFrame shape for Virac Synop: (3652, 151)


  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]
  df[v] = data[v]


**Reasoning**:
The error indicates that the `ee` module is not defined, even though the initialization code is present. This suggests that the `ee` module was not imported in the current session. I need to add the import statement for the `ee` module before attempting to initialize it.



In [None]:
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')

# Define the base path in Google Drive
drive_base_path = '/content/drive/MyDrive/ERA5Datasets'

# Ensure the directory exists, create if not
if not os.path.exists(drive_base_path):
    try:
        os.makedirs(drive_base_path)
        print(f"Created directory: {drive_base_path}")
    except Exception as e:
        print(f"Error creating directory {drive_base_path}: {e}")
        # If directory creation fails, proceed with local save only

# Iterate through the station_dataframes dictionary and save each DataFrame
for station_name, df_station in station_dataframes.items():
    # Construct the filename
    filename = f"{station_name.replace(' ', '_')}_ERA5_Data.csv"
    drive_path = os.path.join(drive_base_path, filename)

    try:
        # Save to Google Drive
        df_station.to_csv(drive_path, index=False)
        print(f"File saved to Google Drive: {drive_path}")
    except Exception as e:
        print(f"Error saving file to Google Drive for {station_name}: {e}")
        # Save locally as backup if Drive save fails
        local_path = filename
        df_station.to_csv(local_path, index=False)
        print(f"File saved locally as backup: {local_path}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
File saved to Google Drive: /content/drive/MyDrive/ERA5Datasets/Sinait_ERA5_Data.csv
File saved to Google Drive: /content/drive/MyDrive/ERA5Datasets/Tanay_ERA5_Data.csv
File saved to Google Drive: /content/drive/MyDrive/ERA5Datasets/Tayabas_ERA5_Data.csv
File saved to Google Drive: /content/drive/MyDrive/ERA5Datasets/Tuguegarao_ERA5_Data.csv
File saved to Google Drive: /content/drive/MyDrive/ERA5Datasets/Virac_Synop_ERA5_Data.csv
