# Settings for Download: ERA5 Hourly

[Dataset](https://developers.google.com/earth-engine/datasets/catalog/ECMWF_ERA5_LAND_HOURLY#description)

In [1]:
# ADJUST ONLY THESE PARAMETERS!

# Debug settings
skip_to_i = 0 # 0 = no skip | If loop broke, resume by setting this i to where you want to continue
first_5000_sites = False

# Timescale of Interest
my_plus_years   = 5 # Years to add after first visit
my_minus_years  = 2 # Years to add before first visit
my_first_date   = "-01-01"  # First date of first year "-MM-DD"
my_last_date    = "-12-31"  # Last date of last year "-MM-DD"

# Dataset of Interest
product            = "ECMWF/ERA5_LAND/HOURLY"
product_start_date = "1950-01-01"
product_end_date   = "2023-10-23"
product_scale      = 11132

output_scale       = product_scale
output_folder      = "gee-raw-data/era5-hourly"

my_bands = [
    "dewpoint_temperature_2m" # Needed to back-calculate humidity
    ,"temperature_2m"         # Needed to back-calculate humidity
    ,"skin_temperature"
    # ,"soil_temperature_level_1"
    # ,"soil_temperature_level_2"
    # ,"soil_temperature_level_3"
    # ,"soil_temperature_level_4"
    # ,"lake_bottom_temperature"
    # ,"lake_ice_depth"
    # ,"lake_ice_temperature"
    # ,"lake_mix_layer_depth"
    # ,"lake_mix_layer_temperature"
    # ,"lake_shape_factor"
    # ,"lake_total_layer_temperature"
    # ,"snow_albedo"
    # ,"snow_cover"
    # ,"snow_density"
    # ,"snow_depth"
    # ,"snow_depth_water_equivalent"
    ,"snowfall"
    # ,"snowmelt"
    # ,"temperature_of_snow_layer"
    # ,"skin_reservoir_content"
    ,"volumetric_soil_water_layer_1"
    ,"volumetric_soil_water_layer_2"
    ,"volumetric_soil_water_layer_3"
    ,"volumetric_soil_water_layer_4"
    # ,"forecast_albedo"
    # ,"surface_latent_heat_flux"
    # ,"surface_net_solar_radiation"
    # ,"surface_net_thermal_radiation"
    # ,"surface_sensible_heat_flux"
    # ,"surface_solar_radiation_downwards"
    # ,"surface_thermal_radiation_downwards"
    # ,"evaporation_from_bare_soil"
    # ,"evaporation_from_open_water_surfaces_excluding_oceans"
    # ,"evaporation_from_the_top_of_canopy"
    # ,"evaporation_from_vegetation_transpiration"
    ,"potential_evaporation"
    # ,"runoff"
    # ,"snow_evaporation"
    # ,"sub_surface_runoff"
    # ,"surface_runoff"
    ,"total_evaporation"
    ,"u_component_of_wind_10m"
    ,"v_component_of_wind_10m"
    ,"surface_pressure"                 # Needed to back-calculate humidity
    ,"total_precipitation"
    ,"leaf_area_index_high_vegetation"
    ,"leaf_area_index_low_vegetation"
    
    # Hourly variables below are created by GEE people and are not used here
    # ,"snowfall_hourly"
    # ,"snowmelt_hourly"
    # ,"surface_latent_heat_flux_hourly"
    # ,"surface_net_solar_radiation_hourly"
    # ,"surface_net_thermal_radiation_hourly"
    # ,"surface_sensible_heat_flux_hourly"
    # ,"surface_solar_radiation_downwards_hourly"
    # ,"surface_thermal_radiation_downwards_hourly"
    # ,"evaporation_from_bare_soil_hourly"
    # ,"evaporation_from_open_water_surfaces_excluding_oceans_hourly"
    # ,"evaporation_from_the_top_of_canopy_hourly"
    # ,"evaporation_from_vegetation_transpiration_hourly"
    # ,"potential_evaporation_hourly"
    # ,"runoff_hourly"
    # ,"snow_evaporation_hourly"
    # ,"sub_surface_runoff_hourly"
    # ,"surface_runoff_hourly"
    # ,"total_evaporation_hourly"
    # ,"total_precipitation_hourly
]

# Download Routine Below

In [2]:
%run -i gee_functions.py

In [3]:
# Import Libraries
import ee
# ee.Authenticate()
ee.Initialize()

import os, re
import pandas as pd
from gee_subset import gee_subset
import geopandas as gpd
from datetime import datetime

In [4]:
data = get_location_site_df()
data_clean = adjust_first_last_date(
    df = data,
    plus_years  = my_plus_years,
    minus_years = my_minus_years,
    first_date  = my_first_date,
    last_date   = my_last_date,
)

if first_5000_sites:
    data_clean = data_clean.iloc[0:10, ]

cnt     = len(data_clean)
siteSet = list(range(0, cnt, 1))

In [None]:
'''
Function to download data per year and band for ERA5 daily.
This is needed to avoid reaching user memory limit.
'''
current_year = 2013
output_folder_yr = output_folder + '/' + str(current_year)
current_start = str(current_year) + '-01-01'
current_end   = str(current_year + 1) + '-01-01'

# Create folder if it doesn't exist
# output_folder_yr_band = output_folder_yr + '/' + band
if not os.path.exists(output_folder_yr):
    os.makedirs(output_folder_yr)

# Start loop over bands
for band in my_bands:
        
    # print('\014 Working on: ', band)
        
    # Start loop over all the data
    for i in siteSet:
        
        # Check if site needs data from current year
        i_year_0 = datetime.strptime(data_clean.iloc[i, 3], "%Y-%m-%d").year
        i_year_1 = datetime.strptime(data_clean.iloc[i, 4], "%Y-%m-%d").year
        
        if  i_year_0 <= current_year <= i_year_1:
        
            # Skip to site i, if required
            if i < skip_to_i:
                continue
        
            df_loop = gee_subset.gee_subset(
                product    = product, 
                bands      = [band], 
                start_date = larger_date(current_start, product_start_date), 
                end_date   = smaller_date(current_end, product_end_date), 
                latitude   = data_clean.iloc[i, 2], 
                longitude  = data_clean.iloc[i, 1], 
                scale      = max(product_scale, output_scale)
                )

            sid = str(data_clean.iloc[i, 0]) 
            df_loop["SiteID"] = sid
            df_loop = df_loop.drop(columns=['id', 'longitude', 'latitude', 'product'])
            df_loop.to_csv(output_folder_yr + '/' + band + "_site_" + str(data_clean.iloc[i, 0]) + ".csv")
        else:
            continue

In [5]:
# data_clean.head(10)

In [6]:
# print("Acess via [0, 1]: ", data_clean.columns[1], "\t = ", data_clean.iloc[0, 1])
# print("Acess via [0, 2]: ", data_clean.columns[2], "\t = ", data_clean.iloc[0, 2])
# print("Acess via [0, 3]: ", data_clean.columns[3], "\t = ", data_clean.iloc[0, 3])
# print("Acess via [0, 4]: ", data_clean.columns[4], "\t = ", data_clean.iloc[0, 4])

In [None]:
download_data_for_year(2008, data_clean, siteSet, output_folder, product, my_bands, product_scale, output_scale)

In [None]:
download_data_for_year(2009, data_clean, siteSet, output_folder, product, my_bands, product_scale, output_scale)

In [None]:
download_data_for_year(2010, data_clean, siteSet, output_folder, product, my_bands, product_scale, output_scale)

In [None]:
download_data_for_year(2011, data_clean, siteSet, output_folder, product, my_bands, product_scale, output_scale)

In [None]:
download_data_for_year(2012, data_clean, siteSet, output_folder, product, my_bands, product_scale, output_scale)

In [None]:
download_data_for_year(2013, data_clean, siteSet, output_folder, product, my_bands, product_scale, output_scale)

In [None]:
download_data_for_year(2014, data_clean, siteSet, output_folder, product, my_bands, product_scale, output_scale)

In [None]:
download_data_for_year(2015, data_clean, siteSet, output_folder, product, my_bands, product_scale, output_scale)

In [None]:
download_data_for_year(2016, data_clean, siteSet, output_folder, product, my_bands, product_scale, output_scale)

In [None]:
download_data_for_year(2017, data_clean, siteSet, output_folder, product, my_bands, product_scale, output_scale)

In [None]:
download_data_for_year(2018, data_clean, siteSet, output_folder, product, my_bands, product_scale, output_scale)

In [None]:
download_data_for_year(2019, data_clean, siteSet, output_folder, product, my_bands, product_scale, output_scale)

In [None]:
download_data_for_year(2020, data_clean, siteSet, output_folder, product, my_bands, product_scale, output_scale)

In [None]:
download_data_for_year(2021, data_clean, siteSet, output_folder, product, my_bands, product_scale, output_scale)

---

In [None]:
if not os.path.exists(output_folder):
        os.makedirs(output_folder)

In [None]:
%%timeit

for i in siteSet:

    if i < skip_to_i:
        continue
    
    df_loop = gee_subset.gee_subset(
        product    = product, 
        bands      = my_bands, 
        start_date = larger_date(data_clean.iloc[i, 3],  product_start_date), 
        end_date   = smaller_date(data_clean.iloc[i, 4], product_end_date), 
        latitude   = data_clean.iloc[i, 2], 
        longitude  = data_clean.iloc[i ,1], 
        scale      = max(product_scale, output_scale)
        )

    # Attach site id for joining later
    df_loop["SiteID"] = str(data_clean.iloc[i, 0]) 
    
    # Remove unnecessary columns to save space and time
    df_loop = df_loop.drop(columns=['id', 'longitude', 'latitude', 'product'])
    
    # Write to csv file
    df_loop.to_csv(output_folder + "/site_" + str(data_clean.iloc[i, 0]) + ".csv", index=False)

15.9 s ± 265 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
%%timeit

for i in siteSet:

    if i < skip_to_i:
        continue
    
    df_loop = gee_subset.gee_subset(
        product    = product, 
        bands      = my_bands, 
        start_date = larger_date(data_clean.iloc[i, 3],  product_start_date), 
        end_date   = smaller_date(data_clean.iloc[i, 4], product_end_date), 
        latitude   = data_clean.iloc[i, 2], 
        longitude  = data_clean.iloc[i ,1], 
        scale      = max(product_scale, output_scale)
        )

    # Attach site id for joining later
    df_loop["SiteID"] = str(data_clean.iloc[i, 0]) 
    
    # Remove unnecessary columns to save space and time
    df_loop = df_loop.drop(columns=['id', 'longitude', 'latitude', 'product'])
    
    # Write to csv file
    df_loop.to_pickle(output_folder + "/site_" + str(data_clean.iloc[i, 0]) + "topickle")

15.9 s ± 608 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
%%timeit

for i in siteSet:

    if i < skip_to_i:
        continue
    
    df_loop = gee_subset.gee_subset(
        product    = product, 
        bands      = my_bands, 
        start_date = larger_date(data_clean.iloc[i, 3],  product_start_date), 
        end_date   = smaller_date(data_clean.iloc[i, 4], product_end_date), 
        latitude   = data_clean.iloc[i, 2], 
        longitude  = data_clean.iloc[i ,1], 
        scale      = max(product_scale, output_scale)
        )

    # Attach site id for joining later
    df_loop["SiteID"] = str(data_clean.iloc[i, 0]) 
    
    # Remove unnecessary columns to save space and time
    df_loop = df_loop.drop(columns=['id', 'longitude', 'latitude', 'product'])
    
    # Write to csv file
    df_loop.to_parquet(output_folder + "/site_" + str(data_clean.iloc[i, 0]) + "toparquet.parquet")

15.4 s ± 758 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
%%timeit

for i in siteSet:

    if i < skip_to_i:
        continue
    
    df_loop = gee_subset.gee_subset(
        product    = product, 
        bands      = my_bands, 
        start_date = larger_date(data_clean.iloc[i, 3],  product_start_date), 
        end_date   = smaller_date(data_clean.iloc[i, 4], product_end_date), 
        latitude   = data_clean.iloc[i, 2], 
        longitude  = data_clean.iloc[i ,1], 
        scale      = max(product_scale, output_scale)
        )

    # Attach site id for joining later
    df_loop["SiteID"] = str(data_clean.iloc[i, 0]) 
    
    # Remove unnecessary columns to save space and time
    df_loop = df_loop.drop(columns=['id', 'longitude', 'latitude', 'product'])
    
    # Write to csv file
    df_loop.to_feather(output_folder + "/site_" + str(data_clean.iloc[i, 0]) + "tofeather.feather")

16.1 s ± 280 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
