# Process ERA5 Daily

In [1]:
debug = True

In [2]:
# Import Libraries
import numpy as np
import pandas as pd
import geopandas as gpd

import datetime
import re
import glob

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

from scipy.ndimage import binary_closing

# Import Magic
%matplotlib inline

## Merge variables per site

In [3]:
# Load all sites
all_sites = pd.read_csv("../01_download_raw_gee_data/sites_years.csv")

if debug:
    all_sites = all_sites.iloc[0:9, ]

print(all_sites)

   first_visit  id         x          y
0         2011   1 -2.842824  48.337505
1         2012   2  3.349757  46.198025
2         2012   3  3.361577  46.827747
3         2012   4  0.402182  48.201563
4         2012   5  6.461081  43.281648
5         2012   6  3.562693  46.447864
6         2012   7 -1.081578  43.288368
7         2012   8  2.635621  48.289030
8         2012   9  4.660504  44.414816


In [4]:
i = 0

# Specify the folder path and file extension
folder_path = '../01_download_raw_gee_data/gee-raw-data/era5-daily/*'
file_extension = 'site_' + str(all_sites.iloc[i, 1]) + '.feather'  # Change this to your desired file extension

# Use glob to get a list of file paths
file_paths = glob.glob(f"{folder_path}/{file_extension}")

# Read all files into a list of DataFrames
dfs = [pd.read_feather(file) for file in file_paths]

# Merge dataframes into one
merged_df = dfs[0]  # Start with the first DataFrame
for df in dfs[1:]:
    merged_df = pd.merge(merged_df, df, on=['date', "SiteID"])

# Correct temperature scales
temperature_cols = [
    'mean_2m_air_temperature', 
    'maximum_2m_air_temperature',
    'minimum_2m_air_temperature',
    'dewpoint_2m_temperature'
    ]

merged_df[temperature_cols] = merged_df[temperature_cols].apply(lambda x: x - 273.15)

# Display the merged DataFrame
merged_df.head(10)

Unnamed: 0,date,minimum_2m_air_temperature,SiteID,total_precipitation,v_component_of_wind_10m,surface_pressure,maximum_2m_air_temperature,mean_sea_level_pressure,u_component_of_wind_10m,dewpoint_2m_temperature,mean_2m_air_temperature
0,2009-01-01,-1.060431,1,0.0001040399,1.371742,100676.179688,1.438013,102575.5625,-3.758306,-0.580023,0.032831
1,2009-01-02,-0.585303,1,5.018897e-05,-0.692971,100692.257812,2.209131,102592.710938,-4.337219,-1.657294,0.201166
2,2009-01-03,-2.310156,1,1.286715e-05,-1.309379,100696.992188,1.359674,102602.914062,-4.201357,-2.589575,-0.48725
3,2009-01-04,-4.834204,1,5.215406e-07,-0.335249,100580.476562,1.545923,102482.375,-0.628976,-3.94361,-1.814337
4,2009-01-05,-2.399146,1,0.001691923,-2.330764,100121.492188,4.182947,101995.65625,-0.294047,0.001215,1.528436
5,2009-01-06,-1.737585,1,0.0005177241,-2.667571,100476.132812,0.940027,102377.054688,-3.802512,-4.187201,-0.1854
6,2009-01-07,-4.084662,1,0.0002399813,-2.46666,100752.804688,2.985101,102650.039062,-1.72893,-3.416144,0.034265
7,2009-01-08,-2.567938,1,0.0005161576,-1.43904,100841.6875,5.348962,102729.46875,-2.891483,-0.758368,2.102472
8,2009-01-09,-5.962653,1,0.0,1.019977,100797.46875,2.250146,102708.585938,-1.91598,-5.524084,-2.79151
9,2009-01-10,-4.980505,1,0.0,1.80784,100885.710938,5.346155,102767.492188,-0.408663,-3.979742,-0.7284


## Statistics

In [5]:
merged_df.info
merged_df.describe().transpose()

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
date,2921.0,2012-12-31 00:00:00,2009-01-01 00:00:00,2011-01-01 00:00:00,2012-12-31 00:00:00,2014-12-31 00:00:00,2016-12-30 00:00:00,
minimum_2m_air_temperature,2921.0,7.841589,-5.962653,4.67254,8.077936,11.22616,19.750604,4.43146
total_precipitation,2921.0,0.002219,0.0,0.00004,0.000539,0.002779,0.036359,0.003791
v_component_of_wind_10m,2921.0,0.380882,-7.617627,-1.912436,0.221982,2.690741,9.626294,2.93776
surface_pressure,2921.0,99827.701082,95372.054688,99290.9375,99931.367188,100473.757812,102307.960938,960.467539
maximum_2m_air_temperature,2921.0,14.537996,0.346155,10.539758,14.442194,18.546198,33.060999,5.462773
mean_sea_level_pressure,2921.0,101640.878547,97121.539062,101084.867188,101741.101562,102289.679688,104216.101562,976.079813
u_component_of_wind_10m,2921.0,1.073985,-6.223584,-0.932736,1.304392,3.097574,8.994781,2.725749
dewpoint_2m_temperature,2921.0,8.077909,-6.560706,4.87655,8.422693,11.516534,18.263239,4.531972
mean_2m_air_temperature,2921.0,11.129403,-2.79151,7.746698,11.302362,14.833551,26.238824,4.739269


In [6]:
merged_df.isnull().values.any()

False

## Visuals

In [7]:
px.line(merged_df, x="date", y="mean_2m_air_temperature", markers=True)

In [8]:
px.line(merged_df, x="date", y="maximum_2m_air_temperature")

In [9]:
px.line(merged_df, x="date", y="minimum_2m_air_temperature")

In [10]:
px.line(merged_df, x="date", y="dewpoint_2m_temperature")

In [11]:
px.line(merged_df, x="date", y="total_precipitation")

In [12]:
px.line(merged_df, x="date", y="surface_pressure")

In [13]:
px.line(merged_df, x="date", y="mean_sea_level_pressure")

In [14]:
px.line(merged_df, x="date", y="u_component_of_wind_10m")

In [15]:
px.line(merged_df, x="date", y="v_component_of_wind_10m")

## Heatwave Detection

### Heat-wave Metrics

**Frequency**
- Count: the number of heat waves that occurred.

**Duration**
- Longest: the duration of the longest heat wave
- Average: the average duration of heat waves
- Total: the total number of heat waves days

**Season length**
- Length: the number of days between the first heat wave of the year and the last.
- ~~Earliest: the DOY of the earliest heat wave~~
- ~~Latest: the DOY of the latest heat wave~~

**Intensity** 
- Hottest: the maximum temperature during the heat wave
- Average: the average temperature during the heat wave

---

**Notes**
- Note that these metrics are extracted from the mean air temperature.
- Note that these metrics can be extracted for the time within census interval or 1-2 years before first census.
- Note that there are more elaborate methods to extract heatwaves. For simplicity we are using the 3-days-above-30 approach.
    -   Alternatives: [hotspell python](https://hotspell.readthedocs.io/en/latest/user_guide/tutorial.html), [heatwaveR](https://robwschlegel.github.io/heatwaveR/index.html)

---
**TODO:**

- Set date filter so that the first summer of the year of sampling is fully in and until the winter of the last year.

### Functions

In [84]:
def extract_min_days_between_extremes(time_series, df_tmp, threshold_days):
    
    '''
    Algorithm to calculate the smallest time difference between two extremes.
    IMPORTANT: With the cleaned time-series of 0 and 1s, the patterns for the 
    start and end of an event is imply 10 (end of event) and 01 (start of event).
    '''
    if debug: print("> Calling extract_min_days_between_extremes:")
    
    all_days_between_heatwaves = []
    last_day_found = False

    for i in range(len(time_series) - 1):
        
        # Detect last day of heat wave
        if (time_series[i] >= threshold_days) & (time_series[i + 1] == 0):
            last_day = df_tmp.loc[i, 'date'] 
            last_day_found = True
            # if debug: print(f"i = {i} \t i+1 = {i + 1}")
            # if debug: print(f"time_series: i = {time_series[i]} \t i+1 = {time_series[i+1]}")
            if debug: print(f"Last event ended on: {last_day} (i = {i}) ")
        
        # If a last day of an event was found, check if a next event can be found
        if last_day_found and i != 0:
            
            # Detect first day of heat wave
            if (time_series[i] == 0) & (time_series[i + 1] == 1):
                first_day = df_tmp.loc[i + 1, 'date'] 
                # print(time_series[i], "\t", time_series[i+1])
                
                # Update interval between last and first days
                days_between = (first_day - last_day).days - 1
                all_days_between_heatwaves.append(days_between)
                if debug: print(f"Next event started on: {first_day} (i = {i}) ")
                if debug: print(f"Time between these events: {days_between}")
                if debug: print("\n")

    # Drop negative values and 0s (errors in algorithm)
    all_days_between_heatwaves = [num for num in all_days_between_heatwaves if num > 0]

    # Find the minimum positive value
    if len(all_days_between_heatwaves) == 0:
        out = np.nan
    else:
        out = min(all_days_between_heatwaves)
    
    return out

In [85]:
def extract_heatwave_metrics(df_in, threshold_temperature, threshold_days, variable_of_interest):
    
    # if debug: print('> Running extract_heatwave_metrics()...')
    
    # -------------------------------------------------------------------------------------------
    # Copy temporary dataframe and rename column of interest
    df_tmp = df_in.copy()
    df_tmp = df_tmp.rename(columns={variable_of_interest: 'voi'})

    # DEBUG: Add artificial heatwaves to check for
    # use_value = 31
    # df_tmp.loc[0:1, 'voi'] = use_value # Start of timeseries
    # df_tmp.loc[5:10, 'voi'] = use_value # Middle of timeseries
    # df_tmp.loc[15:17, 'voi'] = use_value # Middle of timeseries
    # df_tmp.loc[df_tmp.index[-2:], 'voi'] = use_value # End of time series
    # Check if heatwaves were added
    # px.line(df_tmp, x="date", y=voi).show()
    # df_above30 = df_tmp.query('voi > @threshold_temperature')[['date', 'voi']]
    # print(len(df_above30))
    # df_above30
    # df_tmp

    # -------------------------------------------------------------------------------------------
    # Create masks to filter for

    # Create a boolean mask for temperatures above the threshold
    above_threshold_mask = df_tmp['voi'] > threshold_temperature

    # The current code turns the mask into a string. So, taking the boolean or cumsum array (if events that are 10 days or longer) leads to a longer string than the time series has. To remove events shorter than the selected threshold, we need a string of 0s and 1s only.
    above_threshold_mask10 = above_threshold_mask.astype(int)

    # Create dataframe that counts cumulative TRUES in the threshold mask
    consecutive_segments = above_threshold_mask.astype(int).groupby((~above_threshold_mask).cumsum()).cumsum()

    # Extract the count of heatwaves
    hw_counts = (consecutive_segments == threshold_days).sum()

    # -------------------------------------------------------------------------------------------
    # Algorithm to remove days that are not part of a heatwave from the consecutive_segments data

    # Function turns following sequences to zero: 
    # - 0{1 * threshold_days - 1}0
    # - ^{1 * threshold_days - 1}0
    # - 0{1 * threshold_days - 1}$
    # Example for 3 days threshold: 010, 0110, ^110, ^10, 011$, 01$
    str_sequence_org = ''.join(above_threshold_mask10.astype(str))
    str_sequence = str_sequence_org

    # Create all in-between patterns
    mid_patterns = []
    start_patterns = []
    end_patterns = []

    n = threshold_days

    for i in range(n):
        # pat   = ''.join(np.arange(1, n-i).astype(str))
        pat   = (n-i-1) * '1'
        start = [pat + '0']
        mid   = ['0' + pat + '0']
        end   = ['0' + pat]

        mid_patterns = mid_patterns + mid
        start_patterns = start_patterns + start
        end_patterns = end_patterns + end

    # print(f'mid_patterns: {mid_patterns}')
    # print(f'end_patterns: {end_patterns}')
    # print(f'start_patterns: {start_patterns}')

    # Remove all occurrences of patterns from str_sequence
    for pattern in end_patterns:
        replacement   = "0" * len(pattern)
        regex_pattern = re.compile(f"{pattern}$")
        str_sequence   = regex_pattern.sub(replacement, str_sequence)

    for pattern in start_patterns:
        replacement   = "0" * len(pattern)
        regex_pattern = re.compile(f"^{pattern}")
        str_sequence  = regex_pattern.sub(replacement, str_sequence)
        
        
    # For some weird reason, we have to apply mid_pattern removal twice because first run does not capture sections like 0101.
    for pattern in mid_patterns:
        replacement  = "0" * len(pattern)
        regex_pattern = re.compile(f"{pattern}")
        str_sequence  = regex_pattern.sub(replacement, str_sequence)
        
    for pattern in mid_patterns:
        replacement  = "0" * len(pattern)
        regex_pattern = re.compile(f"{pattern}")
        str_sequence  = regex_pattern.sub(replacement, str_sequence)
        
    for pattern in mid_patterns:
        replacement  = "0" * len(pattern)
        regex_pattern = re.compile(f"{pattern}")
        str_sequence  = regex_pattern.sub(replacement, str_sequence)

    # print(str_sequence_org[990:1005])
    # print(str_sequence[990:1005])
    
    # Turn string into array again and update other arrays
    above_threshold_mask10_clean = np.fromiter(str_sequence, dtype=int)

    above_threshold_mask_clean   = above_threshold_mask.copy()
    above_threshold_mask_clean[above_threshold_mask10_clean == 0] = 0

    consecutive_segments_clean   = consecutive_segments.copy()
    consecutive_segments_clean[above_threshold_mask10_clean == 0] = 0
    
    # if debug: print(consecutive_segments[1000:1005])
    # if debug: print(consecutive_segments_clean[1000:1005])
    
    # if debug: print(consecutive_segments[1660:1665])
    # if debug: print(consecutive_segments_clean[1660:1665])

    # print(str_sequence)
    # print(np.array(consecutive_segments))
    # print(result_array)

    # print(above_threshold_mask10_clean)

    # -------------------------------------------------------------------------------------------
    # Extract metrics

    if hw_counts > 0:
        # Extract the longest duration of the heatwave  
        hw_dur_max  = consecutive_segments_clean.max()
        
        # Extract the sum of heatwave days
        hw_day_sum  = above_threshold_mask_clean.sum()
        
        # Extract the mean duration of the heatwaves
        # IDEA: To get the median of heatwave duration, I would have to count the number of days for each heatwave and then add them together. But this requires more work and is not feasible right now.
        hw_dur_mean = hw_day_sum / hw_counts
        
        # Extract smallest duration between two events
        hw_days_between = extract_min_days_between_extremes(
            consecutive_segments_clean,
            df_tmp,
            threshold_days
            )
        
        # Extract mean and max temperature across all heatwaves
        df_temp      = df_tmp[above_threshold_mask10_clean == 1].voi
        hw_mean_temp = df_temp.mean()
        hw_max_temp  = df_temp.max()
        
    else:
        hw_dur_max  = np.nan
        hw_dur_mean = np.nan
        hw_day_sum  = np.nan
        hw_days_between = np.nan
        hw_mean_temp = np.nan
        hw_max_temp = np.nan
        
    # -------------------------------------------------------------------------------------------
    # Return metrics
    out = {
    'hw_counts': hw_counts,
    'hw_dur_max': hw_dur_max,
    'hw_dur_mean': hw_dur_mean,
    'hw_day_sum': hw_day_sum,
    'hw_days_between': hw_days_between,
    'hw_mean_temp': hw_mean_temp,
    'hw_max_temp': hw_max_temp
    }

    # Print results
    print(f'hw_counts \t= {hw_counts}')
    print(f'hw_dur_max \t= {hw_dur_max}')
    print(f'hw_dur_mean \t= {hw_dur_mean}')
    print(f'hw_day_sum \t= {hw_day_sum}')
    print(f'hw_days_between = {hw_days_between}')
    print(f'hw_mean_temp\t= {hw_mean_temp}')
    print(f'hw_max_temp \t= {hw_max_temp }')
    
    # Debug return
    if debug:
        out = pd.concat(
            [consecutive_segments, 
            consecutive_segments_clean, 
            pd.Series(above_threshold_mask10),
            pd.Series(above_threshold_mask10_clean),
            df_tmp['voi'],
            df_tmp['date']], 
            keys=['org_cumsum',
                'clean_cumsum',
                'org_01_mask',
                'clean_01_mask', 
                'temperature_cols', 
                'date'],
            axis=1)
    
    return out

### Test Run

In [None]:
test = extract_heatwave_metrics(
    df_in = merged_df,
    threshold_temperature = 19,
    threshold_days = 3,
    variable_of_interest = 'mean_2m_air_temperature'
)