# Process ERA5 Daily

## Todos

- Create filter routine that removes extreme outliers from the climate data. For example, see wind speeds for id = 856.

---

## Setup

In [2]:
import numpy as np
import pandas as pd
import geopandas as gpd

import datetime as dt
import re
import glob

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

from scipy.ndimage import binary_closing

from utils import *
from run_mp import run_mp

# Import Magic
%matplotlib inline

# Exploration of one site

In [2]:
# Load all sites
all_sites = pd.read_csv("../01_download_raw_gee_data/sites_years.csv")
print(all_sites)

       first_visit     id         x          y
0             2011      1 -2.842824  48.337505
1             2012      2  3.349757  46.198025
2             2012      3  3.361577  46.827747
3             2012      4  0.402182  48.201563
4             2012      5  6.461081  43.281648
...            ...    ...       ...        ...
38297         2016  38298  2.156438  42.686748
38298         2016  38299  7.424664  47.899971
38299         2016  38300 -3.221877  48.851908
38300         2016  38301  3.757202  44.924973
38301         2016  38302  3.417427  46.998179

[38302 rows x 4 columns]


In [3]:
# Test for one site
i = 765
i = i - 1  # Removing 1 one because python starts at 0. Just type in the id of interest.

test_df = load_and_merge_files(
    SiteID=all_sites["id"].iloc[i],
    first_year=all_sites["first_year"].iloc[i],
    subdir="era5-daily",
    verbose=False,
)

test_df.head(10)

Unnamed: 0,date,minimum_2m_air_temperature,SiteID,total_precipitation,v_component_of_wind_10m,surface_pressure,maximum_2m_air_temperature,mean_sea_level_pressure,u_component_of_wind_10m,dewpoint_2m_temperature,mean_2m_air_temperature,first_year
0,2008-01-01,269.621857,765,0.0,-0.875533,92972.09375,275.822906,102265.9375,-0.845897,268.22937,272.278259,2010
1,2008-01-02,269.760162,765,0.0,0.074294,92302.898438,276.186279,101480.226562,-1.168203,266.365631,272.373779,2010
2,2008-01-03,271.698425,765,1.9e-05,0.545303,91420.476562,277.380615,100473.507812,-1.726469,267.181549,273.36026,2010
3,2008-01-04,272.826508,765,0.000265,0.843249,92073.210938,278.443176,101115.03125,-0.961461,270.371704,275.343231,2010
4,2008-01-05,274.798767,765,0.022994,2.357031,92408.757812,279.828827,101456.617188,0.053011,274.009308,277.551208,2010
5,2008-01-06,278.589111,765,0.016159,2.016394,92631.0,280.944214,101672.5,0.619354,278.022675,279.531952,2010
6,2008-01-07,273.435608,765,0.015911,1.609923,93234.0,282.017365,102308.71875,0.779413,276.735687,278.936615,2010
7,2008-01-08,270.554504,765,0.000149,0.436659,93302.945312,278.579681,102512.726562,-0.649612,270.750458,273.748016,2010
8,2008-01-09,274.312378,765,0.003682,1.861838,93117.523438,278.29248,102231.898438,0.110479,273.534393,276.566528,2010
9,2008-01-10,272.688141,765,5.1e-05,1.812051,93058.054688,282.032257,102125.5,-0.176805,273.304901,276.556549,2010


In [4]:
# Display a sample of the merged DataFrame
test_df = fix_and_attach_variables(test_df)
test_df[["date", "season"]].sample(10)

Unnamed: 0,date,season
342,2008-12-08,winter
2445,2014-09-11,fall
1866,2013-02-09,winter
2802,2015-09-03,fall
2446,2014-09-12,fall
2852,2015-10-23,fall
1928,2013-04-12,spring
1910,2013-03-25,spring
2336,2014-05-25,spring
2016,2013-07-09,summer


## Statistics

In [5]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2921 entries, 0 to 2920
Data columns (total 13 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   date                        2921 non-null   datetime64[ns]
 1   minimum_2m_air_temperature  2921 non-null   float64       
 2   SiteID                      2921 non-null   int64         
 3   total_precipitation         2921 non-null   float64       
 4   v_component_of_wind_10m     2921 non-null   float64       
 5   surface_pressure            2921 non-null   float64       
 6   maximum_2m_air_temperature  2921 non-null   float64       
 7   mean_sea_level_pressure     2921 non-null   float64       
 8   u_component_of_wind_10m     2921 non-null   float64       
 9   dewpoint_2m_temperature     2921 non-null   float64       
 10  mean_2m_air_temperature     2921 non-null   float64       
 11  first_year                  2921 non-null   int64       

In [6]:
test_df.describe().transpose()

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
date,2921.0,2011-12-31 00:00:00,2008-01-01 00:00:00,2009-12-31 00:00:00,2011-12-31 00:00:00,2013-12-30 00:00:00,2015-12-30 00:00:00,
minimum_2m_air_temperature,2921.0,5.223668,-20.616721,0.270349,5.439142,10.580927,20.631372,6.600747
SiteID,2921.0,765.0,765.0,765.0,765.0,765.0,765.0,0.0
total_precipitation,2921.0,0.003995,0.0,0.000011,0.000553,0.005324,0.058735,0.006695
v_component_of_wind_10m,2921.0,0.006671,-5.521392,-0.928013,0.073503,1.009756,4.066734,1.371553
surface_pressure,2921.0,92738.739149,89419.90625,92382.882812,92833.78125,93187.601562,94586.179688,712.665749
maximum_2m_air_temperature,2921.0,13.887459,-9.733862,7.620569,14.335474,19.96908,36.404565,8.020677
mean_sea_level_pressure,2921.0,101696.788145,98182.0,101296.414062,101718.210938,102155.710938,103861.382812,752.952294
u_component_of_wind_10m,2921.0,-0.020505,-2.945692,-0.60731,-0.00428,0.564058,2.711296,0.820757
dewpoint_2m_temperature,2921.0,5.175883,-17.596518,0.29165,5.546625,10.570306,17.375482,6.51973


In [7]:
test_df.isnull().values.any()

False

## Visuals

In [8]:
px.line(test_df, x="date", y="mean_2m_air_temperature", markers=True)

  v = v.dt.to_pydatetime()


In [9]:
px.line(test_df, x="date", y="maximum_2m_air_temperature")


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result



In [10]:
px.line(test_df, x="date", y="minimum_2m_air_temperature")


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result



In [11]:
px.line(test_df, x="date", y="dewpoint_2m_temperature")


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result



In [12]:
px.line(test_df, x="date", y="total_precipitation")


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result



In [13]:
px.line(test_df, x="date", y="surface_pressure")


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result



In [14]:
px.line(test_df, x="date", y="mean_sea_level_pressure")


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result



In [15]:
px.line(test_df, x="date", y="u_component_of_wind_10m")


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result



In [16]:
px.line(test_df, x="date", y="v_component_of_wind_10m")


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result



## General Temporal Aggregates

In [17]:
from utils import get_seasonal_aggregates

df_general_aggregates = get_seasonal_aggregates(
    df_in=test_df,
    timescale_days_to_months="fall cut-off",
    fcts_to_apply=["mean", "std"],
    debug=False,
    verbose=False,
)

df_general_aggregates.head(4)

Unnamed: 0,mean_of_minimum_2m_air_temperature_in_summer,mean_of_minimum_2m_air_temperature_in_spring,mean_of_minimum_2m_air_temperature_in_fall,mean_of_minimum_2m_air_temperature_in_winter,mean_of_total_precipitation_in_summer,mean_of_total_precipitation_in_spring,mean_of_total_precipitation_in_fall,mean_of_total_precipitation_in_winter,mean_of_v_component_of_wind_10m_in_summer,mean_of_v_component_of_wind_10m_in_spring,...,std_of_u_component_of_wind_10m_in_fall,std_of_u_component_of_wind_10m_in_winter,std_of_dewpoint_2m_temperature_in_summer,std_of_dewpoint_2m_temperature_in_spring,std_of_dewpoint_2m_temperature_in_fall,std_of_dewpoint_2m_temperature_in_winter,std_of_mean_2m_air_temperature_in_summer,std_of_mean_2m_air_temperature_in_spring,std_of_mean_2m_air_temperature_in_fall,std_of_mean_2m_air_temperature_in_winter
0,12.577023,4.391066,6.298001,-2.331363,0.004134,0.004051,0.003969,0.004346,-0.075531,-0.198187,...,0.759555,0.769941,2.633856,4.061419,4.695206,4.092594,3.387329,4.312077,4.985153,3.771338


## Heatwave Detection


### Heat-wave Metrics

**Frequency**
- Count: the number of heat waves that occurred.

**Duration**
- Longest: the duration of the longest heat wave
- Average: the average duration of heat waves
- Total: the total number of heat waves days

**Season length**
- Length: the number of days between the first heat wave of the year and the last.
- ~~Earliest: the DOY of the earliest heat wave~~
- ~~Latest: the DOY of the latest heat wave~~

**Intensity** 
- Hottest: the maximum temperature during the heat wave
- Average: the average temperature during the heat wave

---

**Notes**
- Note that these metrics are extracted from the mean air temperature.
- Note that these metrics can be extracted for the time within census interval or 1-2 years before first census.
- Note that there are more elaborate methods to extract heatwaves. For simplicity we are using the 3-days-above-30 approach.
    -   Alternatives: [hotspell python](https://hotspell.readthedocs.io/en/latest/user_guide/tutorial.html), [heatwaveR](https://robwschlegel.github.io/heatwaveR/index.html)

---
**TODO:**

- Set date filter so that the first summer of the year of sampling is fully in and until the winter of the last year.

### Test Run

In [18]:
df_hw = extract_heatwave_metrics(
    df_in=test_df,
    threshold_temperature=24,
    threshold_days=3,
    variable_of_interest="mean_2m_air_temperature",
)

df_hw

Unnamed: 0,hw_counts,hw_dur_max,hw_dur_mean,hw_day_sum,hw_days_between,hw_mean_temp,hw_max_temp
0,2,7,5.0,10,7,26.023846,27.370966


## Frost Spell Detection

### Metrics

**Late-Spring Frost Events**

- Metric: Cumulative Growing Degree Days (GDD) before last frost in spring:
- Definitions: 
    -   GDD = Day where daily mean temperature are above certain threshold
    -   Frost Event: Daily minimum temperature is below threshold (0 degC)

**Early-Fall Frost Event**
- Metric: DOY of first frost event after summer.

**Resources**
- See here: https://www.pnas.org/doi/full/10.1073/pnas.1920816117

### Spring and Fall Frost

In [19]:
detect_frost_events(df_in=test_df)

Unnamed: 0,max_gdd_before_spring_frost,min_doy_of_fall_frost
0,70.0,290.0


## Drought Detection



TODO: CWD can only be calculated from the hourly values, I think..

---

## Storm Detection


### Metrics

-  

# Workflow to process all sites

### 1 Site

In [2]:
# Load all sites
all_sites = pd.read_csv("../01_download_raw_gee_data/sites_years.csv")
all_sites

Unnamed: 0,first_visit,id,x,y
0,2011,1,-2.842824,48.337505
1,2012,2,3.349757,46.198025
2,2012,3,3.361577,46.827747
3,2012,4,0.402182,48.201563
4,2012,5,6.461081,43.281648
...,...,...,...,...
38297,2016,38298,2.156438,42.686748
38298,2016,38299,7.424664,47.899971
38299,2016,38300,-3.221877,48.851908
38300,2016,38301,3.757202,44.924973


In [6]:
# Test for one site
i = 765
i = i - 1  # Removing 1 one because python starts at 0. Just type in the id of interest.

test_df = load_and_merge_files(
    SiteID=all_sites["id"].iloc[i],
    first_year=all_sites["first_year"].iloc[i],
    subdir="era5-daily",
    verbose=False,
)

test_df

Unnamed: 0,date,minimum_2m_air_temperature,SiteID,total_precipitation,v_component_of_wind_10m,surface_pressure,maximum_2m_air_temperature,mean_sea_level_pressure,u_component_of_wind_10m,dewpoint_2m_temperature,mean_2m_air_temperature,first_year
0,2008-01-01,269.621857,765,0.000000,-0.875533,92972.093750,275.822906,102265.937500,-0.845897,268.229370,272.278259,2010
1,2008-01-02,269.760162,765,0.000000,0.074294,92302.898438,276.186279,101480.226562,-1.168203,266.365631,272.373779,2010
2,2008-01-03,271.698425,765,0.000019,0.545303,91420.476562,277.380615,100473.507812,-1.726469,267.181549,273.360260,2010
3,2008-01-04,272.826508,765,0.000265,0.843249,92073.210938,278.443176,101115.031250,-0.961461,270.371704,275.343231,2010
4,2008-01-05,274.798767,765,0.022994,2.357031,92408.757812,279.828827,101456.617188,0.053011,274.009308,277.551208,2010
...,...,...,...,...,...,...,...,...,...,...,...,...
2916,2015-12-26,271.498535,765,0.000000,1.128221,94156.882812,283.052460,103251.914062,-0.420447,272.497101,275.722839,2010
2917,2015-12-27,270.975494,765,0.000000,0.699852,94119.601562,283.087677,103180.882812,-0.591978,271.121796,274.917633,2010
2918,2015-12-28,270.547516,765,0.000001,0.826686,93787.992188,282.643158,102801.781250,-0.730592,270.747803,274.794952,2010
2919,2015-12-29,271.171753,765,0.003211,0.988237,93810.093750,279.669861,102948.031250,-0.203482,272.208191,275.217590,2010


In [None]:
perform_wrangling_on_all_sites(my_group=test_df)

### 1000 Sites

In [10]:
i_1 = 0
i_0 = 1000

sites_subset = all_sites.iloc[i_1:i_0, :]
subset_all_data = pd.DataFrame()

for i in range(len(sites_subset)):
    print(f"\014 Working on site: {i}")
    df_tmp = load_and_merge_files(
        sites_subset["id"].iloc[i],
        sites_subset["first_year"].iloc[i],
        subdir="era5-daily",
        verbose=False,
    )

    subset_all_data = pd.concat([subset_all_data, df_tmp], axis=0)

subset_all_data

 Working on site: 0


In [22]:
df_out = pd.DataFrame()
grouped = subset_all_data.groupby("SiteID")

for name, group in grouped:
    print(f"\014 Working on site: {name}")
    out = perform_wrangling_on_all_sites(group)
    out["SiteID"] = name
    df_out = pd.concat([df_out, out], axis=0)

 Working on site: 1
 Working on site: 2
 Working on site: 3
 Working on site: 4
 Working on site: 5
 Working on site: 6
 Working on site: 7
 Working on site: 8
 Working on site: 9
 Working on site: 10
 Working on site: 11
 Working on site: 12
 Working on site: 13
 Working on site: 14
 Working on site: 15
 Working on site: 16
 Working on site: 17
 Working on site: 18
 Working on site: 19
 Working on site: 20
 Working on site: 21
 Working on site: 22
 Working on site: 23
 Working on site: 24
 Working on site: 25
 Working on site: 26
 Working on site: 27
 Working on site: 28
 Working on site: 29
 Working on site: 30
 Working on site: 31
 Working on site: 32
 Working on site: 33
 Working on site: 34
 Working on site: 35
 Working on site: 36
 Working on site: 37
 Working on site: 38
 Working on site: 39
 Working on site: 40
 Working on site: 41
 Working on site: 42
 Working on site: 43
 Working on site: 44
 Working on site: 45
 Working on site: 

In [None]:
df_out

Unnamed: 0,SiteID,first_year,max_gdd_before_spring_frost,min_doy_of_fall_frost,hw_counts,hw_dur_max,hw_dur_mean,hw_day_sum,hw_days_between,hw_mean_temp,...,std_of_u_component_of_wind_10m_in_fall,std_of_u_component_of_wind_10m_in_winter,std_of_dewpoint_2m_temperature_in_summer,std_of_dewpoint_2m_temperature_in_spring,std_of_dewpoint_2m_temperature_in_fall,std_of_dewpoint_2m_temperature_in_winter,std_of_mean_2m_air_temperature_in_summer,std_of_mean_2m_air_temperature_in_spring,std_of_mean_2m_air_temperature_in_fall,std_of_mean_2m_air_temperature_in_winter
0,1,2011,71.0,,0,,,,,,...,2.619938,2.802670,2.226777,3.194698,3.257241,3.455200,2.278503,3.048724,3.254536,3.022379
0,2,2012,85.0,284.0,0,,,,,,...,1.635715,1.790203,2.643025,3.899520,4.153596,3.503203,3.458285,4.101337,5.068428,3.734379
0,3,2012,88.0,302.0,0,,,,,,...,1.856179,2.052064,2.640743,4.043389,4.104312,3.617783,3.405807,4.052677,4.868852,3.544068
0,4,2012,91.0,308.0,0,,,,,,...,2.348274,2.409330,2.607817,3.862305,3.775565,3.719428,3.063142,3.770576,4.167096,3.366450
0,5,2012,58.0,314.0,0,,,,,,...,2.448248,2.317300,3.092643,4.002670,4.915545,4.124628,2.402515,3.521551,4.623864,2.647756
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,996,2010,62.0,294.0,0,,,,,,...,2.466466,2.697261,2.957163,4.284772,4.146851,4.387694,3.486642,4.362452,4.904833,3.852004
0,997,2010,64.0,303.0,0,,,,,,...,1.444291,1.728571,2.670356,3.730016,4.448066,3.749816,3.062309,3.766424,4.847789,3.302101
0,998,2010,64.0,303.0,0,,,,,,...,3.291618,3.139028,2.865181,3.577041,4.477945,3.974889,3.080216,3.604486,5.083651,3.521260
0,999,2010,29.0,,0,,,,,,...,2.932396,3.531742,2.361712,3.105515,3.766668,3.657007,2.227554,2.981601,3.776402,3.200272


### Parallel

In [3]:
# Load all sites
all_sites = pd.read_csv("../01_download_raw_gee_data/sites_years.csv")
all_sites

Unnamed: 0,first_visit,id,x,y
0,2011,1,-2.842824,48.337505
1,2012,2,3.349757,46.198025
2,2012,3,3.361577,46.827747
3,2012,4,0.402182,48.201563
4,2012,5,6.461081,43.281648
...,...,...,...,...
38297,2016,38298,2.156438,42.686748
38298,2016,38299,7.424664,47.899971
38299,2016,38300,-3.221877,48.851908
38300,2016,38301,3.757202,44.924973


In [6]:
sites_subset = all_sites.copy()
# sites_subset = sites_subset.iloc[0:12979, :]

print(sites_subset.shape)
sites_subset.head(1)

(12979, 4)


Unnamed: 0,first_visit,id,x,y
0,2011,1,-2.842824,48.337505


In [7]:
grouped = sites_subset.groupby("id", as_index=False)
df_list = [group for name, group in grouped]

df_out = run_mp(
    load_and_wrangle_PARALLEL,
    df_list,
    combine_func=pd.concat,
    progress_bar=True,
    num_cores=10,
    subdir="era5-daily",
    verbose=False,
)

100%|██████████| 12979/12979 [01:32<00:00, 140.78it/s]


In [11]:
df_out

Unnamed: 0,SiteID,first_year,max_gdd_before_spring_frost,min_doy_of_fall_frost,hw_counts,hw_dur_max,hw_dur_mean,hw_day_sum,hw_days_between,hw_mean_temp,...,std_of_u_component_of_wind_10m_in_fall,std_of_u_component_of_wind_10m_in_winter,std_of_dewpoint_2m_temperature_in_summer,std_of_dewpoint_2m_temperature_in_spring,std_of_dewpoint_2m_temperature_in_fall,std_of_dewpoint_2m_temperature_in_winter,std_of_mean_2m_air_temperature_in_summer,std_of_mean_2m_air_temperature_in_spring,std_of_mean_2m_air_temperature_in_fall,std_of_mean_2m_air_temperature_in_winter
0,1,2011,71.0,,0,,,,,,...,2.619938,2.802670,2.226777,3.194698,3.257241,3.455200,2.278503,3.048724,3.254536,3.022379
0,2,2012,85.0,284.0,0,,,,,,...,1.635715,1.790203,2.643025,3.899520,4.153596,3.503203,3.458285,4.101337,5.068428,3.734379
0,3,2012,88.0,302.0,0,,,,,,...,1.856179,2.052064,2.640743,4.043389,4.104312,3.617783,3.405807,4.052677,4.868852,3.544068
0,4,2012,91.0,308.0,0,,,,,,...,2.348274,2.409330,2.607817,3.862305,3.775565,3.719428,3.063142,3.770576,4.167096,3.366450
0,5,2012,58.0,314.0,0,,,,,,...,2.448248,2.317300,3.092643,4.002670,4.915545,4.124628,2.402515,3.521551,4.623864,2.647756
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,12975,2011,72.0,288.0,0,,,,,,...,0.749352,0.961371,2.548637,4.039474,4.487204,3.822723,3.036904,3.950721,4.690910,3.437288
0,12976,2011,60.0,284.0,0,,,,,,...,1.908012,2.236785,2.503212,4.055288,4.324468,4.436239,3.397147,3.965016,4.666734,3.908839
0,12977,2011,58.0,295.0,0,,,,,,...,1.249327,1.534438,2.899555,4.423644,4.194499,4.240465,3.535718,4.661139,4.908843,4.006709
0,12978,2011,68.0,289.0,0,,,,,,...,2.706190,2.638805,2.586029,3.707108,4.305299,4.460985,3.002345,3.690993,4.776864,3.699087
