# Seasonal Aggregation of EDO Data


# Import and Functions


In [1]:
# Import and Magic

# Magic
# %load_ext autoreload
# %autoreload 2
%matplotlib inline

# Imports

from tqdm import tqdm
import rasterio
import pandas as pd
import glob
import numpy as np
import os
import folium
import matplotlib.pyplot as plt
import chime

chime.theme("mario")

# Import Functions
import sys

sys.path.insert(0, "../../src")
from run_mp import *
from utilities import *
from gee_data_wrangling import *

from ydata_profiling import ProfileReport

# 👉 User Input

- Currently, extraction has to be done individually for each dataset.
- Pick it below and run the whole notebook.


In [2]:
my_variable = "fapar"
take_subset = False

In [5]:
# List all available extracted data
pattern = "edo_qc_files/data_edo*"

# Match all files with pattern in working directory
files = glob.glob(pattern)

# Split into whether the filename contains 'subset' or not
files_full = [f for f in files if "subset" not in f]
files_sub = [f for f in files if "subset" in f]

# Remove files that have "aggregated" in them because they are the output of this notebook
files_full = [f for f in files_full if "aggregated" not in f]
files_sub = [f for f in files_sub if "aggregated" not in f]

# Strip everything but the variable name
files_full = [
    f.replace("data_edo_", "").replace("_raw.feather", "") for f in files_full
]

files_sub = [
    f.replace("data_edo_", "").replace("_raw_subset.feather", "") for f in files_sub
]

files_full = [
    f.replace("edo_qc_files/", "").replace("_raw.feather", "") for f in files_full
]

files_sub = [
    f.replace("edo_qc_files/", "").replace("_raw_subset.feather", "") for f in files_sub
]

# Sort alphabetically
files_full.sort()
files_sub.sort()


print(
    f"List 'files_sub'\tholds {len(files_sub)} datasets for subsetted sites for:\t {files_sub}"
)
print(
    f"List 'files_full'\tholds {len(files_full)} datasets for all sites for: \t\t {files_full}"
)

# files_full
# files_full
# files_subsets
# Strip leading 'data_edo_' and trailing '.feather' from filenames
# files = [f[9:-19] for f in files]
# files

List 'files_sub'	holds 0 datasets for subsetted sites for:	 []
List 'files_full'	holds 2 datasets for all sites for: 		 ['cdi', 'fapar']


In [6]:
# Datasets that are processed separately
separately_processed = ["heatw", "cdi"]
print(f"Datasets that are processed separately: {separately_processed}")

Datasets that are processed separately: ['heatw', 'cdi']


# Load File


In [7]:
# Check if file exists and load if so
if take_subset:
    my_file = f"edo_qc_files/data_edo_{my_variable}_raw_subset.feather"
else:
    my_file = f"edo_qc_files/data_edo_{my_variable}_raw.feather"

# Check if file exists
if my_file in files:
    print(f"✅ File '{my_file}' found.")

    # Load file
    df = pd.read_feather(my_file)
    display(df.shape)
    display(df.head())
else:
    print(f"❌ File '{my_file}' not found.")

✅ File 'edo_qc_files/data_edo_cdi_raw.feather' found.


(9868176, 10)

Unnamed: 0,idp,date,cdi,first_year,x,y,start_year,end_year,season,before_first_year
0,500002,2012-01-01,0.0,2010,4126250.0,2796779.0,2005,2015,winter,False
1,641925,2012-01-01,0.0,2011,3941989.0,2682758.0,2006,2016,winter,False
2,641916,2012-01-01,0.0,2011,3922468.0,2708860.0,2006,2016,winter,False
3,1105437,2012-01-01,0.0,2016,4064066.0,2777903.0,2011,2021,winter,True
4,641908,2012-01-01,0.0,2011,3893755.0,2922824.0,2006,2016,winter,False


# Standard Seasonal Aggregation


In [23]:
# Define functions for seasonal aggregation
agg_functions = ["mean", "std", "range", "median", "iqr"]

In [24]:
# Run seasonal aggregation
if my_variable not in separately_processed:
    # AFTER
    # After first visit. Time between first_year and first_year + 5
    # No need to adjust dates
    df_after = df[df.before_first_year == False]
    df_after = df_after.drop(columns=["before_first_year"])

    # Multiprocessing
    print("Processing data after first visit")
    df_after = put_df_into_10lists_by_keeping_idp_together(df_after)
    df_after = run_mp(
        seasonal_aggregation_per_site,
        df_after,
        num_cores=10,
        progress_bar=True,
        current_var=my_variable,
        fcts_to_apply=agg_functions,
    )
    df_after = pd.concat(df_after)

    # BEFORE
    # Before first visit. Time between first_year - 5 and first_year
    # Need to subtract 5 from first_year to match function format
    df_before = df[df.before_first_year == True]
    df_before = df_before.drop(columns=["before_first_year"])
    df_before["first_year"] = df_before["first_year"] - 5

    # Multiprocessing
    print("Processing data after before visit")
    df_before = put_df_into_10lists_by_keeping_idp_together(df_before)
    df_before = run_mp(
        seasonal_aggregation_per_site,
        df_before,
        num_cores=10,
        progress_bar=True,
        current_var=my_variable,
        fcts_to_apply=agg_functions,
    )
    df_before = pd.concat(df_before)

    # Merge all data and display information
    # Attach suffix _5yrafter to all variables in df_after except idp
    df_after_sf = (
        df_after.add_suffix("_tpls5")
        .rename(columns={"idp_tpls5": "idp"})
        .reset_index(drop=True)
    )
    df_before_sf = (
        df_before.add_suffix("_tmin5")
        .rename(columns={"idp_tmin5": "idp"})
        .reset_index(drop=True)
    )

    # Merge variables into one dataframe
    df_final = pd.merge(
        df_before_sf,
        df_after_sf,
        how="left",
        validate="one_to_one",
    )

Processing data after first visit


100%|██████████| 11/11 [00:55<00:00,  5.05s/it]


Processing data after before visit


100%|██████████| 11/11 [01:00<00:00,  5.49s/it]


# Extraction of Heat- and Coldwaves


## Merge heatwave and temperature data


In [25]:
# Combine all needed raw data
if my_variable == "heatw":
    # To extract max and min temperatures during waves, we need these files too
    if take_subset:
        df_tmin = pd.read_feather("edo_qc_files/data_edo_mintmp_raw_subset.feather")
        df_tmax = pd.read_feather("edo_qc_files/data_edo_maxtmp_raw_subset.feather")
    else:
        df_tmin = pd.read_feather("edo_qc_files/data_edo_mintmp_raw.feather")
        df_tmax = pd.read_feather("edo_qc_files/data_edo_maxtmp_raw.feather")

    df_tmin = df_tmin[["idp", "date", "mintmp"]]
    df_tmax = df_tmax[["idp", "date", "maxtmp"]]

    # Attach to heatwave dataset
    df_t = pd.merge(df_tmin, df_tmax, how="left", validate="one_to_one")
    df_full = pd.merge(df, df_t, how="left", validate="one_to_one")

    # Attach information is about heat or cold wave
    heat_months = [4, 5, 6, 7, 8, 9]

    df_full["heat_or_cold"] = df_full["date"].dt.month.isin(heat_months)
    df_full["heat_or_cold"] = df_full["heat_or_cold"].replace(
        {True: "heatwave", False: "coldwave"}
    )

    # Remove data from before 1. September of first year and after 31. August of last year
    first_day = pd.to_datetime(f"{df_full.start_year.min()}-09-01")
    last_day = pd.to_datetime(f"{df_full.end_year.max()}-08-31")

    df_full = df_full[(df_full.date >= first_day) & (df_full.date <= last_day)]
    df_full.head(3)

## Example for Data Structure


In [26]:
# Example of data structure
if my_variable == "heatw":
    # Get one timeseries for one site from the df

    sample_number = 3  # Good example is site 957543
    sample_site = df.sample(1, random_state=sample_number)["idp"].values[0]
    sample_years = df.sample(1, random_state=sample_number)["first_year"].values[0]
    # Get a range around the sample_years of plus minus 2 years
    sample_years = [sample_years] + np.arange(
        sample_years - 3, sample_years + 3
    ).tolist()

    sample_df = df_full.query("idp == @sample_site and date.dt.year in @sample_years")
    # sample_df = sample_df.replace(np.nan, 0)

    # plot it
    plt.figure(figsize=(15, 5))

    # Define marker styles for different seasons
    marker_styles = {"winter": "o", "spring": "^", "summer": "s", "autumn": "x"}

    # Iterate over unique combinations of heat_or_cold and season
    for heat_or_cold in sample_df["heat_or_cold"].unique():
        for season in sample_df["season"].unique():
            # Filter the dataframe for the current combination
            filtered_df = sample_df[
                (sample_df["heat_or_cold"] == heat_or_cold)
                & (sample_df["season"] == season)
            ]

            # Plot the timeseries with specific color and marker
            plt.scatter(
                filtered_df["date"],
                filtered_df[my_variable],
                label=f"{heat_or_cold}, {season}",
                color=("orange" if heat_or_cold == "heatwave" else "cornflowerblue"),
                marker=marker_styles.get(season, "x"),
            )  # default to 'x' if season not in marker_styles

    ymax = np.nanmax(sample_df[my_variable]) * 1.1
    plt.ylim(0, ymax)
    plt.title(f"Timeseries for site {sample_site}", loc="left")
    plt.xlabel("Date")
    plt.ylabel("Total Duration of x-Wave in Days")
    plt.legend(bbox_to_anchor=(0.55, 1.5))

    # Inset for the year 2015
    # The values in add_axes are [left, bottom, width, height] all in fractions of figure width and height
    inset_ax = plt.axes(
        [0.58, 0.95, 0.3, 0.31]
    )  # Modify these values as needed for your figure layout

    for heat_or_cold in sample_df["heat_or_cold"].unique():
        for season in sample_df["season"].unique():
            filtered_df = sample_df[
                (sample_df["heat_or_cold"] == heat_or_cold)
                & (sample_df["season"] == season)
                & (sample_df["date"].dt.year == 2015)
            ]
            inset_ax.scatter(
                filtered_df["date"],
                filtered_df[my_variable],
                color=("orange" if heat_or_cold == "heatwave" else "cornflowerblue"),
                marker=marker_styles.get(season, "x"),
            )

    inset_ax.set_title("Example for 2015", fontsize=10, loc="left")
    inset_ax.set_xlabel("")
    inset_ax.set_ylabel("")
    inset_ax.set_ylim(0, ymax)
    # inset_ax.set_xticks([])  # Remove x-axis ticks
    # inset_ax.set_yticks([])  # Remove y-axis ticks

    # Adjust x-axis format, labels, etc., as needed for the inset

    plt.show()

## Parallel processing of all sites


In [27]:
# Process extreme events parallel
if my_variable == "heatw":
    df_list = df_full.groupby("idp")
    df_list = [df for _, df in df_list]

    df_final = run_mp(
        extract_extreme_events_per_idp,
        df_list,
        combine_func=pd.concat,
        progress_bar=True,
        num_cores=10,
    )

# Quality Control and Reporting


In [28]:
# Quick check
display(f"Shape of full dataset: {df_final.shape}")
display("Glimpse over full dataset:", df_final)
display("Description of full dataset:", df_final.describe())
display("Info of full dataset:")
print(df_final.info(memory_usage="deep"))

'Shape of full dataset: (40022, 41)'

'Glimpse over full dataset:'

Unnamed: 0,idp,mean_of_mintmp_in_winter_tmin5,mean_of_mintmp_in_spring_tmin5,mean_of_mintmp_in_summer_tmin5,mean_of_mintmp_in_fall_tmin5,std_of_mintmp_in_winter_tmin5,std_of_mintmp_in_spring_tmin5,std_of_mintmp_in_summer_tmin5,std_of_mintmp_in_fall_tmin5,range_of_mintmp_in_winter_tmin5,...,range_of_mintmp_in_winter_tpls5,range_of_mintmp_in_spring_tpls5,median_of_mintmp_in_summer_tpls5,median_of_mintmp_in_fall_tpls5,median_of_mintmp_in_winter_tpls5,median_of_mintmp_in_spring_tpls5,iqr_of_mintmp_in_summer_tpls5,iqr_of_mintmp_in_fall_tpls5,iqr_of_mintmp_in_winter_tpls5,iqr_of_mintmp_in_spring_tpls5
0,500002,-1.167567,5.082934,13.056312,6.491511,4.469240,4.980752,2.994602,4.954806,28.0,...,25.0,24.0,13.3025,6.3025,-0.6975,5.3025,4.000000,7.0,5.0,7.0
1,500008,-1.050294,4.629457,12.607533,6.305687,4.943270,5.081294,3.174573,5.107635,28.0,...,25.0,25.0,12.6925,7.6925,0.6925,5.6925,5.000000,7.0,6.0,6.0
2,500012,0.198922,5.766174,13.414000,7.638175,4.354854,4.728569,2.953793,4.761523,26.0,...,25.0,25.0,13.4140,8.4140,1.4140,6.4140,4.000000,7.0,5.0,6.0
3,500013,1.854845,6.130652,13.154281,8.667418,4.452852,4.240083,2.685948,4.640242,24.0,...,25.0,25.0,13.2850,9.2850,3.2850,6.2850,4.000000,6.0,6.0,5.0
4,500042,1.660330,6.287261,13.542224,8.577682,4.451806,4.308534,2.734696,4.717759,25.0,...,23.0,25.0,13.2590,9.2590,2.2590,6.2590,4.000000,7.0,6.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40017,1131409,-4.967814,-0.624913,7.756625,2.534714,3.439935,3.417463,2.622843,4.110494,19.0,...,21.0,19.0,8.2490,2.2490,-3.7510,0.2490,4.000000,6.0,4.0,5.0
40018,1131410,1.304925,6.124413,14.883834,8.576874,4.325898,4.556230,2.944909,4.389536,27.0,...,25.0,26.0,15.5505,7.5505,0.5505,6.5505,3.000001,6.0,6.0,7.0
40019,1131419,6.525177,7.575870,13.736024,11.429834,3.238585,2.816441,2.086202,2.945704,18.0,...,16.0,14.0,14.0650,10.0650,5.0650,8.0650,3.000001,3.5,6.0,3.0
40020,1131424,-2.419960,2.030848,10.317677,4.877599,4.020809,3.735842,2.587091,4.502508,24.0,...,22.0,20.0,11.3765,4.3765,-2.6235,2.3765,3.000000,6.0,5.0,6.0


'Description of full dataset:'

Unnamed: 0,idp,mean_of_mintmp_in_winter_tmin5,mean_of_mintmp_in_spring_tmin5,mean_of_mintmp_in_summer_tmin5,mean_of_mintmp_in_fall_tmin5,std_of_mintmp_in_winter_tmin5,std_of_mintmp_in_spring_tmin5,std_of_mintmp_in_summer_tmin5,std_of_mintmp_in_fall_tmin5,range_of_mintmp_in_winter_tmin5,...,range_of_mintmp_in_winter_tpls5,range_of_mintmp_in_spring_tpls5,median_of_mintmp_in_summer_tpls5,median_of_mintmp_in_fall_tpls5,median_of_mintmp_in_winter_tpls5,median_of_mintmp_in_spring_tpls5,iqr_of_mintmp_in_summer_tpls5,iqr_of_mintmp_in_fall_tpls5,iqr_of_mintmp_in_winter_tpls5,iqr_of_mintmp_in_spring_tpls5
count,40022.0,39881.0,39881.0,39881.0,39881.0,39881.0,39881.0,39881.0,39881.0,39881.0,...,39881.0,39881.0,39881.0,39881.0,39881.0,39881.0,39881.0,39881.0,39881.0,39881.0
mean,819326.0,0.460025,5.492171,13.086027,7.630863,4.180585,4.082815,2.790611,4.510898,23.056593,...,21.503046,21.073996,13.70365,8.251944,1.49182,5.505573,3.887666,6.02787,5.348838,5.669034
std,191038.2,2.605654,2.050266,1.922903,2.238308,0.449134,0.397443,0.262327,0.350942,2.864393,...,2.603051,2.353771,1.944173,2.23165,2.605982,2.096488,0.558557,0.672046,0.809407,0.742957
min,500002.0,-15.78938,-7.657826,0.679542,-6.635275,2.387192,2.524908,1.648726,2.818453,14.0,...,13.0,14.0,1.9125,-5.51,-12.51,-7.51,1.999999,3.5,2.0,3.0
25%,650634.2,-0.712297,4.853631,12.547084,6.731973,3.905266,3.856753,2.648578,4.349582,21.0,...,20.0,19.0,13.01,7.399,0.399,4.843,3.999999,6.0,5.0,5.0
50%,819978.5,0.833681,5.767978,13.238967,7.865593,4.303505,4.079128,2.847617,4.551745,24.0,...,22.0,21.0,13.8755,8.479,1.8095,5.8225,4.0,6.0,5.0,6.0
75%,1001783.0,2.103,6.605978,13.956728,8.912489,4.494506,4.360791,2.9721,4.719116,25.0,...,23.0,23.0,14.674,9.5125,3.181,6.649,4.0,6.0,6.0,6.0
max,1131437.0,8.403823,12.439218,20.779337,16.007517,5.43615,5.258202,3.950966,5.625982,35.0,...,35.0,29.0,21.323999,16.323999,9.324,12.324,6.0,8.0,9.0,8.0


'Info of full dataset:'

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40022 entries, 0 to 40021
Data columns (total 41 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   idp                               40022 non-null  int64  
 1   mean_of_mintmp_in_winter_tmin5    39881 non-null  float32
 2   mean_of_mintmp_in_spring_tmin5    39881 non-null  float32
 3   mean_of_mintmp_in_summer_tmin5    39881 non-null  float32
 4   mean_of_mintmp_in_fall_tmin5      39881 non-null  float32
 5   std_of_mintmp_in_winter_tmin5     39881 non-null  float64
 6   std_of_mintmp_in_spring_tmin5     39881 non-null  float64
 7   std_of_mintmp_in_summer_tmin5     39881 non-null  float64
 8   std_of_mintmp_in_fall_tmin5       39881 non-null  float64
 9   range_of_mintmp_in_winter_tmin5   39881 non-null  float32
 10  range_of_mintmp_in_spring_tmin5   39881 non-null  float32
 11  range_of_mintmp_in_summer_tmin5   39881 non-null  float32
 12  rang

In [29]:
# Create and save report
report = ProfileReport(df_final, title=f"Report for {my_variable}", minimal=True)
if take_subset:
    report.to_file(f"edo_qc_files/report_{my_variable}_subset.html")
else:
    report.to_file(f"edo_qc_files/report_{my_variable}.html")

# Show report
report

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]



# Save Data


In [30]:
# Save dataframe
if take_subset:
    savedfeather = f"edo_qc_files/data_edo_{my_variable}_subset_aggregated.feather"
else:
    savedfeather = f"edo_qc_files/data_edo_{my_variable}_aggregated.feather"

df_final.reset_index(drop=True).to_feather(savedfeather)
print(f"✅ Successfully saved dataframe:\t{savedfeather}\t at {datetime.datetime.now()}")

chime.success()

✅ Successfully saved dataframe:	data_edo_mintmp_aggregated.feather	 at 2023-12-11 19:52:52.381432
