# European Drought Observatory Data


In [1]:
# Magic
%load_ext autoreload
%autoreload 2

# Imports

from tqdm import tqdm
import rasterio
import pandas as pd
import glob
import numpy as np
import os
import folium

import chime

chime.theme("mario")

# Import Functions
import sys

sys.path.insert(0, "../../src")
from run_mp import *
from extract_raster_values import *
from gee_data_wrangling import *

## Load and Transform Coordinates

**Important:** The \*.tif are in EPSG:3035, so the coordinates need to be transformed to this CRS before extracting the data from the .tif files.


In [2]:
# Load latest coordinates for nfi sites
site_coordinates = pd.read_csv("../00_process_nfi_data/nfi_final_sites_with_idp.csv")

# Turn into geodataframe and set crs to 3035
df_sites = site_coordinates.copy()[["idp", "first_year", "x_fr", "y_fr"]]
df_sites = gpd.GeoDataFrame(
    df_sites,
    geometry=gpd.points_from_xy(df_sites.x_fr, df_sites.y_fr),
    crs="EPSG:2154",
)

# Turn crs from 2154 to 3035
df_sites_eu = df_sites.to_crs("EPSG:3035")

# From geodataframe, extract x and y locations
x_eu = df_sites_eu.geometry.x
y_eu = df_sites_eu.geometry.y

# Overwrite df_sites again and attach European coordinates
df_sites = site_coordinates.copy()[["idp", "first_year"]]
df_sites["x_eu"] = x_eu
df_sites["y_eu"] = y_eu

# Attach start and end year for each site for easier extraction
df_sites["start_year"] = df_sites.first_year - 5
df_sites["end_year"] = df_sites.first_year + 5

# Print info
print(df_sites.shape)
df_sites.head(3)

(40022, 6)


Unnamed: 0,idp,first_year,x_eu,y_eu,start_year,end_year
0,632691,2011,3373961.0,2885391.0,2006,2016
1,702597,2012,3807871.0,2587960.0,2007,2017
2,706240,2012,3814795.0,2657587.0,2007,2017


In [3]:
# Create folium map to see if nfi_plots still fall within France

df_map = df_sites_eu[:1000]

m = folium.Map(location=[46.5, 2], zoom_start=6)
folium.GeoJson(
    df_map,
    name="NFI Sites",
    tooltip=folium.features.GeoJsonTooltip(
        fields=["idp"], aliases=["NFI Plot"], localize=True
    ),
    marker=folium.CircleMarker(fill=True, color="black"),
).add_to(m)

m

# Load Raster Files


In [4]:
subfolder = "smi"

In [5]:
# Get folderpath for selected subfolder
folderpath = f"../../data/raw/edo/{subfolder}/"

# Get list of all files in folder
files = glob.glob(folderpath + "*/*.tif", recursive=True)

# Extract date from filename
dates = [x.split("/")[-1].split("_")[3] for x in files]

# Merge files and dates into dataframe
files_variables = pd.DataFrame({"file": files, "date": dates})

# Format date as datetime
files_variables["date"] = pd.to_datetime(files_variables.date, format="%Y%m%d")
files_variables = files_variables.sort_values("date")

# Attach variable information
files_variables["variable"] = subfolder

print(files_variables.shape)
display(files_variables[:3])

(504, 3)


Unnamed: 0,file,date,variable
429,../../data/raw/edo/smi/sminx_m_euu_20040101_20...,2004-01-01,smi
408,../../data/raw/edo/smi/sminx_m_euu_20040101_20...,2004-01-11,smi
423,../../data/raw/edo/smi/sminx_m_euu_20040101_20...,2004-01-21,smi


In [22]:
# Attach grouping variable for multiprocessing and create groups
files_variables["group"] = np.arange(len(files_variables)) % 10 + 1
grouped = files_variables.groupby("group")
df_list = [group for name, group in grouped]

display(df_list[0][:3])
display(df_list[-1][:3])

Unnamed: 0,file,date,variable,group
429,../../data/raw/edo/smi/sminx_m_euu_20040101_20...,2004-01-01,smi,1
414,../../data/raw/edo/smi/sminx_m_euu_20040101_20...,2004-04-11,smi,1
407,../../data/raw/edo/smi/sminx_m_euu_20040101_20...,2004-07-21,smi,1


Unnamed: 0,file,date,variable,group


## Extract Data


In [30]:
subset_of_coors = True

if subset_of_coors:
    df_sites_in = df_sites.sample(1000)
else:
    df_sites_in = df_sites

# Run in parallel
df_mp = run_mp(
    parallel_edo_extraction,
    df_list[:2],
    progress_bar=True,
    num_cores=10,
    df_sites=df_sites_in,
    debug=False,
)
chime.success()

  0%|          | 0/2 [00:00<?, ?it/s]

100%|██████████| 2/2 [00:11<00:00,  5.67s/it]


In [57]:
# Unlist results by concatenating them
df_unlisted = pd.concat(df_mp)

# Clean structure a bit
df_unlisted = df_unlisted.sort_values(["idp", "date"]).reset_index(drop=True)
df_unlisted.insert(0, "idp", df_unlisted.pop("idp"))
df_unlisted.insert(1, "date", df_unlisted.pop("date"))

# Set -9999 to NA
df_unlisted = df_unlisted.replace(-9999, np.nan)

# Attach season information
df_unlisted = match_season_to_month(df_unlisted)

# Attach information whether observation is from before or after first_year
for i in tqdm(range(len(df_unlisted))):
    iyr = df_unlisted.at[i, "date"].year
    df_unlisted.at[i, "file_date"] = iyr
    df_unlisted.at[i, "before_first_year"] = iyr < df_unlisted.iloc[i]["first_year"]

# Separate into before and after first_year
df_before = df_unlisted[df_unlisted.before_first_year == True]
df_after = df_unlisted[df_unlisted.before_first_year == False]

100%|██████████| 69584/69584 [00:04<00:00, 16474.16it/s]


In [40]:
def seasonal_aggregation_per_site(df_in, current_var):
    grouped = df_in.groupby("idp")  # Group by idp
    df_list = [group for name, group in grouped]  # Create list
    df_out = pd.DataFrame()  # Create empty dataframe for output

    for i in tqdm(range(len(df_list))):
        # for i in range(len(df_list)):
        current_group = df_list[i].copy()[
            ["idp", "date", "first_year", "season", current_var]
        ]
        current_idp = df_list[i].idp.unique()[0]

        df_i = get_seasonal_aggregates(
            df_in=current_group,
            timescale_days_to_months="fall cut-off",
            fcts_to_apply=["mean", "std"],
            debug=False,
            verbose=False,
        )

        df_i["idp"] = current_idp
        df_i.insert(0, "idp", df_i.pop("idp"))

        df_out = pd.concat([df_out, df_i])

    return df_out

In [58]:
df_after = df_after.drop(columns=["before_first_year"])
df_after = seasonal_aggregation_per_site(df_after, current_var=subfolder)

df_before = df_before.drop(columns=["before_first_year"])
df_before = seasonal_aggregation_per_site(df_before, current_var=subfolder)

100%|██████████| 1000/1000 [00:02<00:00, 412.97it/s]
100%|██████████| 1000/1000 [00:02<00:00, 406.30it/s]


In [68]:
# Attach suffix _5yrafter to all variables in df_after except idp
df_after_sf = (
    df_after.add_suffix("_Tpls5")
    .rename(columns={"idp_Tpls5": "idp"})
    .reset_index(drop=True)
)
df_before_sf = (
    df_before.add_suffix("_tmin5")
    .rename(columns={"idp_tmin5": "idp"})
    .reset_index(drop=True)
)

# Merge variables into one dataframe
df_full = pd.merge(
    df_before_sf,
    df_after_sf,
    how="outer",
    validate="one_to_one",
)

Unnamed: 0,idp,mean_of_smi_in_winter_tmin5,mean_of_smi_in_spring_tmin5,mean_of_smi_in_summer_tmin5,mean_of_smi_in_fall_tmin5,std_of_smi_in_winter_tmin5,std_of_smi_in_spring_tmin5,std_of_smi_in_summer_tmin5,std_of_smi_in_fall_tmin5,mean_of_smi_in_winter_Tpls5,mean_of_smi_in_spring_Tpls5,mean_of_smi_in_summer_Tpls5,mean_of_smi_in_fall_Tpls5,std_of_smi_in_winter_Tpls5,std_of_smi_in_spring_Tpls5,std_of_smi_in_summer_Tpls5,std_of_smi_in_fall_Tpls5
524,851796,0.98377,,,0.822120,,,,,0.932608,0.956291,0.805370,0.769162,0.058515,0.016912,0.113475,0.104863
28,514225,0.97995,,,0.229560,,,,0.019078,0.970144,0.950516,0.546369,0.586931,0.010856,0.029105,0.294386,0.258285
91,541470,0.97345,0.959317,0.43983,0.342210,0.003111,0.005954,0.10106,0.020500,0.958480,0.951517,0.760063,0.671077,0.017337,0.014425,0.199160,0.227895
566,873001,0.97286,,,0.856275,,,,0.036480,0.947746,0.890426,0.606511,0.847153,0.029803,0.086315,0.211991,0.182068
608,909470,0.97192,,,0.906305,,,,0.088268,0.914067,0.937490,0.816066,0.755010,0.053969,0.022951,0.147933,0.152435
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1129547,,,,,,,,,0.850063,0.693260,0.402910,0.582733,0.038715,0.031396,0.015500,0.164906
996,1130436,,,,,,,,,0.805753,0.792535,0.519080,0.641643,0.026036,0.004759,0.014114,0.095710
997,1130693,,,,,,,,,0.838087,0.872050,0.503430,0.706170,0.026804,0.000948,0.040432,0.055449
998,1130847,,,,,,,,,0.726930,0.668265,0.425955,0.556903,0.010847,0.031360,0.014644,0.119556


In [72]:
# Quick check
display(df_full)
if not df_full["idp"].nunique() == site_coordinates.idp.nunique():
    print("❌❌❌ Not all sites are included in the final dataframe ❌❌❌")

Unnamed: 0,idp,mean_of_smi_in_winter_tmin5,mean_of_smi_in_spring_tmin5,mean_of_smi_in_summer_tmin5,mean_of_smi_in_fall_tmin5,std_of_smi_in_winter_tmin5,std_of_smi_in_spring_tmin5,std_of_smi_in_summer_tmin5,std_of_smi_in_fall_tmin5,mean_of_smi_in_winter_Tpls5,mean_of_smi_in_spring_Tpls5,mean_of_smi_in_summer_Tpls5,mean_of_smi_in_fall_Tpls5,std_of_smi_in_winter_Tpls5,std_of_smi_in_spring_Tpls5,std_of_smi_in_summer_Tpls5,std_of_smi_in_fall_Tpls5
0,500191,,,,,,,,,0.627663,0.504056,0.258111,0.346293,0.058900,0.207822,0.065008,0.123289
1,500324,,,,,,,,,0.838879,0.852053,0.639411,0.646310,0.064247,0.016927,0.165212,0.144650
2,501588,,,,,,,,,0.859907,0.704399,0.487584,0.628767,0.010990,0.141296,0.096918,0.149759
3,501931,,,,,,,,,0.869707,0.884111,0.733277,0.685771,0.031901,0.029089,0.133217,0.171267
4,503275,,,,,,,,,0.923491,0.877476,0.728933,0.793832,0.008896,0.028966,0.067713,0.089087
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1129547,,,,,,,,,0.850063,0.693260,0.402910,0.582733,0.038715,0.031396,0.015500,0.164906
996,1130436,,,,,,,,,0.805753,0.792535,0.519080,0.641643,0.026036,0.004759,0.014114,0.095710
997,1130693,,,,,,,,,0.838087,0.872050,0.503430,0.706170,0.026804,0.000948,0.040432,0.055449
998,1130847,,,,,,,,,0.726930,0.668265,0.425955,0.556903,0.010847,0.031360,0.014644,0.119556


❌❌❌ Not all sites are included in the final dataframe ❌❌❌


In [73]:
# Save dataframe
df_full.to_feather(f"edo_{subfolder}_data.feather")
df_full.to_csv(f"edo_{subfolder}_data.csv", index=False)
chime.success()

## Quality Control


- Plot points onto folium map and color them binary, whether the index is above or below 0.5. This should give me a quick overview whether dry sites are shown as such.
- Plot the distribution of the index values. This should give me a quick overview whether the index is distributed as expected.
