# Merge all EDO data into one file


In [14]:
import pandas as pd
import numpy as np
import os
import glob

In [15]:
# List all edo data files
files = glob.glob("data_edo_*_aggregated.feather")
# Remove those that are subsets
files = [f for f in files if "subset" not in f]
files_ = [f.replace("data_edo_", "").replace("_aggregated.feather", "") for f in files]
files_ = sorted(files_)

print("Available aggregated datasets ready for final merge:")

for i, f in enumerate(files_):
    print(f" {i+1}.\t{f}")

Available aggregated datasets ready for final merge:
 1.	fapar
 2.	fapar_anom
 3.	heatw
 4.	maxtmp
 5.	maxtmp_anpm
 6.	mintmp
 7.	smi
 8.	smi_anom


In [16]:
# Iteratively load and concatenate
df = pd.DataFrame()
counter = 0

for f in files:
    if counter == 0:
        df = pd.read_feather(f)
        print(f"{counter:<1}. Adding df of shape {df.shape}\t from file: {f}")
        counter += 1
        continue
    df_i = pd.read_feather(f)
    print(f"{counter:<1}. Adding df of shape {df_i.shape}\t from file: {f}")
    df = df.merge(df_i, on="idp")

print(f"\n\nFinal shape: {df.shape} and the columns are:")
for c in df.columns:
    print(f" - {c}")

0. Adding df of shape (40022, 41)	 from file: data_edo_smi_aggregated.feather
1. Adding df of shape (40022, 41)	 from file: data_edo_maxtmp_anpm_aggregated.feather
1. Adding df of shape (40022, 41)	 from file: data_edo_maxtmp_aggregated.feather
1. Adding df of shape (40022, 41)	 from file: data_edo_fapar_anom_aggregated.feather
1. Adding df of shape (40022, 41)	 from file: data_edo_smi_anom_aggregated.feather
1. Adding df of shape (40022, 41)	 from file: data_edo_fapar_aggregated.feather
1. Adding df of shape (40022, 37)	 from file: data_edo_heatw_aggregated.feather
1. Adding df of shape (40022, 41)	 from file: data_edo_mintmp_aggregated.feather


Final shape: (40022, 317) and the columns are:
 - idp
 - mean_of_smi_in_winter_tmin5
 - mean_of_smi_in_spring_tmin5
 - mean_of_smi_in_summer_tmin5
 - mean_of_smi_in_fall_tmin5
 - std_of_smi_in_winter_tmin5
 - std_of_smi_in_spring_tmin5
 - std_of_smi_in_summer_tmin5
 - std_of_smi_in_fall_tmin5
 - range_of_smi_in_winter_tmin5
 - range_of_smi_in

In [17]:
# Quick check
df

Unnamed: 0,idp,mean_of_smi_in_winter_tmin5,mean_of_smi_in_spring_tmin5,mean_of_smi_in_summer_tmin5,mean_of_smi_in_fall_tmin5,std_of_smi_in_winter_tmin5,std_of_smi_in_spring_tmin5,std_of_smi_in_summer_tmin5,std_of_smi_in_fall_tmin5,range_of_smi_in_winter_tmin5,...,range_of_mintmp_in_winter_tpls5,range_of_mintmp_in_spring_tpls5,median_of_mintmp_in_summer_tpls5,median_of_mintmp_in_fall_tpls5,median_of_mintmp_in_winter_tpls5,median_of_mintmp_in_spring_tpls5,iqr_of_mintmp_in_summer_tpls5,iqr_of_mintmp_in_fall_tpls5,iqr_of_mintmp_in_winter_tpls5,iqr_of_mintmp_in_spring_tpls5
0,500002,0.889683,0.893675,0.742380,0.748853,0.036233,0.035247,0.096277,0.131406,0.16480,...,25.0,24.0,13.3025,6.3025,-0.6975,5.3025,4.000000,7.0,5.0,7.0
1,500008,0.847330,0.836865,0.691111,0.721948,0.023519,0.028433,0.093574,0.102557,0.09896,...,25.0,25.0,12.6925,7.6925,0.6925,5.6925,5.000000,7.0,6.0,6.0
2,500012,0.816472,0.740892,0.475539,0.593273,0.041666,0.108548,0.103070,0.126953,0.18647,...,25.0,25.0,13.4140,8.4140,1.4140,6.4140,4.000000,7.0,5.0,6.0
3,500013,0.858204,0.853182,0.605594,0.585933,0.041863,0.054748,0.135282,0.156822,0.16604,...,25.0,25.0,13.2850,9.2850,3.2850,6.2850,4.000000,6.0,6.0,5.0
4,500042,0.830601,0.871834,0.637613,0.580602,0.092289,0.040880,0.116718,0.148331,0.34380,...,23.0,25.0,13.2590,9.2590,2.2590,6.2590,4.000000,7.0,6.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40017,1131409,0.805106,0.854273,0.673793,0.568677,0.095189,0.054391,0.165903,0.177145,0.38139,...,21.0,19.0,8.2490,2.2490,-3.7510,0.2490,4.000000,6.0,4.0,5.0
40018,1131410,0.927724,0.835862,0.553618,0.784724,0.044619,0.109515,0.220300,0.164880,0.20780,...,25.0,26.0,15.5505,7.5505,0.5505,6.5505,3.000001,6.0,6.0,7.0
40019,1131419,0.909482,0.711762,0.407145,0.651205,0.036252,0.174656,0.113768,0.211517,0.13127,...,16.0,14.0,14.0650,10.0650,5.0650,8.0650,3.000001,3.5,6.0,3.0
40020,1131424,0.740195,0.762485,0.525753,0.522868,0.068635,0.055347,0.112243,0.109789,0.22566,...,22.0,20.0,11.3765,4.3765,-2.6235,2.3765,3.000000,6.0,5.0,6.0


In [18]:
# Save to feather
df.to_feather(here("data/final/predictor_datasets/edo.feather"))