In [1]:
import pandas as pd
import sys

sys.path.insert(0, "../../src")
from utils import *
from run_mp import *
from gee_data_wrangling import *

In [2]:
TAKE_ONLY_10k_SITES = False

# Loop to get all data and merge it
my_datasets = ["era5-daily", "modis-vi", "sentinel", "landsat", "modis-lst-terra"]
df_all = pd.DataFrame(columns=["SiteID", "first_year"])

for my_dataset in my_datasets:
    print(f"\n> Working on {my_dataset}")

    # Load all sites
    all_sites = pd.read_csv("../01_download_raw_gee_data/sites_years.csv")
    sites_subset = all_sites.copy()

    if TAKE_ONLY_10k_SITES:
        sites_subset = all_sites.copy().iloc[:10000, :]

    grouped = sites_subset.groupby("id", as_index=False)
    df_list = [group for name, group in grouped]

    out = run_mp(
        load_and_wrangle_PARALLEL,
        df_list,
        combine_func=pd.concat,
        progress_bar=True,
        num_cores=10,
        subdir=my_dataset,
        verbose=False,
        return_which="wrangled",
    )

    df_all = pd.merge(df_all, out, how="outer", on=["SiteID", "first_year"])


> Working on era5-daily


100%|██████████| 38302/38302 [05:31<00:00, 115.57it/s]



> Working on modis-vi


100%|██████████| 38302/38302 [00:52<00:00, 733.27it/s]



> Working on sentinel


100%|██████████| 38302/38302 [01:02<00:00, 615.39it/s] 



> Working on landsat


100%|██████████| 38302/38302 [01:04<00:00, 596.17it/s]



> Working on modis-lst-terra


100%|██████████| 38302/38302 [00:18<00:00, 2048.81it/s]


In [9]:
df_all.to_feather("final_gee_predictor_dataset.feather")
df_all.to_csv("final_gee_predictor_dataset.csv", index=False)

In [6]:
pd.read_feather("final_gee_predictor_dataset.feather")

Unnamed: 0,SiteID,first_year,max_gdd_before_spring_frost,min_doy_of_fall_frost,hw_counts,hw_dur_max,hw_dur_mean,hw_day_sum,hw_days_between,hw_mean_temp,...,mean_of_landsat_EVI_in_fall,mean_of_landsat_EVI_in_winter,std_of_landsat_NDVI_in_spring,std_of_landsat_NDVI_in_summer,std_of_landsat_NDVI_in_fall,std_of_landsat_NDVI_in_winter,std_of_landsat_EVI_in_spring,std_of_landsat_EVI_in_summer,std_of_landsat_EVI_in_fall,std_of_landsat_EVI_in_winter
0,1,2011,71.0,,0,,,,,,...,0.409092,0.243736,0.042461,0.019333,0.020443,0.044926,0.066791,0.038358,0.034453,0.061198
1,2,2012,85.0,284.0,0,,,,,,...,0.470416,0.261726,0.115492,0.058408,0.089923,0.026370,0.193583,0.121745,0.157663,0.040017
2,3,2012,88.0,302.0,0,,,,,,...,0.456291,0.318117,0.094763,0.078777,0.052318,0.098002,0.155626,0.128103,0.084604,0.139906
3,4,2012,91.0,308.0,0,,,,,,...,0.444941,0.437292,0.064986,0.048748,0.034503,0.061557,0.115424,0.089904,0.054856,0.093915
4,5,2012,58.0,314.0,0,,,,,,...,0.383762,0.370150,0.035876,0.019348,0.042045,0.050371,0.059088,0.031746,0.056875,0.062398
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38297,38298,2016,51.0,300.0,0,,,,,,...,0.320209,0.257313,0.066885,0.040937,0.063039,0.042583,0.106770,0.076283,0.092620,0.063518
38298,38299,2016,68.0,304.0,0,,,,,,...,0.371641,0.175064,0.109530,0.043192,0.066637,0.013540,0.180242,0.087778,0.107406,0.021669
38299,38300,2016,51.0,,0,,,,,,...,0.423844,0.418582,0.023636,0.032380,0.071105,0.031718,0.039081,0.051603,0.110863,0.081013
38300,38301,2016,67.0,284.0,0,,,,,,...,0.397496,0.303379,0.069576,0.034570,0.020568,0.087935,0.110179,0.061193,0.039139,0.088243


In [2]:
pd.read_csv("final_gee_predictor_dataset.csv")

Unnamed: 0,SiteID,first_year,max_gdd_before_spring_frost,min_doy_of_fall_frost,hw_counts,hw_dur_max,hw_dur_mean,hw_day_sum,hw_days_between,hw_mean_temp,...,mean_of_landsat_EVI_in_fall,mean_of_landsat_EVI_in_winter,std_of_landsat_NDVI_in_spring,std_of_landsat_NDVI_in_summer,std_of_landsat_NDVI_in_fall,std_of_landsat_NDVI_in_winter,std_of_landsat_EVI_in_spring,std_of_landsat_EVI_in_summer,std_of_landsat_EVI_in_fall,std_of_landsat_EVI_in_winter
0,1,2011,71.0,,0,,,,,,...,0.409092,0.243736,0.042461,0.019333,0.020443,0.044926,0.066791,0.038358,0.034453,0.061198
1,2,2012,85.0,284.0,0,,,,,,...,0.470416,0.261726,0.115492,0.058408,0.089923,0.026370,0.193583,0.121745,0.157663,0.040017
2,3,2012,88.0,302.0,0,,,,,,...,0.456291,0.318117,0.094763,0.078777,0.052318,0.098002,0.155626,0.128103,0.084604,0.139906
3,4,2012,91.0,308.0,0,,,,,,...,0.444941,0.437292,0.064986,0.048748,0.034503,0.061557,0.115424,0.089904,0.054856,0.093915
4,5,2012,58.0,314.0,0,,,,,,...,0.383762,0.370150,0.035876,0.019348,0.042045,0.050371,0.059088,0.031746,0.056875,0.062398
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38297,38298,2016,51.0,300.0,0,,,,,,...,0.320209,0.257313,0.066885,0.040937,0.063039,0.042583,0.106770,0.076283,0.092620,0.063518
38298,38299,2016,68.0,304.0,0,,,,,,...,0.371641,0.175064,0.109530,0.043192,0.066637,0.013540,0.180242,0.087778,0.107406,0.021669
38299,38300,2016,51.0,,0,,,,,,...,0.423844,0.418582,0.023636,0.032380,0.071105,0.031718,0.039081,0.051603,0.110863,0.081013
38300,38301,2016,67.0,284.0,0,,,,,,...,0.397496,0.303379,0.069576,0.034570,0.020568,0.087935,0.110179,0.061193,0.039139,0.088243


In [3]:
df_all = pd.read_csv("final_gee_predictor_dataset.csv")

In [6]:
df_sites = pd.read_csv("../00_process_nfi_data/nfi_final_sites.csv", index_col=0)