# Silvae / Agroparistech Data

- Accessed latest on 2023-12-08: https://silvae.agroparistech.fr/home/?page_id=2683


## Libraries


In [2]:
from tqdm import tqdm
import rasterio
import pandas as pd
import glob
import numpy as np
import os
import chime

chime.theme("mario")

# Import Functions
import sys

sys.path.insert(0, "../../src")
from run_mp import *
from extract_raster_values import *

## Load Coordinates and Data

- Note that both, the coordinates and the .tif files are in the same CRS (EPSG:2154), so we can use the coordinates directly for extracting the data from the .tif files.


In [2]:
# Get site coordinates
site_coordinates = pd.read_csv("../00_process_nfi_data/nfi_final_sites_with_idp.csv")
site_coordinates[:3]

Unnamed: 0,first_year,SiteID,x,y,idp,x_fr,y_fr
0,2011,1,-2.842824,48.337505,632691,267415.027897,6820144.0
1,2012,2,3.349757,46.198025,702597,726971.216676,6566524.0
2,2012,3,3.361577,46.827747,706240,727562.508709,6636462.0


In [3]:
# List all tiff files in agroparistech folder
files = sorted(glob.glob("../../data/raw/agroparistech/all_files/*.tif"))
files[:3]

['../../data/raw/agroparistech/all_files/abal_distrib_v2016.tif',
 '../../data/raw/agroparistech/all_files/abal_mortalite_v2018.tif',
 '../../data/raw/agroparistech/all_files/acca_distrib_v2016.tif']

In [4]:
# Extract variables names from files
variables = [file.split("/")[-1].split(".")[0] for file in files]
variables[:3]

['abal_distrib_v2016', 'abal_mortalite_v2018', 'acca_distrib_v2016']

In [5]:
# Merge files and variables
files_variables = pd.DataFrame({"files": files, "variables": variables})
files_variables[:3]

Unnamed: 0,files,variables
0,../../data/raw/agroparistech/all_files/abal_di...,abal_distrib_v2016
1,../../data/raw/agroparistech/all_files/abal_mo...,abal_mortalite_v2018
2,../../data/raw/agroparistech/all_files/acca_di...,acca_distrib_v2016


In [6]:
# Define final df that will hold all data
df_coords = site_coordinates[["idp", "x_fr", "y_fr"]]
display(df_coords[:3])
df_coords.shape

Unnamed: 0,idp,x_fr,y_fr
0,632691,267415.027897,6820144.0
1,702597,726971.216676,6566524.0
2,706240,727562.508709,6636462.0


(40022, 3)

## Extract raster values


### Split fast and slow files


In [7]:
# Test for one group

# TODO: These files are too big and take very long to extract
# So skipping them for now and will extract them separately
skip_these_files = [
    "tmoy_6190_et_v1",
    "tmoy_6190_hi_v1",
    "tmoy_6190_an_v1",
    "tmin_6190_hi_v1",
    "tmax_6190_et_v1",
    "rad_6190_et_v1",
    "rad_6190_an_v1",
    "etp_6190_et_v1",
    "etp_6190_an_v1",
    "bhctu_6190_et_v1",
    "bhctu_6190_an_v1",
    "ru_6190_et_v1",
    "ru_6190_an_v1",
]

files_variables_quick = files_variables[
    ~files_variables["variables"].isin(skip_these_files)
]

files_variables_slow = files_variables[
    files_variables["variables"].isin(skip_these_files)
]

print(f"Shape of files_variables_quick: {files_variables_quick.shape}")
print(f"Shape of files_variables_slow: {files_variables_slow.shape}")

Shape of files_variables_quick: (80, 2)
Shape of files_variables_slow: (13, 2)


### Fast Files


In [1]:
# Add groupings, make sure that slow variables are split to own cores
files_variables_quick["group"] = np.arange(len(files_variables_quick)) % 10 + 1

# Create list of df to be passed to multiprocessing
grouped = files_variables_quick.groupby("group")
df_list = [group for name, group in grouped]

NameError: name 'np' is not defined

In [10]:
# Test on one group
# df_test = parallel_raster_extraction(df_list[0], df_coords[:50], progress_bar=True)
# df_test
# df_test.reset_index(drop=True).to_feather("data_agroparistech_before_qc.feather")

In [3]:
# Run in parallel
df_quick = run_mp(
    parallel_raster_extraction,
    df_list,
    progress_bar=True,
    num_cores=10,
    df_coords=df_coords,
    verbose=False,
)
chime.success()

NameError: name 'df_list' is not defined

In [None]:
# Combine the list of dataframes using list comprehension
df_quick_merged = df_quick[0]
for i in range(1, len(df_quick)):
    df_quick_merged = pd.merge(
        df_quick_merged, df_quick[i], on=["idp", "y_fr", "x_fr"], how="left"
    )

In [None]:
# Quick Visual Check
df_quick_merged

Unnamed: 0,idp,x_fr,y_fr,abal_distrib_v2016,bepe_mortalite_v2018,fasy_mortalite_v2018,piha_distrib_v2016,prob_mort_epi,quro_distrib_v2016,saca_mortalite_v2018,...,tipl_distrib_v2016,ulgl_distrib_v2016,bepe_distrib_v2016,fasy_if_v2018,piab_mortalite_v2018,prec_6190_et_v1,qupy_mortalite_v2018,saca_distrib_v2016,tm61858610_13,ulmi_distrib_v2016
0,632691,2.674150e+05,6.820144e+06,0.116299,-0.006745,0.003381,1.317830e-10,,0.576204,0.013197,...,0.000827,5.844794e-06,0.697052,2.523884e+01,-9999.000000,149.581284,-9999.0,0.231549,0.805501,0.001437
1,702597,7.269712e+05,6.566524e+06,0.085622,-9999.000000,-9999.000000,2.808699e-08,,0.713204,-9999.000000,...,0.010973,7.097719e-04,0.546788,2.783585e+01,-9999.000000,204.346466,-9999.0,0.184128,1.065304,0.012006
2,706240,7.275625e+05,6.636462e+06,0.051618,-9999.000000,0.003246,1.679225e-06,,0.767968,-9999.000000,...,0.026102,8.237094e-04,0.336591,2.882230e+01,-9999.000000,184.735458,-9999.0,0.167884,1.021843,0.064354
3,708321,5.070276e+05,6.792198e+06,0.040013,-0.006780,0.003302,5.390015e-06,,0.670307,0.021719,...,0.019704,4.144382e-04,0.236383,2.709109e+01,-9999.000000,138.204391,-9999.0,0.096158,0.913146,0.035805
4,708369,9.810095e+05,6.248657e+06,0.000747,-9999.000000,-9999.000000,1.169433e-02,,0.008103,-9999.000000,...,0.001383,5.346998e-08,0.000447,-3.402823e+38,-9999.000000,111.139496,-9999.0,0.004683,0.931766,0.042788
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40017,1131396,7.284673e+05,6.743375e+06,0.045600,-0.006766,-9999.000000,2.963565e-06,,0.775838,0.023715,...,0.020294,4.261766e-04,0.312127,2.901571e+01,-9999.000000,173.473053,-9999.0,0.183173,0.945098,0.087864
40018,1131409,6.307829e+05,6.176717e+06,0.473966,-9999.000000,0.004199,2.519738e-13,,0.002722,0.036059,...,0.001820,9.983832e-04,0.151881,-3.402823e+38,-9999.000000,216.782501,-9999.0,0.246417,0.843088,0.000300
40019,1131410,1.030439e+06,6.764769e+06,0.049305,-9999.000000,-9999.000000,5.670998e-06,-3.402823e+38,0.554224,-9999.000000,...,0.029674,1.838178e-03,0.183773,-3.402823e+38,-9999.000000,199.467590,-9999.0,0.109920,0.942872,0.051256
40020,1131424,7.597461e+05,6.425373e+06,0.607617,-9999.000000,0.002793,4.454063e-13,,0.021926,-9999.000000,...,0.002897,2.257393e-03,0.284154,2.129047e+01,0.007716,222.490387,-9999.0,0.252481,0.986910,0.000227


In [None]:
# Save it
df_quick_merged.to_feather("data_agroparistech_fastfiles_before_qc.feather")
df_quick_merged.to_csv("data_agroparistech_fastfiles_before_qc.csv", index=False)

### Slow Files (large rasters)

- The parallelization for the large raster files is to split the locations into 10 groups and then run the function on one tif file in parallel.


In [14]:
# df_coords_groups = df_coords.copy()
# df_coords_groups["group"] = np.arange(len(df_coords_groups)) % 10 + 1
# grouped = df_coords_groups.groupby("group")
# df_list = [group.reset_index(drop=True) for name, group in grouped]

In [15]:
# # Testing one group for one file
# my_file = files_variables_slow.iloc[0]["files"]
# my_var = files_variables_slow.iloc[0]["variables"]
# my_df = df_list[0]

# wrapper_for_large_files(my_df, my_file, my_var, progress_bar=True)

In [16]:
# %%time
# # Test on one group for all files

# df_slow_i = run_mp(
#     wrapper_for_large_files,
#     df_list,
#     # combine_func=pd.concat,
#     num_cores=10,
#     tif_in = files_variables_slow.iloc[0]["files"],
#     var_in = files_variables_slow.iloc[0]["variables"],
#     progress_bar=False
# )
# # Combine the list of dataframes using list comprehension
# df_slow = df_slow_i
# df_slow_merged = df_slow[0]
# for i in range(1, len(df_slow)):
#     df_slow_merged = pd.concat([df_slow_merged, df_slow[i]], axis=0)

# df_slow_merged

In [17]:
# # Run in parallel for all groups and all files

# df_slow_fin = pd.DataFrame(
#     {"idp": df_coords["idp"], "x_fr": df_coords["x_fr"], "y_fr": df_coords["y_fr"]}
# )

# for f, v in zip(files_variables_slow["files"], files_variables_slow["variables"]):
#     print(f"Extracting {v}")

#     df_slow_i = run_mp(
#         wrapper_for_large_files,
#         df_list,
#         # combine_func=pd.concat,
#         progress_bar=True,
#         num_cores=10,
#         tif_in=f,
#         var_in=v,
#     )

#     df_slow = df_slow_i
#     df_slow_merged = df_slow[0]
#     for i in range(1, len(df_slow)):
#         df_slow_merged = pd.concat([df_slow_merged, df_slow[i]], axis=0).drop(
#             columns=["group"]
#         )

#     # display(df_slow_merged)

#     df_slow_fin = pd.merge(
#         df_slow_fin, df_slow_merged, on=["idp", "y_fr", "x_fr"], how="left"
#     )

# chime.success()

Extracting bhctu_6190_an_v1


  0%|          | 0/10 [32:36<?, ?it/s]


In [None]:
# # Quick Visual Check
# df_slow_fin

In [None]:
# # Save it
# df_slow_fin.to_feather("data_agroparistech_slowfiles_before_qc.feather")
# df_slow_fin.to_csv("data_agroparistech_slowfiles_before_qc.csv", index=False)

---


In [8]:
# Add groupings, make sure that slow variables are split to own cores
files_variables_slow["group"] = np.arange(len(files_variables_slow)) % 10 + 1

# Create list of df to be passed to multiprocessing
grouped = files_variables_slow.groupby("group")
df_list = [group for name, group in grouped]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  files_variables_slow["group"] = np.arange(len(files_variables_slow)) % 10 + 1


In [9]:
# Run in parallel
df_slow = run_mp(
    parallel_raster_extraction,
    df_list,
    progress_bar=True,
    num_cores=10,
    df_coords=df_coords,
    verbose=False,
)
# chime.success()

  0%|          | 0/10 [6:21:03<?, ?it/s]


In [None]:
# Combine the list of dataframes using list comprehension
df_slow_merged = df_slow[0]
for i in range(1, len(df_slow)):
    df_slow_merged = pd.merge(
        df_slow_merged, df_slow[i], on=["idp", "y_fr", "x_fr"], how="left"
    )

In [None]:
# Quick Visual Check
df_slow_merged

Unnamed: 0,idp,x_fr,y_fr,abal_distrib_v2016,bepe_mortalite_v2018,fasy_mortalite_v2018,piha_distrib_v2016,prob_mort_epi,quro_distrib_v2016,saca_mortalite_v2018,...,tipl_distrib_v2016,ulgl_distrib_v2016,bepe_distrib_v2016,fasy_if_v2018,piab_mortalite_v2018,prec_6190_et_v1,qupy_mortalite_v2018,saca_distrib_v2016,tm61858610_13,ulmi_distrib_v2016
0,632691,2.674150e+05,6.820144e+06,0.116299,-0.006745,0.003381,1.317830e-10,,0.576204,0.013197,...,0.000827,5.844794e-06,0.697052,2.523884e+01,-9999.000000,149.581284,-9999.0,0.231549,0.805501,0.001437
1,702597,7.269712e+05,6.566524e+06,0.085622,-9999.000000,-9999.000000,2.808699e-08,,0.713204,-9999.000000,...,0.010973,7.097719e-04,0.546788,2.783585e+01,-9999.000000,204.346466,-9999.0,0.184128,1.065304,0.012006
2,706240,7.275625e+05,6.636462e+06,0.051618,-9999.000000,0.003246,1.679225e-06,,0.767968,-9999.000000,...,0.026102,8.237094e-04,0.336591,2.882230e+01,-9999.000000,184.735458,-9999.0,0.167884,1.021843,0.064354
3,708321,5.070276e+05,6.792198e+06,0.040013,-0.006780,0.003302,5.390015e-06,,0.670307,0.021719,...,0.019704,4.144382e-04,0.236383,2.709109e+01,-9999.000000,138.204391,-9999.0,0.096158,0.913146,0.035805
4,708369,9.810095e+05,6.248657e+06,0.000747,-9999.000000,-9999.000000,1.169433e-02,,0.008103,-9999.000000,...,0.001383,5.346998e-08,0.000447,-3.402823e+38,-9999.000000,111.139496,-9999.0,0.004683,0.931766,0.042788
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40017,1131396,7.284673e+05,6.743375e+06,0.045600,-0.006766,-9999.000000,2.963565e-06,,0.775838,0.023715,...,0.020294,4.261766e-04,0.312127,2.901571e+01,-9999.000000,173.473053,-9999.0,0.183173,0.945098,0.087864
40018,1131409,6.307829e+05,6.176717e+06,0.473966,-9999.000000,0.004199,2.519738e-13,,0.002722,0.036059,...,0.001820,9.983832e-04,0.151881,-3.402823e+38,-9999.000000,216.782501,-9999.0,0.246417,0.843088,0.000300
40019,1131410,1.030439e+06,6.764769e+06,0.049305,-9999.000000,-9999.000000,5.670998e-06,-3.402823e+38,0.554224,-9999.000000,...,0.029674,1.838178e-03,0.183773,-3.402823e+38,-9999.000000,199.467590,-9999.0,0.109920,0.942872,0.051256
40020,1131424,7.597461e+05,6.425373e+06,0.607617,-9999.000000,0.002793,4.454063e-13,,0.021926,-9999.000000,...,0.002897,2.257393e-03,0.284154,2.129047e+01,0.007716,222.490387,-9999.0,0.252481,0.986910,0.000227


In [None]:
# Save it
df_slow_merged.to_feather("data_agroparistech_slowfiles_before_qc.feather")
df_slow_merged.to_csv("data_agroparistech_slowfiles_before_qc.csv", index=False)

## Quality Control For Outliers / NA Values


In [None]:
# Merge slow and fast files into one

df_slow_files = pd.read_feather("data_agroparistech_slowfiles_before_qc.feather")
df_fast_files = pd.read_feather("data_agroparistech_fastfiles_before_qc.feather")

df_agroparistech = pd.merge(
    df_slow_files, df_fast_files, on=["idp", "y_fr", "x_fr"], how="right"
)

df_agroparistech.head()

Unnamed: 0,idp,x_fr,y_fr,bhctu_6190_an_v1,bhctu_6190_et_v1,etp_6190_an_v1,etp_6190_et_v1,rad_6190_an_v1,abal_distrib_v2016,bepe_mortalite_v2018,...,tipl_distrib_v2016,ulgl_distrib_v2016,bepe_distrib_v2016,fasy_if_v2018,piab_mortalite_v2018,prec_6190_et_v1,qupy_mortalite_v2018,saca_distrib_v2016,tm61858610_13,ulmi_distrib_v2016
0,632691,267415.027897,6820144.0,19.240328,-56.831585,54.696297,105.875206,34664.03125,0.116299,-0.006745,...,0.000827,5.844794e-06,0.697052,25.23884,-9999.0,149.581284,-9999.0,0.231549,0.805501,0.001437
1,702597,726971.216676,6566524.0,2.66782,-53.173649,59.746429,121.191681,36479.175781,0.085622,-9999.0,...,0.010973,0.0007097719,0.546788,27.83585,-9999.0,204.346466,-9999.0,0.184128,1.065304,0.012006
2,706240,727562.508709,6636462.0,5.251337,-58.659328,59.844738,120.26944,35390.4375,0.051618,-9999.0,...,0.026102,0.0008237094,0.336591,28.8223,-9999.0,184.735458,-9999.0,0.167884,1.021843,0.064354
3,708321,507027.573495,6792198.0,-0.896232,-65.469238,55.652546,111.108696,32837.777344,0.040013,-0.00678,...,0.019704,0.0004144382,0.236383,27.09109,-9999.0,138.204391,-9999.0,0.096158,0.913146,0.035805
4,708369,981009.515199,6248657.0,-5.091545,-106.926773,77.560204,142.603653,45841.847656,0.000747,-9999.0,...,0.001383,5.346998e-08,0.000447,-3.4028230000000003e+38,-9999.0,111.139496,-9999.0,0.004683,0.931766,0.042788
