# Silvae / Agroparistech Data

- Accessed latest on 2023-12-08: https://silvae.agroparistech.fr/home/?page_id=2683


## Libraries


In [65]:
# Magic
%load_ext autoreload
%autoreload 2

from tqdm import tqdm
import rasterio
from rasterio import windows

import pandas as pd
import glob
import numpy as np
import os
import chime
import matplotlib.pyplot as plt

chime.theme("mario")

# Import Functions
import sys

sys.path.insert(0, "../../src")
from run_mp import *
from extract_raster_values import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [66]:
# Clean where NA value is present, based on files' NA value
def clean_na_in_agroparistech(df_mp, df_vars):
    for col in df_mp.drop(columns=["idp", "x_fr", "y_fr"]).columns:
        # print(col)

        current_na = df_vars.query("variables == @col")["na_value"].iloc[0]
        # print("For column: ", df_mp.columns[i], "NA value is: ", current_na)

        for j in range(df_mp.shape[0]):
            if df_mp.at[j, col] == current_na:
                df_mp.at[j, col] = np.nan
    return df_mp

In [67]:
def get_file_size(filename):
    """Get file size in megabytes."""
    return os.path.getsize(filename) / (1024 * 1024)


def calculate_splits(file_size, max_mb):
    """Calculate the number of splits based on file size and maximum MB per split."""
    return max(1, int(file_size / max_mb))


def get_splits(height, num_splits):
    """Calculate window positions for splitting the raster with overlap."""
    split_height = height // num_splits
    overlap = int(split_height * 0.50)  # 10% overlap

    for i in range(num_splits):
        start = max(0, i * split_height - overlap)
        end = (
            min(height, (i + 1) * split_height + overlap)
            if i < num_splits - 1
            else height
        )
        yield (start, end - start)


def split_raster(input_raster, subfolder, max_mb):
    """Split a raster into subfiles based on maximum MB per file."""

    # Check file size and calculate number of splits
    file_size = get_file_size(input_raster)
    num_splits = calculate_splits(file_size, max_mb)

    # Create subfolder if it doesn't exist
    ffolder = f"{subfolder}/subrasters"
    if not os.path.exists(ffolder):
        os.makedirs(ffolder)

    # Get filename in short
    my_filename = input_raster.split("/")[-1].split(".")[0]

    print(f"Splitting {my_filename} into {num_splits} subfiles:")

    with rasterio.open(input_raster) as dataset:
        width = dataset.width
        height = dataset.height

        # Get list of subrasters
        f_subrasters = []

        for i, (start, split_height) in enumerate(get_splits(height, num_splits)):
            window = windows.Window(0, start, width, split_height)
            transform = dataset.window_transform(window)

            profile = dataset.profile
            profile.update(
                {"height": window.height, "width": window.width, "transform": transform}
            )

            # Get filename
            f_subraster = f"{ffolder}/{my_filename}_subraster_{i}.tif"

            # Write raster
            with rasterio.open(
                f_subraster,
                "w",
                **profile,
            ) as dst:
                dst.write(dataset.read(window=window))

            # Attach filename to list
            f_subrasters.append(f_subraster)

    return f_subrasters

# 👉 User Input


In [68]:
# For subsetting
take_subset = False
subset_size = 1000

# For file selection
run_fast_files = False
run_slow_files = True

# For raster checks:
skip_raster_checks = True  # Directly load the raster info from the csv files
make_figures = False  # Takes 4 mins with plotting and 1.30 mins without

## Load Coordinates and Data

- Note that both, the coordinates and the .tif files are in the same CRS (EPSG:2154), so we can use the coordinates directly for extracting the data from the .tif files.


In [69]:
# Get site coordinates
site_coordinates = pd.read_csv("../00_process_nfi_data/nfi_final_sites_with_idp.csv")

# Take subset if needed
if take_subset:
    site_coordinates = site_coordinates.sample(subset_size, random_state=1)

# Display site coordinates
site_coordinates[:3]

Unnamed: 0,first_year,SiteID,x,y,idp,x_fr,y_fr
34234,2015,34235,-1.716403,48.444659,1006445,351411.774387,6826456.0
11014,2013,11015,3.796458,46.427107,836253,761160.804953,6592213.0
15467,2010,15468,0.894015,44.987642,500357,534032.770425,6434260.0


In [70]:
# List all tiff files in agroparistech folder
files = sorted(glob.glob("../../data/raw/agroparistech/all_files/*.tif"))
files[:3]

['../../data/raw/agroparistech/all_files/abal_distrib_v2016.tif',
 '../../data/raw/agroparistech/all_files/abal_mortalite_v2018.tif',
 '../../data/raw/agroparistech/all_files/acca_distrib_v2016.tif']

In [71]:
# Extract variables names from files
variables = [file.split("/")[-1].split(".")[0] for file in files]
variables[:3]

['abal_distrib_v2016', 'abal_mortalite_v2018', 'acca_distrib_v2016']

In [72]:
# Merge files and variables
files_variables = pd.DataFrame({"files": files, "variables": variables})
files_variables[:3]

Unnamed: 0,files,variables
0,../../data/raw/agroparistech/all_files/abal_di...,abal_distrib_v2016
1,../../data/raw/agroparistech/all_files/abal_mo...,abal_mortalite_v2018
2,../../data/raw/agroparistech/all_files/acca_di...,acca_distrib_v2016


In [73]:
# Define final df that will hold all data
df_coords = site_coordinates[["idp", "x_fr", "y_fr"]]
display(df_coords[:3])
df_coords.shape

Unnamed: 0,idp,x_fr,y_fr
34234,1006445,351411.774387,6826456.0
11014,836253,761160.804953,6592213.0
15467,500357,534032.770425,6434260.0


(1000, 3)

In [None]:
# import geopandas as gpd
# import pandas as pd
# from shapely.geometry import Point

# # Create a GeoDataFrame from the df_coords
# gdf = gpd.GeoDataFrame(
#     df_coords,
#     geometry=[Point(xy) for xy in zip(df_coords.x_fr, df_coords.y_fr)],
#     crs="EPSG:2154",  # Set the coordinate reference system to RGF93 / Lambert-93 projection
# )

# # Export to Shapefile
# if take_subset:
#     gdf.to_file("agroparistech_qc_files/shapefile_for_subset.shp")
# else:
#     gdf.to_file("agroparistech_qc_files/shapefile.shp")

## Attach Raster Information


In [74]:
# Loop through all files, open the raster, save an image of it, and attach the crs to the df

if not skip_raster_checks:
    for i in tqdm(range(files_variables.shape[0]), disable=False):
        # print(f" {i}. Working on {files_variables['variables'].iloc[i]}...")

        # Open raster
        raster = rasterio.open(files_variables["files"].iloc[i])

        # Load Image
        image = raster.read(1)

        # Turn into float to avoid error when putting values to NA
        image = image.astype(float)  # Convert to float

        # Get min value and CRS
        my_crs = raster.crs.to_string()
        my_na = np.min(image)

        # Attach CRS and min value to df
        files_variables = files_variables.copy()
        files_variables.at[i, "crs"] = my_crs
        files_variables.at[i, "na_value"] = my_na

        if make_figures:
            # Plot
            # Set minimum value to na
            image[image == np.min(image)] = np.nan
            # Add caption with the CRS and min value
            plt.figure(figsize=(8, 8))
            plt.imshow(image)
            plt.colorbar(fraction=0.046, pad=0.04)
            plt.title(f"{files_variables['variables'].iloc[i]}")
            plt.text(
                0,
                1300,
                f"Min Value: {my_na}\nCRS: {my_crs[:50]}...",
            )
            plt.savefig(
                f"agroparistech_qc_files/raster_{files_variables['variables'].iloc[i]}.png"
            )
            # plt.show()
            plt.close()

    # Check df
    display(files_variables.head(3))

    # Replace CRS with EPSG code
    for i in tqdm(range(files_variables.shape[0]), disable=False):
        if "RGF_1993_Lambert_Conformal" in files_variables["crs"].iloc[i]:
            files_variables.at[i, "crs"] = "EPSG:2154"

    display(files_variables.value_counts("crs"))

    # Save file
    files_variables.to_csv("agroparistech_qc_files/files_variables.csv", index=False)
    display(files_variables.head(3))
else:
    files_variables = pd.read_csv("agroparistech_qc_files/files_variables.csv")
    display(files_variables.head(3))

Unnamed: 0,files,variables,crs,na_value
0,../../data/raw/agroparistech/all_files/abal_di...,abal_distrib_v2016,EPSG:2154,-3.4028230000000003e+38
1,../../data/raw/agroparistech/all_files/abal_mo...,abal_mortalite_v2018,EPSG:2154,-9999.0
2,../../data/raw/agroparistech/all_files/acca_di...,acca_distrib_v2016,EPSG:2154,-3.4028230000000003e+38


### Split fast and slow files


In [75]:
# These files are too big and take very long to extract
# So skipping them for now and will extract them separately
skip_these_files = [
    "tmoy_6190_et_v1",
    "tmoy_6190_hi_v1",
    "tmoy_6190_an_v1",
    "tmin_6190_hi_v1",
    "tmax_6190_et_v1",
    "rad_6190_et_v1",
    "rad_6190_an_v1",
    "etp_6190_et_v1",
    "etp_6190_an_v1",
    "bhctu_6190_et_v1",
    "bhctu_6190_an_v1",
    "ru_6190_et_v1",
    "ru_6190_an_v1",
]

files_variables_quick = files_variables[
    ~files_variables["variables"].isin(skip_these_files)
].reset_index(drop=True)

files_variables_slow = files_variables[
    files_variables["variables"].isin(skip_these_files)
].reset_index(drop=True)

print(f"Shape of files_variables_quick: {files_variables_quick.shape}")
for i in range(files_variables_quick.shape[0]):
    print(
        f" {i}. {files_variables_quick['variables'].iloc[i]:<20}\t{files_variables_quick['files'].iloc[i]}"
    )

display(f"Shape of files_variables_slow: {files_variables_slow.shape}")
for i in range(files_variables_slow.shape[0]):
    print(
        f" {i}. {files_variables_slow['variables'].iloc[i]:<20}\t{files_variables_slow['files'].iloc[i]}"
    )

Shape of files_variables_quick: (80, 4)
 0. abal_distrib_v2016  	../../data/raw/agroparistech/all_files/abal_distrib_v2016.tif
 1. abal_mortalite_v2018	../../data/raw/agroparistech/all_files/abal_mortalite_v2018.tif
 2. acca_distrib_v2016  	../../data/raw/agroparistech/all_files/acca_distrib_v2016.tif
 3. acca_mortalite_v2018	../../data/raw/agroparistech/all_files/acca_mortalite_v2018.tif
 4. acmo_distrib_v2016  	../../data/raw/agroparistech/all_files/acmo_distrib_v2016.tif
 5. acop_distrib_v2016  	../../data/raw/agroparistech/all_files/acop_distrib_v2016.tif
 6. acpl_distrib_v2016  	../../data/raw/agroparistech/all_files/acpl_distrib_v2016.tif
 7. acps_distrib_v2016  	../../data/raw/agroparistech/all_files/acps_distrib_v2016.tif
 8. algl_distrib_v2016  	../../data/raw/agroparistech/all_files/algl_distrib_v2016.tif
 9. bepe_distrib_v2016  	../../data/raw/agroparistech/all_files/bepe_distrib_v2016.tif
 10. bepe_mortalite_v2018	../../data/raw/agroparistech/all_files/bepe_mortalite_v2018.

'Shape of files_variables_slow: (13, 4)'

 0. bhctu_6190_an_v1    	../../data/raw/agroparistech/all_files/bhctu_6190_an_v1.tif
 1. bhctu_6190_et_v1    	../../data/raw/agroparistech/all_files/bhctu_6190_et_v1.tif
 2. etp_6190_an_v1      	../../data/raw/agroparistech/all_files/etp_6190_an_v1.tif
 3. etp_6190_et_v1      	../../data/raw/agroparistech/all_files/etp_6190_et_v1.tif
 4. rad_6190_an_v1      	../../data/raw/agroparistech/all_files/rad_6190_an_v1.tif
 5. rad_6190_et_v1      	../../data/raw/agroparistech/all_files/rad_6190_et_v1.tif
 6. ru_6190_an_v1       	../../data/raw/agroparistech/all_files/ru_6190_an_v1.tif
 7. ru_6190_et_v1       	../../data/raw/agroparistech/all_files/ru_6190_et_v1.tif
 8. tmax_6190_et_v1     	../../data/raw/agroparistech/all_files/tmax_6190_et_v1.tif
 9. tmin_6190_hi_v1     	../../data/raw/agroparistech/all_files/tmin_6190_hi_v1.tif
 10. tmoy_6190_an_v1     	../../data/raw/agroparistech/all_files/tmoy_6190_an_v1.tif
 11. tmoy_6190_et_v1     	../../data/raw/agroparistech/all_files/tmoy_6190_et_v1.

### Fast Files


In [76]:
if run_fast_files:
    # Add groupings, make sure that slow variables are split to own cores
    files_variables_quick = files_variables_quick.copy()
    files_variables_quick["group"] = np.arange(len(files_variables_quick)) % 10 + 1

    # Create list of df to be passed to multiprocessing
    grouped = files_variables_quick.groupby("group")
    df_list = [group for name, group in grouped]

    # Display one group
    df_list[0].head(3)

In [77]:
if run_fast_files:
    pass
    # Test on one group
    # df_test = parallel_agroparistech_extraction(df_list[0], df_coords, progress_bar=False)
    # df_test = df_test.copy()

    # # Print shape
    # df_test.shape

    # # Replace NA values
    # df_test = clean_na_in_agroparistech(df_test, files_variables_quick)

    # # Final display
    # display(df_test.head(3))
    # display(df_test.tail(3))

In [78]:
if run_fast_files:
    # Run in parallel
    df_quick = run_mp(
        parallel_agroparistech_extraction,
        df_list,
        progress_bar=True,
        num_cores=10,
        df_coords=df_coords,
        verbose=False,
    )

    # Combine the list of dataframes using list comprehension
    df_quick_merged = df_quick[0]
    for i in range(1, len(df_quick)):
        df_quick_merged = pd.merge(
            df_quick_merged, df_quick[i], on=["idp", "y_fr", "x_fr"], how="left"
        )

    # Replace faulty NA values
    df_quick_merged = clean_na_in_agroparistech(df_quick_merged, files_variables_quick)

    # Display final df
    display(df_quick_merged)

    # Notify me when done
    chime.success()

In [79]:
if run_fast_files:
    # Cleaning df
    df_quick_merged = df_quick_merged.drop(
        columns=["x_fr", "y_fr"]
    )  # remove coordinates
    df_quick_merged = df_quick_merged.reindex(
        sorted(df_quick_merged.columns), axis=1
    )  # sort alphabetically
    df_quick_merged.insert(
        0, "idp", df_quick_merged.pop("idp")
    )  # move idp to first again

    print(f"Final columns in df_quick_merged:")
    for i in range(df_quick_merged.shape[1]):
        print(f" {i}. {df_quick_merged.columns[i]}")

In [80]:
if run_fast_files:
    # Save it
    if take_subset:
        df_quick_merged.to_feather(
            "data_agroparistech_fastfiles_before_qc_subset.feather"
        )
    else:
        df_quick_merged.to_feather("data_agroparistech_fastfiles_before_qc.feather")

❌ TODO: Subset needs filtering of NA values, probably setting just to zero but not really sure...


### Slow Files (large rasters)

- The parallelization for the large raster files is to split the locations into 10 groups and then run the function on one tif file in parallel.


In [81]:
for slow_file in files_variables_slow["files"]:
    slow_var = slow_file.split("/")[-1].split(".")[0]
    print(slow_file, slow_var)

../../data/raw/agroparistech/all_files/bhctu_6190_an_v1.tif bhctu_6190_an_v1
../../data/raw/agroparistech/all_files/bhctu_6190_et_v1.tif bhctu_6190_et_v1
../../data/raw/agroparistech/all_files/etp_6190_an_v1.tif etp_6190_an_v1
../../data/raw/agroparistech/all_files/etp_6190_et_v1.tif etp_6190_et_v1
../../data/raw/agroparistech/all_files/rad_6190_an_v1.tif rad_6190_an_v1
../../data/raw/agroparistech/all_files/rad_6190_et_v1.tif rad_6190_et_v1
../../data/raw/agroparistech/all_files/ru_6190_an_v1.tif ru_6190_an_v1
../../data/raw/agroparistech/all_files/ru_6190_et_v1.tif ru_6190_et_v1
../../data/raw/agroparistech/all_files/tmax_6190_et_v1.tif tmax_6190_et_v1
../../data/raw/agroparistech/all_files/tmin_6190_hi_v1.tif tmin_6190_hi_v1
../../data/raw/agroparistech/all_files/tmoy_6190_an_v1.tif tmoy_6190_an_v1
../../data/raw/agroparistech/all_files/tmoy_6190_et_v1.tif tmoy_6190_et_v1
../../data/raw/agroparistech/all_files/tmoy_6190_hi_v1.tif tmoy_6190_hi_v1


In [82]:
i = 0
slow_file = files_variables_slow["files"].iloc[i]
slow_var = files_variables_slow["variables"].iloc[i]
slow_file

'../../data/raw/agroparistech/all_files/bhctu_6190_an_v1.tif'

In [83]:
# Split the raster into smaller files
max_mb = 5

# Do
input_raster = slow_file
current_subraster_files = split_raster(
    input_raster, "agroparistech_qc_files", max_mb=max_mb
)
df_subraster = pd.DataFrame({"files": current_subraster_files, "variables": slow_var})
df_subraster

Splitting bhctu_6190_an_v1 into 183 subfiles:


Unnamed: 0,files,variables
0,agroparistech_qc_files/subrasters/bhctu_6190_a...,tmoy_6190_hi_v1
1,agroparistech_qc_files/subrasters/bhctu_6190_a...,tmoy_6190_hi_v1
2,agroparistech_qc_files/subrasters/bhctu_6190_a...,tmoy_6190_hi_v1
3,agroparistech_qc_files/subrasters/bhctu_6190_a...,tmoy_6190_hi_v1
4,agroparistech_qc_files/subrasters/bhctu_6190_a...,tmoy_6190_hi_v1
...,...,...
178,agroparistech_qc_files/subrasters/bhctu_6190_a...,tmoy_6190_hi_v1
179,agroparistech_qc_files/subrasters/bhctu_6190_a...,tmoy_6190_hi_v1
180,agroparistech_qc_files/subrasters/bhctu_6190_a...,tmoy_6190_hi_v1
181,agroparistech_qc_files/subrasters/bhctu_6190_a...,tmoy_6190_hi_v1


In [84]:
df_subraster["crs"] = files_variables.query("variables == @slow_var")["crs"].iloc[0]
df_subraster["na_value"] = files_variables.query("variables == @slow_var")[
    "na_value"
].iloc[0]

In [85]:
# Add groupings, make sure that slow variables are split to own cores
df_subraster = df_subraster.copy()
df_subraster["group"] = np.arange(len(df_subraster)) % 10 + 1

display(df_subraster)

# Create list of df to be passed to multiprocessing
grouped = df_subraster.groupby("group")
df_list = [group for name, group in grouped]

# Display one group
display(df_list[0].head(3))
df_list[0].shape

Unnamed: 0,files,variables,crs,na_value,group
0,agroparistech_qc_files/subrasters/bhctu_6190_a...,tmoy_6190_hi_v1,EPSG:2154,-3.402823e+38,1
1,agroparistech_qc_files/subrasters/bhctu_6190_a...,tmoy_6190_hi_v1,EPSG:2154,-3.402823e+38,2
2,agroparistech_qc_files/subrasters/bhctu_6190_a...,tmoy_6190_hi_v1,EPSG:2154,-3.402823e+38,3
3,agroparistech_qc_files/subrasters/bhctu_6190_a...,tmoy_6190_hi_v1,EPSG:2154,-3.402823e+38,4
4,agroparistech_qc_files/subrasters/bhctu_6190_a...,tmoy_6190_hi_v1,EPSG:2154,-3.402823e+38,5
...,...,...,...,...,...
178,agroparistech_qc_files/subrasters/bhctu_6190_a...,tmoy_6190_hi_v1,EPSG:2154,-3.402823e+38,9
179,agroparistech_qc_files/subrasters/bhctu_6190_a...,tmoy_6190_hi_v1,EPSG:2154,-3.402823e+38,10
180,agroparistech_qc_files/subrasters/bhctu_6190_a...,tmoy_6190_hi_v1,EPSG:2154,-3.402823e+38,1
181,agroparistech_qc_files/subrasters/bhctu_6190_a...,tmoy_6190_hi_v1,EPSG:2154,-3.402823e+38,2


Unnamed: 0,files,variables,crs,na_value,group
0,agroparistech_qc_files/subrasters/bhctu_6190_a...,tmoy_6190_hi_v1,EPSG:2154,-3.4028230000000003e+38,1
10,agroparistech_qc_files/subrasters/bhctu_6190_a...,tmoy_6190_hi_v1,EPSG:2154,-3.4028230000000003e+38,1
20,agroparistech_qc_files/subrasters/bhctu_6190_a...,tmoy_6190_hi_v1,EPSG:2154,-3.4028230000000003e+38,1


(19, 5)

In [87]:
# Test on one group
df_test = parallel_agroparistech_extraction(
    df_list[9], df_coords, progress_bar=False, concat_by_axis=0
)
df_test = df_test.copy()

# Print shape
df_test.shape
df_test


Group 10 	 | 1/18 | tmoy_6190_hi_v1.tif | EPSG:2154 | agroparistech_qc_files/subrasters/bhctu_6190_an_v1_subraster_9.tif	
Group 10 	 | 2/18 | tmoy_6190_hi_v1.tif | EPSG:2154 | agroparistech_qc_files/subrasters/bhctu_6190_an_v1_subraster_19.tif	
Group 10 	 | 3/18 | tmoy_6190_hi_v1.tif | EPSG:2154 | agroparistech_qc_files/subrasters/bhctu_6190_an_v1_subraster_29.tif	
Group 10 	 | 4/18 | tmoy_6190_hi_v1.tif | EPSG:2154 | agroparistech_qc_files/subrasters/bhctu_6190_an_v1_subraster_39.tif	
Group 10 	 | 5/18 | tmoy_6190_hi_v1.tif | EPSG:2154 | agroparistech_qc_files/subrasters/bhctu_6190_an_v1_subraster_49.tif	
Group 10 	 | 6/18 | tmoy_6190_hi_v1.tif | EPSG:2154 | agroparistech_qc_files/subrasters/bhctu_6190_an_v1_subraster_59.tif	
Group 10 	 | 7/18 | tmoy_6190_hi_v1.tif | EPSG:2154 | agroparistech_qc_files/subrasters/bhctu_6190_an_v1_subraster_69.tif	
Group 10 	 | 8/18 | tmoy_6190_hi_v1.tif | EPSG:2154 | agroparistech_qc_files/subrasters/bhctu_6190_an_v1_subraster_79.tif	
Group 10 	 | 9/1

Unnamed: 0,tmoy_6190_hi_v1,y_fr,x_fr,idp
0,-4.924745,6708830.0,558298.454808,1029396.0
1,70.367752,6591881.0,917030.55917,1028340.0
2,-11.155251,6297802.0,841579.530018,536407.0


In [90]:
df_long = pd.DataFrame()
counter = 0
for xxx in df_list:
    # Test on one group
    df_test = parallel_agroparistech_extraction(
        xxx, df_coords, progress_bar=False, concat_by_axis=0
    )

    df_test = df_test.copy()
    df_long = pd.concat([df_long, df_test], axis=0)

    counter = counter + df_test.shape[0]
display(counter)
display(df_long.shape)
display(df_long.drop_duplicates()z.shape)


Group 1 	 | 1/19 | tmoy_6190_hi_v1.tif | EPSG:2154 | agroparistech_qc_files/subrasters/bhctu_6190_an_v1_subraster_0.tif	
Group 1 	 | 2/19 | tmoy_6190_hi_v1.tif | EPSG:2154 | agroparistech_qc_files/subrasters/bhctu_6190_an_v1_subraster_10.tif	
Group 1 	 | 3/19 | tmoy_6190_hi_v1.tif | EPSG:2154 | agroparistech_qc_files/subrasters/bhctu_6190_an_v1_subraster_20.tif	
Group 1 	 | 4/19 | tmoy_6190_hi_v1.tif | EPSG:2154 | agroparistech_qc_files/subrasters/bhctu_6190_an_v1_subraster_30.tif	
Group 1 	 | 5/19 | tmoy_6190_hi_v1.tif | EPSG:2154 | agroparistech_qc_files/subrasters/bhctu_6190_an_v1_subraster_40.tif	
Group 1 	 | 6/19 | tmoy_6190_hi_v1.tif | EPSG:2154 | agroparistech_qc_files/subrasters/bhctu_6190_an_v1_subraster_50.tif	
Group 1 	 | 7/19 | tmoy_6190_hi_v1.tif | EPSG:2154 | agroparistech_qc_files/subrasters/bhctu_6190_an_v1_subraster_60.tif	
Group 1 	 | 8/19 | tmoy_6190_hi_v1.tif | EPSG:2154 | agroparistech_qc_files/subrasters/bhctu_6190_an_v1_subraster_70.tif	
Group 1 	 | 9/19 | tmoy_

50

(50, 4)

(25, 4)

In [42]:
df_long

Unnamed: 0,tmoy_6190_hi_v1,y_fr,x_fr,idp
0,2.64276,6910495.0,701897.537075,503318.0
1,-0.29256,6591881.0,917030.55917,1028340.0
2,4.13903,6396748.0,599657.796498,541626.0
0,2.64276,6910495.0,701897.537075,503318.0
1,1.074437,6520442.0,622672.05332,543583.0
0,4.470612,6902237.0,493946.738191,523317.0
1,4.427735,6708830.0,558298.454808,1029396.0
2,5.407806,6448577.0,494188.651034,507110.0
3,6.832966,6253610.0,392680.783305,549519.0
0,3.871207,6631091.0,653575.130239,776711.0


In [53]:
# Test on one group
df_test = parallel_agroparistech_extraction(
    df_list[9], df_coords, progress_bar=False, concat_by_axis=0
)
df_test = df_test.copy()

# Print shape
df_test.shape
df_test


Group 10 	 | 1/16 | tmoy_6190_hi_v1.tif | EPSG:2154 | agroparistech_qc_files/subrasters/tmoy_6190_hi_v1_subraster_9.tif	
Group 10 	 | 2/16 | tmoy_6190_hi_v1.tif | EPSG:2154 | agroparistech_qc_files/subrasters/tmoy_6190_hi_v1_subraster_19.tif	
Group 10 	 | 3/16 | tmoy_6190_hi_v1.tif | EPSG:2154 | agroparistech_qc_files/subrasters/tmoy_6190_hi_v1_subraster_29.tif	
Group 10 	 | 4/16 | tmoy_6190_hi_v1.tif | EPSG:2154 | agroparistech_qc_files/subrasters/tmoy_6190_hi_v1_subraster_39.tif	
Group 10 	 | 5/16 | tmoy_6190_hi_v1.tif | EPSG:2154 | agroparistech_qc_files/subrasters/tmoy_6190_hi_v1_subraster_49.tif	
Group 10 	 | 6/16 | tmoy_6190_hi_v1.tif | EPSG:2154 | agroparistech_qc_files/subrasters/tmoy_6190_hi_v1_subraster_59.tif	
Group 10 	 | 7/16 | tmoy_6190_hi_v1.tif | EPSG:2154 | agroparistech_qc_files/subrasters/tmoy_6190_hi_v1_subraster_69.tif	
Group 10 	 | 8/16 | tmoy_6190_hi_v1.tif | EPSG:2154 | agroparistech_qc_files/subrasters/tmoy_6190_hi_v1_subraster_79.tif	
Group 10 	 | 9/16 | tmoy

Unnamed: 0,tmoy_6190_hi_v1,y_fr,x_fr,idp
0,3.415206,6796657.0,451101.757358,558756.0
1,5.676897,6732936.0,422596.273732,549017.0
2,5.061691,6540572.0,484958.940261,530269.0
3,2.9923,6345306.0,893932.645519,548964.0


In [None]:
# Replace NA values
df_test = clean_na_in_agroparistech(df_test, df_subraster)

# Final display
display(df_test.head(3))
display(df_test.tail(3))

In [30]:
# Run in parallel
df_mp = run_mp(
    parallel_agroparistech_extraction,
    df_list,
    progress_bar=True,
    num_cores=10,
    df_coords=df_coords,
    verbose=False,
    concat_by_axis=0,
)

# Combine the list of dataframes using list comprehension
df_mp_merged = df_mp[0]
for i in range(1, len(df_mp)):
    df_mp_merged = pd.merge(
        df_mp_merged, df_mp[i], on=["idp", "y_fr", "x_fr"], how="left"
    )

# Replace faulty NA values
df_mp_merged = clean_na_in_agroparistech(df_mp_merged, files_variables_quick)

# Display final df
display(df_mp_merged)

# Notify me when done
chime.success()

100%|██████████| 10/10 [00:08<00:00,  1.25it/s]


: 

---


In [14]:
# df_coords_groups = df_coords.copy()
# df_coords_groups["group"] = np.arange(len(df_coords_groups)) % 10 + 1
# grouped = df_coords_groups.groupby("group")
# df_list = [group.reset_index(drop=True) for name, group in grouped]

In [15]:
# # Testing one group for one file
# my_file = files_variables_slow.iloc[0]["files"]
# my_var = files_variables_slow.iloc[0]["variables"]
# my_df = df_list[0]

# wrapper_for_large_files(my_df, my_file, my_var, progress_bar=True)

In [16]:
# %%time
# # Test on one group for all files

# df_slow_i = run_mp(
#     wrapper_for_large_files,
#     df_list,
#     # combine_func=pd.concat,
#     num_cores=10,
#     tif_in = files_variables_slow.iloc[0]["files"],
#     var_in = files_variables_slow.iloc[0]["variables"],
#     progress_bar=False
# )
# # Combine the list of dataframes using list comprehension
# df_slow = df_slow_i
# df_slow_merged = df_slow[0]
# for i in range(1, len(df_slow)):
#     df_slow_merged = pd.concat([df_slow_merged, df_slow[i]], axis=0)

# df_slow_merged

In [17]:
# # Run in parallel for all groups and all files

# df_slow_fin = pd.DataFrame(
#     {"idp": df_coords["idp"], "x_fr": df_coords["x_fr"], "y_fr": df_coords["y_fr"]}
# )

# for f, v in zip(files_variables_slow["files"], files_variables_slow["variables"]):
#     print(f"Extracting {v}")

#     df_slow_i = run_mp(
#         wrapper_for_large_files,
#         df_list,
#         # combine_func=pd.concat,
#         progress_bar=True,
#         num_cores=10,
#         tif_in=f,
#         var_in=v,
#     )

#     df_slow = df_slow_i
#     df_slow_merged = df_slow[0]
#     for i in range(1, len(df_slow)):
#         df_slow_merged = pd.concat([df_slow_merged, df_slow[i]], axis=0).drop(
#             columns=["group"]
#         )

#     # display(df_slow_merged)

#     df_slow_fin = pd.merge(
#         df_slow_0, width
# chime.success()

Extracting bhctu_6190_an_v1


  0%|          | 0/10 [32:36<?, ?it/s]


In [None]:
# # Quick Visual Check
# df_slow_fin

In [None]:
# # Save it
# df_slow_fin.to_feather("data_agroparistech_slowfiles_before_qc.feather")
# df_slow_fin.to_csv("data_agroparistech_slowfiles_before_qc.csv", index=False)

---


In [8]:
# Add groupings, make sure that slow variables are split to own cores
files_variables_slow["group"] = np.arange(len(files_variables_slow)) % 10 + 1

# Create list of df to be passed to multiprocessing
grouped = files_variables_slow.groupby("group")
df_list = [group for name, group in grouped]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  files_variables_slow["group"] = np.arange(len(files_variables_slow)) % 10 + 1


In [9]:
# Run in parallel
df_slow = run_mp(
    parallel_raster_extraction,
    df_list,
    progress_bar=True,
    num_cores=10,
    df_coords=df_coords,
    verbose=False,
)
# chime.success()

  0%|          | 0/10 [6:21:03<?, ?it/s]


In [None]:
# Combine the list of dataframes using list comprehension
df_slow_merged = df_slow[0]
for i in range(1, len(df_slow)):
    df_slow_merged = pd.merge(
        df_slow_merged, df_slow[i], on=["idp", "y_fr", "x_fr"], how="left"
    )

In [None]:
# Quick Visual Check
df_slow_merged

Unnamed: 0,idp,x_fr,y_fr,abal_distrib_v2016,bepe_mortalite_v2018,fasy_mortalite_v2018,piha_distrib_v2016,prob_mort_epi,quro_distrib_v2016,saca_mortalite_v2018,...,tipl_distrib_v2016,ulgl_distrib_v2016,bepe_distrib_v2016,fasy_if_v2018,piab_mortalite_v2018,prec_6190_et_v1,qupy_mortalite_v2018,saca_distrib_v2016,tm61858610_13,ulmi_distrib_v2016
0,632691,2.674150e+05,6.820144e+06,0.116299,-0.006745,0.003381,1.317830e-10,,0.576204,0.013197,...,0.000827,5.844794e-06,0.697052,2.523884e+01,-9999.000000,149.581284,-9999.0,0.231549,0.805501,0.001437
1,702597,7.269712e+05,6.566524e+06,0.085622,-9999.000000,-9999.000000,2.808699e-08,,0.713204,-9999.000000,...,0.010973,7.097719e-04,0.546788,2.783585e+01,-9999.000000,204.346466,-9999.0,0.184128,1.065304,0.012006
2,706240,7.275625e+05,6.636462e+06,0.051618,-9999.000000,0.003246,1.679225e-06,,0.767968,-9999.000000,...,0.026102,8.237094e-04,0.336591,2.882230e+01,-9999.000000,184.735458,-9999.0,0.167884,1.021843,0.064354
3,708321,5.070276e+05,6.792198e+06,0.040013,-0.006780,0.003302,5.390015e-06,,0.670307,0.021719,...,0.019704,4.144382e-04,0.236383,2.709109e+01,-9999.000000,138.204391,-9999.0,0.096158,0.913146,0.035805
4,708369,9.810095e+05,6.248657e+06,0.000747,-9999.000000,-9999.000000,1.169433e-02,,0.008103,-9999.000000,...,0.001383,5.346998e-08,0.000447,-3.402823e+38,-9999.000000,111.139496,-9999.0,0.004683,0.931766,0.042788
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40017,1131396,7.284673e+05,6.743375e+06,0.045600,-0.006766,-9999.000000,2.963565e-06,,0.775838,0.023715,...,0.020294,4.261766e-04,0.312127,2.901571e+01,-9999.000000,173.473053,-9999.0,0.183173,0.945098,0.087864
40018,1131409,6.307829e+05,6.176717e+06,0.473966,-9999.000000,0.004199,2.519738e-13,,0.002722,0.036059,...,0.001820,9.983832e-04,0.151881,-3.402823e+38,-9999.000000,216.782501,-9999.0,0.246417,0.843088,0.000300
40019,1131410,1.030439e+06,6.764769e+06,0.049305,-9999.000000,-9999.000000,5.670998e-06,-3.402823e+38,0.554224,-9999.000000,...,0.029674,1.838178e-03,0.183773,-3.402823e+38,-9999.000000,199.467590,-9999.0,0.109920,0.942872,0.051256
40020,1131424,7.597461e+05,6.425373e+06,0.607617,-9999.000000,0.002793,4.454063e-13,,0.021926,-9999.000000,...,0.002897,2.257393e-03,0.284154,2.129047e+01,0.007716,222.490387,-9999.0,0.252481,0.986910,0.000227


In [None]:
# Save it
df_slow_merged.to_feather("data_agroparistech_slowfiles_before_qc.feather")
df_slow_merged.to_csv("data_agroparistech_slowfiles_before_qc.csv", index=False)

## Quality Control For Outliers / NA Values


In [None]:
# Merge slow and fast files into one

df_slow_files = pd.read_feather("data_agroparistech_slowfiles_before_qc.feather")
df_fast_files = pd.read_feather("data_agroparistech_fastfiles_before_qc.feather")

df_agroparistech = pd.merge(
    df_slow_files, df_fast_files, on=["idp", "y_fr", "x_fr"], how="right"
)

df_agroparistech.head()

Unnamed: 0,idp,x_fr,y_fr,bhctu_6190_an_v1,bhctu_6190_et_v1,etp_6190_an_v1,etp_6190_et_v1,rad_6190_an_v1,abal_distrib_v2016,bepe_mortalite_v2018,...,tipl_distrib_v2016,ulgl_distrib_v2016,bepe_distrib_v2016,fasy_if_v2018,piab_mortalite_v2018,prec_6190_et_v1,qupy_mortalite_v2018,saca_distrib_v2016,tm61858610_13,ulmi_distrib_v2016
0,632691,267415.027897,6820144.0,19.240328,-56.831585,54.696297,105.875206,34664.03125,0.116299,-0.006745,...,0.000827,5.844794e-06,0.697052,25.23884,-9999.0,149.581284,-9999.0,0.231549,0.805501,0.001437
1,702597,726971.216676,6566524.0,2.66782,-53.173649,59.746429,121.191681,36479.175781,0.085622,-9999.0,...,0.010973,0.0007097719,0.546788,27.83585,-9999.0,204.346466,-9999.0,0.184128,1.065304,0.012006
2,706240,727562.508709,6636462.0,5.251337,-58.659328,59.844738,120.26944,35390.4375,0.051618,-9999.0,...,0.026102,0.0008237094,0.336591,28.8223,-9999.0,184.735458,-9999.0,0.167884,1.021843,0.064354
3,708321,507027.573495,6792198.0,-0.896232,-65.469238,55.652546,111.108696,32837.777344,0.040013,-0.00678,...,0.019704,0.0004144382,0.236383,27.09109,-9999.0,138.204391,-9999.0,0.096158,0.913146,0.035805
4,708369,981009.515199,6248657.0,-5.091545,-106.926773,77.560204,142.603653,45841.847656,0.000747,-9999.0,...,0.001383,5.346998e-08,0.000447,-3.4028230000000003e+38,-9999.0,111.139496,-9999.0,0.004683,0.931766,0.042788
