# In-Situ Sampling Design

In [1]:
import glob, os
import numpy as np
import pandas as pd
import geopandas as gpd

import rasterstats
from rasterstats import zonal_stats
from pathlib import Path
from IPython.display import display


In [2]:
# Set the general input and output paths
# --------------------------------------

# When you are connected to the computer room
'''
grp_letter   = 'X'
student_name = 'ndeffense'

vector_path = 'X:/data/VECTOR/'
raster_path = 'X:/data/RASTER/'
output_path = f'X:/GROUP_{grp_letter}/TP/{student_name}/DATA/'
'''

# When you are connected to your personnal computer
vector_path = '/Users/Nicolas/OneDrive - UCL/LBRAT2104/VECTOR/'
raster_path = '/Volumes/nbdid-sst-lbrat2104/data/RASTER/'
output_path = '/Users/Nicolas/OneDrive - UCL/LBRAT2104/Output/'


print(f'Vector input path are set to : {vector_path}')
print(f'Raster input path are set to : {raster_path}')
print(f'Output path are set to       : {output_path}')

Vector input path are set to : /Users/Nicolas/OneDrive - UCL/LBRAT2104/VECTOR/
Raster input path are set to : /Volumes/nbdid-sst-lbrat2104/data/RASTER/
Output path are set to       : /Users/Nicolas/OneDrive - UCL/LBRAT2104/Output/


## Add pixel count to in-situ data

In [4]:
# Set up filenames
# ----------------

# In-Situ polygons
in_situ_shp = f'{output_path}IN_SITU/WALLONIA_2018_IN_SITU_ROI.shp'

# Raster template with no NaN (before applying SCL)
img_temp_tif = f'{output_path}IM_ROI/T31UFR_20200316T104709_B02_10m_ROI.tif'

# New file with pixels count for each polygons
in_situ_count_shp = f'{in_situ_shp[:-4]}_pixCount.shp'


# Zonal Statistics
# ----------------

# Compute the zonal stat and store output in a DataFrame
zs_df = pd.DataFrame(zonal_stats(vectors=in_situ_shp,
                                    raster=img_temp_tif,
                                    nodata=-999,
                                    stats='count'))

zs_df = zs_df.rename(columns={'count': 'pix_count'})

# Read in-situ shapefile as a GeoDataFrame
in_situ_gdf = gpd.read_file(in_situ_shp)

# Join pixels count with polygons informations
in_situ_pixCount_gdf = pd.concat([in_situ_gdf, zs_df], axis=1, join="inner")

display(in_situ_pixCount_gdf)

# Write into a new shapefile
in_situ_pixCount_gdf.to_file(in_situ_count_shp)



Unnamed: 0,ID,CROP,LC,CODE,IRRIGATION,area,geometry,pix_count
0,1877,1,Maize (for livestock),201,0,11372,"POLYGON ((634736.092 5591240.943, 634789.895 5...",254
1,1878,1,Common wheat (winter),311,0,25548,"POLYGON ((635007.971 5591764.526, 635051.831 5...",585
2,1879,1,Maize (for livestock),201,0,79139,"POLYGON ((635206.957 5591341.834, 635209.723 5...",262
3,1880,1,Grassland (temporary),62,0,25873,"POLYGON ((635016.863 5591076.352, 635026.482 5...",166
4,3251,1,Barley (winter),321,0,16585,"POLYGON ((634719.083 5590523.686, 634720.398 5...",625
...,...,...,...,...,...,...,...,...
325,183204,0,Not agriculture,20,0,30502,"POLYGON ((634030.058 5595825.237, 634031.095 5...",50
326,183206,0,Not agriculture,19,0,52986,"POLYGON ((627815.480 5595734.706, 627814.517 5...",128
327,183208,0,Not agriculture,19,0,56815,"POLYGON ((628489.484 5595844.131, 628491.520 5...",66
328,183210,0,Not agriculture,6,0,109914,"POLYGON ((632038.546 5595956.541, 632042.544 5...",136


## Sampling Design


<img src="figures/in_situ_sampling_design.png" width="500">


In [7]:
# Set up filenames
# ----------------

in_situ_cal_shp = f'{output_path}IN_SITU/WALLONIA_2018_IN_SITU_ROI_cal.shp'
in_situ_val_shp = f'{output_path}IN_SITU/WALLONIA_2018_IN_SITU_ROI_val.shp'

csv_strategies = f'{output_path}IN_SITU/WALLONIA_2018_IN_SITU_ROI_sampling_design.csv'

# Open the shapefile with GeoPandas

parcels_gdf = gpd.read_file(in_situ_count_shp)

#display(parcels_gdf)

S2pix_min  = 3
S2pix_Best = 10
S2pix_CalH = 10000
sample_ratioH = 0.25
sample_ratioL = 0.75

# ------------------------------------- #
# Remove parcels with not enough pixels #
# ------------------------------------- #

print(f'-- Remove parcels with less than {S2pix_min} pixels')

parcels_gdf = parcels_gdf.loc[parcels_gdf['pix_count'] >= S2pix_min]

nb_remove_small_poly = len(parcels_gdf)

print(f'----> There are {nb_remove_small_poly} polygons with enough pixels')


# ------------------- #
# Select best parcels #
# ------------------- #

print(f'-- Move parcels with less than {S2pix_Best} pixels in the validation dataset')

parcels_non_best_gdf = parcels_gdf.loc[parcels_gdf['pix_count'] < S2pix_Best]   # Non best parcels (less than 10 S2 pixels)
parcels_best_gdf = parcels_gdf.loc[parcels_gdf['pix_count'] >= S2pix_Best]      # Best parcels (more than 10 S2 pixels)    

print(f'----> {len(parcels_best_gdf)} best parcels')
print(f'----> {len(parcels_non_best_gdf)} non-best parcels (moved in the validation dataset)')

# ------------------------------------------------------ #
# Split parcels into cal/val among the selected segments #
# ------------------------------------------------------ #

print('-- Split parcels into cal/val')

pixel_per_class_df = parcels_best_gdf.groupby('CODE')['pix_count'].agg('sum').to_frame().reset_index()

pixel_per_class_df['strategy'] = np.where(pixel_per_class_df['pix_count'] >= S2pix_CalH, 1, 2)


classes_strategy_1 = pixel_per_class_df.loc[np.where(pixel_per_class_df['strategy'] == 1)]
classes_strategy_2 = pixel_per_class_df.loc[np.where(pixel_per_class_df['strategy'] == 2)]


# Strategy 1
# ----------

classes_strategy_1['pix_cal'] = classes_strategy_1['pix_count']*sample_ratioH
classes_strategy_1['pix_val'] = classes_strategy_1['pix_count']*(1-sample_ratioH)

# Strategy 2
# ----------

classes_strategy_2['pix_cal'] = classes_strategy_2['pix_count']*sample_ratioL
classes_strategy_2['pix_val'] = classes_strategy_2['pix_count']*(1-sample_ratioL)


classes_strategy_all = pd.concat([classes_strategy_1, classes_strategy_2]).reset_index()

#print(classes_strategy_all)


parcels_cal_gdfs = []
parcels_val_gdfs = []

for i in range(0,len(classes_strategy_all)):

    code = classes_strategy_all.loc[i,'CODE']
    strategy  = classes_strategy_all.loc[i,'strategy']

    pixel_cal = int(classes_strategy_all['pix_cal'].loc[classes_strategy_all['CODE'] == code])
    #pixel_val = int(classes_strategy_all['pix_val'].loc[classes_strategy_all['sub_nb'] == sub_class])


    parcels_by_sub_class = parcels_best_gdf.loc[parcels_best_gdf['CODE'] == code]
    parcels_by_sub_class_reordered = parcels_by_sub_class.sample(len(parcels_by_sub_class), random_state=10)

    parcels_cal_gdf = parcels_by_sub_class_reordered[parcels_by_sub_class_reordered['pix_count'].cumsum() <= pixel_cal]
    parcels_val_gdf = parcels_by_sub_class_reordered[parcels_by_sub_class_reordered['pix_count'].cumsum() > pixel_cal]

    if pixel_cal < int(parcels_cal_gdf['pix_count'].sum()):
        print("error")

    parcels_cal_gdfs.append(parcels_cal_gdf)
    parcels_val_gdfs.append(parcels_val_gdf)


    #print(f'Number of parcels for {code} : {len(parcels_by_sub_class_reordered)}')
    #print(f'Strategy : {strategy}')
    #print(f'Max pixel for calibration : {pixel_cal}')
    #print(f'Number parcels for calibration : {len(parcels_cal_gdf)}')
    #print(f'Number parcels for validation : {len(parcels_val_gdf)}')
    #print('------------------------------')


parcels_val_gdfs.append(parcels_non_best_gdf)   # Add to validation non best parcels

parcels_cal_final_gdf = pd.concat(parcels_cal_gdfs, ignore_index=True)
parcels_val_final_gdf = pd.concat(parcels_val_gdfs, ignore_index=True)


print(f'----> {len(parcels_cal_final_gdf)} parcels are in calibration dataset')
print(f'----> {len(parcels_val_final_gdf)} parcels are in validation dataset')


print('-- Creating shapefiles')
print(f'----> {in_situ_cal_shp}')
print(f'----> {in_situ_val_shp}')

parcels_cal_final_gdf.to_file(in_situ_cal_shp)
parcels_val_final_gdf.to_file(in_situ_val_shp)


print('-- Creating csv strategies')
print(f'----> {csv_strategies}')

classes_strategy_all.to_csv(csv_strategies, index=False)



-- Remove parcels with less than 3 pixels
----> There are 330 polygons with enough pixels
-- Move parcels with less than 10 pixels in the validation dataset
----> 330 best parcels
----> 0 non-best parcels (moved in the validation dataset)
-- Split parcels into cal/val
----> 173 parcels are in calibration dataset
----> 157 parcels are in validation dataset
-- Creating shapefiles
----> /Users/Nicolas/OneDrive - UCL/LBRAT2104/Output/IN_SITU/WALLONIA_2018_IN_SITU_ROI_cal.shp
----> /Users/Nicolas/OneDrive - UCL/LBRAT2104/Output/IN_SITU/WALLONIA_2018_IN_SITU_ROI_val.shp
-- Creating csv strategies
----> /Users/Nicolas/OneDrive - UCL/LBRAT2104/Output/IN_SITU/WALLONIA_2018_IN_SITU_ROI_sampling_design.csv
