# In-situ prepration for Classification

In [1]:
import glob, os
import numpy as np
import pandas as pd
import geopandas as gpd

from shapely.geometry.polygon import Polygon
from shapely.geometry.multipolygon import MultiPolygon

import rasterstats
from rasterstats import zonal_stats
from pathlib import Path
from IPython.display import display

print(f'Pandas    : {pd.__version__}')
print(f'GeoPandas : {gpd.__version__}')

Pandas    : 1.1.5
GeoPandas : 0.8.1


## Set paths for input and output directories

Create directories if there are missing using `Path` and `mkdir`

In [4]:
#computer_path = 'X:/'
computer_path = '/Volumes/nbdid-sst-lbrat2104/'
grp_letter    = 'X'

# Directory for all work files
work_path = f'{computer_path}GROUP_{grp_letter}/WORK/'

# ----- #
# INPUT #
# ----- #

clipped_path = f'{work_path}2_L2A_CLIPPED/'
in_situ_path = f'{work_path}IN_SITU/'

# ------ #
# OUTPUT #
# ------ #

in_situ_SD_path = f'{work_path}IN_SITU_SD/'

Path(in_situ_SD_path).mkdir(parents=True, exist_ok=True)


print(f'In-Situ data path is set to : {in_situ_path}')
print(f'In-Situ data for classification path is set to : {in_situ_SD_path}')


In-Situ data path is set to : /Volumes/nbdid-sst-lbrat2104/GROUP_X/WORK/IN_SITU/
In-Situ data for classification path is set to : /Volumes/nbdid-sst-lbrat2104/GROUP_X/WORK/IN_SITU_SD/


## 1. Prepare in-situ dataset

<img src="figures/prepare_in_situ.png" width="10000">

### Set up filenames and parameters

In [14]:
# In-Situ original dataset
# ------------------------

in_situ_name = 'WALLONIA_2018_IN_SITU_ROI'

in_situ_shp       = f'{in_situ_path}{in_situ_name}.shp'
in_situ_count_shp = f'{in_situ_SD_path}{in_situ_name}_pixCount.shp'


gdf = gpd.read_file(in_situ_shp)

epsg_code = gdf.crs

print(epsg_code)

display(gdf.head())


# Raster template with no NaN (before applying SCL)
# -------------------------------------------------

img_temp_tif = glob.glob(f'{clipped_path}*.tif')[0]

print(f'Raster template file : {img_temp_tif}')

# Parameters
# ----------

buf_size = -10

epsg:32631


Unnamed: 0,ID,CROP,LC,CODE,IRRIGATION,geometry
0,1877,1,Maize (for livestock),201,0,"POLYGON ((634719.076 5591248.019, 634788.109 5..."
1,1878,1,Common wheat (winter),311,0,"POLYGON ((634996.603 5591774.690, 635051.974 5..."
2,1879,1,Maize (for livestock),201,0,"POLYGON ((635196.857 5591331.590, 635199.724 5..."
3,1880,1,Grassland (temporary),62,0,"POLYGON ((635003.339 5591080.041, 635021.807 5..."
4,3251,1,Barley (winter),321,0,"POLYGON ((634708.850 5590509.262, 634710.400 5..."


Raster template file : /Volumes/nbdid-sst-lbrat2104/GROUP_X/WORK/2_L2A_CLIPPED/T31UFS_20200116T105309_B02_10m_ROI.tif


### 1.1 Apply a negative buffer

This step is useful to avoid border effects

In [6]:
gdf.geometry = gdf.geometry.buffer(buf_size)

gdf = gdf[~gdf.geometry.is_empty]    # Remove empty geometries

display(gdf.head())

Unnamed: 0,ID,CROP,LC,CODE,IRRIGATION,geometry
0,1877,1,Maize (for livestock),201,0,"POLYGON ((634736.092 5591240.943, 634789.895 5..."
1,1878,1,Common wheat (winter),311,0,"POLYGON ((635007.971 5591764.526, 635051.831 5..."
2,1879,1,Maize (for livestock),201,0,"POLYGON ((635206.957 5591341.834, 635209.723 5..."
3,1880,1,Grassland (temporary),62,0,"POLYGON ((635016.863 5591076.352, 635026.482 5..."
4,3251,1,Barley (winter),321,0,"POLYGON ((634719.083 5590523.686, 634720.398 5..."


### 1.1.1 Explode *MultiPolygon* geometry into individual *Polygon* geometries using `GeoPandas` and `Shapely`

*MultiPolygons* geometries can cause problems later on (e.g. for rasterisation) !

In [7]:
# Create a function to explode MultiPolygons from a GeoDataFrame
def explode(indf):
    outdf = gpd.GeoDataFrame(columns=indf.columns)
    for idx, row in indf.iterrows():
        if type(row.geometry) == Polygon:
            outdf = outdf.append(row,ignore_index=True)
        if type(row.geometry) == MultiPolygon:
            multdf = gpd.GeoDataFrame(columns=indf.columns)
            recs = len(row.geometry)
            multdf = multdf.append([row]*recs,ignore_index=True)
            for geom in range(recs):
                multdf.loc[geom,'geometry'] = row.geometry[geom]
            outdf = outdf.append(multdf,ignore_index=True)
    return outdf


# Apply function on GeoDataFrame
gdf = explode(gdf)

display(gdf.head())

Unnamed: 0,ID,CROP,LC,CODE,IRRIGATION,geometry
0,1877,1,Maize (for livestock),201,0,"POLYGON ((634736.092 5591240.943, 634789.895 5..."
1,1878,1,Common wheat (winter),311,0,"POLYGON ((635007.971 5591764.526, 635051.831 5..."
2,1879,1,Maize (for livestock),201,0,"POLYGON ((635206.957 5591341.834, 635209.723 5..."
3,1880,1,Grassland (temporary),62,0,"POLYGON ((635016.863 5591076.352, 635026.482 5..."
4,3251,1,Barley (winter),321,0,"POLYGON ((634719.083 5590523.686, 634720.398 5..."


### 1.2 Add a column with area of each polygons

In [8]:
gdf['area'] = gdf.geometry.area.astype(int)

display(gdf.head())

Unnamed: 0,ID,CROP,LC,CODE,IRRIGATION,geometry,area
0,1877,1,Maize (for livestock),201,0,"POLYGON ((634736.092 5591240.943, 634789.895 5...",25444
1,1878,1,Common wheat (winter),311,0,"POLYGON ((635007.971 5591764.526, 635051.831 5...",59305
2,1879,1,Maize (for livestock),201,0,"POLYGON ((635206.957 5591341.834, 635209.723 5...",26169
3,1880,1,Grassland (temporary),62,0,"POLYGON ((635016.863 5591076.352, 635026.482 5...",16639
4,3251,1,Barley (winter),321,0,"POLYGON ((634719.083 5590523.686, 634720.398 5...",62352


### 1.3 Add pixel count to in-situ data

#### 1.3.1 Zonal Stats using `rasterstats` and store output in a DataFrame

In [9]:
zs_df = pd.DataFrame(zonal_stats(vectors=gdf,
                                 raster=img_temp_tif,
                                 nodata=-999,
                                 stats='count'))

zs_df = zs_df.rename(columns={'count': 'pix_count'})

display(zs_df.head())

# Join pixels count with polygons informations

pixCount_gdf = pd.concat([gdf, zs_df], axis=1, join="inner")

display(pixCount_gdf.head())

Unnamed: 0,pix_count
0,254
1,585
2,262
3,166
4,625


Unnamed: 0,ID,CROP,LC,CODE,IRRIGATION,geometry,area,pix_count
0,1877,1,Maize (for livestock),201,0,"POLYGON ((634736.092 5591240.943, 634789.895 5...",25444,254
1,1878,1,Common wheat (winter),311,0,"POLYGON ((635007.971 5591764.526, 635051.831 5...",59305,585
2,1879,1,Maize (for livestock),201,0,"POLYGON ((635206.957 5591341.834, 635209.723 5...",26169,262
3,1880,1,Grassland (temporary),62,0,"POLYGON ((635016.863 5591076.352, 635026.482 5...",16639,166
4,3251,1,Barley (winter),321,0,"POLYGON ((634719.083 5590523.686, 634720.398 5...",62352,625


### 1.4 Write dataset in shapefiles

In [15]:
print(f'Write In-Situ --> {in_situ_count_shp}')

pixCount_gdf = pixCount_gdf.set_crs(epsg_code)

pixCount_gdf.to_file(in_situ_count_shp)

Write In-Situ --> /Volumes/nbdid-sst-lbrat2104/GROUP_X/WORK/IN_SITU_SD/WALLONIA_2018_IN_SITU_ROI_pixCount.shp


## 2. Sampling Design


<img src="figures/in_situ_sampling_design.png" width="10000">

### Set up filenames and parameters

In [16]:
in_situ_cal_shp = f'{in_situ_SD_path}{in_situ_name}_cal.shp'
in_situ_val_shp = f'{in_situ_SD_path}{in_situ_name}_val.shp'

csv_strategies = f'{in_situ_SD_path}{in_situ_name}_sampling_design.csv'

# Open the in-situ shapefile in a GeoDataFrame

parcels_gdf = gpd.read_file(in_situ_count_shp)

# Parameters
# ----------

S2pixMin         = 3
S2pixBest        = 10
S2pixThres       = 10000
SampleRatioCal_1 = 0.25
SampleRatioCal_2 = 0.75

### 2.1 Remove parcels with not enough pixels

In [17]:
print(f'-- Remove parcels with less than {S2pixMin} pixels')

parcels_gdf = parcels_gdf.loc[parcels_gdf['pix_count'] >= S2pixMin]

nb_remove_small_poly = len(parcels_gdf)

print(f'----> There are {nb_remove_small_poly} polygons with enough pixels')

-- Remove parcels with less than 3 pixels
----> There are 341 polygons with enough pixels


### 2.2 Select best parcels

In [18]:
print(f'-- Move parcels with less than {S2pixBest} pixels in the validation dataset')

parcels_non_best_gdf = parcels_gdf.loc[parcels_gdf['pix_count'] < S2pixBest]
parcels_best_gdf = parcels_gdf.loc[parcels_gdf['pix_count'] >= S2pixBest]

print(f'----> {len(parcels_best_gdf)} best parcels')
print(f'----> {len(parcels_non_best_gdf)} non-best parcels (moved in the validation dataset)')

-- Move parcels with less than 10 pixels in the validation dataset
----> 324 best parcels
----> 17 non-best parcels (moved in the validation dataset)


### 2.3 Split parcels into calibration and validation datasets
#### 2.3.1 Create a table with the number of pixels to get for calibration and validation (depending on the strategy)

In [19]:
print('-- Split parcels into cal/val')

pixel_per_class_df = parcels_best_gdf.groupby('CODE')['pix_count'].agg('sum').to_frame().reset_index()

pixel_per_class_df['strategy'] = np.where(pixel_per_class_df['pix_count'] >= S2pixThres, 1, 2)


classes_strategy_1 = pixel_per_class_df.loc[np.where(pixel_per_class_df['strategy'] == 1)]
classes_strategy_2 = pixel_per_class_df.loc[np.where(pixel_per_class_df['strategy'] == 2)]


# Strategy 1
# ----------

classes_strategy_1['pix_cal'] = classes_strategy_1['pix_count']*SampleRatioCal_1
classes_strategy_1['pix_val'] = classes_strategy_1['pix_count']*(1-SampleRatioCal_1)

# Strategy 2
# ----------

classes_strategy_2['pix_cal'] = classes_strategy_2['pix_count']*SampleRatioCal_2
classes_strategy_2['pix_val'] = classes_strategy_2['pix_count']*(1-SampleRatioCal_2)


classes_strategy_all = pd.concat([classes_strategy_1, classes_strategy_2]).reset_index()

display(classes_strategy_all)

-- Split parcels into cal/val


Unnamed: 0,index,CODE,pix_count,strategy,pix_cal,pix_val
0,5,311,25092,1,6273.0,18819.0
1,16,62,13499,1,3374.75,10124.25
2,22,91,10833,1,2708.25,8124.75
3,0,12,71,2,53.25,17.75
4,1,19,1758,2,1318.5,439.5
5,2,20,146,2,109.5,36.5
6,3,201,6293,2,4719.75,1573.25
7,4,21,208,2,156.0,52.0
8,6,321,6200,2,4650.0,1550.0
9,7,341,329,2,246.75,82.25


#### 2.3.2 Select randomly parcels to reach the number of pixels needed to calibrate the model

In [22]:
parcels_cal_gdfs = []
parcels_val_gdfs = []

for i in range(0,len(classes_strategy_all)):

    code = classes_strategy_all.loc[i,'CODE']
    strategy  = classes_strategy_all.loc[i,'strategy']

    pixel_cal = int(classes_strategy_all['pix_cal'].loc[classes_strategy_all['CODE'] == code])

    parcels_by_sub_class = parcels_best_gdf.loc[parcels_best_gdf['CODE'] == code]
    parcels_by_sub_class_reordered = parcels_by_sub_class.sample(len(parcels_by_sub_class),random_state=10)

    parcels_cal_gdf = parcels_by_sub_class_reordered[parcels_by_sub_class_reordered['pix_count'].cumsum() <= pixel_cal]
    parcels_val_gdf = parcels_by_sub_class_reordered[parcels_by_sub_class_reordered['pix_count'].cumsum() > pixel_cal]

    if pixel_cal < int(parcels_cal_gdf['pix_count'].sum()):
        print("error")

    parcels_cal_gdfs.append(parcels_cal_gdf)
    parcels_val_gdfs.append(parcels_val_gdf)


# Add non best parcels to validation
parcels_val_gdfs.append(parcels_non_best_gdf)

parcels_cal_final_gdf = pd.concat(parcels_cal_gdfs, ignore_index=True)
parcels_val_final_gdf = pd.concat(parcels_val_gdfs, ignore_index=True)


print(f'----> {len(parcels_cal_final_gdf)} parcels are in calibration dataset')
print(f'----> {len(parcels_val_final_gdf)} parcels are in validation dataset')

----> 167 parcels are in calibration dataset
----> 174 parcels are in validation dataset


### 2.4 Write datasets in shapefiles

In [14]:
print(f'Write calibration In-Situ --> {in_situ_cal_shp}')

parcels_cal_final_gdf.to_file(in_situ_cal_shp)

print(f'Write validation In-Situ --> {in_situ_cal_shp}')

parcels_val_final_gdf.to_file(in_situ_val_shp)

classes_strategy_all.to_csv(csv_strategies, index=False, sep=';')