### Preprocessing
This notebooks creates the image tiles used for labelling based on a combination of stratified random sampling combined with additional samples selected adaptively based on visual interpretation of initial model results. 

In [1]:
import geopandas as gpd
import numpy as np
import os
import pandas as pd
import rasterio

from exactextract import exact_extract
from PIL import Image

from config import Config
from core.mapsheet import MapSheet
from core.utils import get_mapping_from_csv

In [2]:
config = Config.Config()

### Create tile catalog

In [3]:
map_sheet_index = gpd.read_file(config.map_sheet_index_geo_path)

In [4]:
tiles = []
for i, row in map_sheet_index.iterrows():
    path = f"{config.map_sheet_folder}/{row.key}.tif"
    map_sheet = MapSheet(
        sheet_id=row.key,
        path=path,
        polygon=row.geometry,
        polygon_crs=map_sheet_index.crs
    )
    map_sheet_tiles = (map_sheet
                       .extract_map_tile_info(tile_size=256)
                       # set unique id for each map sheet tile
                       .rename_axis("tile_id")
                       .reset_index())
    tiles.append(map_sheet_tiles)

In [5]:
# convert tiles to combined EPSG:4131 and concatenate to combined tile dataset
df_tiles = pd.concat([tile.to_crs(config.map_sheet_index_crs) for tile in tiles]).reset_index(drop=True)
print(df_tiles.shape)
df_tiles.head(3)

(633527, 7)


Unnamed: 0,tile_id,map_id,col_off,row_off,tile_size,geometry,random_number
0,0,50491,257,265,256,"POLYGON ((100.25973 20.49268, 100.25 20.49275,...",0.23507
1,1,50491,257,521,256,"POLYGON ((100.25973 20.49268, 100.25965 20.483...",0.697796
2,2,50491,257,777,256,"POLYGON ((100.25965 20.48343, 100.25958 20.474...",0.265995


### Define strata
Use GLC_FCS30D (https://doi.org/10.5281/zenodo.8239305) land use cover data for 1985 to define a stratum for each map sheet tile. 

This allows the use of stratified random sampling resulting in a more balanced dataset for labelling and model training.

In [6]:
with rasterio.open(config.luc_fcs30_1985_path) as src:
    raster_crs = src.crs

df_tiles["luc"] = exact_extract(config.luc_fcs30_1985_path, df_tiles.to_crs(raster_crs), ["majority"], output="pandas")

In [7]:
mapping_class = get_mapping_from_csv(config.luc_fcs30_legend_path, col_key="pixel", col_value="class_detailed")
df_tiles["luc_name"] = df_tiles["luc"].map(mapping_class)

mapping_stratum = get_mapping_from_csv(config.luc_fcs30_legend_path, col_key="pixel", col_value="stratum")
df_tiles["stratum"] = df_tiles["luc"].map(mapping_stratum)

In [8]:
df_tiles["stratum"].value_counts()

stratum
Forest                    349621
Cropland                  123452
Shrubland or grassland    103984
Water body                 50271
Wetland                     5098
Impervious surface          1101
Name: count, dtype: int64

### QA checks

In [9]:
# no missing strata
assert df_tiles["stratum"].isnull().any() == False

# no empty polygons
assert df_tiles["geometry"].is_empty.sum() == 0

# no negative column or row offsets
assert (df_tiles["row_off"] > 0).all() 
assert (df_tiles["col_off"] > 0).all() 

In [10]:
def agg_area(x):
    return np.sum(x.area)
areas = df_tiles.groupby("map_id").agg({"geometry": agg_area})
areas[(areas["geometry"] < 0.061) | (areas["geometry"] > 0.063)]


  return np.sum(x.area)


Unnamed: 0_level_0,geometry
map_id,Unnamed: 1_level_1
59262,0.054306
61462,0.077083


### Save final tile catalog file

In [11]:
# save the final tile catalog including stratum
df_tiles.to_file(config.tile_catalog_path, driver="GeoJSON")

## Create training images

In [12]:
df_tiles = gpd.read_file(config.tile_catalog_path)
map_sheet_index = gpd.read_file(config.map_sheet_index_geo_path)

### Stratified random sampling

In [13]:
# randomly select sample images for each stratum
idx_stratified_sampling = df_tiles.groupby("stratum")["random_number"].nsmallest(config.samples_per_stratum).reset_index()["level_1"]
df_sample = df_tiles.iloc[idx_stratified_sampling].reset_index(drop=True)

### Additional training images
These were added based on visual inspection of final predictions with multiple iterations to improve the final land use cover map

In [14]:
# batch 1 are extra tiles randomly selected for map sheets with poor model performance
maps_extra_forest_water = [62382, 63383, 61374, 61373, 61371, 61372, 62373, 65354, 65351, 65353, 65352, 64352, 61322, 61402, 61411, 60424, 55454]
forest_water_idx = df_tiles[df_tiles["map_id"].isin(maps_extra_forest_water)]["random_number"].nsmallest(20).reset_index()["index"]

maps_extra_river_footpath = [67321, 67324, 68324, 68323, 67332, 67333, 65412]
river_footpath_idx = df_tiles[df_tiles["map_id"].isin(maps_extra_river_footpath)]["random_number"].nsmallest(15).reset_index()["index"]

maps_extra_plantation_symbols = [65321, 66331, 66383, 66384, 65381, 66394, 61392]
plantation_symbols_idx = df_tiles[df_tiles["map_id"].isin(maps_extra_plantation_symbols)]["random_number"].nsmallest(10).reset_index()["index"]

map_53503_idx = df_tiles[df_tiles["map_id"] == 53503]["random_number"].nsmallest(5).reset_index()["index"]
map_62384_idx = df_tiles[df_tiles["map_id"] == 62384]["random_number"].nsmallest(5).reset_index()["index"]

idx_extra_batch1 = [
    *forest_water_idx, 
    *river_footpath_idx, 
    *plantation_symbols_idx,
    *map_53503_idx,
    *map_62384_idx,
]

In [15]:
# batch 2 and 3 are extra tiles based on visual interpretation of initial prediction results
idx_extra_batch2 = pd.read_csv(config.adaptive_samples_batch2)["tile_index"].to_list()
idx_extra_batch3 = pd.read_csv(config.adaptive_samples_batch3)["tile_index"].to_list()

In [16]:
idx_samples = [
    *idx_stratified_sampling,
    *idx_extra_batch1,
    *idx_extra_batch2,
    *idx_extra_batch3
    ]

# filter out any duplicates
idx_samples = list(set(idx_samples))
len(idx_samples)

746

In [17]:
# select sampled rows and indicate which ones were randomly selected
df_sample = df_tiles.iloc[idx_samples].copy()
df_sample["random_sample"] = False
df_sample.loc[idx_stratified_sampling, "random_sample"] = True
df_sample.reset_index(inplace=True)
df_sample.head(3)

Unnamed: 0,index,tile_id,map_id,col_off,row_off,tile_size,random_number,luc,luc_name,stratum,geometry,random_sample
0,268292,483,60463,4644,2224,256,0.451872,52,Closed evergreen broadleaved forest,Forest,"POLYGON ((105.16509 18.67607, 105.1651 18.6853...",False
1,51221,687,53503,6475,4099,256,0.003091,52,Closed evergreen broadleaved forest,Forest,"POLYGON ((101.73326 20.60236, 101.73343 20.611...",False
2,430102,532,63303,4986,286,256,0.067877,190,Impervious surfaces,Impervious surface,"POLYGON ((106.68722 10.74111, 106.67786 10.741...",True


In [18]:
# map legend_type and map_id information back to the selected tiles for use during labelling
legend_type_mapping = {key: lt for key, lt in zip(map_sheet_index["key"], map_sheet_index["legend_type"])}
df_sample["legend_type"] = df_sample["map_id"].map(legend_type_mapping)

map_source_mapping = {key: lt for key, lt in zip(map_sheet_index["key"], map_sheet_index["source"])}
df_sample["nara"] = df_sample["map_id"].map(map_source_mapping) == "NARA"

In [19]:
df_sample[df_sample["random_sample"]]["stratum"].value_counts()

stratum
Impervious surface        80
Cropland                  80
Water body                80
Forest                    80
Shrubland or grassland    80
Wetland                   80
Name: count, dtype: int64

In [20]:
df_sample[~df_sample["random_sample"]]["stratum"].value_counts()

stratum
Forest                    158
Water body                 42
Cropland                   41
Shrubland or grassland     21
Wetland                     4
Name: count, dtype: int64

In [21]:
# write image tiles for each selected tile 
if not os.path.exists(config.tile_folder):
    os.makedirs(config.tile_folder)

for index, row in df_sample.iterrows():
    polygon = map_sheet_index[map_sheet_index["key"] == row.map_id]["geometry"].iloc[0]
    path = f"{config.map_sheet_folder}/{row.map_id}.tif"
    map_sheet = MapSheet(
        sheet_id=row.map_id,
        path=path,
        polygon=polygon,
        polygon_crs=map_sheet_index.crs
    )

    # load image from raster and convert to PIL
    tile = map_sheet.extract_map_tile(row.col_off, row.row_off, tile_size=config.tile_size)
    image = Image.fromarray(tile.transpose(1, 2, 0))  

    # add nara indicator to file path to avoid duplicates as NARA map sheets 
    # were added later partly replacing previous versions from other sources
    nara_id = "_1" if row.nara else ""
    tile_name = f"{row.map_id}{nara_id}_{row.col_off}_{row.row_off}.png"
    output_path = f"{config.tile_folder}/{tile_name}"

    # save the image
    image.save(output_path)

    # update the tile catalog with the tile path
    df_sample.loc[index, "tile_name"] = tile_name
    df_sample.loc[index, "tile_path"] = output_path

# save the sample tile catalog with added tile paths
df_sample.to_file(config.sample_catalog_path, driver="GeoJSON")