## Generate training tiles
This notebook splits up each KH-9 image raster file into tiles of 256 $\times$ 256 pixels. A random subset of these tiles is selected for each study area to be used for manual labelling and model training.

Words in *cursive* below refer to *parameters* and *file paths* specified in the configuration file. Words in **bold** refer to outputs, such as Tables and Figures, used in the paper.

#### Inputs:
* *study_areas -> [study_area] -> rasters*: KH-9 images for each study area (geotiff files)

#### Parameters:
* *study_areas*: Names of the study_areas
* *study_areas -> [study_area] -> n_tiles*: Number of random tiles to select for each study area 

#### Outputs:
* *tiles_folder*: Random subset of KH-9 image tiles (256 $\times$ 256 pixels) for each study area (geotiff files)
* *tile_catalog_path*: Tile catalog with metadata and extent of each of the selected image tiles (geojson file)

In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
import rasterio
import rasterio.features

from rasterio.windows import transform, Window
from shapely.geometry import box
from utils import load_config, create_dir

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
def gen_tile_catalog(raster_path):
    """
    Splits tiled raster file into individual tiles and saves
    a tile catalogue with information about each tile
    """
    with rasterio.open(raster_path) as src:
        assert (
            src.profile.get("blockxsize") == 256
        ), "Raster block size should be 256x256"
        assert (
            src.profile.get("blockysize") == 256
        ), "Raster block size should be 256x256"
        rows = list()
        for ji, window in src.block_windows(1):
            array = src.read(1, window=window)

            # skip tiles that contain missing values or are not 256 by 256
            if ((array == src.nodata).sum() > 0) | (array.shape != (256, 256)):
                continue

            # add polygon with the area of each tile
            window_bounds = rasterio.windows.bounds(window, src.transform)
            window_polygon = box(*window_bounds)

            rows.append((raster_path, window.col_off,
                        window.row_off, window_polygon))

        catalog = gpd.GeoDataFrame(
            rows, columns=["raster", "col_off", "row_off", "geometry"], crs=src.crs
        )

    return catalog


def write_img_tiles(catalog, out_dir, col_name="tile"):
    """
    Writes out individual tiles from a raster file and updates the tile catalog
    with the corresponding output path
    """
    create_dir(out_dir)
    if "fn" not in catalog.columns:
        catalog["fn"] = ""
    if col_name not in catalog.columns:
        catalog[col_name] = ""

    for idx, row in catalog.iterrows():
        with rasterio.open(row["raster"]) as src:
            idx = int(row["index"])
            row_off = int(row.row_off)
            col_off = int(row.col_off)
            win = Window(col_off, row_off, width=256, height=256)
            array = src.read(1, window=win)

            # write image
            profile = src.profile
            profile["width"] = win.width
            profile["height"] = win.height
            profile["compress"] = None
            profile["transform"] = transform(win, src.transform)
            fn = f"{idx}_{col_off}_{row_off}.tif"
            out_path = f"{out_dir}/{fn}"
            with rasterio.open(out_path, "w", **profile) as dst:
                dst.write(array, 1)
            catalog.at[idx, "fn"] = fn
            catalog.at[idx, col_name] = out_path

    return catalog

In [3]:
def process_study_area(
        study_area,
        rasters,
        n_tiles,
        tiles_folder,
        tile_catalog_path,
        seed=123):
    print(study_area)
    catalog_list = list()

    for _, path in rasters.items():
        print(path)
        catalog_raster = gen_tile_catalog(path)
        catalog_list.append(catalog_raster)

    # concatenate the resulting catalogs, randomly shuffle and select the first n entries
    catalog_sa = (
        pd.concat(catalog_list, ignore_index=True)
        .sample(frac=1, random_state=seed)
        .reset_index(drop=True)
        .reset_index()[:n_tiles]
    )

    # write individual image tiles for labelling and final tile catalog
    catalog_sa = write_img_tiles(
        catalog_sa, tiles_folder.format(study_area=study_area))
    catalog_sa.to_file(tile_catalog_path.format(
        study_area=study_area), driver="GeoJSON")

In [4]:
%%time
config = load_config("../config.yaml")
study_areas = config.get("study_areas").keys()

for study_area in study_areas:
    rasters = config.get("study_areas").get(study_area).get("rasters")

    process_study_area(
        study_area=study_area,
        rasters=rasters,
        n_tiles=config.get("study_areas").get(study_area).get("n_tiles"),
        tiles_folder=config.get("tiles_folder"),
        tile_catalog_path=config.get("tile_catalog_path"),
    )

quang_tri
../data/raw/imagery/quang-tri-aft-1mpp.tif
Directory created: ../data/0_data_processing/quang_tri/tiles
tri_border_area
../data/raw/imagery/D3C1204-200292A077-1mpp.tif
../data/raw/imagery/D3C1204-200292A078-1mpp.tif
../data/raw/imagery/D3C1204-200292A079-1mpp.tif
../data/raw/imagery/D3C1204-200292A080-1mpp.tif
../data/raw/imagery/D3C1204-200292A081-1mpp.tif
../data/raw/imagery/D3C1204-200292A082-1mpp.tif
Directory created: ../data/0_data_processing/tri_border_area/tiles
CPU times: total: 8min 33s
Wall time: 8min 41s
