## Create training data
This notebook takes the selected image tiles (geotiff files) and the corresponding labels (geojson files with crater polygons manually created in QGIS) and transforms them into the training data for the neural network. The geojson files are rasterized and an additional boundary class for instance segmentation is added. The final data, consisting of matching image pairs (256 $\times$ 256 pixels) of KH-9 image and label, are split into training, validation and test set and saved in an h5py file.

#### Inputs:
* *tile_catalog_path*: Tile catalog with metadata and extent of each of the selected image tiles (geojson file)
* *labels_processed_path*: File with labelled craters for each study area (geojson files)

#### Parameters:
* *study_areas*: Names of the study_areas
* *crater_ids*: Integers that represent craters in the labelled image tiles
* *boundary_id*: Integer that represents the boundary class in the labelled image tiles
* *n_classes*: Number of classes in the labelled image tiles = len(crater_ids) + 2 (boundary and background classes)
* *min_crater_area*: Minimum area (in pixels) of a crater in the labelled image tiles (smaller craters are removed)
* *study_areas -> [study_area] -> train_prop*: Proportion of image tiles to use for the training data
* *study_areas -> [study_area] -> val_prop*: Proportion of image tiles to use for the validation data
* *study_areas -> [study_area] -> test_prop*: Proportion of image tiles to use for the test data

#### Outputs:
* *tile_labels_folder*: Created label tiles for each selected KH-9 image tiles (256 $\times$ 256 pixels) (geotiff files)
* *tile_catalog_labels_path*: Same as *tile_catalog_path* but includes path to the labelled image tile in *tile_labels_folder*
* *data_path*: Matching image pairs (256 $\times$ 256 pixels) of KH-9 image and crater label split into training, validation and test sets (hdf5 file)
* *data_path_sa*: Same as *data_path* but split into the separate study areas (hdf5 files)

#### Created paper content:
* **Crater prevalence**: Prevalence of crater pixels in the labelled data for each study area

In [1]:
import geopandas as gpd
import h5py
import numpy as np
import rasterio
import rasterio.features

from sklearn.model_selection import train_test_split
from skimage import morphology, segmentation
from utils import create_dir, load_config

In [2]:
def get_boundaries(crater_id, boundary_distance=2):
    crater_id_expanded = segmentation.expand_labels(
        crater_id, distance=boundary_distance)

    # get all pixels where the expanded borders are now touching and define those as border class
    border = segmentation.find_boundaries(
        crater_id_expanded, mode="outer", connectivity=2, background=0)
    border[crater_id_expanded == 0] = 0
    # fill small holes in the border that can occur if boundaries are touching diagonally
    # (due to connectivity=2 setting in find_boundaries function above)
    border = morphology.remove_small_holes(
        border, area_threshold=2, connectivity=1)

    # get all boundary pixels that are not border pixels
    boundaries = crater_id_expanded != crater_id
    boundaries[border] = 0

    return boundaries > 0, border > 0


def write_label_tiles(catalog, labels_path, out_dir, boundary_id, min_crater_area=25):
    '''
    Writes out individual tiles from a raster file and updates the tile catalog
    with the corresponding output path
    '''
    labels = gpd.read_file(labels_path)

    create_dir(out_dir)

    shapes_class = [(geom, val)
                    for geom, val in zip(labels.geometry, labels.class_id)]
    shapes_id = [(geom, val) for geom, val in zip(labels.geometry, labels.id)]

    for idx, row in catalog.iterrows():
        with rasterio.open(row["tile"]) as src:

            img_labels = rasterio.features.rasterize(
                shapes_class,
                fill=0,
                out_shape=src.shape,
                transform=src.transform,
                dtype="uint8"
            )
            crater_id = rasterio.features.rasterize(
                shapes_id,
                fill=0,
                out_shape=src.shape,
                transform=src.transform,
                dtype="uint32"
            )

            # add boundary and border pixels to the labels
            boundary_mask, border_mask = get_boundaries(crater_id)

            # convert craters smaller than min_crater_area to background class, do this after boundary has been
            # calculated as boundary class can decrease crater size for overlapping craters
            # this is done to ensure small objects removals are done in a consistent way on both labels and predictions
            crater_id[boundary_mask] = 0
            crater_id[border_mask] = 0

            large_objects_mask = morphology.remove_small_objects(
                crater_id > 0, min_size=min_crater_area)
            crater_id[~large_objects_mask] = 0
            img_labels[~large_objects_mask] = 0

            # recalculate the boundary pixels of remaining craters
            # (otherwise boundaries of removed craters are still present)
            boundary_mask, border_mask = get_boundaries(crater_id)

            img_labels[boundary_mask] = boundary_id
            img_labels[border_mask] = boundary_id

            profile = src.profile
            profile.update(dtype="uint8", nodata=255)

            fn = row["fn"]
            out_path = f"{out_dir}/{fn}"
            with rasterio.open(out_path, "w", **profile) as dst:
                dst.write(img_labels, 1)
            catalog.at[idx, "tile_label"] = out_path

    return catalog

In [3]:
def split_data_into_sets(df, train_prop, val_prop, test_prop, random_state=42):
    """
    Split the DataFrame into train, validation, and test sets.

    Parameters:
        df (pd.DataFrame): The input DataFrame.
        train_prop (float): Proportion of data to be used for training.
        val_prop (float): Proportion of data to be used for validation.
        test_prop (float): Proportion of data to be used for testing.
        random_state (int): Random seed for reproducibility (default is 42).

    Returns:
        pd.DataFrame: The input DataFrame with three new columns "train", "val", and "test".
    """

    # Check if the proportions add up to 1
    total_proportions = train_prop + val_prop + test_prop
    if abs(total_proportions - 1.0) > 1e-6:
        raise ValueError("Proportions should add up to 1.0")

    # Step 1: Split the data into train-test and validation sets
    train_data, test_val_data = train_test_split(
        df, train_size=train_prop, random_state=random_state)

    # Step 2: Split the train-test data into train and test sets
    val_data, test_data = train_test_split(
        test_val_data, test_size=test_prop / (val_prop + test_prop), random_state=random_state)

    # Step 3: Create new columns in the DataFrame to indicate the split
    df["train"] = 0
    df["val"] = 0
    df["test"] = 0

    # Assign 1 to the rows that belong to the corresponding sets
    df.loc[train_data.index, "train"] = 1
    df.loc[val_data.index, "val"] = 1
    df.loc[test_data.index, "test"] = 1

    return df


def create_training_data(catalog, out_file, n_classes, tile_col="tile", label_col="tile_label"):
    """
    Create training data in HDF5 format from the catalog DataFrame.

    Parameters:
        catalog (pd.DataFrame): The catalog DataFrame containing file paths and subset information.
        out_file (str): The output HDF5 file path to store the training data.
        n_classes (int): The number of classes for one-hot encoding.
        tile_col (str): The column name in the catalog DataFrame containing tile file paths (default is "tile").
        label_col (str): The column name in the catalog DataFrame containing label file paths (default is "tile_label").

    Notes:
        The catalog DataFrame should contain columns for "train", "val", and "test" subsets, where each value is 0 or 1
        indicating if the sample belongs to that subset.

        The output HDF5 file will have datasets named "x_train", "x_val", and "x_test" containing input images,
        and "y_train", "y_val", and "y_test" containing one-hot encoded label arrays.
    """
    res = {}
    subsets = ["train", "val", "test"]

    for subset in subsets:
        catalog_subset = catalog[catalog[subset] == 1]

        imgs = list()
        labels = list()
        for idx, row in catalog_subset.iterrows():
            with rasterio.open(row[tile_col]) as src:
                imgs.append(src.read(1))
            with rasterio.open(row[label_col]) as src:
                label = src.read(1)

            class_labels = list()
            for i in range(n_classes):
                class_labels.append(label == i)
            labels.append(np.stack(class_labels, axis=-1))

        imgs = np.stack(imgs)
        imgs = np.expand_dims(imgs, axis=-1)
        labels = np.stack(labels).astype("float32")

        assert (labels.sum(axis=-1) == 1).all(), (
            "Each pixel needs to have exactly one class assigned to it")

        res[f"x_{subset}"] = imgs
        res[f"y_{subset}"] = labels

    create_dir(out_file, is_file=True)
    with h5py.File(out_file, "w") as hf:
        for k, v in res.items():
            hf.create_dataset(k, data=v, compression="gzip")

    return None

In [4]:
def read_subset(path, subset):
    """
    Read a specific subset of data from an HDF5 file.

    Parameters:
        path (str): The file path to the HDF5 file.
        subset (str): The name of the subset to read.

    Returns:
        numpy.ndarray: The data from the specified subset.
    """
    with h5py.File(path, "r") as hf:
        return hf[subset][:]

In [5]:
config = load_config("../config.yaml")
study_areas = config.get("study_areas").keys()

In [6]:
for study_area in study_areas:
    catalog = gpd.read_file(
        config["tile_catalog_path"].format(study_area=study_area)
    )

    catalog = write_label_tiles(
        catalog=catalog,
        labels_path=config["labels_processed_path"].format(
            study_area=study_area),
        out_dir=config["tile_labels_folder"].format(study_area=study_area),
        boundary_id=config["boundary_id"],
        min_crater_area=config["min_crater_area"]
    )

    catalog = split_data_into_sets(
        catalog,
        train_prop=config.get("study_areas").get(study_area).get("train_prop"),
        val_prop=config.get("study_areas").get(study_area).get("val_prop"),
        test_prop=config.get("study_areas").get(study_area).get("test_prop")
    )

    catalog.to_file(config["tile_catalog_labels_path"].format(
        study_area=study_area), driver="GeoJSON")

    create_training_data(
        catalog,
        out_file=config.get("data_path_sa").format(study_area=study_area),
        n_classes=config.get("n_classes")
    )

Directory created: ../data/2_training_data/quang_tri/tile_labels
Directory already exists: ../data/2_training_data/quang_tri
Directory created: ../data/2_training_data/tri_border_area/tile_labels
Directory already exists: ../data/2_training_data/tri_border_area


### Combine the study areas

In [7]:
def combine_study_areas_data(study_areas, data_path_sa, data_path):
    """
    Combine data from all study areas and store it in a single HDF5 file.

    Parameters:
        study_areas (list): A list of the names of all study areas to be combined.
        data_path_sa (str): A string template for the data paths with "{study_area}" as a placeholder.
        data_path (str): The output file path to store the combined data.

    Notes:
        The study_areas dictionary should have study area names as keys and corresponding data paths as values.
    """
    data_paths = [data_path_sa.format(study_area=i) for i in study_areas]
    subsets = ["x_train", "x_val", "x_test", "y_train", "y_val", "y_test"]

    res = [[read_subset(path, subset) for path in data_paths] for subset in subsets]
    res_type = [np.concatenate(i) for i in res]

    with h5py.File(data_path, "w") as hf:
        for k, v in zip(subsets, res_type):
            hf.create_dataset(k, data=v, compression="gzip")

In [8]:
combine_study_areas_data(
    study_areas=config.get("study_areas").keys(),
    data_path_sa=config.get("data_path_sa"),
    data_path=config.get("data_path"),
)

In [9]:
with h5py.File(config.get("data_path"), "r") as hf:
    print(hf.keys())
    x_train = hf["x_train"][:]
    y_train = hf["y_train"][:]
    x_val = hf["x_val"][:]
    y_val = hf["y_val"][:]
    x_test = hf["x_test"][:]
    y_test = hf["y_test"][:]
print(x_train.shape, y_train.shape)
print(x_val.shape, y_val.shape)
print(x_test.shape, y_test.shape)

<KeysViewHDF5 ['x_test', 'x_train', 'x_val', 'y_test', 'y_train', 'y_val']>
(1200, 256, 256, 1) (1200, 256, 256, 7)
(400, 256, 256, 1) (400, 256, 256, 7)
(800, 256, 256, 1) (800, 256, 256, 7)


### Calculate crater prevalence per study area

In [10]:
def calc_prevalence(study_area, data_path_sa, crater_ids):
    data_path = data_path_sa.format(study_area=study_area)
    subsets = ["y_train", "y_val", "y_test"]

    y = np.concatenate([read_subset(data_path, subset) for subset in subsets])
    y_argmax = np.argmax(y, axis=-1)

    n_crater_pixels = np.isin(y_argmax, crater_ids).sum()
    n_pixels = y_argmax.size
    prev = np.round(n_pixels / n_crater_pixels)

    print(f"Study Area: {study_area}")
    print(f"{n_crater_pixels} crater pixels out of {n_pixels} total pixels.")
    print(f"1 in {prev} pixels are crater pixels")

In [11]:
for study_area in study_areas:
    calc_prevalence(study_area, config.get("data_path_sa"),
                    crater_ids=config.get("crater_ids"))

Study Area: quang_tri
724603 crater pixels out of 65536000 total pixels.
1 in 90.0 pixels are crater pixels
Study Area: tri_border_area
63970 crater pixels out of 91750400 total pixels.
1 in 1434.0 pixels are crater pixels
