# Training Data
This notebook creates the training data by combining image tiles with label tiles created based on annotations in Labelbox. The tiles are split into training and a validation sets using a stratified split.

In [1]:
import geopandas as gpd
import numpy as np

from PIL import Image
from sklearn.model_selection import train_test_split

from config import Config

In [2]:
def read_images_and_labels(tile_names, tile_dir, label_dir):
    """Reads image and label tiles from the specified directories and stacks them into numpy arrays."""
    images = []
    labels = []

    for tile_name in tile_names:
        img_path = f"{tile_dir}/{tile_name}"
        label_path = f"{label_dir}/{tile_name}"

        # Read and append image
        img = Image.open(img_path)
        images.append(np.array(img))

        # Read and append label
        label = Image.open(label_path)
        labels.append(np.array(label))

    return np.stack(images), np.stack(labels)

In [3]:
config = Config.Config()

In [4]:
sample_catalog = gpd.read_file(config.sample_catalog_path)
sample_catalog.head(3)

Unnamed: 0,index,tile_id,map_id,col_off,row_off,tile_size,random_number,luc,luc_name,stratum,random_sample,legend_type,nara,tile_name,tile_path,geometry
0,268292,483,60463,4644,2224,256,0.451872,52,Closed evergreen broadleaved forest,Forest,False,3,False,60463_4644_2224.png,../data/processed/image_tiles/60463_4644_2224.png,"POLYGON ((105.16509 18.67607, 105.1651 18.6853..."
1,51221,687,53503,6475,4099,256,0.003091,52,Closed evergreen broadleaved forest,Forest,False,6,False,53503_6475_4099.png,../data/processed/image_tiles/53503_6475_4099.png,"POLYGON ((101.73326 20.60236, 101.73343 20.611..."
2,430102,532,63303,4986,286,256,0.067877,190,Impervious surfaces,Impervious surface,True,1,False,63303_4986_286.png,../data/processed/image_tiles/63303_4986_286.png,"POLYGON ((106.68722 10.74111, 106.67786 10.741..."


In [5]:
# create train/validation split of all labelled images
train_df, val_df = train_test_split(
    sample_catalog,
    train_size=config.prop_train,
    stratify=sample_catalog["stratum"],
    random_state=config.seed
    )

len(train_df), len(val_df)

(559, 187)

In [6]:
train_df["stratum"].value_counts()

stratum
Forest                    178
Water body                 91
Cropland                   91
Shrubland or grassland     76
Wetland                    63
Impervious surface         60
Name: count, dtype: int64

In [7]:
val_df["stratum"].value_counts()

stratum
Forest                    60
Water body                31
Cropland                  30
Shrubland or grassland    25
Wetland                   21
Impervious surface        20
Name: count, dtype: int64

In [8]:
# load matching image and label tiles and stack them in a npz file  
res = {}
res["train"] = read_images_and_labels(train_df["tile_name"], config.tile_folder, config.label_folder)
res["val"] = read_images_and_labels(val_df["tile_name"], config.tile_folder, config.label_folder)

np.savez(config.training_data_path,
        x_train=res["train"][0], y_train=res["train"][1],
        x_val=res["val"][0], y_val=res["val"][1],
        )

In [9]:
# test loading the data
data = np.load(config.training_data_path)
x_train = data["x_train"]/255. 
y_train = data["y_train"]
x_val = data["x_val"]/255.
y_val = data["y_val"]

print(x_train.shape, y_train.shape)
print(x_val.shape, y_val.shape)

(559, 256, 256, 3) (559, 256, 256)
(187, 256, 256, 3) (187, 256, 256)


In [10]:
train_counts = np.bincount(y_train.flatten())
train_counts

array([ 2079055,   302345,   317012,   313506,   272977,  2972769,
       14152968,  2704830,   323444,  1117247,   334315,  1456809,
        6422167,  3012299,   262017,    84906,   505958])

In [11]:
val_counts = np.bincount(y_val.flatten())
val_counts

array([ 778299,   53809,  110629,   70634,  108815, 1258294, 4685943,
        711945,  146061,  398510,  178153,  557758, 2100006,  855473,
         31302,   23512,  186089])