# Create Training Dataset for DRC Road Classification

The classifier used in DRC Roads analysis requires a training dataset.

To create the training dataset, we first created the label images. These images were created in [GIMP](https://www.gimp.org/) using manual labeling of road and forest regions.

In [138]:
import os

import numpy as np
import rasterio

## Define input data

In [139]:
# labeled images, generated in GIMP, along with cropped OrthoTile as source image
data_dir = 'pre-data'
forest_image_filename = os.path.join(data_dir, 'mosaic_crop_forest.tif')
road_image_filename = os.path.join(data_dir, 'mosaic_crop_road.tif')
bands_image_filename = os.path.join(data_dir, 'mosaic_crop.tif')

## Prepare feature bands

In [153]:
# load cropped orthotile as bands of pixels
def read_4band(image_filename):
    with rasterio.open(image_filename, 'r') as src:
        bands = src.read()
    return bands

bands = read_4_band(bands_image_filename)
bands.shape

(4, 1309, 2201)

In [161]:
from skimage import feature, filters

In [165]:
# calculate texture bands. this comes from drc_roads_classification
def get_texture_bands(green_band):
    def scale(band):
        '''Scale band values to band data type max/min'''
        def _normalize(b):
            '''Normalize values to 0-1'''
            return (b-b.min())/(b.max()-b.min())
        
        scaled_band = _normalize(band) * np.iinfo(band.dtype).max
        return scaled_band.astype(band.dtype)

    green_band_normalized = scale(green_band)

    edges1 = feature.canny(green_band_normalized, sigma=2)
    blurred = filters.gaussian(edges1, sigma=2)
    blurred2 = filters.gaussian(edges1, sigma=6)
    return [blurred, blurred2]

green_band = bands[1, :]
blurred, blurred2 = get_texture_bands(green_band)
blurred.shape

(1309, 2201)

In [167]:
feature_bands = np.concatenate((bands, blurred[np.newaxis,:], blurred2[np.newaxis,:]), axis=0)
feature_bands.shape

(6, 1309, 2201)

## Apply Label masks to features

In [168]:
# load labeled images as boolean masks

def get_label_mask(image_filename):
    with rasterio.open(image_filename, 'r') as src:
        band = src.read(1)
        label_data = band == 0 # valid data in black regions
        label_mask = ~label_data # mask True (masked) for not valid data
    return label_mask

def get_unmasked_count(mask):
    return np.size(mask) - np.count_nonzero(mask)

print(forest_mask.size)
forest_mask = get_label_mask(forest_image_filename)
print(get_unmasked_count(forest_mask))
road_mask = get_label_mask(road_image_filename)
print(get_unmasked_count(road_mask))

2881109
578330
17757


  s = DatasetReader(fp, driver=driver, **kwargs)


In [171]:
# apply label masks to orthotile bands to get 2D array of band values associated with
# pixels of that label
def get_label_pixels(label_mask, bands):
    pixels = np.array([np.ma.array(b, mask=label_mask).compressed()
                       for b in bands])
    return pixels.swapaxes(0,1) # order by pixel then by band

forest_pixels = get_label_pixels(forest_mask, feature_bands)
road_pixels = get_label_pixels(road_mask, feature_bands)

In [172]:
# for balanced training set, use random sampling to create same-size
# samples of labeled pixels
def make_same_size_samples(list_of_pixels):
    sample_len = min([p.shape[0] for p in list_of_pixels])

    def sample_pixels(pixels):
        if pixels.shape[0] > sample_len:
            pixel_sample = pixels.copy()
            np.random.shuffle(pixel_sample)
            pixel_sample = pixel_sample[:sample_len]
        else:
            pixel_sample = pixels
        return pixel_sample
    
    return [sample_pixels(p) for p in list_of_pixels]

[forest_pixels_sample, road_pixels_sample] = \
    make_same_size_samples([forest_pixels, road_pixels])

print(forest_pixels_sample.shape)
print(road_pixels_sample.shape)

(17757, 6)
(17757, 6)


In [173]:
forest_label_value = 0
road_label_value = 1
X = np.concatenate((forest_pixels_sample, road_pixels_sample), axis=0)
y = np.array(forest_pixels_sample.shape[0] * [forest_label_value] + \
             road_pixels_sample.shape[0] * [road_label_value])
    
print(X.shape)
print(y.shape)

(35514, 6)
(35514,)


In [174]:
# save to file system
output_file = os.path.join('pre-data', 'classification_training')
np.savez(output_file, X=X, y=y)