# Cloud and sea tile classifier

It has been identified that ESRGAN in both pretrain PSNR mode and GAN mode struggles with super-resoluting satellite image tiles completely covered by either sea surface or opaque clouds. In addition, both areas of interest, the harbors of Toulon and La Spezia, has lots of sea surface (approaching 50%). Left without intervention around 50% of tiles will only consist of sea and opaque clouds. It is assumed that this leads to an unwanted imbalance in what we want the model to be optimized to perform on.

There are several ways to mitigate this imbalance. One way would be to manually draw a sea surface polygon in a GIS software and undersample tiles extracted from within this polygon. A downside of this approach is that interesting features (ships) within the sea surface polygon would also be undersampled.

Another approach would be to train a cloud and sea tile classifier to detect the the unwanted tiles and discard all or a significant proportion of these tiles before training. This approach has the benefit of addressing the problem head on. The main downside of the approach is that it might be time-consuming to label tiles. However it is hypothesized that relatively little training data is needed to train a modern neural net classifier on such a *simple* classification task.

In [None]:
import pickle
import geopandas
import pandas as pd
import pathlib
import rasterio
import rasterio.plot
import tensorflow as tf

from modules.tile_generator import *
from modules.helpers import *
from modules.image_utils import *
from modules.tile_input_pipeline import *

In [None]:
# Toggles whether to actually do the generation on this run
# Be careful with setting these to True if tiles are already labelled!
# New tiles will overwrite old tiles and labels are trash
GENERATE_NEW_TILES = False
CONVERT_TO_PNG = False
CREATE_LABEL_CSV = False

with open('metadata_df.pickle', 'rb') as file:
    meta = pickle.load(file)
# Path to location where individual satellite images are located
DATA_PATH = 'data/toulon-laspezia/'
DATA_PATH_TILES = 'data/toulon-laspezia-cloud-sea-classifier/'

SENSORS = ['WV02', 'GE01', 'WV03_VNIR']
AREAS = ['La_Spezia', 'Toulon']
meta = meta.loc[meta['sensorVehicle'].isin(SENSORS)]
meta = meta.loc[meta['area_name'].isin(AREAS)]

N_IMAGES = len(meta.index)

#96x96, 128x128, 196x196, 384x384 -- All tiles are squares
TILE_SIZES = [96, 128, 196, 384]
# number of tiles to generate at each tile size
N_TILES = {96: 500, 128: 1000, 196: 500, 384: 500}
N_TILES_TOTAL = sum(N_TILES.values())

print(N_IMAGES)
print(N_TILES)
print(N_TILES_TOTAL)

TRAIN_TILE_SIZE_PAN = 128
TRAIN_TILE_SIZE_MS = int(128/4)

## Allocate n_tiles to every image (weighted by size of image)

In [None]:
if GENERATE_NEW_TILES:
    meta = allocate_tiles(meta, by_partition=False, n_tiles_total=N_TILES[96], new_column_name='n_tiles_96')
    meta = allocate_tiles(meta, by_partition=False, n_tiles_total=N_TILES[128], new_column_name='n_tiles_128')
    meta = allocate_tiles(meta, by_partition=False, n_tiles_total=N_TILES[196], new_column_name='n_tiles_196')
    meta = allocate_tiles(meta, by_partition=False, n_tiles_total=N_TILES[384], new_column_name='n_tiles_384')
    meta

## Generate tiles to disk

In [None]:
if GENERATE_NEW_TILES:
    meta['n_tiles'] = 0
    for tile_size in TILE_SIZES:
        pathlib.Path(DATA_PATH_TILES).joinpath(str(tile_size)).mkdir()
        tile_size_ms = int(tile_size/4)
        meta['n_tiles'] = meta[str('n_tiles_'+str(tile_size))]
        generate_all_tiles(meta, save_dir = str(DATA_PATH_TILES+'/'+str(tile_size)), 
                           ms_height_width=(tile_size_ms,tile_size_ms), sr_factor=4, 
                           cloud_sea_removal=False)

## Flatten the directory structure after generation

Lots of foor loops in order to do one change at a time.

In [None]:
if GENERATE_NEW_TILES:
    # Remove train/val/test directories
    for tilesize_dir in pathlib.Path(DATA_PATH_TILES).iterdir():
        for partition_dir in tilesize_dir.iterdir():
            for image_dir in partition_dir.iterdir():
                dest = tilesize_dir.joinpath(image_dir.stem)
                source = image_dir
                source.rename(dest)
            partition_dir.rmdir()

    # Add tile size to filenames
    for tilesize_dir in pathlib.Path(DATA_PATH_TILES).iterdir():
        for image_dir in tilesize_dir.iterdir():
            for ms_pan_dir in image_dir.iterdir():
                for tile in ms_pan_dir.iterdir():
                    new_tile_name = str(tilesize_dir.stem+'-'+tile.name)
                    new_path = ms_pan_dir.joinpath(new_tile_name)
                    tile.rename(new_path)

    # Completely flatten file structure, remove tile size directories
    for tilesize_dir in pathlib.Path(DATA_PATH_TILES).iterdir():
        for image_dir in tilesize_dir.iterdir():
            for ms_pan_dir in image_dir.iterdir():
                for tile in ms_pan_dir.iterdir():
                    new_dir = pathlib.Path(DATA_PATH_TILES).joinpath(image_dir.stem, ms_pan_dir.name)
                    new_dir.mkdir(parents=True, exist_ok=True)
                    new_path = new_dir.joinpath(tile.name)
                    tile.rename(new_path)
                ms_pan_dir.rmdir()
            image_dir.rmdir()
        tilesize_dir.rmdir()

    # Add image_int_uid to filenames and flatten structure completely
    for image_dir in pathlib.Path(DATA_PATH_TILES).iterdir():
        if image_dir.stem == 'ms' or image_dir.stem == 'pan':
            continue
        for ms_pan_dir in image_dir.iterdir():
            for tile in ms_pan_dir.iterdir():
                int_uid = get_int_uid(meta, image_dir.stem)
                new_tile_name = str(str(int_uid).zfill(2)+'-'+tile.name)
                new_dir = pathlib.Path(DATA_PATH_TILES).joinpath(ms_pan_dir.stem)
                new_dir.mkdir(parents=True, exist_ok=True)
                new_path = new_dir.joinpath(new_tile_name)
                tile.rename(new_path)
            ms_pan_dir.rmdir()
        image_dir.rmdir()

# List all tif files
tif_paths = [file for file in pathlib.Path(DATA_PATH_TILES).glob('**/*.tif')]
tif_paths_ms = tif_paths[:2500]
tif_paths_pan = tif_paths[2500:]

# Divide by 2 because each tile consists of 1 MS + 1 PAN
print('Number of tiles generated and present in flat file structure:', str(int(len(tif_paths)/2)))

# Convert to png
While the input to the actual cloud/sea classifier is tif files it is practical to also convert the image tiles to png. This makes labelling easier.

In [None]:
if CONVERT_TO_PNG:
    for tif_path in tif_paths:
        ms_or_pan = tif_path.parent.stem
        
        # sensor type is needed for conversion of ms to rgb png::
        int_uid = int(tif_path.stem[:2])
        string_uid = get_string_uid(meta, int_uid)
        sensor = get_sensor(meta, string_uid)
        
        # saves png to disk
        geotiff_to_png(tif_path, ms_or_pan=ms_or_pan, scale=True, stretch_img=True, sensor=sensor)

# List all png files
png_paths = [file for file in pathlib.Path(DATA_PATH_TILES).glob('**/*.png')]

# Divide by 2 because each tile consists of 1 MS + 1 PAN
print('Number of tiles generated and present in flat file structure:', str(int(len(png_paths)/2)))

# Create label csv file
Labels are `None` before manual labelling

In [None]:
if CREATE_LABEL_CSV:
    label_df = pd.DataFrame([tif_path.stem for tif_path in tif_paths[:N_TILES_TOTAL]], columns=['tile_uid'])
    label_df['cloud-sea'] = None
    label_df.to_csv(pathlib.Path(DATA_PATH_TILES).joinpath('labels-to-be.csv'), index=False)

# Labelling
*... 4 tedious labelling hours later...*

In [None]:
# Reading the labels from csv
label_df = pd.read_csv(pathlib.Path(DATA_PATH_TILES).joinpath('labels.csv'), delimiter=';', )

In [None]:
label_df['cloud-sea'].to_numpy(dtype=np.float32)

# Checking that UIDs match
If the sequence of image tiles and labels in `tif_paths_pan`, `tif_paths_ms` and `label_df` match we can use integer indices instead of string UIDs when training (quicker to code).

In [None]:
def assert_x_y_match(tif_paths_pan, tif_paths_ms, label_df):
    try:
        assert len(label_df) == len(tif_paths_pan) == len(tif_paths_ms)
    except AssertionError:
        print('Lengths of tif paths and label dataframe differ!')
        
    n = len(label_df)
    try:
        for i in range(n):
            label_tile_uid = label_df.iloc[i]['tile_uid']
            tif_tile_ms_uid = tif_paths_ms[i].stem
            tif_tile_pan_uid = tif_paths_pan[i].stem
            #print(label_tile_uid, tif_tile_ms_uid, tif_tile_pan_uid)
            assert label_tile_uid == tif_tile_ms_uid == tif_tile_pan_uid
    except AssertionError:
        print('Mismatch between sequence of tile uids!')
        print('label_tile_uid:', label_tile_uid)
        print('tif_tile_ms_uid', tif_tile_ms_uid)
        print('tif_tile_pan_uid', tif_tile_pan_uid)
    print('Verification OK. All', n, 'image tile UIDs match.')
        
assert_x_y_match(tif_paths_pan, tif_paths_ms, label_df)

In [None]:
def prepare_for_training(tif_paths, label_df, tile_size, pan_or_ms_or_both='pan'):
    n = len(label_df)
    y = label_df['cloud-sea'].to_numpy(dtype=np.float32)
    if pan_or_ms_or_both == 'pan':
        X = np.empty((n, tile_size, tile_size, 1), dtype=np.float32)
    for i in range(n):
        img = geotiff_to_ndarray(tif_paths[i])
        #print(img.shape)
        img = tf.image.resize(img, [tile_size, tile_size], method=tf.image.ResizeMethod.BILINEAR)
        X[i,:,:,:] = tf.image.convert_image_dtype(img, tf.float32).numpy()
    return X, y
    

In [None]:
X, y = prepare_for_training(tif_paths_pan, label_df, tile_size=TRAIN_TILE_SIZE_PAN, pan_or_ms_or_both='pan')