## Imports and setup

In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import pathlib
from PIL import Image
import rasterio
from rasterio.plot import reshape_as_image

import tensorflow as tf
AUTOTUNE = tf.data.experimental.AUTOTUNE

from modules.metadata_reader import img_metadata_to_dict, add_names_to_metadata_dict, dict_to_df

# Path to location where individual satellite images are located
DATA_PATH = 'data/toulon-laspezia' 
DATA_PATH_IS_RELATIVE = True
DATA_PATH_NPY = 'data/toulon-laspezia-npy' 

# Name of metadata .xml file
METADATA_NAME = 'DeliveryMetadata.xml'

# Names of areas covered by satellite imagery
AREAS = ['La_Spezia', 'Toulon'] # Spelled like the directory names

# Speficy what the xmlns url on top of metadata .xml file is
# (should be second line)
XMLNS = 'http://xsd.digitalglobe.com/xsd/dm'

## Metadata parsing from xml to pandas dataframe

Every satellite image delivery from Maxar contains a `DeliveryMetadata.xml` file with important specifications for both the multispectral and panchromatic images. The following functions finds all the `DeliveryMetadata.xml` files contained in all subdirectories of a directory and parses them into the *Pandas DataFrame* format which will be used for further descriptive statistics of the dataset.

In [None]:
img_metadata_pan, img_metadata_ms = img_metadata_to_dict(METADATA_NAME, 
                                                         DATA_PATH, XMLNS, 
                                                         path_is_relative = DATA_PATH_IS_RELATIVE)

img_metadata_pan = add_names_to_metadata_dict(img_metadata_pan, AREAS)
img_metadata_ms = add_names_to_metadata_dict(img_metadata_ms, AREAS)

img_metadata_pan = dict_to_df(img_metadata_pan)
img_metadata_ms = dict_to_df(img_metadata_ms)

# Randomly draw 2 images for early trials

In [None]:
toulon_wv02_pan = img_metadata_pan[(img_metadata_pan['sensorVehicle'] == 'WV02')
                                   & (img_metadata_pan['area_name'] == 'Toulon')]

np.random.seed(1)
img_names = sorted(toulon_wv02_pan.index.values)
np.random.shuffle(img_names)
images_for_early_trials = img_names[:2]
images_for_early_trials

# Convert .tif files to .npy for easier loading later

In [None]:
def tif_to_npy(path_in, filename, save_to_disk = False, path_out = None):
    path_in = pathlib.Path(path_in)
    #filename = path_in.stem
    with rasterio.open(path_in, 'r') as ds:
        img = ds.read()
    print(type(img))
    img = reshape_as_image(img)
    print(img.shape)
    if save_to_disk:
        np.save(pathlib.Path(path_out, filename), img)
        return True
    return img

def all_tif_to_npy(metadata_pan, metadata_ms, path_out):
    
    # Saving panchromatic images as .npy files
    filenames = metadata_pan.index.values.tolist()
    path_out_pan = pathlib.Path(path_out, 'pan')
    for filename in filenames:
        tif_to_npy(metadata_pan.loc[filename]['tif_path'], 
                   filename, save_to_disk = True,
                   path_out = path_out_pan)
        print('Saved', filename, 'in dir', str(path_out_pan))
    
    # Saving multispectral images as .npy files
    filenames = metadata_ms.index.values.tolist()
    path_out_ms = pathlib.Path(path_out, 'ms')
    for filename in filenames:
        tif_to_npy(metadata_ms.loc[filename]['tif_path'], 
                   filename, save_to_disk = True,
                   path_out = path_out_ms)
        print('Saved', filename, 'in', str(path_out_ms))

In [None]:
# Uncomment to actually convert (takes some time):

#all_tif_to_npy(img_metadata_pan, img_metadata_ms, DATA_PATH_NPY)

# Adding paths to .npy files as column in metadata dataframe

The metadata dataframes are kept up to date so that it can be used as a canonical source of information about images.

In [None]:
def add_npy_paths_to_metadata_df(metadata_pan, metadata_ms, path_to_npy):
    path_pan = pathlib.Path(pathlib.Path.cwd(), path_to_npy, 'pan')
    pan_paths = list(path_pan.glob('**/*.npy'))
    pan_names = [path.stem for path in pan_paths]
    pan_path_df = pd.DataFrame({'pan_names':pan_names,'npy_path':pan_paths}).set_index('pan_names')
    metadata_pan = pd.concat([metadata_pan, pan_path_df],axis=1)
    
    path_ms = pathlib.Path(pathlib.Path.cwd(), path_to_npy, 'ms')
    ms_paths = list(path_ms.glob('**/*.npy'))
    ms_names = [path.stem for path in ms_paths]
    ms_path_df = pd.DataFrame({'ms_names':ms_names,'npy_path':ms_paths}).set_index('ms_names')
    metadata_ms = pd.concat([metadata_ms, ms_path_df],axis=1)
    
    return metadata_pan, metadata_ms

img_metadata_pan, img_metadata_ms = add_npy_paths_to_metadata_df(img_metadata_pan, 
                                                                 img_metadata_ms, 
                                                                 DATA_PATH_NPY)

# Loading .npy files into memory

In [None]:
def load_npy_to_dict(metadata_df, ID_list = None):
    if ID_list == None:
        ID_list = list(metadata_df.index.tolist())
    d = {}
    for ID in ID_list:
        d[ID] = np.load(metadata_df.loc[ID]['npy_path'])
        print(ID, 'loaded into memory as ndarray with shape', d[ID].shape)
    return d

def load_npy_to_list(metadata_df, ID_list = None):
    if ID_list == None:
        ID_list = list(metadata_df.index.tolist())
    l = []
    for ID in ID_list:
        l.append(np.load(metadata_df.loc[ID]['npy_path']))
        print(ID, 'loaded into memory as ndarray with shape', l[-1].shape)
    return l

## Load only early trials images

In [None]:
imgs_pan = load_npy_to_list(img_metadata_pan, images_for_early_trials)
imgs_ms = load_npy_to_list(img_metadata_ms, images_for_early_trials)

## Load all images
Keep a watch on available RAM!

In [None]:
#imgs_pan = load_npy_to_list(img_metadata_pan)
#imgs_ms = load_npy_to_list(img_metadata_ms)

# Data generation pipeline

In [None]:
PAN_WIDTH, PAN_HEIGHT = (384, 384)
PAN_BANDS = 1

SR_FACTOR = 4
MS_WIDTH, MS_HEIGHT = (int(PAN_WIDTH/SR_FACTOR), int(PAN_HEIGHT/SR_FACTOR))
MS_BANDS = 8

In [None]:
def _bytes_feature(value):
  """Returns a bytes_list from a string / byte."""
  if isinstance(value, type(tf.constant(0))):
    value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
  """Returns a float_list from a float / double."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def crop(img, yxhwc_box):
    img = tf.image.crop_to_bounding_box(
        img, 
        offset_height = yxhwc_box[0], 
        offset_width = yxhwc_box[1], 
        target_height = yxhwc_box[2], 
        target_width = yxhwc_box[3])
    return img

def get_random_box(img_shape, crop_size):
    maxval_y, maxval_x = img_shape[:2]
    maxval_y -= crop_size[0]
    maxval_x -= crop_size[1]
    print(maxval_y, maxval_x)
    rng = np.random.default_rng()
    upper_left_yx = rng.integers(0, high=[maxval_y, maxval_x], dtype='int32')
    
    #returning in yxhwc format
    return np.concatenate((upper_left_yx, np.array(crop_size)))

def get_hr_box(lr_box, resize_factor, channels):
    hr_box = lr_box
    hr_box[:4] = lr_box[:4] * resize_factor
    hr_box[4] = channels
    return hr_box

def scale_image(img):
    # Use `convert_image_dtype` to convert to floats in the [0,1] range.
    img = tf.image.convert_image_dtype(img, tf.float32)
    return img

def add_imgID(img_array, imgID):
    img_array = np.expand_dims(img_array, 0)
    img_array[:,]

def preprocess_tiles(imgs_pan, imgs_ms, img_IDs, n_tiles):
    n_images = len(img_IDs)
    arr_ms = np.zeros((n_tiles, MS_HEIGHT, MS_WIDTH, MS_BANDS))
    arr_pan = np.zeros((n_tiles, PAN_HEIGHT, PAN_WIDTH, PAN_BANDS))
    print(arr_ms.shape)
    tile_imgID_map = []
    
    rng = np.random.default_rng()
    
    for i in range(n_tiles):
        img_ID_int = rng.integers(0, high=n_images, dtype='int32')
        img_ID = img_IDs[img_ID_int]
        tile_imgID_map.append(img_ID)
        print(img_ID)

        img_pan = scale_image(imgs_pan[img_ID_int])
        img_ms = scale_image(imgs_ms[img_ID_int])
        
        box_ms = get_random_box(img_ms.shape, [MS_HEIGHT, MS_WIDTH, MS_BANDS])
        print(box_ms)
        img_ms_cropped = crop(img_ms, box_ms)
        box_pan = get_hr_box(box_ms, SR_FACTOR, PAN_BANDS)
        img_pan_cropped = crop(img_pan, box_pan)
        arr_ms[i,:,:,:] = img_ms_cropped
        arr_pan[i,:,:,:] = img_pan_cropped
    
    return arr_ms, arr_pan, tile_imgID_map

In [None]:
arr_ms, arr_pan, tile_imgID_map = preprocess_tiles(imgs_pan, imgs_ms, images_for_early_trials, 10)

fig = plt.figure(figsize = (15,15))
    
ax0 = fig.add_subplot(1,2,1)
ax0.set_title('MS')
ax0 = plt.imshow(arr_ms[0,:,:,0], cmap = 'gray')
    
ax1 = fig.add_subplot(1,2,2)
ax1.set_title('PAN')
ax1 = plt.imshow(arr_pan[0,:,:,0], cmap = 'gray')

In [None]:
tile_imgID_map

In [None]:

    feature = {
        #'img_ID': _bytes_feature(img_ID),
        'image_pan': _bytes_feature(img_pan_cropped),
        'pan_height': _int64_feature(img_pan_cropped.shape[0]),
        'pan_width': _int64_feature(img_pan_cropped.shape[1]),
        'pan_channels': _int64_feature(img_pan_cropped.shape[2]),
        'image_ms': _bytes_feature(img_ms_cropped),
        'ms_height': _int64_feature(img_ms_cropped.shape[0]),
        'ms_width': _int64_feature(img_ms_cropped.shape[1]),
        'ms_channels': _int64_feature(img_ms_cropped.shape[2])
    }
    
    return tf.train.Example(features=tf.train.Features(feature=feature))