In [None]:
# Path to location where individual satellite images are located
DATA_PATH = '/media/landet/Data/master/data/' 

# Name of metadata .xml file
METADATA_NAME = 'DeliveryMetadata.xml'
XMLNS = 'http://xsd.digitalglobe.com/xsd/dm'

import numpy as np
import pandas as pd

import xml.etree.ElementTree as ET
import os

# Metadata parsing from xml to dictionary

Every satellite image delivery from Maxar contains a `DeliveryMetadata.xml` file with important specifications for both the multispectral and panchromatic images. The following functions finds all the `DeliveryMetadata.xml` files contained in a folder and parses them into a Python dictionary format which will be used for further descriptive statistics of the dataset.

In [None]:
def remove_xmlns(string, xmlns):
    output = string.replace('{' + xmlns + '}', '')
    return output

def find(name, path):
    for root, dirs, files in os.walk(path):
        if name in files:
            return os.path.join(root, name)

def is_panchromatic_product(xml_node):
    for child in xml_node:
        if child.text == 'Pan':
            return True
    return False

def xml_root_children_to_dict(xml_root_element):
    d = {}
    for child in xml_root_element:
        #print(remove_xmlns(child.tag, XMLNS))
        d[remove_xmlns(child.tag, XMLNS)] = child.text
    return d

def xml_product_to_dict(xml_product_element):
    d = {}
    count_strip, count_product, count_band = 0, 0, 0
    for prod_child in xml_product_element:
        prod_child_tag = remove_xmlns(prod_child.tag, XMLNS)
        if prod_child_tag == 'strip':
            d[prod_child_tag+str(count_strip)] = xml_root_children_to_dict(prod_child)
            count_strip += 1
        elif prod_child_tag == 'productFile':
            d[prod_child_tag+str(count_product)] = xml_root_children_to_dict(prod_child)
            count_product += 1
        elif prod_child_tag == 'band':
            d[prod_child_tag+str(count_band)] = prod_child.text
            count_band += 1
        else:
            d[prod_child_tag] = prod_child.text
    d['n_bands'] = count_band
    d['n_products'] = count_product
    d['n_strips'] = count_strip
    return d

def xml_metadata_to_dict(path):
    d_pan, d_ms, d_mutual = {}, {}, {}
    xml_parsed = ET.parse(path)
    for child in xml_parsed.getroot():
        child_tag = remove_xmlns(child.tag, XMLNS)
        if child_tag == 'product':
            if is_panchromatic_product(child):
                d_pan = xml_product_to_dict(child)
            else: 
                d_ms = xml_product_to_dict(child)
        else:
            d_mutual[child_tag] = child.text
    d_pan.update(d_mutual), d_ms.update(d_mutual)
    return d_pan, d_ms

def img_metadata_to_dict(metadata_name, data_path):
    img_metadata_pan, img_metadata_ms = {}, {}
    img_list = os.listdir(DATA_PATH)
    for img in img_list:
        path_to_metadata_files = find(METADATA_NAME, str(DATA_PATH + img))
        img_metadata_pan[img], img_metadata_ms[img] = xml_metadata_to_dict(path_to_metadata_files)
    return img_metadata_pan, img_metadata_ms

def add_names_to_metadata(metadata_dictionary, list_of_area_names):
    for image_name in metadata_dictionary.keys():
        for area_name in list_of_area_names:
            if area_name in image_name:
                metadata_dictionary[image_name]['area_name'] = area_name
    return metadata_dictionary

areas = ['La_Spezia', 'Toulon'] # Spelled like in the directory names
img_metadata_pan, img_metadata_ms = img_metadata_to_dict(METADATA_NAME, DATA_PATH)

img_metadata_pan = add_names_to_metadata(img_metadata_pan, areas)
img_metadata_ms = add_names_to_metadata(img_metadata_ms, areas)

In [None]:
def dict_to_df(img_metadata_dict):
    img_metadata_df = pd.DataFrame(img_metadata_dict).transpose()
    img_metadata_df = img_metadata_df.astype(
        {'bitsPerPixel': 'category', 
         'cloudCover': 'float', 
         'datum': 'category', 
         'imageFileFormat': 'category',
         'imageTypeSize': 'category',
         'imagingTilingType': 'category',
         'isDynamicRangeAdjusted': 'bool', 
         'isMosaic': 'bool', 
         'mapProjection': 'category',
         'mapProjectionUnit': 'category',
         'mapProjectionZone': 'category', 
         'mergingAlgorithm': 'category',
         'mergedBand': 'category',
         'offNadirAngle': 'float',
         'pixelHeight': 'float',
         'pixelWidth': 'float',
         'processingLevel': 'category',
         'resamplingKernel': 'category',
         'sensorVehicle': 'category', 
         'sunAzimuth': 'float',
         'sunElevation': 'float', 
         'n_bands': 'int8', 
         'n_products': 'int8',
         'n_strips': 'int8'
    })
    
    # Converting the datatypes of the bands to category
    for i in range(20): #20 is just a number much higher than number of bands to be safe
        if str('band' + str(i)) in img_metadata_df.columns:
            img_metadata_df = img_metadata_df.astype({str('band'+str(i)): 'category'})
        
    img_metadata_df['earliestAcquisitionTime'] = pd.to_datetime(img_metadata_df['earliestAcquisitionTime'])
    img_metadata_df['latestAcquisitionTime'] = pd.to_datetime(img_metadata_df['latestAcquisitionTime'])
    img_metadata_df['productionDate'] = pd.to_datetime(img_metadata_df['productionDate'])
    img_metadata_df['updateDate'] = pd.to_datetime(img_metadata_df['updateDate'])

    return img_metadata_df

In [None]:
img_metadata_df_pan = dict_to_df(img_metadata_pan)
img_metadata_df_ms = dict_to_df(img_metadata_ms)

In [None]:
def calculate_actual_image_areas(img_metadata_df):
    for i, r in img_metadata_df.iterrows():
        print r[i]

# Distribution of images

In [None]:
print('Number of panchromatic images:', len(img_metadata_pan))
print('Number of multispectral images:', len(img_metadata_ms))
if len(img_metadata_pan) == len(img_metadata_ms):
    print('Pass: Equal number of panchromatic and multispectral images. This is good.')
    n_images = len(img_metadata_pan)
else:
    print('Fail: NOT equal number of panchromatic and multispectral images. This is NOT good.')
print()
if img_metadata_pan.keys() == img_metadata_ms.keys():
    print('Pass: Identical keys. Keys in panchromatic and multispectral dictionaries are identical.')
else: print('Fail: Identical keys. Keys in panchromatic and multispectral dictionaries are not identical.')
print()
    
def count_images_of_area(metadata_dictionary, name_of_area):
    count = sum(name_of_area in keys for keys in metadata_dictionary.keys())
    return count
toulon_n = count_images_of_area(img_metadata_pan, 'Toulon')
laspezia_n = count_images_of_area(img_metadata_pan, 'La_Spezia')
print('Number of images from each area:')
print('Toulon:', toulon_n)
print('La Spezia:', laspezia_n)

sensor_count = {'GE01', 'WV02', 'WV03'}
def count_sensor_of_area(metadata_dictionary, name_of_area, sensor_name):
    count = 0
    for k, v in metadata_dictionary.items():
        #print(image)
        if v['sensorVehicle'] == sensor_name:
            count += 1
            print(k, sensor_name)
    return count

count_sensor_of_area(img_metadata_pan, 'La_Spezia', 'GE01')

In [None]:
#img_metadata_pan['GE01_La_Spezia_2009_09_25_011651186010_0']
img_metadata_ms['WV02_Toulon_2013_04_10_011651050010_0']