In [16]:
# default_exp eumetsat

# EUMETSAT API Wrapper Development

In [22]:
#export
import numpy as np
import pandas as pd
import dataset

import FEAutils as hlp
from typing import Union, List
import xmltodict
import dotenv
import datetime
import zipfile
import copy
import os
import io

from requests.auth import HTTPBasicAuth
import requests

from nbdev.showdoc import *
from fastcore.test import test

from satip import utils
from satip.gcp_helpers import get_eumetsat_filenames

from ipypb import track
from IPython.display import JSON

<br>

### User Input

In [23]:
data_dir = '../data/raw'
debug_fp = '../logs/EUMETSAT_download.txt'
env_vars_fp = '../.env'
metadata_db_fp = '../data/EUMETSAT_metadata.db'

download_data = True

<br>

### Authorising API Access

First we'll load the the environment variables

In [24]:
dotenv.load_dotenv(env_vars_fp)

user_key = os.environ.get('USER_KEY')
user_secret = os.environ.get('USER_SECRET')
slack_id = os.environ.get('SLACK_ID')
slack_webhook_url = os.environ.get('SLACK_WEBHOOK_URL')

<br>

And test they were loaded successfully

In [25]:
def check_env_vars_have_loaded(env_vars):
    for name, value in env_vars.items():
        assert value is not None, f'{name}` should not be None'
    
    return

env_vars = {
    'user_key': user_key,
    'user_secret': user_secret,
}

check_env_vars_have_loaded(env_vars)

<br>

We'll then use them to request an access token for the API

In [26]:
#export
def request_access_token(user_key, user_secret):
    """
    Requests an access token from the EUMETSAT data API
    
    Parameters:
        user_key: EUMETSAT API key
        user_secret: EUMETSAT API secret
        
    Returns:
        access_token: API access token
        
    """
    
    token_url = 'https://api.eumetsat.int/token'

    data = {
      'grant_type': 'client_credentials'
    }

    r = requests.post(token_url, data=data, auth=(user_key, user_secret))
    access_token = r.json()['access_token']

    return access_token

<br>

We'll then use them to request an access token for the API

In [27]:
access_token = request_access_token(user_key, user_secret)

access_token

'30203f55-f716-318e-829b-69235a7567ac'

<br>

### Querying Available Data

Before we can download any data we have to know where it's stored. To learn this we can query their search-products API, which returns a JSON containing a list of file metadata.

In [28]:
#export
format_dt_str = lambda dt: pd.to_datetime(dt).strftime('%Y-%m-%dT%H:%M:%SZ')

def query_data_products(
    start_date:str='2020-01-01', 
    end_date:str='2020-01-02', 
    start_index:int=0, 
    num_features:int=10_000,
    product_id:str='EO:EUM:DAT:MSG:MSG15-RSS'
) -> requests.models.Response:
    """
    Queries the EUMETSAT data API for the specified data
    product and date-range. The dates will accept any
    format that can be interpreted by `pd.to_datetime`.
    A maximum of 10,000 entries are returned by the API
    so the indexes of the returned entries can be specified.
    
    Parameters:
        start_date: Start of the query period
        end_date: End of the query period
        start_index: Starting index of returned entries
        num_features: Number of returned entries
        product_id: ID of the EUMETSAT product requested
        
    Returns:
        r: Response from the request
        
    """
    
    search_url = 'https://api.eumetsat.int/data/search-products/os'

    params = {
        'format': 'json',
        'pi': product_id,
        'si': start_index, 
        'c': num_features,
        'sort': 'start,time,0',
        'dtstart': format_dt_str(start_date),
        'dtend': format_dt_str(end_date)
    }

    r = requests.get(search_url, params=params)
    
    assert r.ok, f'Request was unsuccesful - Error code: {r.status_code}'
    
    return r

<br>

We'll quickly make a test request to this end-point

In [29]:
start_date = '2020-10-01'
end_date = '2020-10-07'

r = query_data_products(start_date, end_date)

r_json = r.json()
JSON(r_json)

<IPython.core.display.JSON object>

<br>

However the search-api is capped (at 10,000) for the number of files it will return metadata for, so we'll create a while loop that waits until all the relevant data has been returned. We'll then extract just the list of features from the returned JSONs.

In [30]:
#export
def identify_available_datasets(start_date: str, end_date: str, 
                                product_id='EO:EUM:DAT:MSG:MSG15-RSS'):
    """
    Identifies available datasets from the EUMETSAT data
    API for the specified data product and date-range. 
    The dates will accept any format that can be 
    interpreted by `pd.to_datetime`.
    
    Parameters:
        start_date: Start of the query period
        end_date: End of the query period
        product_id: ID of the EUMETSAT product requested
        
    Returns:
        r: Response from the request
        
    """
    
    num_features = 10000
    start_index = 0
    
    datasets = []
    all_results_returned = False
    
    while all_results_returned == False:
        r_json = query_data_products(start_date, end_date, 
                                     start_index=start_index, 
                                     num_features=num_features, 
                                     product_id='EO:EUM:DAT:MSG:MSG15-RSS').json()

        datasets += r_json['features']

        num_total_results = r_json['properties']['totalResults']
        num_returned_results = start_index + len(r_json['features'])

        if num_returned_results < num_total_results:
            start_index += num_features
        else:
            all_results_returned = True
            
        assert num_returned_results == len(datasets), 'Some features have not been appended'
        
    return datasets

<br>

We'll check that the same number of available datasets are identified

In [31]:
%%time

datasets = identify_available_datasets(start_date, end_date)

print(f'{len(datasets)} datasets have been identified')

1728 datasets have been identified
CPU times: user 144 ms, sys: 20 ms, total: 164 ms
Wall time: 1.78 s


<br>

Finally we'll create a helper function for converting the dataset ids into their file urls.

In [32]:
#export
dataset_id_to_link = lambda data_id: f'https://api.eumetsat.int/data/download/products/{data_id}'

<br>

We'll now test this works.

N.b. You cannot use the link returned directly as it will not be OAuth'ed

In [33]:
dataset_ids = sorted([dataset['id'] for dataset in datasets])
example_data_link = dataset_id_to_link(dataset_ids[0])

example_data_link

'https://api.eumetsat.int/data/download/products/MSG3-SEVI-MSG15-0100-NA-20201001000414.060000000Z-NA'

<br>

### Downloading Data

Now that we know where our data is located we want to download it. First we'll check that the directory we wish to save the data in exists, if not we'll create it

In [34]:
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

<br>

We also want to extract the relevant metadata information from each file. Here we'll create a generalised framework for extracting data from any product, to add a new one please add its metadata mapping under the relevant `product_id`.

N.b. In the future we should look at saving directly to bigquery

In [35]:
# export
def json_extract(json_obj:Union[dict, list], locators:list):
    extracted_obj = copy.deepcopy(json_obj)
    
    for locator in locators:
        extracted_obj = extracted_obj[locator]
        
    return extracted_obj

def extract_metadata(data_dir: str, product_id='EO:EUM:DAT:MSG:MSG15-RSS'):
    with open(f'{data_dir}/EOPMetadata.xml', 'r') as f:
        xml_str = f.read()
        
    raw_metadata = xmltodict.parse(xml_str)
    metadata_map = metadata_maps[product_id]
    
    datatypes_to_transform_func = {
        'datetime': pd.to_datetime,
        'str': str,
        'int': int,
        'float': float
    }
    
    cleaned_metadata = dict()

    for feature, attrs in metadata_map.items():
        location = attrs['location']
        datatype = attrs['datatype']

        value = json_extract(raw_metadata, location)
        formatted_value = datatypes_to_transform_func[datatype](value)

        cleaned_metadata[feature] = formatted_value

    return cleaned_metadata

metadata_maps = {
    'EO:EUM:DAT:MSG:MSG15-RSS': {
        'start_date': {
            'datatype': 'datetime',
            'location': ['eum:EarthObservation', 'om:phenomenonTime', 'gml:TimePeriod', 'gml:beginPosition']
        },
        'end_date': {
            'datatype': 'datetime',
            'location': ['eum:EarthObservation', 'om:phenomenonTime', 'gml:TimePeriod', 'gml:endPosition']
        },
        'result_time': {
            'datatype': 'datetime',
            'location': ['eum:EarthObservation', 'om:resultTime', 'gml:TimeInstant', 'gml:timePosition']
        },
        'platform_short_name': {
            'datatype': 'str',
            'location': ['eum:EarthObservation', 'om:procedure', 'eop:EarthObservationEquipment', 'eop:platform', 'eop:Platform', 'eop:shortName']
        },
        'platform_orbit_type': {
            'datatype': 'str',
            'location': ['eum:EarthObservation', 'om:procedure', 'eop:EarthObservationEquipment', 'eop:platform', 'eop:Platform', 'eop:orbitType']
        },
        'instrument_name': {
            'datatype': 'str',
            'location': ['eum:EarthObservation', 'om:procedure', 'eop:EarthObservationEquipment', 'eop:instrument', 'eop:Instrument', 'eop:shortName']
        },
        'sensor_op_mode': {
            'datatype': 'str',
            'location': ['eum:EarthObservation', 'om:procedure', 'eop:EarthObservationEquipment', 'eop:sensor', 'eop:Sensor', 'eop:operationalMode']
        },
        'center_srs_name': {
            'datatype': 'str',
            'location': ['eum:EarthObservation', 'om:featureOfInterest', 'eop:Footprint', 'eop:centerOf', 'gml:Point', '@srsName']
        },
        'center_position': {
            'datatype': 'str',
            'location': ['eum:EarthObservation', 'om:featureOfInterest', 'eop:Footprint', 'eop:centerOf', 'gml:Point', 'gml:pos']
        },
        'file_name': {
            'datatype': 'str',
            'location': ['eum:EarthObservation', 'om:result', 'eop:EarthObservationResult', 'eop:product', 'eop:ProductInformation', 'eop:fileName', 'ows:ServiceReference', '@xlink:href']
        },
        'file_size': {
            'datatype': 'int',
            'location': ['eum:EarthObservation', 'om:result', 'eop:EarthObservationResult', 'eop:product', 'eop:ProductInformation', 'eop:size', '#text']
        },
        'missing_pct': {
            'datatype': 'float',
            'location': ['eum:EarthObservation', 'eop:metaDataProperty', 'eum:EarthObservationMetaData', 'eum:missingData', '#text']
        },
    }
}

<br>

We're now ready to create a download manager that will handle all of the querying, processing and retrieving for us

In [46]:
#export
def check_valid_request(r:requests.models.Response):
    """
    Checks that the response from the request is valid
    
    Parameters:
        r: Response object from the request

    """
    
    class InvalidCredentials(Exception):
        pass
    
    if r.ok == False:
        if 'Invalid Credentials' in r.text:
            raise InvalidCredentials('The access token passed in the API request is invalid')
        else:
            raise Exception('The API request was unsuccesful')
            
    return

class DownloadManager:
    """
    The DownloadManager class provides a handler for downloading data
    from the EUMETSAT API, managing: retrieval, logging and metadata
    
    """
    
    def __init__(self, user_key: str, user_secret: str, 
                 data_dir: str, metadata_db_fp: str, log_fp: str, 
                 main_logging_level: str='DEBUG', slack_logging_level: str='CRITICAL', 
                 slack_webhook_url: str=None, slack_id: str=None, 
                 bucket_name=None, bucket_prefix=None):
        """
        Initialises the download manager by:
        * Setting up the logger
        * Requesting an API access token
        * Configuring the download directory
        * Connecting to the metadata database
        * Adding satip helper functions

        Parameters:
            user_key: EUMETSAT API key
            user_secret: EUMETSAT API secret
            data_dir: Path to the directory where the satellite data will be saved
            metadata_db_fp: Path to where the metadata database is stored/will be saved
            log_fp: Filepath where the logs will be stored
            main_logging_level: Logging level for file and Jupyter
            slack_logging_level: Logging level for Slack
            slack_webhook_url: Webhook for the log Slack channel
            slack_id: Option user-id to mention in Slack
            bucket_name: (Optional) Google Cloud Storage bucket name to check for existing files
            bucket_prefix: (Optional) Prefix for cloud bucket files

        Returns:
            download_manager: Instance of the DownloadManager class

        """
        
        # Configuring the logger
        self.logger = utils.set_up_logging('EUMETSAT Download', log_fp, 
                                           main_logging_level, slack_logging_level, 
                                           slack_webhook_url, slack_id)
        
        self.logger.info(f'********** Download Manager Initialised **************')
        
        # Requesting the API access token
        self.user_key = user_key
        self.user_secret = user_secret
        
        self.request_access_token()
        
        # Configuring the data directory
        self.data_dir = data_dir
        
        if not os.path.exists(self.data_dir):
            os.makedirs(self.data_dir)
        
        # Initialising the metadata database
        self.metadata_db = dataset.connect(f'sqlite:///{metadata_db_fp}')
        self.metadata_table = self.metadata_db['metadata']
        
        # Adding satip helper functions
        self.identify_available_datasets = identify_available_datasets
        self.query_data_products = query_data_products

        # Google Cloud integration
        self.bucket_name = bucket_name
        self.bucket_prefix = bucket_prefix
        self.bucket_filenames = None

        if bucket_name:
            print(f'Checking files in GCP bucket {bucket_name}, this will take a few seconds')
            self.bucket_filenames = get_eumetsat_filenames(bucket_name, prefix=bucket_prefix)
        
        return
    
    
    def request_access_token(self, user_key=None, user_secret=None): 
        """
        Requests an access token from the EUMETSAT data API.
        If no key or secret are provided then they will default
        to the values provided in the download manager initialisation

        Parameters:
            user_key: EUMETSAT API key
            user_secret: EUMETSAT API secret

        Returns:
            access_token: API access token

        """
    
        if user_key is None:
            user_key = self.user_key
        if user_secret is None:
            user_secret = self.user_secret
            
        self.access_token = request_access_token(user_key, user_secret)
        
        return
        
    def download_single_dataset(self, data_link:str):
        """
        Downloads a single dataset from the EUMETSAT API

        Parameters:
            data_link: Url link for the relevant dataset
        
        """
        
        params = {
            'access_token': self.access_token
        }

        r = requests.get(data_link, params=params)

        check_valid_request(r)

        zipped_files = zipfile.ZipFile(io.BytesIO(r.content))
        zipped_files.extractall(f'{self.data_dir}')

        return


    def check_if_downloaded(self, filenames: List[str]):
        """Checks which files should be downloaded based on 
        local file contents and a cloud storage bucket, if specified.
        
        Parameters:
            filenames: List of filename strings
        
        Returns:
            List of filenames to download
        """
        in_bucket = []
        local = []
        for filename in filenames:
            if self.bucket_name:
                if filename in self.bucket_filenames:
                    in_bucket.append(filename)

            if f'{filename}.nat' in os.listdir(self.data_dir):
                local.append(filename)

        # Download files if they are not locally downloaded or in the bucket
        to_download = set(filenames).difference(set(local).union(set(in_bucket)))

        if self.bucket_name:
            self.logger.info(f'{len(filenames)} files queried, {len(in_bucket)} found in bucket, {len(local)} found in {self.data_dir}, {len(to_download)} to download.')
        else:
            self.logger.info(f'{len(filenames)} files queried, {len(local)} found in {self.data_dir}, {len(to_download)} to download.')

        return to_download


    def download_datasets(self, start_date:str, end_date:str, product_id='EO:EUM:DAT:MSG:MSG15-RSS'):
        """
        Downloads a set of dataset from the EUMETSAT API
        in the defined date range and specified product

        Parameters:
            start_date: Start of the requested data period
            end_date: End of the requested data period
            product_id: ID of the EUMETSAT product requested
        
        """

        # Identifying dataset ids to download
        datasets = identify_available_datasets(start_date, end_date, product_id=product_id)
        dataset_ids = sorted([dataset['id'] for dataset in datasets])

        # Check which datasets to download
        dataset_ids = self.check_if_downloaded(dataset_ids)
        
        # Downloading specified datasets
        if not dataset_ids:
            self.logger.info('No files will be downloaded. Set DownloadManager bucket_name argument for local download')
            return 

        for dataset_id in track(dataset_ids):
            dataset_link = dataset_id_to_link(dataset_id)

            # Download the raw data
            try:
                self.download_single_dataset(dataset_link)
            except:
                self.logger.info('The EUMETSAT access token has been refreshed')
                self.request_access_token()
                self.download_single_dataset(dataset_link)

            # Extract and save metadata
            dataset_metadata = extract_metadata(self.data_dir, product_id=product_id)
            dataset_metadata.update({'downloaded': pd.Timestamp.now()})
            self.metadata_table.insert(dataset_metadata)

            # Delete old metadata files
            for xml_file in ['EOPMetadata.xml', 'manifest.xml']:
                xml_filepath = f'{self.data_dir}/{xml_file}'

                if os.path.isfile(xml_filepath):
                    os.remove(xml_filepath)
                    
        return
    
    get_df_metadata = lambda self: pd.DataFrame(self.metadata_table.all()).set_index('id')

<br>

We'll now see what it looks like when we initialise the download manager

In [47]:
dm = DownloadManager(user_key, user_secret, data_dir, metadata_db_fp, debug_fp)
start_date = '2020-10-01 12:00'
end_date = '2020-10-01 12:05'

if download_data == True:
    dm.download_datasets(start_date, end_date)

2020-11-13 12:45:56,692 - INFO - ********** Download Manager Initialised **************
2020-11-13 12:45:56,692 - INFO - ********** Download Manager Initialised **************
2020-11-13 12:45:56,692 - INFO - ********** Download Manager Initialised **************
2020-11-13 12:45:56,692 - INFO - ********** Download Manager Initialised **************
2020-11-13 12:45:56,692 - INFO - ********** Download Manager Initialised **************
2020-11-13 12:45:56,692 - INFO - ********** Download Manager Initialised **************
2020-11-13 12:45:57,339 - INFO - 1 files queried, 1 found in ../data/raw, 0 to download.
2020-11-13 12:45:57,339 - INFO - 1 files queried, 1 found in ../data/raw, 0 to download.
2020-11-13 12:45:57,339 - INFO - 1 files queried, 1 found in ../data/raw, 0 to download.
2020-11-13 12:45:57,339 - INFO - 1 files queried, 1 found in ../data/raw, 0 to download.
2020-11-13 12:45:57,339 - INFO - 1 files queried, 1 found in ../data/raw, 0 to download.
2020-11-13 12:45:57,339 - I

In [48]:
dm = DownloadManager(user_key, user_secret, data_dir, metadata_db_fp, debug_fp, 
                     slack_webhook_url=slack_webhook_url, slack_id=slack_id)

start_date = '2020-10-01 12:00'
end_date = '2020-10-01 12:05'

if download_data == True:
    dm.download_datasets(start_date, end_date)

df_metadata = dm.get_df_metadata()

df_metadata.head()

NameError: name 'DownloadManager' is not defined

Unnamed: 0_level_0,start_date,end_date,result_time,platform_short_name,platform_orbit_type,instrument_name,sensor_op_mode,center_srs_name,center_position,file_name,file_size,missing_pct,downloaded
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,2020-10-30 00:00:09.204,2020-10-30 00:04:15.571,2020-10-30 00:04:15.571,MSG3,GEO,SEVIRI,RSS,EPSG:4326,0 9.5,MSG3-SEVI-MSG15-0100-NA-20201030000415.5710000...,99819,0.0,2020-11-13 10:26:18.634153
2,2020-10-30 00:05:09.050,2020-10-30 00:09:15.416,2020-10-30 00:09:15.416,MSG3,GEO,SEVIRI,RSS,EPSG:4326,0 9.5,MSG3-SEVI-MSG15-0100-NA-20201030000915.4160000...,99819,0.0,2020-11-13 10:26:40.368230
3,2020-10-30 00:10:08.895,2020-10-30 00:14:15.262,2020-10-30 00:14:15.262,MSG3,GEO,SEVIRI,RSS,EPSG:4326,0 9.5,MSG3-SEVI-MSG15-0100-NA-20201030001415.2620000...,99819,0.0,2020-11-13 10:27:05.045463
4,2020-10-30 00:15:08.741,2020-10-30 00:19:15.108,2020-10-30 00:19:15.108,MSG3,GEO,SEVIRI,RSS,EPSG:4326,0 9.5,MSG3-SEVI-MSG15-0100-NA-20201030001915.1080000...,99819,0.0,2020-11-13 10:27:29.583702
5,2020-10-30 00:20:09.789,2020-10-30 00:24:16.155,2020-10-30 00:24:16.155,MSG3,GEO,SEVIRI,RSS,EPSG:4326,0 9.5,MSG3-SEVI-MSG15-0100-NA-20201030002416.1550000...,99819,0.0,2020-11-13 10:27:54.587581


<br>

The `get_size` function was adapted from <a href="https://stackoverflow.com/questions/1392413/calculating-a-directorys-size-using-python">this stackoverflow answer</a>

In [None]:
#export
def get_dir_size(directory='.'):
    total_size = 0
    
    for dirpath, dirnames, filenames in os.walk(directory):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            
            if not os.path.islink(fp):
                total_size += os.path.getsize(fp)

    return total_size

In [None]:
data_dir_size = get_dir_size(data_dir)

print(f'The data directory is currently {round(data_dir_size/1_000_000_000, 2):,} Gb')

The data directory is currently 0.61 Gb


In [5]:
#hide
from nbdev.export import *
notebook2script()

Converted 00_utils.ipynb.
Converted 01_eumetsat.ipynb.
Converted 02_reproj.ipynb.
Converted 03_zarr.ipynb.
Converted 04_gcp.ipynb.
Converted 05_usage.ipynb.
