This little script untars uncompressed native (`.nat`) satellite image files downloaded from EUMETSAT.

Set `SRC_PATH` to be the directory holding all the `.tar` files downloaded from EUMETSAT.

Set `DST_PATH` to be the destination path.  This script will store data in a directory structure of the form `<year>/<month>/<day>/<hour>/<minute/` e.g. `2019/01/10/20/55`.

Set `TMP_PATH` to a place which will be used to temporarily store the un-tarred `.nat` files before they are compressed and sorted.  For speed, use an SSD or RAM disk (although a spinning disk is also fine!)

## Requirements

* Install [pbzip2](https://linux.die.net/man/1/pbzip2) (a parallel-processing version of bzip2).  On Ubuntu: `sudo apt install pbzip2`

In [12]:
import tarfile
import re
import os
import glob
from datetime import datetime
import subprocess
import shutil
import logging
import sys
import math

from eumetsat import get_filesize_megabytes
from consts import PATH

In [5]:
# The directory containing the tar files downloaded from EUMETSAT
SRC_PATH = os.path.join(PATH, 'unsorted_downloads/1353166')
TMP_PATH = '/home/jack/temp'
DST_PATH = os.path.join(PATH, 'sorted')
DEL_PATH = os.path.join(PATH, 'delete_me')  # The path where we put files ready to be deleted
NATIVE_FILESIZE_MB = 102.210123  # Expected filesize of each Native file.

LOG_PATH = os.path.join(PATH, 'logs', 'sort_and_compress')

## Logging
STREAM_HANDLER = True
LOG_FILENAME = os.path.join(LOG_PATH, 'eumetsat_sort_and_compress.log')

# Make filenames
for directory in [TMP_PATH, DST_PATH, DEL_PATH, LOG_PATH]:
    if not os.path.exists(directory):
        os.makedirs(directory)

# This is a list of the tar files which have already been processed.
# This is useful so that this script can re-start for where it left off,
# if needs be.
LIST_OF_COMPLETED_FILES = os.path.join(PATH, 'sorted_and_compressed_files.txt')

In [6]:
# Set up logging
log = logging.getLogger('eumetsat_sort')
log.setLevel(logging.DEBUG)
log.handlers = [logging.FileHandler(filename=LOG_FILENAME, mode='a')]
if STREAM_HANDLER:
    log.handlers.append(logging.StreamHandler(sys.stdout))
formatter = logging.Formatter(
    '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
for handler in log.handlers:
    handler.setFormatter(formatter)

In the code below, we use the term `full_filename` to refer to a filename including the full path, e.g. `/storage/data/eumetsat/native/auto_downloads/1354458-1-1of8.tar`

The `base_filename` is the filename _without_ the path, e.g. `1354458-1-1of8.tar`

In [7]:
# Load list of completed files
if os.path.exists(LIST_OF_COMPLETED_FILES):
    with open(LIST_OF_COMPLETED_FILES, 'r') as fh:
        full_tar_filenames_completed = fh.readlines()
    full_tar_filenames_completed = [fname.strip() for fname in full_tar_filenames_completed]
else:
    full_tar_filenames_completed = []
    
full_tar_filenames_completed.sort()

In [8]:
full_tar_filenames_completed[:5]

['/storage/data/eumetsat/native/auto_downloads/1354458-1-1of8.tar',
 '/storage/data/eumetsat/native/auto_downloads/1354458-1-2of8.tar',
 '/storage/data/eumetsat/native/auto_downloads/1354458-1-3of8.tar',
 '/storage/data/eumetsat/native/auto_downloads/1354458-1-4of8.tar',
 '/storage/data/eumetsat/native/auto_downloads/1354458-1-5of8.tar']

In [9]:
full_tar_filenames = glob.glob(os.path.join(SRC_PATH, '*.tar'))
full_tar_filenames.sort()

len(full_tar_filenames)

83

In [10]:
full_tar_filenames[:5]

['/storage/data/eumetsat/native/unsorted_downloads/1353166/1353166-1-1of8.tar',
 '/storage/data/eumetsat/native/unsorted_downloads/1353166/1353166-1-2of8.tar',
 '/storage/data/eumetsat/native/unsorted_downloads/1353166/1353166-1-3of8.tar',
 '/storage/data/eumetsat/native/unsorted_downloads/1353166/1353166-1-4of8.tar',
 '/storage/data/eumetsat/native/unsorted_downloads/1353166/1353166-1-5of8.tar']

In [11]:
# Remove files which have previously been completed
completed_base_filenames = [os.path.basename(filename) for filename in full_tar_filenames_completed]
full_tar_filenames_filtered = [
    full_tar_filename for full_tar_filename in full_tar_filenames 
    if os.path.basename(full_tar_filename) not in completed_base_filenames]
full_tar_filenames_filtered.sort()
n = len(full_tar_filenames_filtered)
n

83

In [None]:
def get_datetime(inner_tar_name):
    p = re.compile('^MSG[23]-SEVI-MSG15-0100-NA-(\d*)\.')
    title_match = p.match(inner_tar_name)
    date_str = title_match.group(1)
    return datetime.strptime(date_str, "%Y%m%d%H%M%S")


for i, full_tar_filename in enumerate(full_tar_filenames_filtered):
    log.info('%d of %d = %.1f %% : %s', i+1, n, ((i+1)/n)*100, full_tar_filename)
    base_tar_filename = os.path.basename(full_tar_filename)
    
    # Extract tar file to TMP_PATH/base_filename/
    new_tmp_path = os.path.join(TMP_PATH, os.path.splitext(base_tar_filename)[0])
    if not os.path.exists(new_tmp_path):
        os.makedirs(new_tmp_path)
    with tarfile.open(full_tar_filename) as tar:
        # `extractall` will throw a `ReadError: unexpected end of data` if the 
        # tar file is incomplete.
        log.debug('Extracting %s to %s', full_tar_filename, new_tmp_path)
        tar.extractall(new_tmp_path)

    # Now compress & move each native file
    full_native_filenames = glob.glob(os.path.join(new_tmp_path, '*.nat'))
    log.info('Found %d native files.', len(full_native_filenames))
    for full_native_filename in full_native_filenames:
        # Check filesize is correct
        native_filesize_mb = get_filesize_megabytes(full_native_filename)
        if not math.isclose(native_filesize_mb, NATIVE_FILESIZE_MB, abs_tol=1):
            msg = 'Filesize incorrect for {}!  Expected {} MB.  Actual = {} MB.'.format(
                full_native_filename, NATIVE_FILESIZE_MB, native_filesize_mb)
            log.error(msg)
            raise RuntimeError(msg)
        
        log.debug('Compressing %s', full_native_filename)
        # TODO: Capture output of subprocess
        completed_process = subprocess.run(['pbzip2', '-5', full_native_filename])
        try:
            completed_process.check_returncode()
        except:
            log.exception('Compression failed!')
            raise
        full_compressed_filename = full_native_filename + '.bz2'
        compressed_filesize_mb = get_filesize_megabytes(full_compressed_filename)
        log.debug('Filesizes: Before compression = %.1f MB. After compression = %.1f MB.  Compression %.1f', 
                 native_filesize_mb, compressed_filesize_mb, compressed_filesize_mb / native_filesize_mb)
        base_native_filename = os.path.basename(full_native_filename)
        dt = get_datetime(base_native_filename)
        new_dst_path = os.path.join(DST_PATH, dt.strftime("%Y/%m/%d/%H/%M"))
        if not os.path.exists(new_dst_path):
            os.makedirs(new_dst_path)
        
        log.debug('Moving %s to %s', full_compressed_filename, new_dst_path)
        shutil.move(src=full_compressed_filename, dst=new_dst_path)
        
    # Tidy up
    log.debug('Removing %s', new_tmp_path)
    os.rmdir(new_tmp_path)
    log.debug('Moving %s to %s', full_tar_filename, DEL_PATH)
    shutil.move(src=full_tar_filename, dst=DEL_PATH)

    with open(LIST_OF_COMPLETED_FILES, 'a') as fh:
        fh.write('{}\n'.format(full_tar_filename))

2019-10-01 12:43:51,665 - eumetsat_sort - INFO - 1 of 83 = 1.204819 % : /storage/data/eumetsat/native/unsorted_downloads/1353166/1353166-1-1of8.tar
2019-10-01 12:43:51,667 - eumetsat_sort - DEBUG - Extracting /storage/data/eumetsat/native/unsorted_downloads/1353166/1353166-1-1of8.tar to /home/jack/temp/1353166-1-1of8
2019-10-01 12:43:54,751 - eumetsat_sort - INFO - Found 29 native files.
2019-10-01 12:43:54,752 - eumetsat_sort - DEBUG - Compressing /home/jack/temp/1353166-1-1of8/MSG3-SEVI-MSG15-0100-NA-20190101040917.449000000Z-20190101040935-1353166-1.nat
2019-10-01 12:43:55,507 - eumetsat_sort - DEBUG - Moving /home/jack/temp/1353166-1-1of8/MSG3-SEVI-MSG15-0100-NA-20190101040917.449000000Z-20190101040935-1353166-1.nat.bz2 to /storage/data/eumetsat/native/sorted/2019/01/01/04/09
2019-10-01 12:43:55,526 - eumetsat_sort - DEBUG - Compressing /home/jack/temp/1353166-1-1of8/MSG3-SEVI-MSG15-0100-NA-20190101032416.756000000Z-20190101032435-1353166-1.nat
2019-10-01 12:43:56,419 - eumetsat_so