This little script untars uncompressed native (`.nat`) satellite image files downloaded from EUMETSAT.

Set `SRC_PATH` to be the directory holding all the `.tar` files downloaded from EUMETSAT.

Set `DST_PATH` to be the destination path.  This script will store data in a directory structure of the form `<year>/<month>/<day>/<hour>/<minute/` e.g. `2019/01/10/20/55`.

Set `TMP_PATH` to a place which will be used to temporarily store the un-tarred `.nat` files before they are compressed and sorted.  For speed, use an SSD or RAM disk (although a spinning disk is also fine!)

## Requirements

* Install [pbzip2](https://linux.die.net/man/1/pbzip2) (a parallel-processing version of bzip2).  On Ubuntu: `sudo apt install pbzip2`

In [17]:
import tarfile
import re
import os
import glob
from datetime import datetime
import subprocess
import shutil

In [18]:
PATH = '/storage/data/eumetsat/native/'

# The directory containing the tar files downloaded from EUMETSAT
SRC_PATH = os.path.join(PATH, 'testing')
TMP_PATH = '/home/jack/temp'
DST_PATH = os.path.join(PATH, 'sorted')
DEL_PATH = os.path.join(PATH, 'delete_me')  # The path where we put files ready to be deleted

# This is a list of the tar files which have already been processed.
# This is useful so that this script can re-start for where it left off,
# if needs be.
LIST_OF_COMPLETED_FILES = os.path.join(PATH, 'completed_files.txt')

In the code below, we use the term `full_filename` to refer to a filename including the full path, e.g. `/storage/data/eumetsat/native/auto_downloads/1354458-1-1of8.tar`

The `base_filename` is the filename _without_ the path, e.g. `1354458-1-1of8.tar`

In [11]:
# Load list of completed files
if os.path.exists(LIST_OF_COMPLETED_FILES):
    with open(LIST_OF_COMPLETED_FILES, 'r') as fh:
        full_tar_filenames_completed = fh.readlines()
    full_tar_filenames_completed = [fname.strip() for fname in full_tar_filenames_completed]
else:
    full_tar_filenames_completed = []
    
full_tar_filenames_completed.sort()

In [12]:
full_tar_filenames_completed[:5]

['/storage/data/eumetsat/native/auto_downloads/1354458-1-1of8.tar',
 '/storage/data/eumetsat/native/auto_downloads/1354458-1-2of8.tar',
 '/storage/data/eumetsat/native/auto_downloads/1354458-1-3of8.tar',
 '/storage/data/eumetsat/native/auto_downloads/1354458-1-4of8.tar',
 '/storage/data/eumetsat/native/auto_downloads/1354458-1-5of8.tar']

In [13]:
full_tar_filenames = glob.glob(os.path.join(SRC_PATH, '*.tar'))
full_tar_filenames.sort()

len(full_tar_filenames)

102

In [14]:
full_tar_filenames[:5]

['/storage/data/eumetsat/native/testing/1354464-1-1of8.tar',
 '/storage/data/eumetsat/native/testing/1354464-1-2of8.tar',
 '/storage/data/eumetsat/native/testing/1354464-1-3of8.tar',
 '/storage/data/eumetsat/native/testing/1354464-1-4of8.tar',
 '/storage/data/eumetsat/native/testing/1354464-1-5of8.tar']

In [15]:
# Remove files which have previously been completed
completed_base_filenames = [os.path.basename(filename) for filename in full_tar_filenames_completed]
full_tar_filenames_filtered = [
    full_tar_filename for full_tar_filename in full_tar_filenames 
    if os.path.basename(full_tar_filename) not in completed_base_filenames]
full_tar_filenames_filtered.sort()
n = len(full_tar_filenames_filtered)
n

91

In [16]:
def get_datetime(inner_tar_name):
    p = re.compile('^MSG[23]-SEVI-MSG15-0100-NA-(\d*)\.')
    title_match = p.match(inner_tar_name)
    date_str = title_match.group(1)
    return datetime.strptime(date_str, "%Y%m%d%H%M%S")

# TODO: Logging
for i, full_tar_filename in enumerate(full_tar_filenames_filtered):
    print(i+1, 'of', n, '= {:.0%}'.format((i+1)/n), ' : ', full_tar_filename)
    base_tar_filename = os.path.basename(full_tar_filename)
    
    # Extract tar file to TMP_PATH/base_filename/
    new_tmp_path = os.path.join(TMP_PATH, os.path.splitext(base_tar_filename)[0])
    if not os.path.exists(new_tmp_path):
        os.makedirs(new_tmp_path)
    with tarfile.open(full_tar_filename) as tar:
        # `extractall` will throw a `ReadError: unexpected end of data` if the 
        # tar file is incomplete.
        tar.extractall(new_tmp_path)
        
    # Now compress & move each native file
    full_native_filenames = glob.glob(os.path.join(new_tmp_path, '*.nat'))
    print('Found', len(native_filenames), 'native files')
    # TODO: Check filesize of the native files?
    for full_native_filename in full_native_filenames:
        print('\r', full_native_filename, end='', flush=True)
        # TODO: Capture output of subprocess
        completed_process = subprocess.run(['pbzip2', '-5', full_native_filename])
        completed_process.check_returncode()
        base_native_filename = os.path.basename(full_native_filename)
        dt = get_datetime(base_native_filename)
        new_dst_path = os.path.join(DST_PATH, dt.strftime("%Y/%m/%d/%H/%M"))
        if not os.path.exists(new_dst_path):
            os.makedirs(new_dst_path)
        shutil.move(src=full_native_filename + '.bz2', dst=new_dst_path)
        
    # Tidy up
    os.rmdir(new_tmp_path)
    shutil.move(src=full_tar_filename, dst=DEL_PATH)

    with open(LIST_OF_COMPLETED_FILES, 'a') as fh:
        fh.write('{}\n'.format(full_tar_filename))

    print()

1 of 91 = 1%  :  /storage/data/eumetsat/native/testing/1355708-1-1of8.tar
Found 29 native files
 /storage/data/eumetsat/native/temp/1355708-1-1of8/MSG3-SEVI-MSG15-0100-NA-20190131050418.075000000Z-20190131050435-1355708-1.nat
2 of 91 = 2%  :  /storage/data/eumetsat/native/testing/1355708-1-2of8.tar
Found 29 native files
 /storage/data/eumetsat/native/temp/1355708-1-2of8/MSG3-SEVI-MSG15-0100-NA-20190131073418.039000000Z-20190131073435-1355708-1.nat
3 of 91 = 3%  :  /storage/data/eumetsat/native/testing/1355708-1-3of8.tar
Found 29 native files
 /storage/data/eumetsat/native/temp/1355708-1-3of8/MSG3-SEVI-MSG15-0100-NA-20190131085918.180000000Z-20190131085936-1355708-1.nat
4 of 91 = 4%  :  /storage/data/eumetsat/native/testing/1355708-1-4of8.tar
Found 29 native files
 /storage/data/eumetsat/native/temp/1355708-1-4of8/MSG3-SEVI-MSG15-0100-NA-20190131105917.679000000Z-20190131105935-1355708-1.nat
5 of 91 = 5%  :  /storage/data/eumetsat/native/testing/1355708-1-5of8.tar
Found 29 native files
