This little script untars satellite image files downloaded from EUMETSAT.

Set `SRC_PATH` to be the directory holding all the `.tar` files downloaded from EUMETSAT.

Set `DST_PATH` to be the destination path.  This script will store data in a directory structure of the form `<year>/<month>/<day>/<hour>/<minute/` e.g. `2019/01/10/20/55`.

In [1]:
import tarfile
import re
import os
from datetime import datetime

In [2]:
PATH = "/media/jack/Backup/data_not_stored_on_laptop/eumetsat/dexter"

# The directory containing the tar files downloaded from EUMETSAT
SRC_PATH = os.path.join(PATH, 'downloaded_tar')
DST_PATH = os.path.join(PATH, 'jpegs')

# This is a list of the tar files which have already been processed.
# This is useful so that this script can re-start for where it left off,
# if needs be.
LIST_OF_COMPLETED_FILES = os.path.join(PATH, 'completed_files.txt')

In [3]:
# Load list of completed files
if os.path.exists(LIST_OF_COMPLETED_FILES):
    with open(LIST_OF_COMPLETED_FILES, 'r') as fh:
        completed_files = fh.readlines()
    completed_files = [fname.strip() for fname in completed_files]
else:
    completed_files = []

In [4]:
completed_files

['1313606-1-1of2.tar', '1284046-1of1.tar', '1313642-2-1of1.tar']

In [5]:
filenames = os.listdir(SRC_PATH)
len(filenames)

95

In [6]:
# Remove files which have previously been completed
filenames = list(set(filenames) - set(completed_files))
filenames.sort()
n = len(filenames)
n

92

In [7]:
def get_datetime(inner_tar_name):
    p = re.compile('^MSG[23]-SEVI-MSG15-0100-NA-(\d*)\.')
    title_match = p.match(inner_tar_name)
    date_str = title_match.group(1)
    return datetime.strptime(date_str, "%Y%m%d%H%M%S")


for i, filename in enumerate(filenames):
    print(i+1, 'of', n, '= {:.0%}'.format((i+1)/n), ' : ', filename)
    full_filename = os.path.join(SRC_PATH, filename)
    tar = tarfile.open(full_filename)
    for inner_tar in tar:
        print('\r', inner_tar.name, end='', flush=True)
        extracted = tar.extractfile(inner_tar)
        extracted_tar = tarfile.open(fileobj=extracted)
        dt = get_datetime(inner_tar.name)
        dir_name = os.path.join(DST_PATH, dt.strftime("%Y/%m/%d/%H/%M"))
        if not os.path.exists(dir_name):
            os.makedirs(dir_name)
        extracted_tar.extractall(dir_name)
    with open(LIST_OF_COMPLETED_FILES, 'a') as fh:
        fh.write('{}\n'.format(filename))
    print()

1 of 92 = 1%  :  1284044-1of1.tar
 MSG2-SEVI-MSG15-0100-NA-20140109224916.597000000Z-20140109224935-1284044.tarMSG2-SEVI-MSG15-0100-NA-20140109070415.949000000Z-20140109070435-1284044.tarMSG2-SEVI-MSG15-0100-NA-20140103160415.986000000Z-20140103160436-1284044.tar
2 of 92 = 2%  :  1284048-1of2.tar
 MSG2-SEVI-MSG15-0100-NA-20140216185416.488000000Z-20140216185435-1284048.tarMSG2-SEVI-MSG15-0100-NA-20140228095417.667000000Z-20140228095505-1284048.tarMSG2-SEVI-MSG15-0100-NA-20140227213916.707000000Z-20140227213936-1284048.tar
3 of 92 = 3%  :  1284048-2of2.tar
 MSG2-SEVI-MSG15-0100-NA-20140218021915.393000000Z-20140218021935-1284048.tar
4 of 92 = 4%  :  1284288-1-1of2.tar
 MSG2-SEVI-MSG15-0100-NA-20160702030917.275000000Z-20160702030935-1284288-1.tarMSG2-SEVI-MSG15-0100-NA-20160706083414.454000000Z-20160706083432-1284288-1.tar
5 of 92 = 5%  :  1284288-1-2of2.tar
 MSG2-SEVI-MSG15-0100-NA-20160716221918.092000000Z-20160716221936-1284288-1.tarMSG2-SEVI-MSG15-0100-NA-20160716182917.798000000Z-2