In [1]:
from pathlib import Path
import pandas as pd

In [2]:
EUMETSAT_PATH = Path("/mnt/storage_a/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/satellite/EUMETSAT/SEVIRI_RSS/native/")

## Delete `.nat` files if a `.bz2` file already exists:

In [38]:
%%time
nat_filenames = list(EUMETSAT_PATH.glob('*/*/*/*.nat'))

CPU times: user 249 ms, sys: 131 ms, total: 380 ms
Wall time: 1.04 s


In [40]:
for nat_filename in nat_filenames:
    bz2_filename = nat_filename.with_suffix(nat_filename.suffix + '.bz2')
    if bz2_filename.exists():
        print(f'Deleting {nat_filename}')
        nat_filename.unlink()

Deleting /mnt/storage_a/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/satellite/EUMETSAT/SEVIRI_RSS/native/2019/06/01/MSG3-SEVI-MSG15-0100-NA-20190601124416.900000000Z-20190601124434-1358336-1.nat
Deleting /mnt/storage_a/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/satellite/EUMETSAT/SEVIRI_RSS/native/2019/06/01/MSG3-SEVI-MSG15-0100-NA-20190601124916.989000000Z-20190601124935-1358336-1.nat
Deleting /mnt/storage_a/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/satellite/EUMETSAT/SEVIRI_RSS/native/2019/06/01/MSG3-SEVI-MSG15-0100-NA-20190601125417.078000000Z-20190601125504-1358336-1.nat
Deleting /mnt/storage_a/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/satellite/EUMETSAT/SEVIRI_RSS/native/2019/06/01/MSG3-SEVI-MSG15-0100-NA-20190601125917.167000000Z-20190601125935-1358336-1.nat
Deleting /mnt/storage_a/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/satellite/EUMETSAT/SEVIRI_RSS/native/2019/06/01/MSG3-SEVI-MSG15-0100-NA-20190601122416.

In [43]:
nat_filenames = list(EUMETSAT_PATH.glob('*/*/*/*.nat'))
print(f'{len(nat_filenames)} nat filenames exist.')

0 nat filenames exist.


## Remove duplicate `.nat.bz2` files, when we accidentally ordered the same datetime twice

In [7]:
def find_duplicate_files(year_month_day_directory: Path) -> list[Path]:
    files_for_day = [x for x in year_month_day_directory.glob('*.nat.bz2')]
    files_for_day = [x.name.replace('.nat.bz2', '') for x in files_for_day]
    
    # Split the filenames into <filename_without_order_number>, <order_number>
    split_filenames = [x.split('-') for x in files_for_day]
    filenames_without_order_number = ['-'.join(split_filename[:7]) for split_filename in split_filenames]
    order_numbers = ['-'.join(split_filename[7:]) for split_filename in split_filenames]
    df = pd.DataFrame(
        {
            'filename_without_order_number': filenames_without_order_number,
            'order_number': order_numbers,
        },
    )
    df = df.sort_values(by=['filename_without_order_number', 'order_number'])
    
    # Find duplicate filenames_without_order_number
    dupe_mask = df.filename_without_order_number.duplicated(keep='last')
    rows_to_delete = df[dupe_mask]
    filenames_to_delete = ['-'.join(row.values) for _, row in rows_to_delete.iterrows()]
    filenames_to_delete = [x + '.nat.bz2' for x in filenames_to_delete]
    filenames_to_delete = [year_month_day_directory / x for x in filenames_to_delete]
    
    return filenames_to_delete

In [14]:
def list_directories(path: Path, pattern: str) -> list[Path]:
    return [x for x in path.glob(pattern) if x.is_dir()]

In [16]:
%%time
year_month_day_directories = list_directories(path=EUMETSAT_PATH, pattern='*/*/*')
n = len(year_month_day_directories)
n

CPU times: user 16 ms, sys: 0 ns, total: 16 ms
Wall time: 33.6 ms


613

In [21]:
%%time

total_bytes_to_remove = 0
files_to_remove = []

for i, year_month_day_directory in enumerate(year_month_day_directories):
    print(f'\r{i+1:3d}/{n:3d}: {year_month_day_directory}', end='', flush=True)
    
    filenames_to_delete = find_duplicate_files(year_month_day_directory)
    
    files_to_remove.extend(filenames_to_delete)
    total_bytes_to_remove += sum([x.stat().st_size for x in filenames_to_delete])

613/613: /mnt/storage_a/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/satellite/EUMETSAT/SEVIRI_RSS/native/2020/02/09CPU times: user 3.09 s, sys: 304 ms, total: 3.4 s
Wall time: 4.14 s


In [27]:
n_files_to_remove = len(files_removed)
n_files_to_remove

14646

In [23]:
total_bytes_removed / 1E9

533.745391468

In [29]:
%%time

# Actually remove the files
for i, filename in enumerate(files_removed):
    if i % 100:
        print(f'\r{i+1:5d}/{n_files_to_remove:5d}: {filename}', end='', flush=True)
    filename.unlink()

14646/14646: /mnt/storage_a/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/satellite/EUMETSAT/SEVIRI_RSS/native/2020/02/09/MSG3-SEVI-MSG15-0100-NA-20200209235916.881000000Z-20200209235934-1402904-10.nat.bz2CPU times: user 10.3 s, sys: 2.24 s, total: 12.5 s
Wall time: 2min 9s
