In [56]:
# standard library
import os, sys
from os.path import join, exists
from json import dump
# external
import numpy as np
import torch
import pandas as pd
# local
# -

Splitting images into Train/Val/Test

- multiple images might capture the same area

In [58]:
seed = 42
np.random.seed(seed)

path_data = '/mnt/sdb1/code/sentinel2/interim'
path_images = join(path_data, 'subscenes')
path_masks = join(path_data, 'masks')
path_filelists = join(path_data, 'filelists')

split_train = 0.8
split_val = 0.1

In [59]:
# split images into train, val, test
images_filenames = os.listdir(path_images)
images_filenames.sort()
files = np.array([f for f in images_filenames if exists(join(path_masks, f))])
indices = np.random.permutation(len(files))
n_train = int(split_train * len(files))
n_val = int(split_val * len(files))
indices_train = indices[:n_train]
indices_val = indices[n_train:n_train+n_val]
indices_test = indices[n_train+n_val:]
print(f'{len(indices_train)} train, {len(indices_val)} val, {len(indices_test)} test')
files_train = files[indices_train]
files_val = files[indices_val]
files_test = files[indices_test]

410 train, 51 val, 52 test


In [61]:
# save filelists as csv files
path_train = join(path_filelists, 'train.csv')
path_val = join(path_filelists, 'val.csv')
path_test = join(path_filelists, 'test.csv')
os.makedirs(path_filelists, exist_ok=False)  # fail if exists
pd.DataFrame(files_train).to_csv(path_train, index=False, header=False)
pd.DataFrame(files_val).to_csv(path_val, index=False, header=False)
pd.DataFrame(files_test).to_csv(path_test, index=False, header=False)

In [63]:
# save metadata
path_metadata = join(path_filelists, 'metadata.json')
metadata = {
    'split_train': split_train,
    'split_val': split_val,
    'seed': seed,
    'total': len(files),
    'timestamp': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'),
}
with open(path_metadata, 'w') as f:
    dump(metadata, f, indent=4)