# Distribute YAML config files
This notebook is used to copy a template YAML file (where some information has already been filled in) to a set of dataset directories. Each YAML file is then modified, filling in the details that are specific to each dataset (date, grouping, area, number of raw files). These details (except number of files) are inferred from the name of the dataset directory, which follows the pattern `massimal_<grouping>_<area>_<datetime>-<optional_identifier(s)>_hsi`. 

In [1]:
import yaml
from pathlib import Path  
import shutil

In [2]:
base_dirs = ['/home/mha114/data/seabee-minio']
yaml_file_name = 'config.seabee.yaml'

In [3]:
def parse_config(yaml_path):
    """Parse YAML config file, accepting only basic YAML tags"""
    with open(yaml_path, "r") as stream:
        data = yaml.safe_load(stream)
    return data

def write_config(data,yaml_path):
    """Write YAML config file, using only basic YAML tags"""
    with open(yaml_path, mode="wt",encoding="utf-8") as stream:
        yaml.safe_dump(data,stream,sort_keys=False)

In [4]:
def find_hsi_datasets(base_dir:Path,dataset_subdir_search_str='0_raw'):
    """Find HSI dataset paths based on expected subdirectory in dataset"""
    subdirs = base_dir.rglob(dataset_subdir_search_str)
    return [subdir.parent for subdir in subdirs]

In [5]:
# "Dry-run", show found dataset dirs
for base_dir in base_dirs:
    print(f'BASE DIR: {base_dir}')
    dataset_dirs = find_hsi_datasets(Path(base_dir))
    for dd in dataset_dirs:
        print('    ' + dd.name)

BASE DIR: /home/mha114/data/seabee-minio
    massimal_smola_skalmen_202306201736-nw_hsi
    massimal_smola_skalmen_202306201520-nw_hsi
    massimal_smola_skalmen_202306201815-se_hsi
    massimal_smola_skalmen_202306201640-nw_hsi
    massimal_smola_skalmen_202306201552-nw_hsi
    massimal_smola_skalmen_202306201709-nw_hsi
    massimal_smola_skalmen_202306201842-se_hsi
    massimal_smola_maholmen_202306191438-all_hsi
    massimal_smola_maholmen_202306211432-3_hsi
    massimal_smola_maholmen_202306211228-2_hsi
    massimal_smola_maholmen_202306211155-2_hsi
    massimal_smola_maholmen_202306211324-2_hsi
    massimal_smola_maholmen_202306211129-2_hsi
    massimal_smola_maholmen_202306211355-3_hsi
    massimal_larvik_olbergholmen_202108251029-south_hsi
    massimal_larvik_olbergholmen_202108251318-north2_hsi
    massimal_larvik_olbergholmen_202108250952-north1_hsi
    massimal_larvik_olbergholmen_202308301025-north_hsi
    massimal_larvik_olbergholmen_202308300939-south2_hsi
    massimal_lar

In [6]:
# Copy template to all dataset dirs
for base_dir in base_dirs:
    dataset_dirs = find_hsi_datasets(Path(base_dir))
    for dataset_dir in dataset_dirs:
        shutil.copy(yaml_file_name,dataset_dir)

In [7]:
# Update YAML files with date, area etc. specific to each dataset
for dataset_dir in dataset_dirs:
    # Get dataset information from dataset directory name
    try:
        _,grouping,area,datetime,_ = dataset_dir.name.split('_')
        datetime = datetime.split('-')[0] # Remove additional dataset identifier, e.g. "north"
        nfiles = sum([Path(raw_dir).is_dir() for raw_dir in (dataset_dir / '0_raw').glob('*')])
    except ValueError:
        print(f'Error while parsing dataset {dataset_dir}')
        
    # Read current YAML file
    dataset_yaml_path = dataset_dir / yaml_file_name
    yaml_data = parse_config(dataset_yaml_path)

    # Update with information from directory name
    yaml_data['grouping'] = grouping
    yaml_data['area'] = area
    yaml_data['datetime'] = datetime
    yaml_data['nfiles'] = nfiles
    write_config(yaml_data,dataset_yaml_path)

    