### Merge two HF Audio Datasets

In this NB we load two HF datasets (ATCO2 and ATCOSI), we give them the same structure (same columns, columns names) and we merge them.

We choose to merge:
* train with train
* test with test

finally, we save the concatenated dataset locally

In [1]:
from datasets import load_dataset, load_from_disk, concatenate_datasets
from datasets import Dataset, DatasetDict

In [2]:
HF_DS1 = "atco2_hf"
HF_DS2 = "atcosim_hf"

HF_MERGED = "atco2_atcosim_hf"

SEED = 42

# list of cols to remove
# since we have only in ds2
to_remove = ["segment_start_time", "segment_end_time", "duration"]

In [3]:
# load the two ds
ds1 = load_from_disk(HF_DS1)

ds1

DatasetDict({
    train: Dataset({
        features: ['path', 'audio', 'sentence'],
        num_rows: 504
    })
    test: Dataset({
        features: ['path', 'audio', 'sentence'],
        num_rows: 56
    })
})

In [4]:
ds1["test"][0]

{'path': './atco2_orig2/LSGS_SION_Ground_Control_121_7MHz_20210505_105917.wav',
 'audio': {'path': 'LSGS_SION_Ground_Control_121_7MHz_20210505_105917.wav',
  'array': array([ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         -9.15527344e-05, -9.15527344e-05, -9.15527344e-05]),
  'sampling_rate': 16000},
 'sentence': 'Tango Three Three Five frequency change approved good bye good bye '}

In [5]:
ds2 = load_from_disk(HF_DS2)

ds2

DatasetDict({
    test: Dataset({
        features: ['id', 'audio', 'text', 'segment_start_time', 'segment_end_time', 'duration'],
        num_rows: 1901
    })
    train: Dataset({
        features: ['id', 'audio', 'text', 'segment_start_time', 'segment_end_time', 'duration'],
        num_rows: 7638
    })
})

In [6]:
ds2["test"][0]

{'id': 'atcosim_zf1_05_146_000000_000350',
 'audio': {'path': None,
  'array': array([-0.01065305, -0.0091649 ,  0.00305132, ...,  0.00235061,
          0.00200877, -0.00022785]),
  'sampling_rate': 16000},
 'text': 'lufthansa four three nine three descend to flight level two seven zero',
 'segment_start_time': 0.0,
 'segment_end_time': 3.5,
 'duration': 3.5}

#### Change columns name, remove

In [7]:
ds1 = ds1.rename_column("path", "id")
ds2 = ds2.rename_column("text", "sentence")

ds2 = ds2.remove_columns(to_remove)

In [8]:
ds1

DatasetDict({
    train: Dataset({
        features: ['id', 'audio', 'sentence'],
        num_rows: 504
    })
    test: Dataset({
        features: ['id', 'audio', 'sentence'],
        num_rows: 56
    })
})

In [9]:
ds2

DatasetDict({
    test: Dataset({
        features: ['id', 'audio', 'sentence'],
        num_rows: 1901
    })
    train: Dataset({
        features: ['id', 'audio', 'sentence'],
        num_rows: 7638
    })
})

In [10]:
ds_train = concatenate_datasets([ds1["train"], ds2["train"]])
ds_test = concatenate_datasets([ds1["test"], ds2["test"]])

# shuffle
ds_train = ds_train.shuffle(seed=SEED)
ds_test = ds_test.shuffle(seed=SEED)

Loading cached shuffled indices for dataset at /Users/lsaetta/Progetti/hf-audio-datasets/atco2_hf/train/cache-3461719dad91c431.arrow
Loading cached shuffled indices for dataset at /Users/lsaetta/Progetti/hf-audio-datasets/atco2_hf/test/cache-9b102643e96462bf.arrow


In [11]:
# create the final dataset dict
ds = DatasetDict()

ds["train"] = ds_train
ds["test"] = ds_test

In [12]:
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'audio', 'sentence'],
        num_rows: 8142
    })
    test: Dataset({
        features: ['id', 'audio', 'sentence'],
        num_rows: 1957
    })
})

In [13]:
ds["train"][0:2]

{'id': ['./atco2_orig2/LSZB_BERN_Tower_121_0MHz_20210421_094113.wav',
  'atcosim_zf1_04_064_000000_000594'],
 'audio': [{'path': 'LSZB_BERN_Tower_121_0MHz_20210421_094113.wav',
   'array': array([0.        , 0.        , 0.        , ..., 0.07315063, 0.07467651,
          0.07498169]),
   'sampling_rate': 16000},
  {'path': None,
   'array': array([-0.00191666,  0.00078971, -0.00110885, ...,  0.00132176,
           0.00324113,  0.00264592]),
   'sampling_rate': 16000}],
 'sentence': ["Hotel Golf Papa unknown VFR traffic a helicopter at your eleven o'clock position two miles same altitude opposite direction ",
  'air france one five five four due traffic turn disregard descend to flight level three three zero']}

In [14]:
# save the dataset in HF format

ds.save_to_disk(HF_MERGED)

print(f"Dataset saved in HF format in {HF_MERGED}")

Loading cached processed dataset at /Users/lsaetta/Progetti/hf-audio-datasets/atco2_hf/train/cache-e0918e897043a2d5.arrow


Saving the dataset (0/5 shards):   0%|          | 0/8142 [00:00<?, ? examples/s]

Loading cached processed dataset at /Users/lsaetta/Progetti/hf-audio-datasets/atco2_hf/test/cache-7575155bb3dd26af.arrow


Saving the dataset (0/1 shards):   0%|          | 0/1957 [00:00<?, ? examples/s]

Dataset saved in HF format in atco2_atcosim_hf
