### Prepare ATCO2 datasets

* checks have been moved to check_files

In [1]:
import os
import glob
from os import path
import pandas as pd
from datasets import Dataset, load_dataset, Audio

In [2]:
DATA_DIR = "atco2_orig2"
ATCO2_CSV = "atco2.csv"

TEST_FRAC = 0.1

# the name of the HF dataset (also the dir where it is saved
HF_DIR = "atco2_hf"

# the expected sampling rate
SAMPLING_RATE = 16000

In [3]:
def remove_path(f_name):
    new_name = f_name.split("/")[-1]

    return new_name

#### Make some minimal checks

In [4]:
df_atco2 = pd.read_csv(path.join(DATA_DIR, ATCO2_CSV))

# remove old path
df_atco2["path"] = df_atco2["path"].apply(remove_path)

df_atco2.head()

Unnamed: 0,path,sentence
0,LKPR_RUZYNE_Radar_120_520MHz_20201025_091112.wav,Oscar Kilo Papa Mike Bravo descend flight leve...
1,LKPR_RUZYNE_Radar_120_520MHz_20201025_120512.wav,Oscar Kilo Kilo Echo Alfa Praha Radar identifi...
2,LKPR_RUZYNE_Radar_120_520MHz_20201025_121325.wav,Ryanair Seven Three Alpha Hotel turn left head...
3,LKPR_RUZYNE_Radar_120_520MHz_20201025_130407.wav,Oscar Kilo Kilo Uniform November proceed direc...
4,LKPR_RUZYNE_Radar_120_520MHz_20201025_140929.wav,Eurowings Seven Alfa Bravo turn right heading ...


In [5]:
# check that for every row there is a wav file
list_wav = glob.glob(path.join(DATA_DIR, "*.wav"))

# controlla che le due liste abbiano eguale lunghezza
assert df_atco2.shape[0] == len(list_wav)

# controlla che per ogni riga del csv ci sia il file wav
df_paths = list(df_atco2["path"].values)

not_found = 0
for my_path in df_paths:
    # add prefix
    new_path = DATA_DIR + "/" + my_path
    if new_path not in list_wav:
        print(f"{path} not found")
        not_found += 1

print()
if not_found == 0:
    print("All files match...")
else:
    print(f"{not_found} files not found...")


All files match...


#### Create the dataset

In [6]:
# create lists
list_path = list(df_atco2["path"].values)
list_text = list(df_atco2["sentence"].values)

list_new_path = ["./" + DATA_DIR + "/" + f_name for f_name in list_path]

# replace path
audio_dict = {"path": list_new_path, "audio": list_new_path, "sentence": list_text}

In [7]:
# create the dataset
audio_dataset = Dataset.from_dict(audio_dict)

# ci assicuriamo anche che sia a 16 Khz
audio_dataset = audio_dataset.cast_column("audio", Audio(sampling_rate=SAMPLING_RATE))

#### Check

In [8]:
audio_dataset[0]

{'path': './atco2_orig2/LKPR_RUZYNE_Radar_120_520MHz_20201025_091112.wav',
 'audio': {'path': './atco2_orig2/LKPR_RUZYNE_Radar_120_520MHz_20201025_091112.wav',
  'array': array([ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
         -6.1035156e-05, -6.1035156e-05, -6.1035156e-05], dtype=float32),
  'sampling_rate': 16000},
 'sentence': 'Oscar Kilo Papa Mike Bravo descend flight level one hundred level one hundred Oscar Kilo Papa Mike Bravo '}

In [9]:
atco2_ds = audio_dataset.train_test_split(shuffle=True, test_size=TEST_FRAC)

In [10]:
atco2_ds["train"][0]

{'path': './atco2_orig2/LSGS_SION_Ground_Control_121_7MHz_20210501_065354.wav',
 'audio': {'path': './atco2_orig2/LSGS_SION_Ground_Control_121_7MHz_20210501_065354.wav',
  'array': array([ 0.        ,  0.        ,  0.        , ..., -0.00045776,
         -0.00045776, -0.00045776], dtype=float32),
  'sampling_rate': 16000},
 'sentence': 'Hotel Echo X-ray report downwind wind calm runway two five cleared for takeoff '}

#### Save the dataset

In [11]:
# save the dataset in HF format

atco2_ds.save_to_disk(HF_DIR)

print(f"Dataset saved in HF format in {HF_DIR}")

Flattening the indices:   0%|          | 0/504 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/504 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/56 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/56 [00:00<?, ? examples/s]

Dataset saved in HF format in atco2_hf
