### Prepare ATCO2 dataset with augmentatio

* checks have been moved to check_files
* augmentation: change speed (0.9, 1, 1.1): see https://jonathanbgn.com/2021/08/30/audio-augmentation.html

In [1]:
import os
import random
import glob
from tqdm import tqdm
from os import path
import pandas as pd
from datasets import Dataset, load_dataset, Audio

# for augmentation
import torchaudio

In [2]:
SEED = 42

TEST_FRAC = 0.1

DATA_DIR = "atco2_orig2"
ATCO2_CSV = "atco2.csv"
LOCAL_AUG_DIR = "atco2_augmentation"

# directory for the augmented dataset
HF_DIR = "atco2_hf_augmented"

# the expected sampling rate
SAMPLING_RATE = 16000

In [3]:
def remove_path(f_name):
    new_name = f_name.split("/")[-1]

    return new_name

# for audio augmentation
# from https://jonathanbgn.com/2021/08/30/audio-augmentation.html

class AudioSpeedChanger:
    def __init__(self, sample_rate):
        self.sample_rate = sample_rate

    def __call__(self, audio_data, speed_factor):
        # limits what could be done
        assert speed_factor in [0.9, 1.0, 1.1]
        
        if speed_factor == 1.0: # no change
            return audio_data

        # change speed and resample to original rate:
        sox_effects = [
            ["speed", str(speed_factor)],
            ["rate", str(self.sample_rate)],
        ]
        transformed_audio, _ = torchaudio.sox_effects.apply_effects_tensor(
            audio_data, self.sample_rate, sox_effects)
        
        return transformed_audio

#### Read the list of files

In [4]:
df_atco2 = pd.read_csv(path.join(DATA_DIR, ATCO2_CSV))

# remove old path
df_atco2["path"] = df_atco2["path"].apply(remove_path)

df_atco2.head()

Unnamed: 0,path,sentence
0,LKPR_RUZYNE_Radar_120_520MHz_20201025_091112.wav,Oscar Kilo Papa Mike Bravo descend flight leve...
1,LKPR_RUZYNE_Radar_120_520MHz_20201025_120512.wav,Oscar Kilo Kilo Echo Alfa Praha Radar identifi...
2,LKPR_RUZYNE_Radar_120_520MHz_20201025_121325.wav,Ryanair Seven Three Alpha Hotel turn left head...
3,LKPR_RUZYNE_Radar_120_520MHz_20201025_130407.wav,Oscar Kilo Kilo Uniform November proceed direc...
4,LKPR_RUZYNE_Radar_120_520MHz_20201025_140929.wav,Eurowings Seven Alfa Bravo turn right heading ...


#### Create the non augmented dataset

In [5]:
# create lists
list_path = list(df_atco2["path"].values)
list_text = list(df_atco2["sentence"].values)

list_new_path = ["./" + DATA_DIR + "/" + f_name for f_name in list_path]

# replace path
audio_dict = {"path": list_new_path, "audio": list_new_path, "sentence": list_text}

In [6]:
# create the dataset
# cast moved after augmentation
ds_atco2 = Dataset.from_dict(audio_dict)

# casto to audio moved after augmentation

In [7]:
# shuffle and split in train, test
ds_atco2_train_valid = ds_atco2.train_test_split(shuffle=True, test_size=TEST_FRAC, seed=SEED)

In [8]:
ds_atco2_train = ds_atco2_train_valid['train']
ds_atco2_test = ds_atco2_train_valid['test']

print(f"We have {len(ds_atco2_train_valid['train'])} records in train dataset.")
print(f"We have {len(ds_atco2_train_valid['test'])} records in test dataset.")

We have 504 records in train dataset.
We have 56 records in test dataset.


In [9]:
# we only augment train set
# get the list of files for train, for augmentation and create augmented wav
speed_transform = AudioSpeedChanger(SAMPLING_RATE)

list_new_file_names = []
list_new_sentences = []

for f_name, the_sentence in tqdm(zip(ds_atco2_train['path'], ds_atco2_train['sentence'])):
    # remove the dir
    only_path_name = f_name.split("/")[-1]
    audio_data, sample_rate = torchaudio.load(f_name)
    
    # for each file build two other version: 0.9 speed and 1.1 speed
    transformed_audio_09 = speed_transform(audio_data, 0.9)
    transformed_audio_11 = speed_transform(audio_data, 1.1)
    
    # save to local dir
    aug09_pathname = LOCAL_AUG_DIR + "/09_" + only_path_name
    torchaudio.save(aug09_pathname, transformed_audio_09, SAMPLING_RATE)
    
    list_new_file_names.append(aug09_pathname)
    list_new_sentences.append(the_sentence)
    
    aug11_pathname = LOCAL_AUG_DIR + "/11_" + only_path_name
    torchaudio.save(aug11_pathname, transformed_audio_11, SAMPLING_RATE)
    
    list_new_file_names.append(aug11_pathname)
    list_new_sentences.append(the_sentence)

504it [00:14, 34.34it/s]


In [10]:
# adding to ds_train
for f_name, sentence in zip(list_new_file_names, list_new_sentences):
    to_add = {"path": f_name, "audio" : f_name, "sentence": sentence}
    ds_atco2_train_valid['train'] = ds_atco2_train_valid['train'].add_item(to_add)

In [11]:
# finally we cast to audio
ds_atco2_train_valid = ds_atco2_train_valid.cast_column("audio", Audio(sampling_rate=SAMPLING_RATE))

In [12]:
# re-get the right pointers
ds_atco2_train = ds_atco2_train_valid['train']
ds_atco2_test = ds_atco2_train_valid['test']

# see if the numbers are ok
print(f"We have {len(ds_atco2_train_valid['train'])} records in train dataset.")
print(f"We have {len(ds_atco2_train_valid['test'])} records in test dataset.")

We have 1512 records in train dataset.
We have 56 records in test dataset.


In [13]:
# make a final check for compatibility with HF example

rand_int = random.randint(0, len(ds_atco2_train)-1)

print()
print("Target text:", ds_atco2_train[rand_int]["sentence"])
print("Input array shape:", ds_atco2_train[rand_int]["audio"]["array"].shape) 
print("Sampling rate:", ds_atco2_train[rand_int]["audio"]["sampling_rate"])


Target text: Vistajet Four Six Seven we expect to descend 
Input array shape: (44032,)
Sampling rate: 16000


#### Save the dataset

In [14]:
# save the dataset in HF format

ds_atco2_train_valid.save_to_disk(HF_DIR)

print(f"Dataset saved in HF format in {HF_DIR}")

Flattening the indices:   0%|          | 0/1512 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/1512 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/56 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/56 [00:00<?, ? examples/s]

Dataset saved in HF format in atco2_hf_augmented
