### Prepare ATCO2 Dataset Step 1

In [1]:
import torch
import librosa
import numpy as np
from datasets import load_dataset, Audio
from datasets import Dataset
from tqdm import tqdm

import glob
import soundfile as sf
import re
import pandas as pd
import xml.etree.ElementTree as ET

In [2]:
# global setting
PATH_ATCO2_DATA = "/home/datascience/atco2-data/ATCO2-ASRdataset-v1_beta/DATA"

TEST_FRAC = 0.1

In [3]:
# functions
def extract_text_from_xml(f_name):
    tree = ET.parse(f_name)

    root = tree.getroot()

    all_text = ""

    for elem in root.iter():
        if elem.tag == "text":
            # replace [ with <
            new_text = elem.text.replace("[", "<")
            # replace ] with >
            new_text = new_text.replace("]", ">")
        
            new_text = re.sub('<[^>]+>', '', new_text)
        
            all_text += new_text + " "
    
    return all_text

In [4]:
list_wav = sorted(glob.glob(PATH_ATCO2_DATA + "/*.wav"))
list_xml = sorted(glob.glob(PATH_ATCO2_DATA + "/*.xml"))

print(f"We have {len(list_wav)} wav files.")
print(f"We have {len(list_xml)} xml files.")

We have 560 wav files.
We have 560 xml files.


In [5]:
# check that all files have sample rate = 16000

no_except = 0

for file_wav in list_wav:
    data, samplerate = sf.read(file_wav)
    
    if samplerate != 16000:
        n_except += 1
        print(f"{file_wav}: sample_rate: {samplerate}")

if no_except == 0:
    print("All files with sample rate = 16000.")

All files with sample rate = 16000.


#### Read all xml and extract only text

In [6]:
# for Pandas DataFrame
list_path_names = []
list_txts = []
list_arrays = []

with tqdm(total=len(list_wav)) as pbar:
    
    for wav_name in list_wav:
        # get only name
        only_name = wav_name.split(".")[0]
    
        xml_name = only_name + ".xml"
        name_withouth_path = xml_name.split("/")[-1]

        txt = extract_text_from_xml(xml_name)
    
        # to remove from list a special unicode char
        txt = txt.replace(u'\xa0', u' ')
    
        list_path_names.append(wav_name)
        list_txts.append(txt)
    
        data, sr = librosa.load(wav_name, sr=None)
        
        list_arrays.append({"array": np.array(data, dtype=np.float32)})
        
        pbar.update(1)
    
dict_res = {"path": list_path_names, "audio" : list_arrays, "transcription": list_txts}

100%|██████████| 560/560 [01:01<00:00,  9.05it/s]


#### Create the HF Dataset 
* as https://huggingface.co/docs/transformers/tasks/asr

In [7]:
ds_atco2 = Dataset.from_dict(dict_res)

In [8]:
# split in train, test
ds_atco2_train_valid = ds_atco2.train_test_split(shuffle=True, test_size=TEST_FRAC, seed=1243)

In [9]:
print(f"Abbiamo {len(ds_atco2_train_valid['train'])} record nel train dataset.")
print(f"Abbiamo {len(ds_atco2_train_valid['test'])} record nel validation dataset.")

Abbiamo 504 record nel train dataset.
Abbiamo 56 record nel validation dataset.


In [13]:
ds_atco2_train_valid['train'][10]

{'path': '/home/datascience/atco2-data/ATCO2-ASRdataset-v1_beta/DATA/LSZB_BERN_Tower_121_0MHz_20210413_161649.wav',
 'audio': {'array': [0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
  

#### Salvo il dataset

In [14]:
# salvo il dataset in una directory
DS_DIR = "ds_atco2"

ds_atco2_train_valid.save_to_disk(DS_DIR)

Flattening the indices:   0%|          | 0/1 [00:00<?, ?ba/s]

Flattening the indices:   0%|          | 0/1 [00:00<?, ?ba/s]