### Imports

In [1]:
import os
import re
import string
import librosa
import numpy as np
import pandas as pd
import IPython.display as ipd

### Config

TODO: Use .``yaml`` file to set up configuration variables.

In [2]:
# Sample rate
from config import SR

# Paths for the directories of the data
from config import DATA_PATH, AF_PATH, XH_PATH

# Exception that is raised if the required data is not downloaded
from config import DataNotDownloadedError

Note that the 2nd column of the line_index.tsv file is referred to as:
 - 'label' OR 'transcript' OR 'text' OR 'sentence'
 
We mostly use the term 'label' in the code.

### Read in transcript data (labels)

In [11]:
if (not os.path.isdir('data/high-quality-tts-data')) or (not os.path.isdir('data')):
    raise DataNotDownloadedError('Data is not available. Please download the data first.')

af_labels = pd.read_csv(
    os.path.join(os.path.join(DATA_PATH, AF_PATH), 'line_index.tsv'), sep='\t')
xh_labels = pd.read_csv(
    os.path.join(os.path.join(DATA_PATH, XH_PATH), 'line_index.tsv'), sep='\t')

def preprocess_transcripts(column):
    column = column.apply(lambda x: x.lower())
    column = column.apply(lambda x: re.sub(rf'[{string.punctuation}]', '', x))
    column = column.apply(lambda x: re.sub(r'\d', '0', x))
    # TODO: Use unidecode to remove diacritics (if necessary)
    # TODO: Add <s> and </s> (if necessary)
    #       Also probably better to do after converting sentences 
    #       into list of words/strings
    return column

for labels in [af_labels, xh_labels]:
    labels.columns = ['file_name', 'label']
    labels['label'] = preprocess_transcripts(labels['label'])

### Read in audio data (using ``librosa``)

In [12]:
af_dir =  os.path.join(os.path.join(DATA_PATH, AF_PATH), 'wavs')
xh_dir =  os.path.join(os.path.join(DATA_PATH, XH_PATH), 'wavs')

def read_wav_files(directory):
    for filename in os.listdir(directory):
        if filename.endswith('.wav'):
            file_path = os.path.join(directory, filename)
            audio_data, sample_rate = librosa.load(file_path, sr=None)
            assert sample_rate == 48000
            yield filename, audio_data, sample_rate

audio_data_af = {}
audio_data_xh = {}

for filename, audio_data, _ in read_wav_files(af_dir):
    audio_data_af[filename] = audio_data

for filename, audio_data, _ in read_wav_files(xh_dir):
    audio_data_xh[filename] = audio_data

Test

In [14]:
# audio_data_test = audio_data_af['afr_8924_2922670405.wav']
# ipd.Audio(data=audio_data_test, rate=SR)