In [1]:
import os
import re
from typing import Union

import numpy as np
import pandas as pd
import torch
import yaml
import torchaudio
from torch import Tensor

from torchaudio.compliance import kaldi
from torchaudio.transforms import ComputeDeltas

In [2]:
class AttrDict(dict):
    def __init__(self, *args, **kwargs):
        super(AttrDict, self).__init__(*args, **kwargs)

    def __getattr__(self, item):
        if item not in self:
            return None
        if type(self[item]) is dict:
            self[item] = AttrDict(self[item])
        return self[item]

    def __setattr__(self, item, value):
        self.__dict__[item] = value

In [3]:

CONFIG_PATH = "../config/config.yaml"

with open(CONFIG_PATH) as file:
    config = AttrDict(yaml.load(file, Loader=yaml.FullLoader))
config = config.data

In [4]:
def preprocess_transcript(transcript):
    transcript = re.sub(r'[^\w\s]', '', transcript)
    transcript = transcript.lower()
    transcript = transcript.strip()
    transcript = transcript.replace('"', '')
    return transcript

In [5]:
PATH_PREFIX = 'data/'

def load_transcripts(data_path, transcript_path):
    transcripts = []
    df = pd.read_csv(data_path).dropna(subset=['index'])
    audio_paths = df.loc[df['path_from_data_dir'].str.endswith('.WAV'), 'path_from_data_dir']
    
    for audio_path in audio_paths:
        txt_path = os.path.join(PATH_PREFIX, os.path.splitext(audio_path)[0] + '.TXT')
        audio_path = os.path.join(PATH_PREFIX, audio_path)
        
        with open(txt_path, 'r') as f:
            file_content = f.read().split(' ', 2)
            _, duration, transcript = file_content
            transcript = preprocess_transcript(transcript)
            transcripts.append({
                'audio_path': audio_path,
                'transcript': transcript,
                'duration': duration
            })
    
    transcripts_df = pd.DataFrame(transcripts)
    transcripts_df.to_csv(transcript_path, index=False)
    

In [6]:
train_data_path = config.train_data
core_train_path = config.core_train
load_transcripts(train_data_path, core_train_path)

test_data_path = config.test_data
core_test_path = config.core_test
load_transcripts(test_data_path, core_test_path)

In [7]:
def get_audio_features(
    audio_path: str,
    config: AttrDict,
    mean: Union[float, Tensor] = 0.0,
    std: Union[float, Tensor] = 1.0,
):
    x, _ = torchaudio.load(audio_path, normalize=True)
    mfcc = kaldi.mfcc(
        waveform=x,
        window_type=config.window_type,
        num_mel_bins=config.num_mel_bins,
    )
    delta = ComputeDeltas()(mfcc)
    features = torch.cat((mfcc, delta), dim=1)
    return (features - mean) / std

In [8]:
df = pd.read_csv(core_train_path)

features = []
for audio_path in df['audio_path']:
    feature = get_audio_features(audio_path, config) #TxC
    features.append(feature)

features_concat = torch.cat(features, dim=0)
global_train_mean = torch.mean(features_concat, axis=0, keepdims=True) #1xC
global_train_std = torch.std(features_concat, axis=0, keepdims=True) #1xC

print('TRAIN_MEAN\n', global_train_mean.squeeze())
print('TRAIN_STD\n', global_train_std.squeeze())

TRAIN_MEAN
 tensor([-37.4068,  -9.6411, -12.1189,  -5.6344, -16.7420, -12.8604, -10.2134,
         -9.9208,  -0.7866,  -5.6351,  -1.9424,  -3.0447,  -2.9735,   7.8341,
          8.8833,   4.5336,  -1.1062,  -0.3415,  -0.2044,   3.4851,   2.3877,
          2.0828,   1.2596,  -0.1783,   0.4292,  -0.1991])
TRAIN_STD
 tensor([15.3909, 20.1550, 15.4514, 18.1530, 18.6900, 17.3615, 17.3337, 17.6844,
        15.8430, 15.9828, 13.7187, 13.5874, 11.8107,  6.2806,  6.8959,  7.3200,
         6.3373,  5.3114,  6.1657,  5.2719,  4.9151,  4.8187,  4.7241,  4.2430,
         4.8689,  4.8141])


In [9]:
def save_audios_features(transcript_path: str, config: AttrDict, mean: Tensor, std: Tensor):
    df = pd.read_csv(transcript_path)
    for audio_path in df['audio_path']:
        features = get_audio_features(audio_path, config, mean, std)
        features = features.unsqueeze(0)
        save_path = f"{os.path.splitext(audio_path)[0]}.npy"
        np.save(save_path, features.numpy())
        print(save_path)

In [10]:
save_audios_features(core_train_path, config, global_train_mean, global_train_std)
save_audios_features(core_test_path, config, global_train_mean, global_train_std)

data/TRAIN/DR4/MMDM0/SI681.npy
data/TRAIN/DR4/MMDM0/SA2.npy
data/TRAIN/DR4/MMDM0/SX411.npy
data/TRAIN/DR4/MMDM0/SA1.npy
data/TRAIN/DR4/MMDM0/SX231.npy
data/TRAIN/DR4/MMDM0/SX51.npy
data/TRAIN/DR4/MMDM0/SX141.npy
data/TRAIN/DR4/MMDM0/SI1941.npy
data/TRAIN/DR4/MMDM0/SI1311.npy
data/TRAIN/DR4/MMDM0/SX321.npy
data/TRAIN/DR4/MCSS0/SX300.npy
data/TRAIN/DR4/MCSS0/SX210.npy
data/TRAIN/DR4/MCSS0/SI750.npy
data/TRAIN/DR4/MCSS0/SI1380.npy
data/TRAIN/DR4/MCSS0/SA2.npy
data/TRAIN/DR4/MCSS0/SA1.npy
data/TRAIN/DR4/MCSS0/SX390.npy
data/TRAIN/DR4/MCSS0/SX30.npy
data/TRAIN/DR4/MCSS0/SI688.npy
data/TRAIN/DR4/MCSS0/SX120.npy
data/TRAIN/DR4/MCDR0/SI524.npy
data/TRAIN/DR4/MCDR0/SX164.npy
data/TRAIN/DR4/MCDR0/SI1784.npy
data/TRAIN/DR4/MCDR0/SA2.npy
data/TRAIN/DR4/MCDR0/SI1154.npy
data/TRAIN/DR4/MCDR0/SX74.npy
data/TRAIN/DR4/MCDR0/SA1.npy
data/TRAIN/DR4/MCDR0/SX344.npy
data/TRAIN/DR4/MCDR0/SX434.npy
data/TRAIN/DR4/MCDR0/SX254.npy
data/TRAIN/DR4/MLEL0/SI1246.npy
data/TRAIN/DR4/MLEL0/SI1876.npy
data/TRAIN/DR4/M

In [11]:

test_speakers = {
    "DAB0", "TAS1", "JMP0", "LLL0", "BPM0", "CMJ0", "GRT0", "JLN0", "WBT0", 
    "WEW0", "LNT0", "TLS0", "KLT0", "JDH0", "NJM0", "PAM0", "ELC0", "PAS0", 
    "PKT0", "JLM0", "NLP0", "MGD0", "DHC0", "MLD0"
}

def is_core_test_set(row):
    path_parts = row["audio_path"].split("/")
    speaker = path_parts[3][1:]
    sentence = path_parts[4].split('.')[0]
    return speaker in test_speakers and not sentence.startswith("SA")

df = pd.read_csv(core_test_path)
df = df[df.apply(is_core_test_set, axis=1)]
df['audio_path'] = config.name + '/' + df['audio_path']
df.to_csv(core_test_path, index=False)

print(f'test dataset size: {len(df)}')

test dataset size: 192


In [12]:
def is_core_train_set(row):
    path_parts = row["audio_path"].split("/")
    sentence = path_parts[4].split('.')[0]
    return not sentence.startswith("SA")

df = pd.read_csv(core_train_path)
df = df[df.apply(is_core_train_set, axis=1)]
df['audio_path'] = config.name + '/' + df['audio_path']
df.to_csv(core_train_path, index=False)

print(f'train dataset size: {len(df)}')

train dataset size: 3696


In [13]:

core_val_path = config.core_val

df = pd.read_csv(core_train_path)
df = df.sample(n=184, random_state=42)  
df.to_csv(core_val_path, index=False)

print(f'validation dataset size: {len(df)}')

validation dataset size: 184
