In [1]:
from pathlib import Path

import numpy as np
import pandas as pd

In [2]:
audio_dir = Path("recordings/")
digit_2_text = {
    "0": "ZERO",
    "1": "ONE",
    "2": "TWO",
    "3": "THREE"
    "4": "FOUR",
    "5": "FIVE",
    "6": "SIX",
    "7": "SEVEN",
    "8": "EIGHT",
    "9": "NINE",
}

In [3]:
df = pd.DataFrame(data={"path": audio_dir.iterdir()})
df["label"] = df["path"].apply(lambda x: str(x.stem).split("_")[0])
df["speaker"] = df["path"].apply(lambda x: str(x.stem).split("_")[1])

# Deal w/ sorting problems in kaldi
def make_utter_id(p):
    names = p.stem.split("_")
    return "-".join([names[1], names[0], names[2]])
df["utter_id"] = df["path"].apply(make_utter_id)
df["subset"] = None

In [4]:
test_ratio = 0.2
for label, speaker in df.groupby(["label", "speaker"]).groups.keys():
    num_total = len(df.loc[(df["label"] == label) & (df["speaker"] == speaker)])
    num_test = int(num_total * 0.2)
    num_train = num_total - num_test
    
    subset_tags = ["train"] * num_train + ["test"] * num_test
    np.random.shuffle(subset_tags)
    df.loc[(df["label"] == label) & (df["speaker"] == speaker), "subset"] = subset_tags

In [5]:
train_df = df[df["subset"] == "train"]
test_df = df[df["subset"] == "test"]

In [6]:
train_df.head()

Unnamed: 0,path,label,speaker,utter_id,subset
0,recordings/0_jackson_0.wav,0,jackson,jackson-0-0,train
2,recordings/0_jackson_10.wav,0,jackson,jackson-0-10,train
3,recordings/0_jackson_11.wav,0,jackson,jackson-0-11,train
4,recordings/0_jackson_12.wav,0,jackson,jackson-0-12,train
5,recordings/0_jackson_13.wav,0,jackson,jackson-0-13,train


In [7]:
audio_df = train_df

In [8]:
# Create "text" (utterance id + transcript)
text_df = pd.DataFrame(data={"0": audio_df["utter_id"],
                             "1": [digit_2_text[label] for label in audio_df["label"]]})
                                   
text_df.to_csv("data/train/text", sep=" ", index=False, header=False)

In [9]:
# Create "wav.scp" (utterance id + audio path)
wav_scp_df = pd.DataFrame(data={"0": audio_df["utter_id"],
                                "1": audio_df["path"]})
wav_scp_df.to_csv("data/train/wav.scp", sep=" ", index=False, header=False)

In [10]:
# Create "utt2spk" (utterance id + speaker id)
utt2spk_df = pd.DataFrame(data={"0": audio_df["utter_id"],
                                "1": audio_df["speaker"]})
utt2spk_df.to_csv("data/train/utt2spk", sep=" ", index=False, header=False)

In [35]:
present_words = set()
with open("data/train/text") as f:
    for line in f.readlines():
        _, transcript = line.strip().split(" ", 1)
        present_words.update(transcript.split(" "))

with open("full_lexicon.txt", encoding="ISO-8859-1") as f, \
        open("data/local/lang/lexicon.txt", "w") as target_file:
    for line in f.readlines():
        word, pronun = line.strip().split(" ", 1)
        if word in present_words:
            target_file.write(f"{word}{pronun}\n")

In [28]:
present_words

{'EIGHT',
 'FIVE',
 'FOUR',
 'NINE',
 'ONE',
 'SEVEN',
 'SIX',
 'THREE',
 'TWO',
 'ZERO'}