# Data Processing Instruction

This instruction describes how we split the speech sentence into word MFCCs and how the following `.pkl` files come from:
- 500h_word_counter.pkl
- 500h_word_split.pkl
- 500h_word2wav_keys.pkl
- split_mfcc_dict.pkl

## 1.Get LibriSpeech Dataset

We use [LibriSpeech](https://www.openslr.org/12) Clean dataset for training, which includes:

- train-clean-100
- train-clean-360
- dev-clean
- test-clean

Download and merge them into a single folder, and there will be 1252 first level subfolders.

Then create vocab using text transcripts:

In [10]:
import sys
import glob
import os
import collections
import random
import numpy as np

sys.path.append("/".join(sys.path[0].split("/")[:-1]))
from utils import pickle_util,seed_util


def read_txt(txt_path):
    total_lines = []
    with open(txt_path) as f:
        lines = f.readlines()
        for l in lines:
            word_arr = l.lower().strip().split()
            assert len(word_arr) >= 2, l
            word_arr = word_arr[1:]
            total_lines.append(word_arr)
    return total_lines


def create_wrod_counter(root_folder):
    pattern = root_folder + "*/*/*.txt"
    arr = glob.glob(pattern)
    arr.sort()
    assert len(arr) == 2866
    total_lines = []
    for txt in arr:
        total_lines.extend(read_txt(txt))

    counter = collections.Counter()
    for l in total_lines:
        for w in l:
            counter[w] += 1
    return counter

# root_folder = "/ssd/1_libri_flac/500h/"
# assert len(os.listdir(root_folder)) == 1252
# counter = create_wrod_counter(root_folder)
# pickle_util.save_pickle("../dataset/info/500h_word_counter.pkl", counter)

## 2.Get Alignment Files

Download the `.txt` alignment files from: https://github.com/CorentinJ/librispeech-alignments.

Process them using the following code:

In [11]:
def handle_single_alignment_file(p1):
    obj_list = []
    with open(p1) as f:
        lines = f.readlines()
        lines = [l.strip() for l in lines if l.strip()]
        for l in lines:
            key, text, time_string = l.split()
            text_string = text[1:-1]
            time_string = time_string[1:-1]
            text_array = text_string.lower().split(",")
            time_array = [float(c) for c in time_string.split(",")]
            assert len(time_array) == len(text_array)
            ans_time_array = []
            ans_word_array = []
            for i in range(len(text_array)):
                word = text_array[i]
                if i > 0:
                    start_time = time_array[i - 1]
                else:
                    start_time = 0

                end_time = time_array[i]
                if word:
                    ans_word_array.append(word)
                    ans_time_array.append((start_time, end_time))
            audio_duration = time_array[-1]
            obj = {"key": key, "word_array": ans_word_array, "time_array": ans_time_array, "duration": audio_duration}
            obj_list.append(obj)
    return obj_list


def convert_alignment_files(alignment_folder):
    # 1. get all alignment files:
    names = [
        'train-clean-360',
        'train-clean-100',
        'dev-clean',
        'test-clean',
    ]
    txt_file_arr = []
    for name in names:
        pattern = alignment_folder + "%s/*/*/*.alignment.txt" % name
        txt_file_arr.extend(glob.glob(pattern))
    assert len(txt_file_arr) == 2866

    all_objs = []
    for f in txt_file_arr:
        all_objs.extend(handle_single_alignment_file(f))
    assert len(all_objs) == 137833
    return all_objs

def create_word2keys(all_objs):
    word2keys = collections.defaultdict(list)
    for obj in all_objs:
        key = obj["key"]
        word_array = obj["word_array"]
        for i in range(len(word_array)):
            w = word_array[i]
            save_name = "%s/%02d_%s.wav" % (key, i + 1, w)
            word2keys[w].append(save_name)
    return word2keys


# alignment_folder = "/data/1_dataset/LibriSpeechAlignmentTxt/"
# all_objs = convert_alignment_files(alignment_folder)
# pickle_util.save_pickle("../dataset/info/500h_word_split.pkl", all_objs)

# word2keys = create_word2keys(all_objs)
# pickle_util.save_pickle("../dataset/info/500h_word2wav_keys.pkl", word2keys)

## 3. Convert Speech Sentence into Word MFCCs

1.Run `python scripts/1_split_sentence_flac_to_word_wav.py` to split each sentence `.flac` into words `.wav` files.

2.Run `python scripts/2_convert_word_wav_to_mfcc.py` to covert each word `.wav` file into mfcc format.

3.Aggregate these mfcc file into a dict using following code:

In [12]:
def calc_mfcc_mean_std(pattern):
    voice_list = glob.glob(pattern)
    seed_util.set_seed(100)
    random.shuffle(voice_list)

    arr = []
    for i in range(10000):
        data = np.load(voice_list[i], allow_pickle=True)
        arr.append(data)

    cated = np.concatenate(arr, axis=0)
    mean = np.mean(cated, axis=0)
    std = np.std(cated, axis=0)
    save_data = {
        "mean": mean,
        "std": std
    }
    return save_data

def load_mfcc(mfcc_mean_std, the_path, aim_len=20):
    data = np.load(the_path, allow_pickle=True)
    data = (data - mfcc_mean_std["mean"]) / mfcc_mean_std["std"]
    cur_len = len(data)
    if cur_len > aim_len:
        data = data[0:aim_len]
    return data


def create_mfcc_dict(pattern, mfcc_mean_std):
    voice_list = glob.glob(pattern)

    ans_dict = {}
    for i in range(len(voice_list)):
        the_path = voice_list[i]
        # ../dataset/split_mfcc/7140-72212-0014/16_with.mfcc
        key = the_path.split("split_mfcc/")[1]
        ans_dict[key] = load_mfcc(mfcc_mean_std, the_path)
        if i % 10000 == 0:
            print(i / len(voice_list))
    return ans_dict

# pattern = "../dataset/split_mfcc/*/*.mfcc"
# mfcc_mean_std = calc_mfcc_mean_std(pattern)
# ans_dict = create_mfcc_dict(pattern, mfcc_mean_std)
# pickle_util.save_pickle("/ssd/1_libri_flac/split_mfcc_dict.pkl", ans_dict)