In [1]:
import soundfile as sf
import random
import os
import matplotlib.pyplot as plt
import textgrid
import numpy as np

In [2]:
# Get all dictories and file names of files with certain suffix
def get_filelist(dir, Filelist, namelist, suffix):
    newDir = dir
    if os.path.isfile(dir):
        if dir.endswith(suffix):
            Filelist.append(dir)
            name = os.path.basename(dir)
            namelist.append(name[:(len(name)-len(suffix))])
    elif os.path.isdir(dir):
        for s in os.listdir(dir):
            newDir=os.path.join(dir,s)
            get_filelist(newDir, Filelist, namelist,suffix)
    return Filelist, namelist

In [3]:
# Some random chosen keywords
KEYWORDS = ['was','his','which','from','any','she','people','without','little','about']

root_dir = './LibriSpeech/Librispeech' # Root directory of audios
text_dir = './librispeech_MFA/Documents/aligned_librispeech' # Root directory of transcripts generated by MFA
filelist, namelist = get_filelist(root_dir,[],[],'.wav')
scriptlist, scriptname = get_filelist(text_dir,[],[],'.TextGrid')

# Generate a folder for outputs
if os.path.exists('./Outputs') == False:
    os.mkdir('./Outputs')

In [4]:
for word_idx in range(len(KEYWORDS)): #loop over all selected keywords
    keyword = KEYWORDS[word_idx]
    
    filecount = 0

    for file in range(len(filelist)): # loop over all audio files
        file_dir = filelist[file]
        file_name = namelist[file]
        script_dir = scriptlist[file]

        tg = textgrid.TextGrid()
        tg.read(script_dir.format(file_name))

        segmentlist = np.array([[0,0]])
        locations = np.array([[0,0]])

        for idx in range(0,len(tg.tiers[0])): # loop over all words in an audio
            if tg.tiers[0][idx].mark == keyword:
                t_start = tg.tiers[0][idx].minTime
                t_end = tg.tiers[0][idx].maxTime
                duration = t_end - t_start
                
                # Generate 1-sec segments
                if (1-duration) >= t_start:
                    rand_num = round(random.uniform(0,t_start),2)
                else:
                    rand_num = round(random.uniform(0,1-duration),2)

                t_start_new = t_start - rand_num
                t_end_new = t_start_new + 1
                
                # The array that contains all 1-sec segments that present the selected keyword
                segmentlist = np.concatenate((segmentlist,np.array([[t_start_new, t_end_new]])))
                
                # The array that contains the location of each keyword in each segment
                locations = np.concatenate((locations,np.array([[t_start - t_start_new, t_end - t_start_new]])))

        segmentlist = np.delete(segmentlist,0,axis=0)
        locations = np.delete(locations,0,axis=0)

        y, sr = sf.read(file_dir) # read the original audio

        if os.path.exists('./Outputs/{0}'.format(keyword)) == False:
            os.mkdir('./Outputs/{0}'.format(keyword))
        
        
        
        # Generate and save the segments
        for i in range(np.shape(segmentlist)[0]):
            y_cut = y[int(segmentlist[i,0]*sr) : int(segmentlist[i,1]*sr)]
            sf.write('./Outputs/{0}/{1}-{2}.wav'.format(keyword, str(word_idx).zfill(4), str(filecount).zfill(4)), y_cut, sr)
            
            # generate the .wrd files with the location of each keyword
            with open('./Outputs/{0}/{1}-{2}.wrd'.format(keyword, str(word_idx).zfill(4), str(filecount).zfill(4)),'w') as file:
                file.write('{0} {1} {2}'.format(int(locations[i,0]*sr), int(locations[i,1]*sr), keyword))
            
            filecount += 1

print('Files are generated successifully!')

Files are generated successifully!


In [5]:
# This part can separate the audios into train, val, and test sets.

import shutil
from shutil import copy2

train_ratio = 0.7
test_ratio = 0.15

if os.path.exists('./newOutputs') == False:
    os.mkdir('./newOutputs')

train_dir_root = './newOutputs/train'
if os.path.exists(train_dir_root) == False:
    os.mkdir(train_dir_root)

val_dir_root = './newOutputs/validation'
if os.path.exists(val_dir_root) == False:
    os.mkdir(val_dir_root)

test_dir_root = './newOutputs/test'
if os.path.exists(test_dir_root) == False:
    os.mkdir(test_dir_root)

for keyword in KEYWORDS:
    file_dir = "./Outputs/{0}/".format(keyword)
    all_files = os.listdir(file_dir)
    name_list = []
    for file in all_files:
        if file.endswith('.wav'):
            name_list.append(file[:-4])
    num_audios = len(name_list)
    index_list = list(range(num_audios))
    random.shuffle(index_list)
    num = 0
    
    train_dir = os.path.join(train_dir_root,keyword)
    if os.path.exists(train_dir) == False:
        os.mkdir(train_dir)
    
    val_dir = os.path.join(val_dir_root,keyword)
    if os.path.exists(val_dir) == False:
        os.mkdir(val_dir)
    
    test_dir = os.path.join(test_dir_root,keyword)
    if os.path.exists(test_dir) == False:
        os.mkdir(test_dir)
    
    for i in index_list:
        audio_files = os.path.join(file_dir, name_list[i] + '.wav')
        wrd_files = os.path.join(file_dir, name_list[i] + '.wrd')
        if num < num_audios*train_ratio:
            copy2(audio_files, train_dir)
            copy2(wrd_files, train_dir)
        elif num >= num_audios*(1-test_ratio):
            copy2(audio_files, test_dir)
            copy2(wrd_files, test_dir)
        else:
            copy2(audio_files, val_dir)
            copy2(wrd_files, val_dir)
        num += 1
        
print('Files are separated into train, validation, and test sets.')

Files are separated into train, validation, and test sets.
