In [8]:
# imports
import os
from os.path import join
from tqdm import tqdm
import numpy as np
import pandas as pd
import json
import librosa

In [9]:
# DEFINING CONSTANTS FOR THE FILE NAMING
ROOT_FOLDER = './librispeech_data/'
MANIFEST_FILENAME = 'manifest'
MANIFEST_FILENAME_NO_LABEL = 'manifest_no_annotation'
GOT_ANNOTATION = False

In [10]:
# helper function to build the lookup table for the id and annotations from all the text files and return the table
def build_lookup_table(root_folder):
    #initiate list to store the id and annotations lookup
    split_list_frame = []
    
    # get all the annotations into a dataframe
    for root, subdirs, files in os.walk(root_folder):
        for file in files:
            if file.endswith(".txt"):
                # add on to the code here
                df = pd.read_csv(os.path.join(root, file), header=None)
                df.columns = ['name']

                for i,j in enumerate(df.name):
                    split_list = j.split(" ",1)
                    split_list_frame.append(split_list)
                    
    df_new = pd.DataFrame(split_list_frame, columns=['id', 'annotations']) # id and annotations are just dummy headers here
    
    return df_new

In [11]:
# helper function to create the json manifest file
def create_json_manifest(root_folder, manifest_filename, got_annotation):
    
    if got_annotation:
        # get the lookup table
        df_new = build_lookup_table(root_folder)
    
    # retrieve the dataframe lookup table
    for root, subdirs, files in os.walk(root_folder):
        for file in files:
            if file.endswith(".flac"):
                # retrieve the base path for the particular audio file
                base_path = os.path.basename(os.path.join(root, file)).split('.')[0]
                
                # create the dictionary that is to be appended to the json file
                if got_annotation:
                    data = {'audio_filepath' : os.path.join(root, file),
                            'duration' : librosa.get_duration(filename=os.path.join(root, file)),
                            'text' : df_new.loc[df_new['id'] == base_path, 'annotations'].to_numpy()[0]
                           }
                else:
                    data = {'audio_filepath' : os.path.join(root, file),
                            'duration' : librosa.get_duration(filename=os.path.join(root, file)),
                           }

                # write to json file
                with open(f'{root_folder}{manifest_filename}.json', 'a+', encoding='utf-8') as f:
                    f.write(json.dumps(data) + '\n')
                    # json.dump(data, f, ensure_ascii=False, indent=2)
                    # f.write('\n')

In [7]:
# got annotation
create_json_manifest(root_folder=ROOT_FOLDER, 
                     manifest_filename=MANIFEST_FILENAME,
                     got_annotation=True)

In [12]:
# no annotation
create_json_manifest(root_folder=ROOT_FOLDER, 
                     manifest_filename=MANIFEST_FILENAME_NO_LABEL,
                     got_annotation=False)