In [1]:
# imports
import os
from os.path import join
import numpy as np
import pandas as pd
import json
import librosa
from pathlib import Path
import yaml
from tqdm import tqdm

In [2]:
# get the filepath of the file


In [19]:
# adopt from previous class
class GenerateManifest():
    
    def __init__(self, root_folder, manifest_filename, got_annotation):
        self.root_folder = root_folder
        self.manifest_filename = manifest_filename
        self.got_annotation = got_annotation
    
    # check if the json file name already existed (if existed, need to throw error or else the new json manifest will be appended to the old one, hence causing a file corruption)
    def json_existence(self):
        assert not os.path.isfile(f'{self.manifest_filename}'), "json filename exists! Please remove old json file!"
    
    # helper function to build the lookup table for the id and annotations from all the text files and return the table
    def build_lookup_table(self):
        #initiate list to store the id and annotations lookup
        split_list_frame = []

        # get all the annotations into a dataframe
        for root, subdirs, files in os.walk(self.root_folder):
            for file in files:
                if file.endswith(".txt"):
                    # add on to the code here
                    df = pd.read_csv(os.path.join(root, file), header=None)
                    df.columns = ['name']

                    for i,j in enumerate(df.name):
                        split_list = j.split(" ",1)
                        split_list_frame.append(split_list)

        df_new = pd.DataFrame(split_list_frame, columns=['id', 'annotations']) # id and annotations are just dummy headers here
        return df_new
    
    # helper function to create the json manifest file
    def create_json_manifest(self):
        data_list = []
        
        # check if the json filename have existed in the directory
        self.json_existence()
        
        if self.got_annotation:
            # get the lookup table
            df_new = self.build_lookup_table()

        # retrieve the dataframe lookup table
        for root, subdirs, files in os.walk(self.root_folder):
            
            # since self.root_folder is a subset of the root, can just replace self.root with empty string
            modified_root_ = str(Path(root)).replace(str(Path(self.root_folder)), '')
            # replace the slash with empty string after Path standardization
            modified_root = modified_root_.replace('/', '', 1)

            for _, file in enumerate(files):
                if file.endswith(".flac"):
                    # retrieve the base path for the particular audio file
                    base_path = os.path.basename(os.path.join(root, file)).split('.')[0]
                    
                    audio_array, _ = librosa.load(os.path.join(root, file), sr=None)
                    
                    # create the dictionary that is to be appended to the json file
                    if self.got_annotation:
                        data = {
                                # 'audio_filepath' : os.path.join(modified_root, file),
                                'audio_filepath' : os.path.join(root, file),
                                'duration' : librosa.get_duration(filename=os.path.join(root, file)),
                                'text' : df_new.loc[df_new['id'] == base_path, 'annotations'].to_numpy()[0].replace('<FIL>', '&').replace('  ', ' '),
                                'array' : {'audio': audio_array, 'path': os.path.join(root, file)}
                                }
                        #print(data)
                        data_list.append(data)
                    else:
                        data = {
                                'audio_filepath' : os.path.join(modified_root, file),
                                'duration' : librosa.get_duration(filename=os.path.join(root, file)),
                               }

                    # write to json file
                    #with open(f'{self.root_folder}{self.manifest_filename}', 'a+', encoding='utf-8') as f:
                    # with open(f'{self.manifest_filename}', 'a+', encoding='utf-8') as f:
                    #     f.write(json.dumps(data) + '\n')
                        # json.dump(data, f, ensure_ascii=False, indent=2)
                        # f.write('\n')

        return f'{self.manifest_filename}', data_list

    def __call__(self):
        return self.create_json_manifest()

In [20]:
get_manifest_a = GenerateManifest(root_folder='datasets/magister_data_flac_16000/train', 
                                    manifest_filename='test2.json', 
                                    got_annotation=True)
get_manifest_b = GenerateManifest(root_folder='datasets/magister_data_flac_16000/dev', 
                                    manifest_filename='test2.json', 
                                    got_annotation=True)

In [21]:
_,a = get_manifest_a()
_,b = get_manifest_b()

In [22]:
from datasets import Dataset, DatasetDict

In [23]:
df_a = pd.DataFrame(a)
df_b = pd.DataFrame(b)

dataset_a = Dataset.from_pandas(df_a)
dataset_b = Dataset.from_pandas(df_b)

In [24]:
df_a.head()

Unnamed: 0,audio_filepath,duration,text,array
0,datasets/magister_data_flac_16000/train/11056/...,6.084812,FCS GUNNERY BROADCAST POLICY SURFACE AND AIR F...,"{'audio': [-0.00091552734, -0.0007324219, -0.0..."
1,datasets/magister_data_flac_16000/train/11056/...,0.983313,ROGER,"{'audio': [-0.00012207031, 0.00018310547, 0.00..."
2,datasets/magister_data_flac_16000/train/11056/...,3.094813,AGUN ENGAGE MISSILE NORTH AT TWO MILES,"{'audio': [0.0077209473, 0.009490967, 0.011291..."
3,datasets/magister_data_flac_16000/train/11056/...,8.364937,G P M G PROVIDE ONE BOX TWO FIVE ZERO ROUNDS L...,"{'audio': [-0.0057373047, -0.0053100586, -0.00..."
4,datasets/magister_data_flac_16000/train/11056/...,4.495687,OOW COME LEFT ZERO TWO ZERO INCREASE ONE TWO OOW,"{'audio': [3.0517578e-05, -3.0517578e-05, -6.1..."


In [25]:
df_b.head()

Unnamed: 0,audio_filepath,duration,text,array
0,datasets/magister_data_flac_16000/dev/11251/31...,19.898937,FOXTROT ONE DELTA THIS IS ALFA FOUR QUEBEC SHO...,"{'audio': [0.01727295, 0.01184082, 0.006408691..."
1,datasets/magister_data_flac_16000/dev/11251/31...,6.426563,FOXTROT ONE DELTA THIS IS ALFA FOUR QUEBEC SHO...,"{'audio': [0.0015258789, 0.001159668, 0.000793..."
2,datasets/magister_data_flac_16000/dev/11251/31...,6.191813,ALFA FOUR QUEBEC & POSITIONING & FOR P A C RUN...,"{'audio': [0.005645752, 0.004852295, 0.0040588..."
3,datasets/magister_data_flac_16000/dev/11251/31...,25.105563,ALL IN THIS IS PWO INTEND CONTINUES FIRING NO ...,"{'audio': [0.001739502, 0.0017700195, 0.001831..."
4,datasets/magister_data_flac_16000/dev/11251/31...,11.548937,FOXTROT ONE DELTA THIS IS ALFA FOUR QUEBEC COM...,"{'audio': [0.0011291504, 0.0005493164, 0.0, 0...."


In [27]:
df_a.shape, df_b.shape

((1818, 4), (360, 4))

In [8]:
dataset = DatasetDict({'train': dataset_a, 'test': dataset_b})

In [9]:
dataset

DatasetDict({
    train: Dataset({
        features: ['audio_filepath', 'duration', 'text', 'array'],
        num_rows: 22
    })
    test: Dataset({
        features: ['audio_filepath', 'duration', 'text', 'array'],
        num_rows: 2
    })
})

In [10]:
len(dataset['train']['array'][20]['audio'])

61227

In [11]:
dataset['train']['array'][20]['audio'][:30]

[0.00048828125,
 0.000701904296875,
 0.00091552734375,
 0.0013427734375,
 0.001800537109375,
 0.00146484375,
 0.001129150390625,
 0.000640869140625,
 0.00018310546875,
 6.103515625e-05,
 -6.103515625e-05,
 0.000518798828125,
 0.0010986328125,
 0.000885009765625,
 0.000701904296875,
 0.000396728515625,
 9.1552734375e-05,
 0.000152587890625,
 0.000213623046875,
 0.00054931640625,
 0.00091552734375,
 0.001922607421875,
 0.002960205078125,
 0.002197265625,
 0.001434326171875,
 0.00146484375,
 0.001495361328125,
 0.00177001953125,
 0.002044677734375,
 0.000640869140625]

In [12]:
dataset['array'][0]['path']

KeyError: 'array'

In [None]:
dataset['array'][0]['audio'][:30]

In [None]:
dataset['text'][8]

In [None]:
k = dataset['array'][0]

In [13]:
k[:20]

NameError: name 'k' is not defined

In [219]:
type(k)

list

In [220]:
len(k)

97357

In [221]:
kk = np.array(k, dtype='float32')

In [222]:
kk

array([-0.00091553, -0.00073242, -0.0005188 , ..., -0.00949097,
       -0.00546265, -0.00143433], dtype=float32)