In [6]:
# imports
import os
from os.path import join
import numpy as np
import pandas as pd
import json
import librosa
from pathlib import Path
import yaml
from tqdm import tqdm
from datasets import Dataset, DatasetDict

In [7]:
# generate the csv with all the data required to build the DatasetDict for the finetuning step 
class GenerateCSV():
    
    def __init__(self, root_folder, csv_filename, audio_format):
        self.root_folder = root_folder
        self.csv_filename = csv_filename
        self.audio_format = audio_format
        
    # helper function to build the lookup table for the id and annotations from all the text files and return the table
    def build_lookup_table(self):
        #initiate list to store the id and annotations lookup
        split_list_frame = []

        # get all the annotations into a dataframe
        for root, subdirs, files in os.walk(self.root_folder):
            for file in files:
                if file.endswith(".txt"):
                    # add on to the code here
                    df = pd.read_csv(os.path.join(root, file), header=None)
                    df.columns = ['name']

                    for i,j in enumerate(df.name):
                        split_list = j.split(" ",1)
                        split_list_frame.append(split_list)

        df_new = pd.DataFrame(split_list_frame, columns=['id', 'annotations']) # id and annotations are just dummy headers here
        return df_new 
    
    # generate the csv to prepare the dataset for the finetuning step
    def build_csv(self):
        # list to append all the data in
        data_list = []
        
        # build the lookup table
        df_new = self.build_lookup_table()
        
        # retrieve the dataframe for the lookup table and create the csv file
        for root, subdirs, files in tqdm(os.walk(self.root_folder)):
            for _, file in enumerate(files):
                if file.endswith(self.audio_format):
                    
                    # retrieve the base path for the particular audio file
                    base_path = os.path.basename(os.path.join(root, file)).split('.')[0]
                    
                    # get the array of values from the audio files and using 16000 sampling rate (16000 due to w2v2 requirment)
                    audio_array, _ = librosa.load(os.path.join(root, file), sr=16000)
                    
                    # NOTE THAT THERE ARE TWO LEVEL OF DICTIONARY: 
                        # the sub dictionary for the audio component 
                        # the main dictionary which comprises the file, audio and text component
                    
                    # creating the dictionary
                    data = {
                        'file': os.path.join(root, file),
                        'audio': {
                            'array': audio_array, 
                            'path': os.path.join(root, file), 
                            'sampling_rate': 16000
                        },
                        'text': df_new.loc[df_new['id'] == base_path, 'annotations'].to_numpy()[0].replace('<FIL>', '&').replace('<FILL>', '&').replace('  ', ' ')
                    }
                    
                    data_list.append(data)
                    
        # form the dataframe
        df_final = pd.DataFrame(data_list)
        
        # export the dataframe to csv
        df_final.to_csv(self.csv_filename, index=False)
        
        return df_final  
        
    def __call__(self):
        return self.build_csv()

In [8]:
# get the dataset
generate_csv_train = GenerateCSV(root_folder='./datasets/magister_data_flac_16000_finetune/train/', 
                                 csv_filename='./csv/magister_data_flac_16000_train.csv', 
                                 audio_format='.flac')

generate_csv_dev = GenerateCSV(root_folder='./datasets/magister_data_flac_16000_finetune/dev/', 
                                 csv_filename='./csv/magister_data_flac_16000_dev.csv', 
                                 audio_format='.flac')

df_train = generate_csv_train()
df_dev = generate_csv_dev()

38it [00:03,  9.96it/s]
4it [00:00,  4.93it/s]


In [17]:
# check the train dataset
print(df_train.shape)
df_train.head()

(1818, 3)


Unnamed: 0,file,audio,text
0,./datasets/magister_data_flac_16000_finetune/t...,"{'array': array([-0.00091553, -0.00073242, -0....",FCS GUNNERY BROADCAST POLICY SURFACE AND AIR F...
1,./datasets/magister_data_flac_16000_finetune/t...,"{'array': array([-0.00012207, 0.00018311, 0....",ROGER
2,./datasets/magister_data_flac_16000_finetune/t...,"{'array': array([ 0.00772095, 0.00949097, 0....",AGUN ENGAGE MISSILE NORTH AT TWO MILES
3,./datasets/magister_data_flac_16000_finetune/t...,"{'array': array([-0.0057373 , -0.00531006, -0....",G P M G PROVIDE ONE BOX TWO FIVE ZERO ROUNDS L...
4,./datasets/magister_data_flac_16000_finetune/t...,"{'array': array([ 3.0517578e-05, -3.0517578e-0...",OOW COME LEFT ZERO TWO ZERO INCREASE ONE TWO OOW


In [18]:
# check the train dataset
print(df_dev.shape)
df_dev.head()

(360, 3)


Unnamed: 0,file,audio,text
0,./datasets/magister_data_flac_16000_finetune/d...,"{'array': array([0.01727295, 0.01184082, 0.006...",FOXTROT ONE DELTA THIS IS ALFA FOUR QUEBEC SHO...
1,./datasets/magister_data_flac_16000_finetune/d...,"{'array': array([0.00152588, 0.00115967, 0.000...",FOXTROT ONE DELTA THIS IS ALFA FOUR QUEBEC SHO...
2,./datasets/magister_data_flac_16000_finetune/d...,"{'array': array([ 0.00564575, 0.00485229, 0....",ALFA FOUR QUEBEC & POSITIONING & FOR P A C RUN...
3,./datasets/magister_data_flac_16000_finetune/d...,"{'array': array([ 1.7395020e-03, 1.7700195e-0...",ALL IN THIS IS PWO INTEND CONTINUES FIRING NO ...
4,./datasets/magister_data_flac_16000_finetune/d...,"{'array': array([0.00112915, 0.00054932, 0. ...",FOXTROT ONE DELTA THIS IS ALFA FOUR QUEBEC COM...


In [19]:
# or load from csv that are saved (simulate the finetuning code later)
df_train = pd.read_csv('./csv/magister_data_flac_16000_train.csv')
df_dev = pd.read_csv('./csv/magister_data_flac_16000_dev.csv')

# make the pandas dataframe into a huggingface Dataset class and combine the two into a DatasetDict object
dataset = DatasetDict({'train': Dataset.from_pandas(df_train), 'test': Dataset.from_pandas(df_dev)})
dataset

DatasetDict({
    train: Dataset({
        features: ['file', 'audio', 'text'],
        num_rows: 1818
    })
    test: Dataset({
        features: ['file', 'audio', 'text'],
        num_rows: 360
    })
})