In [1]:
from glob import glob
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [2]:
speakers = {
    'A': ('Bunga', 0, 'female'),
    'B': ('Ariff', 1, 'male'),
    'C': ('Ayu', 2, 'female'),
    'D': ('Kamarul', 3, 'male'),
}

In [None]:
import IPython.display as ipd
ipd.Audio('../output-gtts/ms-MY-Wavenet-A_23851.mp3')

In [3]:
files = sorted(glob('../output-gtts/*.mp3'))

In [4]:
from datasets import Dataset, Audio
from datasets import load_dataset, load_from_disk
from dataspeech import rate_apply, pitch_apply, snr_apply, squim_apply
import numpy as np
import pandas as pd
from datasets import DatasetDict
from multiprocess import set_start_method
import argparse
from pathlib import Path
import os
import matplotlib.pyplot as plt
import json
import torch

In [5]:
with open('../gtts-text.json') as fopen:
    t = json.load(fopen)

In [6]:
t[0]

'Ketika perang Aceh meletus pada tahun seribu lapan ratus tujuh puluh tiga , Teuku Ibrahim Lamnga aktif berjuang di garisan depan .'

In [7]:
data = []
for f in files:
    splitted = f.split('-')[-1].split('_')
    speaker = splitted[0]
    row = int(splitted[1].split('.')[0])
    
    speaker_id = speakers[speaker]
    
    data.append({
        'audio': f,
        'transcription': t[row],
        'speaker': speaker_id[0],
        'speaker_id': speaker_id[1],
        'gender': speaker_id[2]
    })

In [8]:
dataset = Dataset.from_list(data)

In [9]:
len(dataset)

124276

In [10]:
dataset = dataset.cast_column("audio", Audio(sampling_rate = 22050))

In [11]:
audio_column_name = 'audio'
text_column_name = 'transcription'
num_workers_per_gpu_for_squim = 4
cpu_num_workers = 5
penn_batch_size = 512
num_workers_per_gpu_for_pitch = 4
num_workers_per_gpu_for_snr = 1
cpu_writer_batch_size = 1000

In [None]:
squim_dataset = dataset.map(
    squim_apply,
    batched=True,
    batch_size=1,
    with_rank=True if torch.cuda.device_count()>0 else False,
    num_proc=torch.cuda.device_count()*num_workers_per_gpu_for_squim if torch.cuda.device_count()>0 else cpu_num_workers,
    remove_columns=[audio_column_name], # tricks to avoid rewritting audio
    fn_kwargs={"audio_column_name": audio_column_name,},
)

In [None]:
squim_dataset.save_to_disk('gtts-squim')

In [12]:
squim_dataset = load_from_disk('gtts-squim')

In [None]:
pitch_dataset = dataset.cast_column(audio_column_name, Audio(sampling_rate=16_000)).map(
    pitch_apply,
    batched=True,
    batch_size=1,
    with_rank=True if torch.cuda.device_count()>0 else False,
    num_proc=torch.cuda.device_count()*num_workers_per_gpu_for_pitch if torch.cuda.device_count()>0 else cpu_num_workers,
    remove_columns=[audio_column_name], # tricks to avoid rewritting audio
    fn_kwargs={"audio_column_name": audio_column_name, "penn_batch_size": penn_batch_size},
)

In [None]:
pitch_dataset.save_to_disk('gtts-pitch')

In [13]:
pitch_dataset = load_from_disk('gtts-pitch')

In [None]:
snr_dataset = dataset.map(
    snr_apply,
    batched=True,
    batch_size=1,
    with_rank=True if torch.cuda.device_count()>0 else False,
    num_proc=torch.cuda.device_count()*num_workers_per_gpu_for_snr if torch.cuda.device_count()>0 else cpu_num_workers,
    remove_columns=[audio_column_name], # tricks to avoid rewritting audio
    fn_kwargs={"audio_column_name": audio_column_name},
)

In [None]:
snr_dataset.save_to_disk('gtts-snr')

In [14]:
snr_dataset = load_from_disk('gtts-snr')

In [15]:
dataset = dataset.add_column('utterance_pitch_mean', pitch_dataset['utterance_pitch_mean']).add_column(
    'utterance_pitch_std', pitch_dataset['utterance_pitch_std'])

In [16]:
dataset = dataset.add_column("snr", snr_dataset["snr"]).add_column("c50", snr_dataset["c50"])

In [17]:
dataset = dataset.add_column("speech_duration", snr_dataset["speech_duration"])
dataset = dataset.add_column("stoi", squim_dataset["stoi"]).add_column("si-sdr", squim_dataset["sdr"]).add_column("pesq", squim_dataset["pesq"])

In [18]:
dataset = dataset.filter(lambda row: not np.isnan(row["snr"]))

Filter:   0%|          | 0/124276 [00:00<?, ? examples/s]

In [19]:
dataset_dict = DatasetDict({
    'train': dataset
})

In [20]:
SPEAKER_RATE_BINS = ["very slowly", "quite slowly", "slightly slowly", "moderate speed", "slightly fast", "quite fast", "very fast"]
SNR_BINS = ["very noisy", "quite noisy", "slightly noisy", "moderate ambient sound", "slightly clear", "quite clear", "very clear"]
REVERBERATION_BINS = ["very roomy sounding", "quite roomy sounding", "slightly roomy sounding", "moderate reverberation", "slightly confined sounding", "quite confined sounding", "very confined sounding"]
UTTERANCE_LEVEL_STD = ["very monotone", "quite monotone", "slightly monotone", "moderate intonation", "slightly expressive", "quite expressive", "very expressive"]

# this one is supposed to be apply to speaker-level mean pitch, and relative to gender
SPEAKER_LEVEL_PITCH_BINS = ["very low pitch", "quite low pitch", "slightly low pitch", "moderate pitch", "slightly high pitch", "quite high pitch", "very high pitch"]

In [21]:
text_bins_dict = {}
bin_edges_dict = {}

speaker_level_pitch_bins = text_bins_dict.get("speaker_level_pitch_bins", SPEAKER_LEVEL_PITCH_BINS)
speaker_rate_bins = text_bins_dict.get("speaker_rate_bins", SPEAKER_RATE_BINS)
snr_bins = text_bins_dict.get("snr_bins", SNR_BINS)
reverberation_bins = text_bins_dict.get("reverberation_bins", REVERBERATION_BINS)
utterance_level_std = text_bins_dict.get("utterance_level_std", UTTERANCE_LEVEL_STD)

In [22]:
def bins_to_text(dataset, text_bins, column_name, output_column_name, leading_split_for_bins="train", batch_size = 4, num_workers = 1, std_tolerance=5, save_dir=None, only_save_plot=False, lower_range=None, bin_edges=None):
    '''
    Compute bins of `column_name` from the splits `leading_split_for_bins` and apply text bins to every split.
    `leading_split_for_bins` can be a string or a list.
    '''
    if bin_edges is None:
        values = []
        for df in dataset:
            for split in df:
                if leading_split_for_bins is None or leading_split_for_bins in split:
                    values.extend(df[split][column_name])
        
        # filter out outliers
        values = np.array(values)
        if std_tolerance is not None:
            filtered_values = values[np.abs(values - np.mean(values)) < std_tolerance * np.std(values)]
        else:
            filtered_values = values

        if save_dir is not None:
            visualize_bins_to_text(values, filtered_values, "Before filtering", "After filtering", text_bins, save_dir, output_column_name, lower_range=lower_range)
            
        # speaking_rate can easily have outliers
        if save_dir is not None and output_column_name=="speaking_rate":
            visualize_bins_to_text(filtered_values, filtered_values, "After filtering", "After filtering", text_bins, save_dir, f"{output_column_name}_after_filtering", lower_range=lower_range)
        
        values = filtered_values
        hist, bin_edges = np.histogram(values, bins = len(text_bins), range=(lower_range, values.max()) if lower_range else None)
        
        if only_save_plot:
            return dataset, bin_edges
    else:
        print(f"Already computed bin edges have been passed for {output_column_name}. Will use: {bin_edges}.")

    def batch_association(batch):
        index_bins = np.searchsorted(bin_edges, batch, side="left")
        # do min(max(...)) when values are outside of the main bins
        # it happens when value = min or max or have been filtered out from bins computation
        batch_bins = [text_bins[min(max(i-1, 0), len(text_bins)-1)] for i in index_bins]
        return {
            output_column_name: batch_bins
        }
    
    dataset = [df.map(batch_association, batched=True, batch_size=batch_size, input_columns=[column_name], num_proc=num_workers) for df in dataset]
    return dataset, bin_edges

def speaker_level_relative_to_gender(dataset, text_bins, speaker_column_name, gender_column_name, column_name, output_column_name, batch_size = 4, num_workers=1, std_tolerance=None, save_dir=None, only_save_plot=False, bin_edges=None):
    '''
    Computes mean values on a speaker level and computes bins on top relative to the gender column name.
    Then associate a text bin to the column.
    This time, doesn't use leading_split_for_bins, computes it for all. Could probably be optimized
    '''
    list_data = []
    for df in dataset:
        for split in df:
            panda_data = df[split].remove_columns([col for col in df[split].column_names if col not in {speaker_column_name, column_name, gender_column_name}]).to_pandas()
            list_data.append(panda_data)
        
    dataframe = pd.concat(list_data, ignore_index=True)
    dataframe = dataframe.groupby(speaker_column_name).agg({column_name: "mean", gender_column_name: "first"})
    if bin_edges is None:
        bin_edges = {}
        if save_dir is not None:
            save_dict = {}
            save_dict_afer_filtering = {}
        for category in ["male", "female"]:
            values = dataframe[dataframe[gender_column_name] == category][column_name]
            values = np.array(values)
            if save_dir is not None:
                save_dict[category] = values
            if std_tolerance is not None:
                # filter out outliers
                values = values[np.abs(values - np.mean(values)) < std_tolerance * np.std(values)]
                if save_dir is not None:
                    save_dict_afer_filtering[category] = values
            bin_edges[category] = np.histogram(values, len(text_bins))[1]
        
        if save_dir is not None:
            visualize_bins_to_text(save_dict["male"], save_dict["female"], "Male distribution", "Female distribution", text_bins, save_dir, output_column_name)
            if std_tolerance is not None:
                visualize_bins_to_text(save_dict_afer_filtering["male"], save_dict_afer_filtering["female"], "Male distribution", "Female distribution", text_bins, save_dir, f"{output_column_name}_after_filtering")

        if only_save_plot:
            return dataset, bin_edges
     
    speaker_id_to_bins = dataframe.apply(lambda x: np.searchsorted(bin_edges[x[gender_column_name]], x[column_name]), axis=1).to_dict()
        
    def batch_association(batch):
        index_bins = [speaker_id_to_bins[speaker] for speaker in batch]
        # do min(max(...)) when values are outside of the main bins
        # it happens when value = min or max or have been filtered out from bins computation
        batch_bins = [text_bins[min(max(i-1, 0), len(text_bins)-1)] for i in index_bins]
        return {
            output_column_name: batch_bins
        }
        
    
    dataset = [df.map(batch_association, batched=True, input_columns=[speaker_column_name], batch_size=batch_size, num_proc=num_workers) for df in dataset]
    return dataset, bin_edges

In [23]:
bin_edges = None
pitch_std_tolerance = 5.
dataset, pitch_bin_edges = speaker_level_relative_to_gender(
    [dataset_dict], speaker_level_pitch_bins, 'speaker_id', 
    'gender', "utterance_pitch_mean", "pitch", 
    batch_size=100, num_workers=5, std_tolerance=None, 
    save_dir=None, only_save_plot=False, bin_edges=bin_edges)

Map (num_proc=5):   0%|          | 0/124276 [00:00<?, ? examples/s]

In [24]:
dataset, speaking_rate_bin_edges = bins_to_text(
    dataset, 
    speaker_rate_bins, "speech_duration", "speaking_rate", 
    batch_size=100, num_workers=5, 
    leading_split_for_bins=None, 
    std_tolerance=None, save_dir=None, 
    only_save_plot=False, bin_edges=bin_edges_dict.get("speaking_rate",None))

Map (num_proc=5):   0%|          | 0/124276 [00:00<?, ? examples/s]

In [25]:
dataset, noise_bin_edges = bins_to_text(
    dataset, snr_bins, "snr", "noise", 
    batch_size=100, num_workers=5, 
    leading_split_for_bins=None, 
    std_tolerance=None, save_dir=None, only_save_plot=False, bin_edges=bin_edges_dict.get("noise",None), 
                                        lower_range=None)

Map (num_proc=5):   0%|          | 0/124276 [00:00<?, ? examples/s]

In [26]:
dataset, reverberation_bin_edges = bins_to_text(
    dataset, reverberation_bins, "c50", "reverberation", 
    batch_size=100, num_workers=5, 
    leading_split_for_bins=None, 
    std_tolerance=None, 
    save_dir=None, only_save_plot=False, 
    bin_edges=bin_edges_dict.get("reverberation",None))

Map (num_proc=5):   0%|          | 0/124276 [00:00<?, ? examples/s]

In [27]:
dataset, speech_monotony_bin_edges = bins_to_text(
    dataset, utterance_level_std, 
    "utterance_pitch_std", 
    "speech_monotony", 
    batch_size=100, 
    num_workers=5, 
    leading_split_for_bins=None, 
    std_tolerance=None, 
    save_dir=None, only_save_plot=False, bin_edges=bin_edges_dict.get("speech_monotony",None))

Map (num_proc=5):   0%|          | 0/124276 [00:00<?, ? examples/s]

In [28]:
dataset[0].save_to_disk('gtts-metadata')

Saving the dataset (0/17 shards):   0%|          | 0/124276 [00:00<?, ? examples/s]

In [29]:
!du -hs gtts-metadata

7.8G	gtts-metadata


In [30]:
dataset[0]['train'][0]

{'audio': {'path': '../output-gtts/ms-MY-Wavenet-A_0.mp3',
  'array': array([-0.00083028, -0.00129574, -0.00129971, ...,  0.00044667,
          0.00043588,  0.0003486 ]),
  'sampling_rate': 22050},
 'transcription': 'Ketika perang Aceh meletus pada tahun seribu lapan ratus tujuh puluh tiga , Teuku Ibrahim Lamnga aktif berjuang di garisan depan .',
 'speaker': 'Bunga',
 'speaker_id': 0,
 'gender': 'female',
 'utterance_pitch_mean': 220.41683959960938,
 'utterance_pitch_std': 42.64897537231445,
 'snr': 62.63203430175781,
 'c50': 59.670066833496094,
 'speech_duration': 8.133750000000003,
 'stoi': 0.9968859553337097,
 'si-sdr': 25.412067413330078,
 'pesq': 3.9492032527923584,
 'pitch': 'very low pitch',
 'speaking_rate': 'quite slowly',
 'noise': 'quite clear',
 'reverberation': 'very confined sounding',
 'speech_monotony': 'quite monotone'}

In [45]:
PROMPT = """You will be given 6 descriptive keywords related to an audio sample of [speaker_name]'s speech. These keywords include:
1. The gender (e.g., male, female)
2. The level of reverberation (e.g., very distant-sounding, quite distant-sounding, slightly distant-sounding, moderately balanced reverberation, slightly close-sounding, quite close-sounding, very close-sounding)
3. The amount of noise the sample (e.g., very noisy, quite noisy, slightly noisy, balanced in clarity, slightly clean, quite clean, very clean)
4. The tone of the speaker's voice (e.g., very monotone, quite monotone, slightly monotone, moderately dynamic, slightly expressive and animated, quite expressive and animated, very expressive and animated)
5. The pace of the speaker's delivery (e.g., very slowly, quite slowly, slightly slowly, moderate speed, slightly fast, quite fast, very fast)
6. The pitch of the speaker's voice (e.g., very low-pitch, quite low-pitch, slightly low-pitch, moderate pitch, slightly high-pitch, quite high-pitch, very high-pitch)

Your task is to create a text description using these keywords that accurately describes the speech sample. Ensure that the generated description is grammatically correct, easy to understand, and most importantly, concise. 
You can optionally change the order of keywords, and replace synonymous terms. You can also optionally omit the following terms, as they are default terms: 'moderately balanced reverberation', 'balanced in clarity', 'moderately dynamic', 'moderate speed' and 'moderate pitch'.
If the amount of noise is 'very noisy' and the level of reverberation is 'distant-sounding', you must include words such as 'very poor recording' in the description. Likewise, if the amount of noise is 'very clear' and the level of reverberation is 'very close-sounding', you must include terms like 'very good recording' in the description. 
Otherwise, do not add extra details beyond what has been provided, and only return the generated description.

For example, given the following keywords: 'female', 'slightly distant-sounding', 'slightly noisy', 'very expressive', 'moderate pitch', 'very slowly', a valid description would be: '[speaker_name], a woman with a moderately pitched voice speaks very slowly but has an animated delivery in an echoey room with some background noise.'.
Another valid description would be: '[speaker_name] in a room with slight background noise, a female speaker delivers an animated and expressive speech,at a very slow pace.'
Another valid description would be: '[speaker_name], with female voice enunciates an animated and expressive speech. Her voice is slightly distant-sounding, with some background noise present. She speaks very slowly with a moderate pitch but a very expressive tone.'
For the keywords: '[gender]', '[reverberation]', '[noise]', '[speech_monotony]', '[pitch]', '[speaking_rate]', return the corresponding description in JSON {'result'}"""

In [46]:
EXPECTED_COLUMNS = {"gender", "pitch", "noise", "reverberation", "speech_monotony", "speaking_rate"}

In [47]:
dataset[0]['train']

Dataset({
    features: ['audio', 'transcription', 'speaker', 'speaker_id', 'gender', 'utterance_pitch_mean', 'utterance_pitch_std', 'snr', 'c50', 'speech_duration', 'stoi', 'si-sdr', 'pesq', 'pitch', 'speaking_rate', 'noise', 'reverberation', 'speech_monotony'],
    num_rows: 124276
})

In [48]:
import requests

In [49]:
sample = dataset[0]['train'][0]
sample

{'audio': {'path': '../output-gtts/ms-MY-Wavenet-A_0.mp3',
  'array': array([-0.00083028, -0.00129574, -0.00129971, ...,  0.00044667,
          0.00043588,  0.0003486 ]),
  'sampling_rate': 22050},
 'transcription': 'Ketika perang Aceh meletus pada tahun seribu lapan ratus tujuh puluh tiga , Teuku Ibrahim Lamnga aktif berjuang di garisan depan .',
 'speaker': 'Bunga',
 'speaker_id': 0,
 'gender': 'female',
 'utterance_pitch_mean': 220.41683959960938,
 'utterance_pitch_std': 42.64897537231445,
 'snr': 62.63203430175781,
 'c50': 59.670066833496094,
 'speech_duration': 8.133750000000003,
 'stoi': 0.9968859553337097,
 'si-sdr': 25.412067413330078,
 'pesq': 3.9492032527923584,
 'pitch': 'very low pitch',
 'speaking_rate': 'quite slowly',
 'noise': 'quite clear',
 'reverberation': 'very confined sounding',
 'speech_monotony': 'quite monotone'}

In [52]:
from tqdm import tqdm

prompts = []
for i in tqdm(range(len(dataset[0]['train']))):
    sample = dataset[0]['train'][i]
    sample_prompt = PROMPT
    for key in EXPECTED_COLUMNS:
        sample_prompt = sample_prompt.replace(f"[{key}]", sample[key])
    
    sample_prompt = sample_prompt.replace("[speaker_name]", sample['speaker'])

    sample_prompt = [{"role": "user", "content": sample_prompt}]
    
    while True:
        r = requests.post('http://localhost:7860/v1/chat/completions',
                 json = {'messages': sample_prompt, 'model': 'mallam-small', 
                         'temperature': 0.6, 'max_tokens': 256}).json()

        try:
            r = json.loads(r['choices'][0]['message']['content'])['result']
            if isinstance(r, str):
                prompts.append(r)
                break
        except Exception as e:
            pass

  8%|███████▌                                                                                          | 9625/124276 [3:48:53<45:19:54,  1.42s/it]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 63%|████████████████████████████████████████████████████████████▊                                   | 78747/124276 [27:00:20<14:48:26,  1.17s/it]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

100%|█████████████████████████████████████████████████████

In [53]:
with open('gtts-prompts.json', 'w') as fopen:
    json.dump(prompts, fopen)

In [54]:
for i in range(len(prompts)):
    if not isinstance(prompts[i], str):
        prompts[i] = None

In [55]:
dataset[0]['train'] = dataset[0]['train'].add_column('prompt', prompts)

In [56]:
dataset[0]['train'][-1]

{'audio': {'path': '../output-gtts/ms-MY-Wavenet-D_9999.mp3',
  'array': array([-0.00089931, -0.00147248, -0.0015128 , ...,  0.00062253,
          0.00067943,  0.00061922]),
  'sampling_rate': 22050},
 'transcription': 'Di Grup A , tiket babak enam belas besar masih diperebutkan oleh Prancis , Norwegia , dan Nigeria .',
 'speaker': 'Kamarul',
 'speaker_id': 3,
 'gender': 'male',
 'utterance_pitch_mean': 155.5963592529297,
 'utterance_pitch_std': 39.80721664428711,
 'snr': 61.4404182434082,
 'c50': 59.63848876953125,
 'speech_duration': 6.4799999999999995,
 'stoi': 0.9979360103607178,
 'si-sdr': 22.45107078552246,
 'pesq': 3.5210366249084473,
 'pitch': 'very high pitch',
 'speaking_rate': 'quite slowly',
 'noise': 'quite clear',
 'reverberation': 'very confined sounding',
 'speech_monotony': 'quite monotone',
 'prompt': 'Kamarul, a male speaker with a very high-pitched voice delivers a monotonous speech in a very confined-sounding room with minimal background noise. He speaks at a very 

In [57]:
dataset[0]['train'].push_to_hub('mesolitica/gtts-annotated')

Uploading the dataset shards:   0%|          | 0/17 [00:00<?, ?it/s]

Map:   0%|          | 0/7311 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/74 [00:00<?, ?ba/s]

Map:   0%|          | 0/7311 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/74 [00:00<?, ?ba/s]

Map:   0%|          | 0/7311 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/74 [00:00<?, ?ba/s]

Map:   0%|          | 0/7311 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/74 [00:00<?, ?ba/s]

Map:   0%|          | 0/7311 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/74 [00:00<?, ?ba/s]

Map:   0%|          | 0/7311 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/74 [00:00<?, ?ba/s]

Map:   0%|          | 0/7310 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/74 [00:00<?, ?ba/s]

Map:   0%|          | 0/7310 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/74 [00:00<?, ?ba/s]

Map:   0%|          | 0/7310 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/74 [00:00<?, ?ba/s]

Map:   0%|          | 0/7310 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/74 [00:00<?, ?ba/s]

Map:   0%|          | 0/7310 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/74 [00:00<?, ?ba/s]

Map:   0%|          | 0/7310 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/74 [00:00<?, ?ba/s]

Map:   0%|          | 0/7310 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/74 [00:00<?, ?ba/s]

Map:   0%|          | 0/7310 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/74 [00:00<?, ?ba/s]

Map:   0%|          | 0/7310 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/74 [00:00<?, ?ba/s]

Map:   0%|          | 0/7310 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/74 [00:00<?, ?ba/s]

Map:   0%|          | 0/7310 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/74 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/gtts-annotated/commit/eefc6884a70f25cb820fcc8a5cebcdbd96ddbd68', commit_message='Upload dataset', commit_description='', oid='eefc6884a70f25cb820fcc8a5cebcdbd96ddbd68', pr_url=None, pr_revision=None, pr_num=None)