Script 1: Creating the Audiofolder

In [1]:
from datasets import load_dataset
from pydub import AudioSegment
import numpy as np
import os
import re
import csv
import requests

dataset = load_dataset("google/fleurs", "fil_ph", trust_remote_code=True)
print(". . . Dataset is loaded")

# Create directory for audiofolder format
output_dir = "./audiofolder_Filipino_dataset"
os.makedirs(output_dir, exist_ok=True)

# Create directory for audio data
data_dir = os.path.join(output_dir, "data")
os.makedirs(data_dir, exist_ok=True)
print(". . . Folders audiofolder_Filipino_dataset and data are created")



Generating train split: 0 examples [00:00, ? examples/s]

DatasetGenerationError: An error occurred while generating the dataset

In [None]:
# Function to download README file from Hugging Face Hub
def download_readme(dataset_name, output_dir):

    readme_url = f"https://huggingface.co/datasets/{dataset_name}/blob/main/README.md"

    response = requests.get(readme_url)

    if response.status_code == 200:
        # Save the README file
        output_path = os.path.join(output_dir, "README.md")
        with open(output_path, "wb") as f:
            f.write(response.content)
        print(". . . README.md File Successfully Downloaded")
    else:
        print(". . . Failed to download README file.")

dataset_name = "google/fleurs"
download_readme(dataset_name, output_dir)

In [None]:
train_dataset = dataset['train']
audiosToBeDownloaded = {}
processed_ids = set()

metadata_csv_path = os.path.join(output_dir, "metadata.csv")

# Write the metadata to a CSV file
with open(metadata_csv_path, mode='w', newline='', encoding='utf-8') as csv_file:

    fieldnames = ['file_name', 'normalized_text', 'speaker_id']

    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()

    # Write each row of metadata
    for idx, example in enumerate(train_dataset):
        id = example['id']

        if id in processed_ids:
            continue  # Skip this row if it's already processed

        audio = f"Filipino_Audio_Recording_{id}.wav"
        transcription = example['transcription']

        speaker_id = "Female" if example['gender'] == 1 else "Male"

        # Script 2 -  Remove rows and audio that has text of more than 200 characters, numbers, and special characters
        total_characters = len(transcription)

        if total_characters <= 200 and not re.search(r'\d', transcription) and not re.search(r'[!.,\"\'\(\)\[\]{}]', transcription):
            # Write the row to the CSV file
            writer.writerow({
                'file_name': audio,
                'normalized_text': transcription,
                'speaker_id': speaker_id
            })
            # Add the metadata to the dictionary
            audiosToBeDownloaded[id] = {
                'id': id,
                'audio': audio,
            }
            # Mark the ID as processed
            processed_ids.add(id)

print(". . . Metadata CSV file created successfully.")


In [None]:
for row in dataset['train']:
    # Check if the id of the current row is in audiosToBeDownloaded dictionary
    if row['id'] in audiosToBeDownloaded:
        # Access the audio data from the current row
        audio_data = row['audio']['array'] 

        # Ensure the audio data is in the correct format (16-bit PCM)
        audio_data = (audio_data * (2**15 - 1)).astype(np.int16)

        # Convert numpy array to Pydub AudioSegment (Script 4 - Samping rate of 16kHz or 16000)
        audio_segment = AudioSegment(audio_data.tobytes(), frame_rate=16000, sample_width=2, channels=1)

        # Get the audio filename from audiosToBeDownloaded dictionary
        audio_filename = audiosToBeDownloaded[row['id']]['audio']

        output_file_path = os.path.join(data_dir, audio_filename)

        # Save audio recording to file using Pydub (Script 4 - WAV Extension)
        audio_segment.export(output_file_path, format="wav")

print("... All audio recordings downloaded successfully.")
