In [1]:
from datasets import load_dataset
from datasets import DatasetDict

In [2]:
dataset = load_dataset("../data/springlab-asr-task-data/")

dataset

DatasetDict({
    train: Dataset({
        features: ['audio', 'text', 'language'],
        num_rows: 8000
    })
})

In [3]:
dataset["train"][0]

{'audio': {'path': None,
  'array': array([0.13391113, 0.118927  , 0.10662842, ..., 0.14904785, 0.14758301,
         0.14038086], shape=(79360,)),
  'sampling_rate': 16000},
 'text': 'block which lets get this one running so the simple part is just go over there and then',
 'language': 'en-IN'}

In [4]:
full_dataset = dataset["train"]

# Step 1: Split into train (80%) and temp (20%)
split_dataset = full_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = split_dataset['train']
temp_dataset = split_dataset['test']

# Step 2: Split temp (20%) into validation (10%) and test (10%)
val_test_split = temp_dataset.train_test_split(test_size=0.5, seed=42)
val_dataset = val_test_split['train']
test_dataset = val_test_split['test']

new_dataset_dict = DatasetDict({'train': train_dataset,
                                'validation': val_dataset,
                                'test': test_dataset})

new_dataset_dict

DatasetDict({
    train: Dataset({
        features: ['audio', 'text', 'language'],
        num_rows: 6400
    })
    validation: Dataset({
        features: ['audio', 'text', 'language'],
        num_rows: 800
    })
    test: Dataset({
        features: ['audio', 'text', 'language'],
        num_rows: 800
    })
})

In [5]:
new_dataset_dict["train"][0]

{'audio': {'path': None,
  'array': array([0.0005188 , 0.00186157, 0.0015564 , ..., 0.        , 0.00119019,
         0.00021362], shape=(109120,)),
  'sampling_rate': 16000},
 'text': 'the players have spoken up for the captain through a letter sent to the sports governing body in the country',
 'language': 'en-IN'}

In [6]:
import os
import json
import numpy as np
import soundfile as sf
from datasets import Dataset  # Optional for type hinting

In [7]:
def create_manifest_from_split(dataset_split: Dataset, split_name: str, output_dir: str = "../data/springlab-asr-task-wavs") -> str:
    """
    Creates a manifest file for a dataset split.

    Args:
        dataset_split (Dataset): HuggingFace Dataset split (e.g. train/validation/test).
        split_name (str): Name of the split.
        output_dir (str): Directory to store audio files and manifest.

    Returns:
        str: Path to the generated manifest file.
    """
    os.makedirs(output_dir, exist_ok=True)
    manifest_path = os.path.join(output_dir, f"{split_name}_manifest.json")

    with open(manifest_path, "w", encoding="utf-8") as fout:
        for i, item in enumerate(dataset_split):
            if not item.get('audio') or not item.get('text'):
                continue

            try:
                audio_array = item['audio']['array']
                sr = item['audio']['sampling_rate']

                # Normalize audio to [-1, 1], then convert to int16
                if np.max(np.abs(audio_array)) != 0:
                    audio_array = audio_array / np.max(np.abs(audio_array))
                audio_array = (audio_array * 32767).astype(np.int16)

                # Save audio to WAV file
                audio_path = os.path.join(output_dir, f"{split_name}_audio_{i}.wav")
                sf.write(audio_path, audio_array, sr, format='WAV', subtype='PCM_16')

                # Create manifest entry
                duration = len(audio_array) / sr
                manifest_entry = {
                    "audio_filepath": os.path.abspath(audio_path),
                    "duration": duration,
                    "text": item["text"].strip().lower()
                }

                # Write to file with readable Unicode
                fout.write(json.dumps(manifest_entry, ensure_ascii=False) + "\n")

            except Exception as e:
                print(f"Skipping {split_name}_audio_{i}: {e}")

    return manifest_path

In [8]:
train_manifest = create_manifest_from_split(new_dataset_dict["train"], "train")
validation_manifest = create_manifest_from_split(new_dataset_dict["validation"], "validation")
test_manifest = create_manifest_from_split(new_dataset_dict["test"], "test")

In [9]:
print(train_manifest)

print(validation_manifest)

print(test_manifest)

../data/springlab-asr-task-wavs/train_manifest.json
../data/springlab-asr-task-wavs/validation_manifest.json
../data/springlab-asr-task-wavs/test_manifest.json
