In [11]:
import os
import json
import pandas as pd

#dataset root relative to the repo structure
DATASET_PATH = os.path.join(os.path.dirname(__file__), '..', 'data')
CLIPS_DIR = os.path.join(DATASET_PATH, 'clips')  #Ensure this exists if needed

def create_manifest(csv_filename, manifest_rel_name, manifest_abs_name):
    csv_path = os.path.join(DATASET_PATH, 'processed', csv_filename)
    df = pd.read_csv(csv_path)

    rel_manifest_path = os.path.join('..', 'manifests', manifest_rel_name)
    abs_manifest_path = os.path.join('..', 'manifests', manifest_abs_name)

    rel_entries = []
    abs_entries = []

    for _, row in df.iterrows():
        rel_path = row["wav_path"]
        abs_path = os.path.abspath(os.path.join(DATASET_PATH, rel_path))

        entry = {
            "audio_filepath": rel_path,
            "text": row["transcript"]
        }
        rel_entries.append(entry)

        abs_entry = entry.copy()
        abs_entry["audio_filepath"] = abs_path
        abs_entries.append(abs_entry)

    with open(rel_manifest_path, 'w', encoding='utf-8') as f_rel, \
         open(abs_manifest_path, 'w', encoding='utf-8') as f_abs:
        for r, a in zip(rel_entries, abs_entries):
            f_rel.write(json.dumps(r, ensure_ascii=False) + '\n')
            f_abs.write(json.dumps(a, ensure_ascii=False) + '\n')

def create_test_manifest(test_tsv_filename, rel_output_name, abs_output_name):
    test_tsv_path = os.path.join(DATASET_PATH, 'raw', test_tsv_filename)
    df = pd.read_csv(test_tsv_path, sep='\t')

    rel_manifest_path = os.path.join('..', 'manifests', rel_output_name)
    abs_manifest_path = os.path.join('..', 'manifests', abs_output_name)

    with open(rel_manifest_path, 'w', encoding='utf-8') as f_rel, \
         open(abs_manifest_path, 'w', encoding='utf-8') as f_abs:
        for _, row in df.iterrows():
            relative_wav_path = os.path.join('clips', row["path"].replace(".mp3", ".wav"))
            absolute_wav_path = os.path.abspath(os.path.join(DATASET_PATH, relative_wav_path))

            entry_rel = {
                "audio_filepath": relative_wav_path,
                "text": row["sentence"]
            }
            entry_abs = {
                "audio_filepath": absolute_wav_path,
                "text": row["sentence"]
            }
            f_rel.write(json.dumps(entry_rel, ensure_ascii=False) + '\n')
            f_abs.write(json.dumps(entry_abs, ensure_ascii=False) + '\n')


create_manifest("final_train.csv", "train_manifest.jsonl", "train_manifest_abs.jsonl")
create_manifest("validation.csv", "val_manifest.jsonl", "val_manifest_abs.jsonl")
create_test_manifest("test.tsv", "test_manifest.jsonl", "test_manifest_abs.jsonl")