In [None]:
import os
import json
import pandas as pd

# Define you dataset path
# My Example (for Google Colab with mounted Drive):
# DATASET_PATH = "/content/drive/MyDrive/Capstone/cv-corpus-20.0-2024-12-06/hy-AM"

DATASET_PATH = "/your/local/or/colab/path/to/dataset"

def create_manifest(csv_filename, manifest_rel_name, manifest_abs_name):
    csv_path = os.path.join(DATASET_PATH, csv_filename)
    df = pd.read_csv(csv_path)

    rel_manifest_path = os.path.join(DATASET_PATH, manifest_rel_name)
    abs_manifest_path = os.path.join(DATASET_PATH, manifest_abs_name)

    rel_entries = []
    abs_entries = []

    for _, row in df.iterrows():
        rel_path = row["wav_path"]
        abs_path = os.path.abspath(os.path.join(DATASET_PATH, rel_path))

        entry = {
            "audio_filepath": rel_path,
            "text": row["transcript"]
        }
        rel_entries.append(entry)

        abs_entry = entry.copy()
        abs_entry["audio_filepath"] = abs_path
        abs_entries.append(abs_entry)

    with open(rel_manifest_path, 'w', encoding='utf-8') as f_rel, \
         open(abs_manifest_path, 'w', encoding='utf-8') as f_abs:
        for r, a in zip(rel_entries, abs_entries):
            json.dump(r, f_rel)
            f_rel.write('\n')
            json.dump(a, f_abs)
            f_abs.write('\n')

    print(f"Created: {rel_manifest_path}")
    print(f"Created: {abs_manifest_path}")

#Generate manifests for train and validation
create_manifest("train.csv", "train_manifest.jsonl", "train_manifest_abs.jsonl")
create_manifest("validation.csv", "val_manifest.jsonl", "val_manifest_abs.jsonl")