In [1]:
import os
import pandas as pd
from datasets import Dataset, Audio, DatasetInfo, Features, Value

In [2]:
# load audio files under folder "sanskrit" and it's transcription

In [3]:
def search_folder(folder):
    # folder has multiple audio files .wav (sent_1.wav)and a .txt
    # transcriptions.txt file has the transcription of each audio file in a new line

    # get all audio files; sorted based on file_number: sent_1.wav, sent_2.wav, sent_3.wav, ...
    audio_files = sorted([f for f in os.listdir(folder) if f.endswith(".wav")], key=lambda f: int(f.split("_")[1].split(".")[0]))
    if len(audio_files) == 0:
        return None
    
    # get transcription file
    transcription_file = [f for f in os.listdir(folder) if f.endswith(".txt")][0]

    #join folder and audio file name
    audio_files = [os.path.join(folder, audio_file) for audio_file in audio_files]
    #join folder and transcription file name
    transcription_file = os.path.join(folder, transcription_file)

    # read transcription file
    transcription_texts = pd.read_csv(transcription_file, sep=",", header=None)[1].str.strip().values

    audiofile2transcription = {}
    # create a dictionary with audio file name as key and transcription as value
    for audio_file, transcription_text in zip(audio_files, transcription_texts):
        audiofile2transcription[audio_file] = transcription_text

    return audiofile2transcription

In [4]:
# do this for all folders in side "sanskrit" folder
audiofile2transcription = {}
ROOT_DIR = "sanskrit"
RECORDING_FOLDER = os.path.join(ROOT_DIR, "newsonair_v5", "sanskrit")
for folder in os.listdir(RECORDING_FOLDER):
    folder = os.path.join(RECORDING_FOLDER, folder)
    more_data = search_folder(folder)
    if more_data is not None:
        audiofile2transcription.update(more_data)


In [5]:
dict_for_dataset = {
    "audio": list(audiofile2transcription.keys()),
    "transcriptions": list(audiofile2transcription.values())
}

In [6]:
dataset_san = Dataset.from_dict(dict_for_dataset).cast_column("audio", Audio(sampling_rate=16_000))

In [7]:
dataset_san[0]["audio"]

{'path': 'sanskrit\\newsonair_v5\\sanskrit\\NSD-Sanskrit-Sanskrit-0655-0700-2019101072720\\sent_1.wav',
 'array': array([-0.00112915, -0.00106812, -0.00100708, ..., -0.00848389,
        -0.01177979, -0.00576782]),
 'sampling_rate': 16000}

In [8]:
dataset_san_info = DatasetInfo(
    description="Shrutilipi is a labelled ASR corpus for Sanskrit",
    features=Features({"audio": Audio(), "transcriptions": Value("string")}),
    supervised_keys=None,
    homepage="https://ai4bharat.org/shrutilipi",
    citation="""
        @misc{https://doi.org/10.48550/arxiv.2208.12666,
            doi = {10.48550/ARXIV.2208.12666},
            url = {https://arxiv.org/abs/2208.12666},
            author = {Bhogale, Kaushal Santosh and Raman, Abhigyan and Javed, Tahir and Doddapaneni, Sumanth and Kunchukuttan, Anoop and Kumar, Pratyush and Khapra, Mitesh M.},
            title = {Effectiveness of Mining Audio and Text Pairs from Public Data for Improving ASR Systems for Low-Resource Languages},
            publisher = {arXiv},
            year = {2022},
            copyright = {arXiv.org perpetual, non-exclusive license}
        }
    """,
)


In [9]:
# not sure if this inplace operation is correct; but seem to work
dataset_san._info = dataset_san_info
dataset_san._num_rows = len(dict_for_dataset)
dataset_san.save_to_disk("shrutilipi_sanskrit")

Saving the dataset (0/16 shards):   0%|          | 0/14414 [00:00<?, ? examples/s]

In [10]:
TOKEN = "API_KEY"
dataset_san.push_to_hub("shrutilipi_sanskrit", token=TOKEN)

Map:   0%|          | 0/901 [00:00<?, ? examples/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/16 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]