In [None]:
import os
import json
import csv
import regex as re
import subprocess
import numpy as np
from youtubesearchpython import VideosSearch
import soundfile as sf
from typing import List, Dict

## Utils

In [None]:
def duration_to_int(dur):
    dd = 0
    tt = 0  # power of 60
    for ii in dur.split(":")[::-1]:
        dd += (60 ** tt) * int(ii)
        tt += 1
    return dd

In [None]:
def read_audio_section(filename, start_time, stop_time):
    track = sf.SoundFile(filename)
    if not track.seekable():
        raise ValueError("Not compatible with seeking")

    sr = track.samplerate
    start_frame = sr * int(start_time)
    frames_to_read = sr * int(stop_time - start_time)
    track.seek(start_frame)
    audio_section = track.read(frames_to_read)
    
    return audio_section, sr

In [None]:
URL_PATTERN = re.compile(r'\[([^\]]+)\]')

## Raag Search

### Query Builder

In [None]:
vocal_artists = []
instr_artists = []

In [None]:
def query_builder(raag) -> List[str]:
    links = []
    
    # Bada Khyal
    b_res = VideosSearch(f"Raag {raag} Khyaal", limit = 7).result()["result"]
    b_res = sorted(b_res, key=lambda x: duration_to_int(x["duration"]))[::-1]
    # constrain duration between 30-90 minutes
    for br in b_res:
        if 1800 <= duration_to_int(br["duration"]) <= 5400:
            print(f"adding {br['title']}")
            links.append(br["link"])
        else:
            print(f"skipping {br['title']}")
    
    # Chhota Khyal
    c_res = VideosSearch(f"Raag {raag} Vocal Drut", limit = 7).result()["result"]
    c_res = sorted(c_res, key=lambda x: duration_to_int(x["duration"]))[::-1]
    # constrain duration between 7-25 minutes
    for cr in c_res:
        if 420 <= duration_to_int(cr["duration"]) <= 1500:
            print(f"adding {cr['title']}")
            links.append(cr["link"])
        else:
            print(f"skipping {cr['title']}")
    
    # Instrumental
    i_res = VideosSearch(f"Raag {raag} Instrumental", limit = 7).result()["result"]
    i_res = sorted(i_res, key=lambda x: duration_to_int(x["duration"]))[::-1]
    # constrain duration between 10-45 minutes
    for ir in i_res:
        if 600 <= duration_to_int(ir["duration"]) <= 2700:
            print(f"adding {ir['title']}")
            links.append(ir["link"])
        else:
            print(f"skipping {ir['title']}")
    
    return links

In [None]:
# # List of all raags
# raag_names = []
# refs = []
# with open("tanarang/raaglist.csv", "r") as fp:
#     wr = csv.reader(fp, delimiter=",")
#     for line in wr:
#         raag_names.append(line[0])
#         refs.append(line[1])
# raag_include = [False] * len(raag_names)


In [None]:
# # To create a SMALL dataset
# for ii, rn in enumerate(raag_names):
#     y = input(f"include {rn}?\n")
#     if "y" in y:
#         raag_include[ii] = True
# with open("raags_small.txt", "w") as fp:
#     for raag in np.array(raag_names)[raag_include]:
#         fp.write(f"{raag}\n")
with open("raags_small.txt", "r") as fp:
    raag_names_small = fp.readlines()
    raag_names_small = [rr.strip() for rr in raag_names_small]

### Gather Links + Download Audios

In [None]:
DOWNLOAD_DIR = "hindustani-raag-fullaudios"

In [None]:
def fetch_audio(raag):
    raag_urls = query_builder(raag)

    raag_concise = raag.split("(")[0].replace(" ", "")
    os.makedirs(DOWNLOAD_DIR + "/" + raag_concise, exist_ok=True)

    for ii in raag_urls:
        command = "/opt/homebrew/bin/yt-dlp " + ii + " -f 'ba' -x --audio-format 'mp3' --ffmpeg-location /opt/homebrew/bin/ffmpeg -P " + DOWNLOAD_DIR + "/" + raag_concise + "/"
        result = subprocess.run(command, shell=True, capture_output=True)
        print(result.stdout.decode())
        if len(result.stderr) > 0:
            print("Error:", result.stderr.decode())

In [None]:
# for raag in raag_names_small:
#     fetch_audio(raag)
fetch_audio("Amritavarshini")

### Get Chunks & Write to Dataset Structure

In [None]:
DATASET_DIR = "hindustani-raag-small"
DOWNLOAD_DIR = "hindustani-raag-fullaudios"

In [None]:
# heuristic sample locations
# each chunk 0.01 of the audio duration
train_chunks = [
    (0.24, 0.25),
    (0.44, 0.45),
    (0.74, 0.75)  
]
test_chunks = [
    (0.44, 0.45),
    (0.64, 0.65),
]

In [None]:
np.random.seed(0)

for raag in sorted(os.listdir(DOWNLOAD_DIR)):
    if raag[0] == ".":
        continue
    os.makedirs(DATASET_DIR + "/" + raag, exist_ok=True)
    test_file_idx = np.random.choice(range(len(os.listdir(DOWNLOAD_DIR + "/" + raag))))
    
    for ai, audio in enumerate(sorted(os.listdir(DOWNLOAD_DIR + "/" + raag))):
        if audio[0] == ".":
            continue
        f = sf.SoundFile(DOWNLOAD_DIR + "/" + raag + "/" + audio)
        duration = f.frames / f.samplerate
        # audio_concise = re.sub(r'[^\p{L}\p{M}\d]', "-", audio[:-4])
        audio_url = re.findall(URL_PATTERN, audio)[-1]
        
        if ai == test_file_idx:
            for ci, ch in enumerate(test_chunks):
                writepath = DATASET_DIR + "/" + raag + "/test_[" + audio_url + "]_chunk" + str(ci) + ".mp3"
                if not os.path.exists(writepath):
                    audio_chunk, sr = read_audio_section(DOWNLOAD_DIR + "/" + raag + "/" + audio, duration * ch[0], duration * ch[1])
                    sf.write(writepath, audio_chunk, sr)
        else:
            for ci, ch in enumerate(train_chunks):
                writepath = DATASET_DIR + "/" + raag + "/train_[" + audio_url + "]_chunk" + str(ci) + ".mp3"
                if not os.path.exists(writepath):
                    audio_chunk, sr = read_audio_section(DOWNLOAD_DIR + "/" + raag + "/" + audio, duration * ch[0], duration * ch[1])
                    sf.write(writepath, audio_chunk, sr)

Note: listen to all samples to ensure this is not speech & is indeed said raag

In [None]:
# TODO: Carry over some metadata from the youtube search..
# This will be very important for credit attribution!!

## Push To Hub

In [None]:
# # check whether this looks good
# from datasets import load_dataset
# ds = load_dataset("hindustani-raag-small")

# # this doesn't work
# ds.push_to_hub(repo_id="neerajaabhyankar/hindustani-raag-small", token="")

In [None]:
from huggingface_hub import HfApi
api = HfApi()

In [None]:
api.upload_folder(
    folder_path="hindustani-raag-small/",
    repo_id="neerajaabhyankar/hindustani-raag-small",
    repo_type="dataset",
)