In [1]:
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
import os
import subprocess
import shutil

import librosa
import soundfile as sf
import torch

from pyannote.audio import Pipeline


  from .autonotebook import tqdm as notebook_tqdm
  torchaudio.set_audio_backend("soundfile")


In [3]:
def download_youtube_audio(youtube_id, out_dir="yt_audio"):
    os.makedirs(out_dir, exist_ok=True)

    url = f"https://www.youtube.com/watch?v={youtube_id}"

    cmd = [
        "yt-dlp",
        "-f", "bestaudio/best",
        "--extract-audio",
        "--audio-format", "wav",
        "--audio-quality", "0",
        "--ffmpeg-location", "C:\\ffmpeg\\bin",  # üîß ‡πÅ‡∏Å‡πâ‡πÉ‡∏´‡πâ‡∏ï‡∏£‡∏á‡πÄ‡∏Ñ‡∏£‡∏∑‡πà‡∏≠‡∏á
        "--postprocessor-args", "ffmpeg:-ac 1",  # mono only
        "-o", f"{out_dir}/{youtube_id}.%(ext)s",
        url
    ]

    result = subprocess.run(cmd, capture_output=True, text=True)
    if result.returncode != 0:
        print(result.stderr)
        raise RuntimeError("yt-dlp failed")

    print(f"‚úÖ downloaded {youtube_id}")


In [None]:
os.makedirs("inference_audio", exist_ok=True)

yt_id = "Mrax5_z07Y0" # ‡∏î‡∏≤‡∏ß‡∏´‡∏≤‡∏á‡∏Æ‡∏±‡∏•‡πÄ‡∏•‡∏¢‡πå
download_youtube_audio(yt_id, "inference_audio")

‚úÖ downloaded Mrax5_z07Y0


In [None]:
yt_id = "Gy-MZjiFv2M" # ‡∏î‡∏≠‡∏Å‡∏Å‡∏£‡∏∞‡πÄ‡∏à‡∏µ‡∏¢‡∏ß‡∏ö‡∏≤‡∏ô
download_youtube_audio(yt_id, "inference_audio")

‚úÖ downloaded Gy-MZjiFv2M


In [13]:
# !pip install demucs

In [14]:
import subprocess

def get_vocal_voice(file_path):
    
    subprocess.run([
        "demucs",
        "--two-stems=vocals",
        file_path
    ])

In [15]:
file_path_list = ["inference_audio/Gy-MZjiFv2M.wav", "inference_audio/Mrax5_z07Y0.wav"]

In [16]:
for file_path in file_path_list:
    get_vocal_voice(file_path)

In [17]:
import os
import pandas as pd

import torch
import whisper
from transformers import pipeline

from transformers import logging
logging.set_verbosity_error()

In [18]:
device = "cuda" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32


whisper_model = whisper.load_model("large")
whisper_model = whisper_model.to(device)

def whisper_trans(audio_path):
    """Transcribe with OpenAI Whisper (single file)"""
    with torch.no_grad():
        result = whisper_model.transcribe(audio_path)
    return result["text"]

In [19]:
task = "transcribe"

pipe_pathumma = pipeline(
    task="automatic-speech-recognition",
    model="nectec/Pathumma-whisper-th-large-v3",
    torch_dtype=torch_dtype,
    device=device,
)

pipe_pathumma.model.config.forced_decoder_ids = (
    pipe_pathumma.tokenizer.get_decoder_prompt_ids(
        language=None,
        task=task
    )
)

def pathumma_trans(audio_path):
    """Transcribe with Pathumma Whisper (single file)"""
    with torch.no_grad():
        result = pipe_pathumma(audio_path)
    return result["text"]


In [20]:
file_path_list = ["separated/htdemucs/Gy-MZjiFv2M/vocals.wav", "separated/htdemucs/Mrax5_z07Y0/vocals.wav"]

rows = []

for file_path in file_path_list:
    whisper_text = whisper_trans(file_path)
    pathumma_text = pathumma_trans(file_path)

    speaker = file_path.split("/")[0]

    row_dict = {
        "file_path": file_path,
        "whisper_text": whisper_text,
        "pathumma_text": pathumma_text,
        "speaker": speaker
    }

    rows.append(row_dict)

    print(f"File: {file_path} - Whisper: {whisper_text} - Pathumma: {pathumma_text}")

File: separated/htdemucs/Gy-MZjiFv2M/vocals.wav - Whisper: ‡∏™‡∏á‡∏™‡∏≤‡∏£ Sew goes further‡πÑ‡∏î‡πâ‡∏°‡∏≤‡∏û‡πâ‡∏≠ ‡∏´‡∏ô‡∏≠‡∏°‡πÅ‡∏Å‡πâ‡∏°‡∏≠‡∏á ‡∏ï‡πâ‡∏≠‡∏á‡∏ô–ø–æ–ª–æ–∂‡∏ß‡∏±‡∏ô‡∏™‡∏á‡∏Å‡∏£‡∏≤‡∏ô‡πÄ‡∏ï‡πâ‡∏ô‡∏≠‡∏¢‡∏π‡πà‡∏´‡∏ô‡πâ‡∏≤‡∏≠‡∏≤‡∏´‡∏≤‡∏£ ‡πÄ‡∏°‡∏∑‡πà‡∏≠‡∏Ñ‡∏∑‡∏ô‡πÄ‡∏ó‡∏®‡∏Å‡∏≤‡∏•‡∏ß‡∏±‡∏ô‡πÑ‡∏´‡∏•‡πÄ‡∏à‡πâ‡∏≤‡∏¢‡∏¥‡πâ‡∏°‡πÄ‡∏Ç‡πâ‡∏≤‡∏ß‡∏∞‡∏¢‡∏≠‡∏Å ‡∏ö‡∏≠‡∏Å‡∏ß‡πà‡∏≤ ‡∏Æ‡∏±‡∏Å‡πÑ‡∏≠‡πâ‡πÄ‡∏ö‡∏¥‡∏î‡πÉ‡∏à‡πÄ‡∏Ç‡πâ‡∏≤‡∏°‡∏≤‡∏™‡∏ö‡∏ó‡∏µ‡πà‡∏ï‡∏£‡∏á‡πÑ‡∏• ‡∏à‡∏∞‡∏ß‡πà‡∏≤ ‡∏ß‡∏∞‡πÉ‡∏à‡πÄ‡∏´‡∏≤‡∏∞‡∏ï‡∏£‡∏á‡∏Å‡∏±‡∏ô‡∏≠‡∏¢‡∏π‡πà‡∏Å‡∏±‡∏ô‡∏™‡∏≠‡∏á‡∏Ñ‡∏∑‡∏ô ‡πÄ‡∏à‡πâ‡∏≤‡∏°‡∏≤‡∏Ñ‡∏≠‡∏Å‡∏•‡∏∞‡πÑ‡∏õ‡πÄ‡∏´‡πá‡∏î‡∏á‡∏≤‡∏ô‡∏Å‡πà‡∏≠‡∏ô‡∏Å‡∏£‡∏∞‡πÄ‡∏à‡∏µ‡∏¢‡∏ß‡∏ö‡∏≤‡∏ô ‡∏ö‡πà‡∏≠‡∏ï‡πâ‡∏≠‡∏á‡∏¢‡πà‡∏≤‡∏ô‡∏™‡∏µ‡πà‡∏Å‡∏•‡πà‡∏≥‡∏°‡∏´‡∏≤‡πÑ‡∏≠‡πâ‡∏Å‡πá‡∏ó‡∏≤‡∏à‡∏°‡πÄ‡∏®‡∏£‡πâ‡∏≤ ‡∏Å‡∏≠‡∏î‡∏£‡∏π‡∏õ‡πÄ‡∏à‡πâ‡∏≤‡∏ó‡∏±‡πâ‡∏á‡∏ô‡πâ‡∏≥‡∏ï‡∏≤‡πÄ‡∏à‡πâ‡∏≤‡∏Ñ‡∏á‡∏•‡∏∑‡∏°‡∏™‡∏±‡∏ç‡∏ç‡∏≤ ‡∏™‡∏≤‡∏°‡πÄ‡∏î‡∏∑‡∏≠‡∏ô‡∏Å‡∏ß‡πà‡∏≤‡πÜ ‡πÅ‡∏•‡πâ‡∏ß‡πÑ‡∏õ‡πÇ‡∏¢‡∏™‡∏±‡∏¢‡∏≠‡∏¢‡∏≤‡∏Å‡πÄ‡∏à‡∏≠ ‡πÑ‡∏î‡πâ‡πÅ‡∏Ñ‡πà‡πÄ‡∏û‡∏µ‡

In [21]:
df = pd.DataFrame(rows)
df.to_csv("transcription_inference.csv", index=False)

In [22]:
df.head()

Unnamed: 0,file_path,whisper_text,pathumma_text,speaker
0,separated/htdemucs/Gy-MZjiFv2M/vocals.wav,‡∏™‡∏á‡∏™‡∏≤‡∏£ Sew goes further‡πÑ‡∏î‡πâ‡∏°‡∏≤‡∏û‡πâ‡∏≠ ‡∏´‡∏ô‡∏≠‡∏°‡πÅ‡∏Å‡πâ‡∏°‡∏≠‡∏á ‡∏ï‡πâ‡∏≠‡∏á...,‡πÑ‡∏î‡πâ‡∏°‡∏≤‡∏û‡πà‡∏≠‡∏ô‡πâ‡∏≠‡∏á‡πÅ‡∏Å‡πâ‡∏°‡∏≠‡πâ‡∏≠‡∏á‡∏ï‡πâ‡∏≠‡∏á‡πÉ‡∏ô‡∏ß‡∏±‡∏ô‡∏™‡∏á‡∏Å‡∏£‡∏≤‡∏ô‡∏ï‡πå ‡πÄ‡∏ï‡πâ‡∏ô‡∏≠‡∏¢‡∏π‡πà...,separated
1,separated/htdemucs/Mrax5_z07Y0/vocals.wav,Thank you. Thank you. Thank you. Thank you. T...,‡πÄ‡∏°‡∏∑‡πà‡∏≠‡πÄ‡∏°‡∏∑‡πà‡∏≠ ‡∏°‡∏µ‡∏Å‡∏≤‡∏£‡∏õ‡∏è‡∏¥‡∏ö‡∏±‡∏ï‡∏¥‡∏Å‡∏≤‡∏£‡∏Å‡∏≤‡∏£‡∏Å‡∏≤‡∏£‡πÄ‡∏á‡∏¥‡∏ô‡∏ó‡∏µ‡πà‡∏™‡∏≥‡∏Ñ‡∏±‡∏ç‡πÉ‡∏ô...,separated
