In [6]:
from dotenv import load_dotenv
import os
load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")

import ssl
ssl._create_default_https_context = ssl._create_unverified_context


# ─── Prepend Homebrew to PATH ──────────────────────────────────────────────────
# so that subprocesses can find /opt/homebrew/bin/ffmpeg, etc.
homebrew_bin = "/opt/homebrew/bin"
os.environ["PATH"] = f"{homebrew_bin}:{os.environ.get('PATH', '')}"


In [2]:
import whisperx
from tools.compare_runner import get_hypothesis_text, load_reference_text, compare_texts


def cleaning(text):
    if not isinstance(text, str):
        return None

    return persian_normalizer({"sentence": text}, return_dict=False)


def segments_comparison(segments,
    ground_truth_path: str,
    audio_path: str,
    diff: bool      = False,
    print_hype_text: bool     = False,
    print_ref_text: bool      = False,
    msg: str = None,
    ):

    # 2. hypothesis text
    hyp_text = get_hypothesis_text(segments).strip()
    if print_hype_text:
        print("\nhyp_text:\n", hyp_text)


    # 3. reference text
    ref_text = load_reference_text(ground_truth_path).strip()
    if print_ref_text:
        print("\nref_text:\n", ref_text)

    hyp = cleaning(hyp_text).strip() if isinstance(hyp_text, str) else hyp_text
    ref     = cleaning(ref_text).strip() if isinstance(ref_text, str) else ref_text


    # 4. compare & report
    print("\n" + "*"*70)
    print(f"Comparing {msg}\n  HYP: {audio_path}\n  REF: {ground_truth_path}")
    print("*"*70 + "\n")
    compare_texts(hyp_text, ref_text, diff=diff)

    # 4. compare & report
    print("\n" + "*"*70)
    print(f"Comparing After Normalizing{msg}\n  HYP: {audio_path}\n  REF: {ground_truth_path}")
    print("*"*70 + "\n")
    compare_texts(hyp, ref, diff=diff)



def evaluate_transcription(
    audio_path: str,
    ground_truth_path: str,
    model_size: str = "small",
    device: str     = "cpu",
    compute_type: str = "float32",
    batch_size: int = 16,
    diff: bool      = False,
    print_hype_text: bool     = False,
    print_ref_text: bool      = False,
):
    """
    1) Load Whisper ASR, transcribe `audio_path` → segments
    2) Build hypothesis text
    3) Load reference text from ground_truth_path
    4) Compute & print WER (+ optional diff)
    """
    # 1. load & transcribe
    model = whisperx.load_model(model_size, device, compute_type=compute_type)
    audio = whisperx.load_audio(audio_path)
    result = model.transcribe(audio, batch_size=batch_size)
    segments_comparison(result["segments"], ground_truth_path, audio_path, diff=diff, print_hype_text=print_hype_text, print_ref_text=print_ref_text, msg="transcription")

    return result

        #
    # # delete model if low on GPU resources
    # # import gc; import torch; gc.collect(); torch.cuda.empty_cache(); del model
    #
    #
    # # 2. Align whisper output
    # model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device, )
    # result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
    #
    # segments = result["segments"]
    # print(segments) # after alignment
    #
    # hyp_text = get_hypothesis_text(segments)
    # hyp_text = hyp_text.strip()
    # # hyp_text = hyp_text.lower()
    #
    # # 4. load ground truth
    # ref_text = load_reference_text(ground_truth_file)
    # ref_text = ref_text.strip()
    # # ref_text = ref_text.lower()
    #
    # # print("\n hype text: \n", hyp_text)
    # # print("\n ref text: \n", ref_text)
    #
    # print("\n***********************************************************************")
    # print("\n After Alignment")
    # print("\n***********************************************************************")
    #
    # # 5. compare
    # compare_texts(hyp_text, ref_text, diff=False)
    #
    #
    # # delete model if low on GPU resources
    # # import gc; import torch; gc.collect(); torch.cuda.empty_cache(); del model_a
    #
    # # 3. Assign speaker labels
    # diarize_model = whisperx.diarize.DiarizationPipeline(use_auth_token=HF_TOKEN, device=device)
    #
    # # add min/max number of speakers if known
    # diarize_segments = diarize_model(audio)
    # # diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)
    #
    # result = whisperx.assign_word_speakers(diarize_segments, result)
    # print(diarize_segments)
    # print(result["segments"]) # segments are now assigned speaker IDs


def align_whisper_output(
    result,
    audio_path: str,
    ground_truth_path: str,
    device: str     = "cpu",
    diff: bool      = False,
    print_hype_text: bool     = False,
    print_ref_text: bool      = False,
):

    audio = whisperx.load_audio(audio_path)

    # 2. Align whisper output
    model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device, )
    result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)

    segments_comparison(result["segments"], ground_truth_path, audio_path, diff=diff, print_hype_text=print_hype_text, print_ref_text=print_ref_text, msg="aligned")

In [48]:
audio_file = "../Data/Training/English/Churchill/english_firstsourcecommons_13_churchill_64kb.mp3"
ground_truth_file = "../Data/Training/English/Churchill/english_firstsourcecommons_13_churchill_transcript-english_translation_hebrew.docx"

evaluate_transcription(audio_path=audio_file, ground_truth_path=ground_truth_file)


Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.5.2. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../.venv/lib/python3.11/site-packages/whisperx/assets/pytorch_model.bin`


No language specified, language will be first be detected for each audio file (increases inference time).
>>Performing voice activity detection using Pyannote...
Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.7.1. Bad things might happen unless you revert torch to 1.x.
Detected language: en (0.99) in first 30s of audio...

**********************************************************************
Comparing
  HYP: ../Data/Training/English/Churchill/english_firstsourcecommons_13_churchill_64kb.mp3
  REF: ../Data/Training/English/Churchill/english_firstsourcecommons_13_churchill_transcript-english_translation_hebrew.docx
**********************************************************************


📊 WER: 12.70%
   Substitutions: 39
   Deletions:     249
   Insertions:    13



In [4]:
import psutil, time, functools, os
from collections import namedtuple

# A small helper to snapshot stats
Snapshot = namedtuple("Snapshot", ["ts", "cpu_user", "cpu_sys", "rss", "vms", "read_bytes", "write_bytes"])

def take_snapshot():
    p = psutil.Process(os.getpid())
    io = psutil.disk_io_counters()
    cpu_times = p.cpu_times()
    mem = p.memory_info()
    return Snapshot(
        ts=time.time(),
        cpu_user=cpu_times.user,
        cpu_sys=cpu_times.system,
        rss=mem.rss,
        vms=mem.vms,
        read_bytes=io.read_bytes,
        write_bytes=io.write_bytes,
    )

def profile_resources(func):
    """Decorator to measure resources used by a single call to func(...)."""
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        before = take_snapshot()
        result = func(*args, **kwargs)
        after = take_snapshot()
        # compute deltas
        wall  = after.ts - before.ts
        cpu   = (after.cpu_user + after.cpu_sys) - (before.cpu_user + before.cpu_sys)
        rss   = (after.rss - before.rss) / (1024**2)        # MB
        vms   = (after.vms - before.vms) / (1024**2)        # MB
        read  = (after.read_bytes - before.read_bytes) / (1024**2)  # MB
        write = (after.write_bytes - before.write_bytes) / (1024**2) # MB
        print(f"\n🔍 Resource profile for `{func.__name__}`:")
        print(f"  Wall-time:   {wall:.1f}s")
        print(f"  CPU time:    {cpu:.1f}s")
        print(f"  RSS Δ:       {rss:.1f} MB")
        print(f"  VMS Δ:       {vms:.1f} MB")
        print(f"  Disk Read:   {read:.1f} MB")
        print(f"  Disk Write:  {write:.1f} MB\n")
        return result
    return wrapper


In [7]:

# result = evaluate_transcription(audio_path=audio_file, ground_truth_path=ground_truth_file, diff=False, print_hype_text=True , print_ref_text=True, model_size="medium" )

@profile_resources
def run_test():

    audio_file = "../Data/Training/Farsi/Quran/farsi_secondsource.mp3"
    ground_truth_file = "../Data/Training/Farsi/Quran/farsi_secondsource_transcript-farsi_translation-hebrew.docx"

    # audio_file = "../Data/Training/Farsi/Quran/farsi_thirdsource.mp3"
    # ground_truth_file = "../Data/Training/Farsi/Quran/farsi_thirdsource_transcript-farsi_translation-hebrew.docx"

    # audio_file = "../Data/Training/Hebrew/hebrew_firstsource.mp4"
    # ground_truth_file = "../Data/Training/Hebrew/hebrew_firstsource_transcript-hebrew_translation-english.docx"

    return evaluate_transcription(
        audio_path=audio_file,
        ground_truth_path=ground_truth_file,
        diff=False,
        print_hype_text=True,
        print_ref_text=True,
        model_size="medium"
    )

# Now execute:
run_test()

# align_whisper_output(result=result, audio_path=audio_file, ground_truth_path=ground_truth_file, diff=True, print_hype_text=True , print_ref_text=True )


Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.5.2. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../.venv/lib/python3.11/site-packages/whisperx/assets/pytorch_model.bin`


No language specified, language will be first be detected for each audio file (increases inference time).
>>Performing voice activity detection using Pyannote...
Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.7.1. Bad things might happen unless you revert torch to 1.x.
Detected language: fa (0.98) in first 30s of audio...

hyp_text:
 به نام خداوند بخشنلی بخشایشگر سوگند به کوه تور و کتابی که نوشته شده در صفحه گسترده و سوگند به بیتون معمور و سقف برپراشته و دریاج مملوغ و برپروخته که عذاب پروردارت واقع می شود و چیزی ازن مانع نخواهد بود. این عذاب الاهی در آن روزی ایس چه آسمان به شدت به حرکت درمیادید. و کوها از جا کنده و متحرک میشن. وای در آن روز بر تکزیب کنندگان. هم آنها که در صخنان باطل به بازی مشکولند. در آن روز شا آنها را به زور به سوی آتش دوزخ نیرانند. و آنها میگویند این همان آتشیست که آن را انکار میکردید. آیا این سهرست یا شما نمیدینی در آن وارد شوید و بسو

{'segments': [{'text': ' به نام خداوند بخشنلی بخشایشگر سوگند به کوه تور و کتابی که نوشته شده در صفحه گسترده و سوگند به بیتون معمور و سقف برپراشته و دریاج مملوغ و برپروخته که عذاب پروردارت واقع می شود و چیزی ازن مانع نخواهد بود. این عذاب الاهی در آن روزی ایس چه آسمان به شدت به حرکت درمیادید.',
   'start': 0.166,
   'end': 29.157},
  {'text': ' و کوها از جا کنده و متحرک میشن. وای در آن روز بر تکزیب کنندگان. هم آنها که در صخنان باطل به بازی مشکولند. در آن روز شا آنها را به زور به سوی آتش دوزخ نیرانند. و آنها میگویند این همان آتشیست که آن را انکار میکردید. آیا این سهرست یا شما نمیدینی',
   'start': 29.427,
   'end': 52.85},
  {'text': ' در آن وارد شوید و بسوزید، میخواید سرب کنید یا نکنید، برای شما یک سان هست چرا که تنها به اعمالتان جزا داده میشدید. ولی پرهیزگاران در میان باق های بهشت و نعمت های پرابان جایی دارند، و از آنچ پرورگارشان با آنها داده و آنان را از عذاب دوزخ نگاه داشته هست شاد و مصروب هست.',
   'start': 53.305,
   'end': 77.386},
  {'text': ' با انها گفته می شود بخورید و بیا شامی

In [68]:
%load_ext memory_profiler %time %memit evaluate_transcription(audio_path=audio_file, ground_truth_path=ground_truth_file, diff=False, print_hype_text=True, print_ref_text=True, model_size="medium")


Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.5.2. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../.venv/lib/python3.11/site-packages/whisperx/assets/pytorch_model.bin`


No language specified, language will be first be detected for each audio file (increases inference time).
>>Performing voice activity detection using Pyannote...
Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.7.1. Bad things might happen unless you revert torch to 1.x.
Detected language: fa (0.98) in first 30s of audio...

hyp_text:
 به نام خداوند بخشنلی بخشایشگر سوگند به کوه تور و کتابی که نوشته شده در صفحه گسترده و سوگند به بیتون معمور و سقف برپراشته و دریاج مملوغ و برپروخته که عذاب پروردارت واقع می شود و چیزی ازن مانع نخواهد بود. این عذاب الاهی در آن روزی ایس چه آسمان به شدت به حرکت درمیادید. و کوها از جا کنده و متحرک میشن. وای در آن روز بر تکزیب کنندگان. هم آنها که در صخنان باطل به بازی مشکولند. در آن روز شا آنها را به زور به سوی آتش دوزخ نیرانند. و آنها میگویند این همان آتشیست که آن را انکار میکردید. آیا این سهرست یا شما نمیدینی در آن وارد شوید و بسو

In [None]:

from speechbrain.inference.ASR import WhisperASR
audio_file = "../Data/Training/Farsi/Quran/farsi_secondsource.mp3"
ground_truth_file = "../Data/Training/Farsi/Quran/farsi_secondsource_transcript-farsi_translation-hebrew.docx"



asr_model = WhisperASR.from_hparams(source="speechbrain/asr-whisper-large-v2-commonvoice-fa", savedir="whisperx_models/asr-whisper-large-v2-commonvoice-fa")

segments = asr_model.transcribe_file(audio_file)

hyp_text = " ".join(seg.words.strip() for seg in segments).strip()

# 4️⃣ Load your ground-truth text
ref_text = load_reference_text(ground_truth_file).strip()

# 5️⃣ Compare
print(f"\n▶︎ Comparing SpeechBrain Whisper (asr-whisper-large-v2-commonvoice-fa) output")
compare_texts(hyp_text, ref_text, diff=True)


In [None]:


def transcribe_wav2vec2(
    audio_path: str,
    model_name: str = "jonatasgrosman/wav2vec2-large-xlsr-53-persian",
    device: str = "cpu",
    chunk_length_s: float = None
) -> str:
    """
    Transcribe a single audio file with a pretrained Wav2Vec2 model,
    resampling via torchaudio to avoid any numba/NumPy mismatch.
    """
    # 1. Load processor & model
    processor = Wav2Vec2Processor.from_pretrained(model_name)
    model = Wav2Vec2ForCTC.from_pretrained(model_name).to(device)

    # 2. Load audio (any format / sample rate) with torchaudio
    waveform, sr = torchaudio.load(audio_path)   # Tensor [1, num_frames]
    # Resample if needed
    if sr != 16_000:
        resampler = torchaudio.transforms.Resample(sr, 16_000)
        waveform = resampler(waveform)
        sr = 16_000

    # Convert to numpy for processor
    speech = waveform.squeeze(0).cpu().numpy()

    # 3. Optionally chunk long files
    if chunk_length_s and speech.shape[0] / sr > chunk_length_s:
        pieces = []
        step = int(chunk_length_s * sr)
        for start in range(0, speech.shape[0], step):
            chunk = speech[start : start + step]
            inputs = processor(chunk, sampling_rate=sr, return_tensors="pt", padding=True)
            with torch.no_grad():
                logits = model(inputs.input_values.to(device)).logits
            pred_ids = torch.argmax(logits, dim=-1)
            pieces.append(processor.batch_decode(pred_ids)[0])
        return " ".join(pieces)

    # 4. One-shot decode
    inputs = processor(speech, sampling_rate=sr, return_tensors="pt", padding=True)
    with torch.no_grad():
        logits = model(inputs.input_values.to(device)).logits
    pred_ids = torch.argmax(logits, dim=-1)
    return processor.batch_decode(pred_ids)[0]

from comparison import load_reference_text, compare_texts

audio_file = "../Data/Training/Farsi/Quran/farsi_secondsource.mp3"
ground_truth_file = "../Data/Training/Farsi/Quran/farsi_secondsource_transcript-farsi_translation-hebrew.docx"

# get ASR output
hyp = transcribe_wav2vec2(
    audio_file,
    model_name="jonatasgrosman/wav2vec2-large-xlsr-53-persian",
    device="cpu",
    chunk_length_s=30.0
)

# load and compare
ref = load_reference_text(ground_truth_file)
print("hyp:\n", hyp, "\nref:\n", ref)
compare_texts(hyp, ref, diff=True)


In [None]:
import torch
import torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

from comparison import load_reference_text, compare_texts
from tools.persian_normalize.persian_normalizer import persian_normalizer


def cleaning(text):
    if not isinstance(text, str):
        return None

    return persian_normalizer({"sentence": text}, return_dict=False)



def transcribe_wav2vec2(audio_path: str,
                        model_name: str = "jonatasgrosman/wav2vec2-large-xlsr-53-persian",
                        device: str = "cpu") -> str:
    """
    Load & run a Wav2Vec2 CTC model on `audio_path`, returning the raw transcription string.
    """
    # 1. load model + tokenizer
    processor = Wav2Vec2Processor.from_pretrained(model_name)
    model     = Wav2Vec2ForCTC.from_pretrained(model_name).to(device)
    model.eval()

    # 2. load & (if needed) resample
    waveform, sr = torchaudio.load(audio_path)          # [1, T]
    if sr != 16_000:
        waveform = torchaudio.transforms.Resample(sr, 16_000)(waveform)
        sr = 16_000
    speech = waveform.squeeze(0).cpu().numpy()

    # 3. tokenize & run
    inputs = processor(speech, sampling_rate=sr, return_tensors="pt", padding=True)
    input_values = inputs.input_values.to(device)
    attention_mask = inputs.attention_mask.to(device)
    with torch.no_grad():
        logits = model(input_values, attention_mask=attention_mask).logits
    pred_ids = torch.argmax(logits, dim=-1)

    # 4. decode
    return processor.batch_decode(pred_ids)[0]


def evaluate_wav2vec2_model(audio_path: str,
                            ground_truth_path: str,
                            model_name: str = "m3hrdadfi/wav2vec2-large-xlsr-persian-v3",
                            device: str = "cpu",
                            diff: bool = False,
                            print_hyp: bool = False,
                            print_ref: bool = False):
    """
    1. Runs Wav2Vec2 ASR on `audio_path`
    2. Cleans and compares against text in `ground_truth_path`
    3. Prints WER (and diff if requested)
    """
    # 1️⃣ Transcribe
    raw_hyp = transcribe_wav2vec2(audio_path, model_name, device)
    hyp = cleaning(raw_hyp).strip() if isinstance(raw_hyp, str) else raw_hyp

    if print_hyp:
        print("\n— Hypothesis —\n", hyp)

    # 2️⃣ Load & clean reference
    raw_ref = load_reference_text(ground_truth_path)
    ref     = cleaning(raw_ref).strip() if isinstance(raw_ref, str) else raw_ref

    if print_ref:
        print("\n— Reference —\n", ref)

    # 3️⃣ Compare
    print(f"\n▶︎ Evaluating Wav2Vec2 model `{model_name}`\n")
    compare_texts(hyp, ref, diff=diff)


# ───────────────────────────── Usage ─────────────────────────────────────────────

audio_file        = "../Data/Training/Farsi/Quran/farsi_secondsource.mp3"
ground_truth_file = "../Data/Training/Farsi/Quran/farsi_secondsource_transcript-farsi_translation-hebrew.docx"

evaluate_wav2vec2_model(
    audio_path        = audio_file,
    ground_truth_path = ground_truth_file,
    model_name        = "m3hrdadfi/wav2vec2-large-xlsr-persian-v3",
    device            = "cpu",
    diff              = True,
    print_hyp         = True,
    print_ref         = True,
)
