In [1]:
# Sanity check to see if CUDA runs.
import torch, platform, sys
print(
    f"Python {platform.python_version()}  |  Torch {torch.__version__}\n"
    f"CUDA available: {torch.cuda.is_available()}\n"
    f"CUDA device count: {torch.cuda.device_count()}\n"
    f"Current device: {torch.cuda.current_device() if torch.cuda.is_available() else '‑'}\n"
    f"Device name: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else '‑'}"
)

Python 3.10.12  |  Torch 2.1.2+cu121
CUDA available: True
CUDA device count: 1
Current device: 0
Device name: NVIDIA GeForce RTX 2070 SUPER


# Validation
test 32.10s a frequent switch from 2 speakders
- Talk script 
start=32.1s stop=32.8s speaker_speaker_1 # patient
start=33.2s stop=33.7s speaker_speaker_0 # clinician
start=34.4s stop=35.1s speaker_speaker_0 
start=35.3s stop=35.6s speaker_speaker_0
start=35.8s stop=36.5s speaker_speaker_1
start=37.0s stop=37.2s speaker_speaker_0
start=37.5s stop=38.7s speaker_speaker_0
start=38.9s stop=39.5s speaker_speaker_0
start=39.7s stop=40.1s speaker_speaker_1
- pyannote script
start=32.1s stop=34.0s speaker_SPEAKER_01 # clinician
The above time range covers the second line of another person but is actually a long patient speech, then a short clinician speech. This has been differentiated in tha talk script (32.1s to 33.7s). So relatively <b>the talk script works better</b> Will need further metric evaluation.
start=32.3s stop=32.8s speaker_SPEAKER_00 # patient
start=34.3s stop=36.5s speaker_SPEAKER_01
start=35.9s stop=36.1s speaker_SPEAKER_00
start=37.0s stop=40.3s speaker_SPEAKER_01

In [1]:
# The modified whisnemo_diarization.py
# https://github.com/moomoofarm1/talk_ext/blob/ALF0725/inst/python/mf_whisnemo_diarization.py

import os
import sys
import torch
import torchaudio
import tempfile
import subprocess

def append_rttm_lines(source_rttm_path, output_rttm_path):
    if not os.path.exists(source_rttm_path):
        print(f"[ERROR] Source RTTM file not found: {source_rttm_path}")
        return

    with open(source_rttm_path, "r") as src, open(output_rttm_path, "a") as dst:
        for line in src:
            dst.write(line)

def print_rttm_as_segments(rttm_path):
    if not os.path.exists(rttm_path):
        print(f"[ERROR] RTTM file not found: {rttm_path}")
        return

    with open(rttm_path, "r") as f:
        for line in f:
            parts = line.strip().split()
            if parts[0] == "SPEAKER":
                start = float(parts[3])
                duration = float(parts[4])
                speaker = parts[7]
                stop = start + duration
                print(f"start={start:.1f}s stop={stop:.1f}s speaker_{speaker}")

def run_diarization_with_script(audio_path: str, script_path: str, device: str = "cuda"):
    sample_rate = 16000

    # Load and preprocess audio
    waveform, sr = torchaudio.load(audio_path)
    if sr != sample_rate:
        waveform = torchaudio.transforms.Resample(sr, sample_rate)(waveform)
    if waveform.shape[0] > 1:
        waveform = torch.mean(waveform, dim=0, keepdim=True)
        
    with tempfile.TemporaryDirectory(prefix="diar_temp_") as temp_dir:
        pt_path = os.path.join(temp_dir, "audio_waveform.pt")
        torch.save(waveform, pt_path)

        pass
        
        waveform_path = os.path.join(temp_dir, "audio_waveform.pt")
        audio_waveform = torch.load(waveform_path)  # shape: (channels, samples)
        if audio_waveform.dim() == 1: 
            audio_waveform = audio_waveform.unsqueeze(0)
        elif audio_waveform.dim() == 2: pass
        else: audio_waveform = audio_waveform.squeeze(0)
        rttm_dir = os.path.join(temp_dir, "pred_rttms")
        os.makedirs(rttm_dir, exist_ok=True)

        # Save the audio waveform to "mono_file.wav"
        torchaudio.save(
            os.path.join(temp_dir, "mono_file.wav"),
            audio_waveform.cpu().float(),
            16000,
            channels_first=True,
        )

        print("if exist: audio_waveform.pt")
        print(os.path.isfile(os.path.join(temp_dir, "audio_waveform.pt")))
        print("if exist: mono_file.wav")
        print(os.path.isfile(os.path.join(temp_dir, "mono_file.wav")))
    

        pass
        
        # Run external diarization script
        cmd = [
            sys.executable,
            script_path,
            "--diarize-worker",
            "--temp-dir", temp_dir,
            "--device", device
        ]
        
        print(f"[INFO] Running diarization script:\n{' '.join(cmd)}")
        result = subprocess.run(cmd, capture_output=True, text=True)
        if result.returncode != 0:
            print("[ERROR] Diarization subprocess failed:")
            print(result.stdout)
            print(result.stderr)
            return
        print("if exist: audio_waveform.pt")
        print(os.path.isfile(os.path.join(temp_dir, "audio_waveform.pt")))
        print("if exist: mono_file.wav")
        print(os.path.isfile(os.path.join(temp_dir, "mono_file.wav")))
        print("if exist: mono_file.rttm")
        print(os.path.isfile(os.path.join(temp_dir, "pred_rttms", "mono_file.rttm")))

        # Expected RTTM output from subprocess
        source_rttm_path = os.path.join(temp_dir, "pred_rttms", "mono_file.rttm")
        output_rttm_path = os.path.join(os.getcwd(), "mono_file.rttm")

        # Copy or append RTTM lines to output
        append_rttm_lines(source_rttm_path, output_rttm_path)

        # Print RTTM entries
        print_rttm_as_segments(output_rttm_path)

if __name__ == "__main__":
    AUDIO_FILE = "noisereduce_patient.wav"
    SCRIPT_PATH = "mf_whisnemo_diarization.py"
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

    run_diarization_with_script(AUDIO_FILE, SCRIPT_PATH, device=DEVICE)

if exist: audio_waveform.pt
True
if exist: mono_file.wav
True
[INFO] Running diarization script:
C:\Users\SoREAL\miniconda3\envs\virtutalk\python.exe talkpack_whisnemo_diarization.py --diarize-worker --temp-dir C:\Users\SoREAL\AppData\Local\Temp\diar_temp_vuphnks9 --device cuda
if exist: audio_waveform.pt
True
if exist: mono_file.wav
True
if exist: mono_file.rttm
True
start=2.0s stop=2.4s speaker_speaker_0
start=2.6s stop=3.5s speaker_speaker_0
start=4.1s stop=4.4s speaker_speaker_0
start=7.8s stop=8.0s speaker_speaker_0
start=8.9s stop=10.2s speaker_speaker_0
start=10.9s stop=11.8s speaker_speaker_0
start=12.6s stop=12.8s speaker_speaker_1
start=13.3s stop=14.2s speaker_speaker_0
start=14.6s stop=14.8s speaker_speaker_0
start=15.0s stop=15.4s speaker_speaker_0
start=16.1s stop=17.1s speaker_speaker_0
start=17.5s stop=20.0s speaker_speaker_0
start=20.7s stop=21.2s speaker_speaker_0
start=22.4s stop=22.7s speaker_speaker_0
start=23.2s stop=23.6s speaker_speaker_0
start=24.1s stop=24.4s 

In [21]:
# Key: ONLY !!!!!!!!!!! for research ONLY !!!!!!!!!!!!!!!!
# https://github.com/pyannote/pyannote-audio?tab=readme-ov-file
# in a virtudiarization conda env with numpy < 2.0
#Install pyannote.audio with pip install pyannote.audio
#Accept pyannote/segmentation-3.0 user conditions # !!!!!!!!!! Annoying for production
#Accept pyannote/speaker-diarization-3.1 user conditions # !!!!!!!!!! Annoying for production
#Create access token at hf.co/settings/tokens.

from pyannote.audio import Pipeline

# Load the pretrained pipeline
pipeline = Pipeline.from_pretrained(
    "pyannote/speaker-diarization-3.1",
    use_auth_token= hf_token)

# send pipeline to GPU (when available)
import torch
pipeline.to(torch.device("cuda" if torch.cuda.is_available() else "cpu")) # the script can be used to test GPU.

# Key: ONLY !!!!!!!!!!! for research ONLY !!!!!!!!!!!!!!!!
# Apply the pipeline to your audio file
diarization = pipeline("noisereduce_patient.wav")

# Print the results
for turn, _, speaker in diarization.itertracks(yield_label=True):
    print(f"start={turn.start:.1f}s stop={turn.end:.1f}s speaker_{speaker}")

# write to RTTM
with open("noisereduce_patient_pyannoteaudio.rttm", "w") as f:
    diarization.write_rttm(f)

# test 32.10s a frequent switch from 2 speakders

  std = sequences.std(dim=-1, correction=1)


start=1.4s stop=3.7s speaker_SPEAKER_01
start=4.1s stop=4.4s speaker_SPEAKER_01
start=7.5s stop=10.4s speaker_SPEAKER_01
start=10.9s stop=11.8s speaker_SPEAKER_01
start=12.7s stop=12.8s speaker_SPEAKER_01
start=13.3s stop=15.5s speaker_SPEAKER_01
start=16.2s stop=20.2s speaker_SPEAKER_01
start=20.7s stop=21.2s speaker_SPEAKER_01
start=22.4s stop=23.0s speaker_SPEAKER_01
start=23.1s stop=26.5s speaker_SPEAKER_01
start=27.4s stop=27.9s speaker_SPEAKER_01
start=28.4s stop=31.0s speaker_SPEAKER_01
start=32.1s stop=34.0s speaker_SPEAKER_01
start=32.3s stop=32.8s speaker_SPEAKER_00
start=34.3s stop=36.5s speaker_SPEAKER_01
start=35.9s stop=36.1s speaker_SPEAKER_00
start=37.0s stop=40.3s speaker_SPEAKER_01
start=40.6s stop=41.3s speaker_SPEAKER_01
start=41.4s stop=47.8s speaker_SPEAKER_01
start=43.8s stop=44.4s speaker_SPEAKER_00
start=46.8s stop=46.9s speaker_SPEAKER_00
start=47.9s stop=51.3s speaker_SPEAKER_01
start=52.7s stop=56.5s speaker_SPEAKER_01
start=58.3s stop=62.6s speaker_SPEAKER_