In [None]:
import os
from pyannote.audio import Pipeline, Audio
from pyannote.core import Segment
import torch
from transformers import AutoProcessor, WhisperForConditionalGeneration
import numpy as np

# Load models
speaker_diarization = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token="AUTH_TOKEN")
processor = AutoProcessor.from_pretrained("erdiyalcin/whisper-large-v3-turkish-test1")
model = WhisperForConditionalGeneration.from_pretrained("erdiyalcin/whisper-large-v3-turkish-test1")
audio_file_path = "deneme.wav"

# Apply speaker diarization
who_speaks_when = speaker_diarization(audio_file_path, num_speakers=2)

# Initialize text file for saving transcriptions
output_file = "transcriptions_with_speakers.txt"

with open(output_file, "w", encoding="utf-8") as f_out:
    # Process each speaker segment
    audio = Audio(sample_rate=16000, mono=True)
    for segment, _, speaker in who_speaks_when.itertracks(yield_label=True):
        waveform, sample_rate = audio.crop(audio_file_path, segment)
        waveform_np = waveform.squeeze().numpy().astype(np.float16)  
        print(waveform_np.dtype)

        inputs = processor(waveform_np, return_tensors="pt", sampling_rate=sample_rate)
        print(inputs.input_features.dtype)

        input_features = inputs.input_features  
        print(input_features.dtype)

        generated_ids = model.generate(inputs=input_features)
        transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        
        f_out.write(f"{segment.start:06.1f}s - {segment.end:06.1f}s - {speaker}: {transcription}\n")
        print(f"{segment.start:06.1f}s - {segment.end:06.1f}s - {speaker}: {transcription}")

print(f"Transcriptions saved to {output_file}")
