In [None]:
import os
from pydub import AudioSegment
from pyannote.audio import Pipeline, Audio
import whisper
from pyannote.core import Segment
from pyannote.database import get_protocol, FileFinder
from pyannote.database.util import load_rttm

# Function to convert audio to WAV format if not already in WAV
def convert_to_wav(audio_file_path):
    file_name, file_extension = os.path.splitext(audio_file_path)
    if file_extension.lower() != ".wav":
        audio = AudioSegment.from_file(audio_file_path)
        wav_file_path = f"{file_name}.wav"
        audio.export(wav_file_path, format="wav")
        return wav_file_path
    return audio_file_path

# same directory as this script
audio_file_path = "deneme.wav"
audio_file_path = convert_to_wav(audio_file_path)

speaker_diarization = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token="AUTH_TOKEN")

# Apply speaker diarization and save to RTTM format
# We are saving this in the rrtm file because if the code fails, we can still have the output of the speaker diarization
# and it is easier to debug the model and work on it
who_speaks_when = speaker_diarization(audio_file_path, num_speakers=2)  # Assuming 2 speakers (can be more or less with max and min also specified)
with open("audio2.rttm", "w") as rttm:
    who_speaks_when.write_rttm(rttm)

In [None]:
# Load Whisper model
model = whisper.load_model("large-v3")

# Save transcriptions to a file
output_file = "transcriptions_with_speakers2.txt"

# Load the RTTM file to get diarization segments
diarization = load_rttm("audio2.rttm")

# Process each speaker segment and save transcriptions
audio = Audio(sample_rate=16000, mono=True)
with open(output_file, "w", encoding="utf-8") as f_out:
    for file_id, turns in diarization.items():
        for segment, track, label in turns.itertracks(yield_label=True):
            waveform, sample_rate = audio.crop(audio_file_path, segment)  # waveform is a torch tensor
            # Convert waveform tensor to a numpy array and then to a list
            waveform_np = waveform.squeeze().numpy()
            result = model.transcribe(waveform_np, language="tr")
            text = result["text"]
            f_out.write(f"{segment.start:06.1f}s - {segment.end:06.1f}s - {label}: {text}\n")
            print(f"{segment.start:06.1f}s - {segment.end:06.1f}s - {label}: {text}")


In [None]:
# Import necessary libraries
import os
from pydub import AudioSegment
from pyannote.audio import Pipeline, Audio
from pyannote.core import Segment
import whisper

# Function to convert audio to WAV format if not already in WAV
def convert_to_wav(audio_file_path):
    file_name, file_extension = os.path.splitext(audio_file_path)
    if file_extension.lower() != ".wav":
        audio = AudioSegment.from_file(audio_file_path)
        wav_file_path = f"{file_name}.wav"
        audio.export(wav_file_path, format="wav")
        return wav_file_path
    return audio_file_path

# Same directory as this script
audio_file_path = "deneme.wav"
audio_file_path = convert_to_wav(audio_file_path)

# Load pyannote.audio speaker diarization pipeline
speaker_diarization = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token="AUTH_TOKEN")

# Apply speaker diarization
who_speaks_when = speaker_diarization(audio_file_path, num_speakers=2, min_speakers=None, max_speakers=None)

# Load OpenAI Whisper automatic speech transcription model
model = whisper.load_model("large-v3")

# Create an audio object for cropping
audio = Audio(sample_rate=16000, mono=True)

# Transcribe each speaker segment in the first minute
for segment, _, speaker in who_speaks_when.itertracks(yield_label=True):
    waveform, sample_rate = audio.crop(audio_file_path, segment)
    text = model.transcribe(waveform.squeeze().numpy(), language="tr")["text"]
    print(f"{segment.start:06.1f}s {segment.end:06.1f}s {speaker}: {text}")

In [None]:
# Import necessary libraries
import os
from pydub import AudioSegment
from pyannote.audio import Pipeline, Audio
from pyannote.core import Segment
import whisper

# Function to convert audio to WAV format if not already in WAV
def convert_to_wav(audio_file_path):
    file_name, file_extension = os.path.splitext(audio_file_path)
    if file_extension.lower() != ".wav":
        audio = AudioSegment.from_file(audio_file_path)
        wav_file_path = f"{file_name}.wav"
        audio.export(wav_file_path, format="wav")
        return wav_file_path
    return audio_file_path

# Same directory as this script
audio_file_path = "deneme.wav"
audio_file_path = convert_to_wav(audio_file_path)

# Load pyannote.audio speaker diarization pipeline
pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token="AUTH_TOKEN")
import torch
pipeline.to(torch.device("cuda"))

# Apply speaker diarization
who_speaks_when = pipeline(audio_file_path, num_speakers=2, min_speakers=None, max_speakers=None)

# Load OpenAI Whisper automatic speech transcription model
model = whisper.load_model("large-v3")

# Create an audio object for cropping
audio = Audio(sample_rate=16000, mono=True)

# Transcribe each speaker segment in the first minute
for segment, _, speaker in who_speaks_when.itertracks(yield_label=True):
    waveform, sample_rate = audio.crop(audio_file_path, segment)
    #text = model.transcribe(waveform.squeeze().numpy(), language="tr")["text"]
    #print(f"{segment.start:06.1f}s {segment.end:06.1f}s {speaker}: {text}")
    print(waveform.squeeze().numpy())
    print(waveform.squeeze().numpy().shape)

In [None]:
# Import necessary libraries
import os
import numpy as np
import ffmpeg
from pydub import AudioSegment
from pyannote.audio import Pipeline, Audio
from pyannote.core import Segment
import whisper

# Function to convert audio to WAV format if not already in WAV
def convert_to_wav(audio_file_path):
    file_name, file_extension = os.path.splitext(audio_file_path)
    if file_extension.lower() != ".wav":
        audio = AudioSegment.from_file(audio_file_path)
        wav_file_path = f"{file_name}.wav"
        audio.export(wav_file_path, format="wav")
        return wav_file_path
    return audio_file_path

# Function to resample audio to 16kHz
def resample_audio(waveform, orig_sr, target_sr=16000):
    if orig_sr != target_sr:
        out, _ = (
            ffmpeg.input('pipe:', format='f32le', ac=1, ar=orig_sr)
            .output('pipe:', format='f32le', ac=1, ar=target_sr)
            .run_async(pipe_stdin=True, pipe_stdout=True)
            .communicate(input=waveform.tobytes())
        )
        waveform = np.frombuffer(out, np.float32)
    return waveform

# Function to convert stereo to mono
def stereo_to_mono(waveform):
    if waveform.shape[0] == 2:  # Check if the audio is stereo
        waveform = np.mean(waveform, axis=0)  # Average the two channels
    return waveform

# Same directory as this script
audio_file_path = "deneme.wav"
audio_file_path = convert_to_wav(audio_file_path)

# Load pyannote.audio speaker diarization pipeline
pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token="YOUR_AUTH_TOKEN")
import torch
pipeline.to(torch.device("cuda"))

# Apply speaker diarization
who_speaks_when = pipeline(audio_file_path, num_speakers=2)

# Load OpenAI Whisper automatic speech transcription model
model = whisper.load_model("large-v3")

# Create an audio object for cropping
audio = Audio(sample_rate=16000, mono=True)

# Transcribe each speaker segment
for segment, _, speaker in who_speaks_when.itertracks(yield_label=True):
    waveform, sample_rate = audio.crop(audio_file_path, segment)
    waveform = waveform.squeeze().numpy()
    waveform = stereo_to_mono(waveform)
    waveform = resample_audio(waveform, sample_rate, target_sr=16000)
    text = model.transcribe(waveform, language="tr")["text"]
    print(f"{segment.start:06.1f}s {segment.end:06.1f}s {speaker}: {text}")


In [None]:
# Import necessary libraries
import os
import numpy as np
import ffmpeg
from pydub import AudioSegment
from pyannote.audio import Pipeline, Audio
from pyannote.core import Segment
import whisper

# Function to convert audio to WAV format if not already in WAV
def convert_to_wav(audio_file_path):
    file_name, file_extension = os.path.splitext(audio_file_path)
    if file_extension.lower() != ".wav":
        audio = AudioSegment.from_file(audio_file_path)
        wav_file_path = f"{file_name}.wav"
        audio.export(wav_file_path, format="wav")
        return wav_file_path
    return audio_file_path

# Function to convert stereo to mono
def stereo_to_mono(waveform):
    if waveform.shape[0] == 2:  # Check if the audio is stereo
        waveform = np.mean(waveform, axis=0)  # Average the two channels
    return waveform

# Same directory as this script
audio_file_path = "deneme.mp3"
audio_file_path = convert_to_wav(audio_file_path)

# Load pyannote.audio speaker diarization pipeline
pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token="AUTH_TOKEN")
import torch
#pipeline.to(torch.device("cuda"))

# Apply speaker diarization
who_speaks_when = pipeline(audio_file_path, num_speakers=3)

# Load OpenAI Whisper automatic speech transcription model
model = whisper.load_model("large-v3")

# Create an audio object for cropping
audio = Audio(sample_rate=16000, mono=True)

# Transcribe each speaker segment
for segment, _, speaker in who_speaks_when.itertracks(yield_label=True):
    # Crop the audio segment
    waveform, sample_rate = audio.crop(audio_file_path, segment)
    waveform = waveform.squeeze().numpy()  # Convert to numpy array
    # Convert to mono and resample to 16000 Hz
    waveform = stereo_to_mono(waveform)    
    # Transcribe the processed waveform
    text = model.transcribe(waveform, language="tr")["text"]
    print(f"{segment.start:06.1f}s {segment.end:06.1f}s {speaker}: {text}")


In [None]:
who_speaks_when