In [1]:
!pip install pyannote.audio
!pip install soundfile



In [3]:
def get_device():
    import torch
    if torch.backends.mps.is_available():
        return torch.device("mps")
    elif torch.cuda.is_available():
        return torch.device("cuda")
    else:
        return torch.device("cpu")

# Set the device
device = get_device()
print(f"Using device: {device}")

Using device: mps


In [4]:
import subprocess
import os
from pyannote.audio import Pipeline
import soundfile as sf
import numpy as np

# Initialize the diarization pipeline
diarization_pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization@2.1",
                                                use_auth_token="AUTH_GOES_HERE")
diarization_pipeline.to(device)

# Load your audio file
audio_file = "./audio/group1.wav"

# Apply the diarization pipeline
diarization = diarization_pipeline(audio_file)

Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.3.1. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../.cache/torch/pyannote/models--pyannote--segmentation/snapshots/c4c8ceafcbb3a7a280c2d357aee9fbc9b0be7f9b/pytorch_model.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.3.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.3.1. Bad things might happen unless you revert torch to 1.x.


In [8]:
# Load the full audio file
audio, sample_rate = sf.read(audio_file)

# Create a directory for temporary segment files
os.makedirs("temp_segments", exist_ok=True)

# Process results
results = []
for i, (turn, _, speaker) in enumerate(diarization.itertracks(yield_label=True)):
    # Extract the audio segment
    start_sample = int(turn.start * sample_rate)
    end_sample = int(turn.end * sample_rate)
    segment = audio[start_sample:end_sample]
    
    # Save the segment as a temporary file
    segment_file = f"temp_segments/segment_{i}.wav"
    sf.write(segment_file, segment, sample_rate)
    
    # Call Whisper CLI on the segment
    whisper_command = [
        "/Users/mclem/Development/whisper.cpp/main",  # Replace with the actual path to your Whisper executable
        "-m", "/Users/mclem/Development/whisper.cpp/models/ggml-medium.en.bin",  # Replace with the path to your model
        "-l", "en",
        "--output-txt",
        "-f", segment_file
    ]
    result = subprocess.run(whisper_command, capture_output=True, text=True)
    
    # Read the transcription from the output text file
    with open(f"{segment_file}.txt", "r") as f:
        transcription = f.read().strip()
    
    # Delete temporary files
    os.remove(segment_file)
    os.remove(f"{segment_file}.txt")
    
    results.append({
        "speaker": speaker,
        "start": turn.start,
        "end": turn.end,
        "text": transcription
    })

# Remove temporary directory
os.rmdir("temp_segments")

In [9]:
# Print and save results
with open("diarization_and_transcription_results.txt", "w") as f:
    for result in results:
        line = f"Speaker {result['speaker']}: {result['start']:.1f}s - {result['end']:.1f}s\n{result['text']}\n"
        print(line)
        f.write(line + "\n")

Speaker SPEAKER_00: 0.0s - 0.1s


Speaker SPEAKER_00: 2.1s - 5.6s
Okay. It should have given you the everything's being recorded.

Speaker SPEAKER_01: 7.1s - 15.2s
Um, quick question, do you happen to know the, um, the tempo for the song?
 Just cause I'm thinking time, time-based effects.

Speaker SPEAKER_00: 15.9s - 20.3s
- That's a really good question.
 Let me see if I have the metadata.

Speaker SPEAKER_01: 27.8s - 32.5s
Also, after I changed the tempo, I may or may not have to
 put the things back in again.

Speaker SPEAKER_00: 31.9s - 35.0s
Right on.
 I didn't think about that.

Speaker SPEAKER_06: 34.5s - 36.5s
- I'm blocking this so hard.

Speaker SPEAKER_01: 38.7s - 39.4s


Speaker SPEAKER_01: 40.4s - 42.7s
or if we do lock it, will that fix it?

Speaker SPEAKER_06: 42.8s - 48.0s
Try, try it. And you might be able to detect the tempo from like the smart tools as well.

Speaker SPEAKER_01: 49.7s - 52.0s
Where is the lock? I can't remember.

Speaker SPEAKER_06: 52.2s - 56.0s
Try