# Speaker Identification and Transcribing Model


1. Making the necessary imports

In [16]:
import os
import torch
import torchaudio
import numpy as np
from pyannote.audio import Pipeline, Model, Inference
from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import cosine_distances
import warnings
from transformers import pipeline
warnings.filterwarnings("ignore")

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"The device being used is {device}")

# HuggingFace token
hf = "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"

The device being used is cuda


In [17]:
# Initialize diarization pipeline
diarization_pipeline = Pipeline.from_pretrained(
    "pyannote/speaker-diarization-3.1",
    use_auth_token=hf
).to(device)

# Initialize embedding model
embedding_model = Inference(
    Model.from_pretrained("pyannote/embedding", use_auth_token=hf).to(device),
    window="whole"
)

# Initialize Whisper ASR model
whisper_pipeline = pipeline(
    "automatic-speech-recognition", 
    model="openai/whisper-medium"
)


INFO:speechbrain.utils.quirks:Applied quirks (see `speechbrain.utils.quirks`): [allow_tf32, disable_jit_profiling]
INFO:speechbrain.utils.quirks:Excluded quirks specified by the `SB_DISABLE_QUIRKS` environment (comma-separated list): []
Lightning automatically upgraded your loaded checkpoint from v1.2.7 to v2.4.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint C:\Users\shoke\.cache\torch\pyannote\models--pyannote--embedding\snapshots\4db4899737a38b2d618bbd74350915aa10293cb2\pytorch_model.bin`
Lightning automatically upgraded your loaded checkpoint from v1.2.7 to v2.4.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint C:\Users\shoke\.cache\torch\pyannote\models--pyannote--embedding\snapshots\4db4899737a38b2d618bbd74350915aa10293cb2\pytorch_model.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.8.1+cu102, yours is 2.2.1+cu118. Bad things might happen unless you revert torch to 1.x.
Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.8.1+cu102, yours is 2.2.1+cu118. Bad things might happen unless you revert torch to 1.x.


In [39]:
# List of audio files to process
audio_files = ["./data/sample1.wav", "./data/sample2.wav","./data/john1.wav","./data/sample5.wav","./data/trump1.wav"]

# Perform diarization
diarization_results = []
for audio_file in audio_files:
    waveform, sample_rate = torchaudio.load(audio_file)
    waveform = waveform.to(device)

    diarization = diarization_pipeline({"waveform": waveform, "sample_rate": sample_rate})
    diarization_results.append({"file": audio_file, "diarization": diarization})

# Display results for the first file
for i, file_result in enumerate(diarization_results):
        print(f"\nDiarization results for {file_result['file']}:")
        for turn, _, speaker_label in file_result['diarization'].itertracks(yield_label=True):
            print(f"Segment: {turn.start:.2f}s - {turn.end:.2f}s, Speaker: {speaker_label}")


Diarization results for ./data/sample1.wav:
Segment: 1.59s - 9.89s, Speaker: SPEAKER_00
Segment: 10.87s - 11.91s, Speaker: SPEAKER_00
Segment: 12.06s - 15.46s, Speaker: SPEAKER_00
Segment: 16.65s - 22.35s, Speaker: SPEAKER_00
Segment: 23.51s - 29.72s, Speaker: SPEAKER_00
Segment: 30.59s - 36.80s, Speaker: SPEAKER_00
Segment: 37.38s - 38.23s, Speaker: SPEAKER_00
Segment: 39.43s - 45.22s, Speaker: SPEAKER_00
Segment: 45.32s - 46.78s, Speaker: SPEAKER_00
Segment: 48.65s - 59.14s, Speaker: SPEAKER_00
Segment: 60.48s - 61.28s, Speaker: SPEAKER_00
Segment: 62.88s - 66.27s, Speaker: SPEAKER_00
Segment: 67.05s - 71.57s, Speaker: SPEAKER_00
Segment: 73.79s - 84.25s, Speaker: SPEAKER_00
Segment: 85.07s - 86.24s, Speaker: SPEAKER_00
Segment: 88.28s - 91.47s, Speaker: SPEAKER_00
Segment: 91.62s - 93.47s, Speaker: SPEAKER_00
Segment: 94.66s - 98.70s, Speaker: SPEAKER_00
Segment: 100.45s - 108.90s, Speaker: SPEAKER_00
Segment: 110.86s - 118.90s, Speaker: SPEAKER_00
Segment: 119.77s - 128.28s, Speak

In [40]:
# Extract embeddings and speaker information
all_embeddings = []
speaker_info = []

for result in diarization_results:
    audio_file = result['file']
    diarization = result['diarization']
    waveform, sample_rate = torchaudio.load(audio_file)
    waveform = waveform.to(device)

    for turn, _, speaker_label in diarization.itertracks(yield_label=True):
        start_time = turn.start
        end_time = turn.end

        if (end_time - start_time) < 1.0:  # Skip very short segments
            continue

        segment = waveform[:, int(start_time * sample_rate):int(end_time * sample_rate)].cpu()
        embedding = embedding_model({"waveform": segment, "sample_rate": sample_rate})
        embedding = embedding / np.linalg.norm(embedding)

        all_embeddings.append(embedding)
        speaker_info.append({
            'audio_file': audio_file,
            'start_time': start_time,
            'end_time': end_time,
            'local_speaker_label': speaker_label
        })

print(f"Extracted {len(all_embeddings)} embeddings.")


Extracted 188 embeddings.


In [41]:
embeddings_array = np.vstack(all_embeddings)
cosine_sim_matrix = 1 - cosine_distances(embeddings_array)
distance_matrix = 1 - cosine_sim_matrix

clustering = DBSCAN(eps=0.5, min_samples=2, metric='precomputed')
clustering.fit(distance_matrix)

labels = clustering.labels_

# Assign custom labels
speaker_mapping = {
    0: "Vaibhav",
    1: "Sushmit",
    2: "John",
    3:"S Jaishankar",
    4:"Donald Trump"
}

cluster_to_name = {}
for label in set(labels):
    if label == -1:
        continue  # Skip noise points
    cluster_to_name[label] = speaker_mapping.get(label, f"SPEAKER_{label:02d}")

# Display cluster mapping
print("\nCluster to Name Mapping:")
for cluster, name in cluster_to_name.items():
    print(f"Cluster {cluster}: {name}")

# Display results with global labels
print("\nEmbedding Details with Global Labels:")
for idx, info in enumerate(speaker_info):
    cluster_label = labels[idx]
    global_label = cluster_to_name.get(cluster_label, "Noise") if cluster_label != -1 else "Noise"
    print(f"File: {info['audio_file']}")
    print(f"  Segment: {info['start_time']:.2f}s - {info['end_time']:.2f}s")
    print(f"  Local Label: {info['local_speaker_label']}")
    print(f"  Global Label: {global_label} (Cluster {cluster_label})")
    print()


Cluster to Name Mapping:
Cluster 0: Vaibhav
Cluster 1: Sushmit
Cluster 2: John
Cluster 3: S Jaishankar
Cluster 4: Donald Trump

Embedding Details with Global Labels:
File: ./data/sample1.wav
  Segment: 1.59s - 9.89s
  Local Label: SPEAKER_00
  Global Label: Vaibhav (Cluster 0)

File: ./data/sample1.wav
  Segment: 10.87s - 11.91s
  Local Label: SPEAKER_00
  Global Label: Vaibhav (Cluster 0)

File: ./data/sample1.wav
  Segment: 12.06s - 15.46s
  Local Label: SPEAKER_00
  Global Label: Vaibhav (Cluster 0)

File: ./data/sample1.wav
  Segment: 16.65s - 22.35s
  Local Label: SPEAKER_00
  Global Label: Vaibhav (Cluster 0)

File: ./data/sample1.wav
  Segment: 23.51s - 29.72s
  Local Label: SPEAKER_00
  Global Label: Vaibhav (Cluster 0)

File: ./data/sample1.wav
  Segment: 30.59s - 36.80s
  Local Label: SPEAKER_00
  Global Label: Vaibhav (Cluster 0)

File: ./data/sample1.wav
  Segment: 39.43s - 45.22s
  Local Label: SPEAKER_00
  Global Label: Vaibhav (Cluster 0)

File: ./data/sample1.wav
  Seg

In [42]:
# Save embeddings, labels, and cluster-to-name mapping
np.savez_compressed("known_embeddings.npz", 
                    embeddings=all_embeddings, 
                    speaker_info=speaker_info, 
                    labels=labels, 
                    cluster_to_name=cluster_to_name)
print("Known embeddings saved to 'known_embeddings.npz'.")


Known embeddings saved to 'known_embeddings.npz'.


In [43]:
known_data = np.load("known_embeddings.npz", allow_pickle=True)
known_embeddings = known_data['embeddings']
known_labels = known_data['labels']
cluster_to_name = dict(known_data['cluster_to_name'].item())

# Load new audio file
new_audio_file = "./data/john2.wav"
waveform, sample_rate = torchaudio.load(new_audio_file)
waveform = waveform.to(device)

# Perform diarization
diarization = diarization_pipeline({"waveform": waveform, "sample_rate": sample_rate})

# Transcribe with speaker labels
transcriptions = []
for turn, _, speaker_label in diarization.itertracks(yield_label=True):
    start_time = turn.start
    end_time = turn.end
    duration = end_time - start_time

    if duration < 1.0:
        continue  # Skip short segments

    start_idx = int(start_time * sample_rate)
    end_idx = int(end_time * sample_rate)
    segment = waveform[:, start_idx:end_idx].cpu()
    embedding = embedding_model({"waveform": segment, "sample_rate": sample_rate})
    embedding = embedding / np.linalg.norm(embedding)

    # Match with known speakers
    similarities = 1 - cosine_distances([embedding], known_embeddings).flatten()
    matched_label = known_labels[np.argmax(similarities)]
    speaker_name = cluster_to_name.get(matched_label, f"SPEAKER_{matched_label:02d}")

    # Transcribe the segment
    transcription = whisper_pipeline(segment.squeeze().numpy())
    transcriptions.append({
        'speaker_name': speaker_name,
        'start_time': start_time,
        'end_time': end_time,
        'text': transcription['text']
    })

# Display transcriptions
transcript_text = ""
for segment in transcriptions:
    transcript_line = (
        f"[{segment['speaker_name']}] {segment['text']} "
        f"({segment['start_time']:.2f}s - {segment['end_time']:.2f}s)"
    )
    print(transcript_line)
    transcript_text += transcript_line + "\n"

# Save transcript to a text file
transcript_file = os.path.splitext(new_audio_file)[0] + "_transcript.txt"
with open(transcript_file, "w", encoding="utf-8") as file:
    file.write(transcript_text)

print(f"\nTranscript saved to {transcript_file}")

[John]  But Abhinash was not able to complete his sentence. He felt such an intense pleasure in abusing Binoi that he was incapable of even pretending to be anxious on his account. (1.10s - 10.98s)
[John]  In less than no time, all the important members of Kura's party came in turn. (11.45s - 15.49s)
[John]  And when they were all gathered, a heated discussion soon started on the subject of Benoit's Kanda. (15.75s - 21.06s)
[John]  The majority of them had only one comment to make, and it was that this present affair was no matter for surprise, because they had all of them again and again noticed signs of weakness and hesitation in Benoit's character. (21.55s - 34.13s)
[John]  They declared that Benoit had never surrendered himself wholeheartedly to their party. (35.25s - 39.38s)
[John]  Many of them said they had always felt how intolerable was the way in which he had tried. (39.74s - 44.52s)
[John]  Somehow or other from the very beginning in which he had tried somehow or other (44.6