# Speaker Recognition with SpeechBrain ECAPA-TDNN
This notebook demonstrates...

## 1. Setup and Imports

In [1]:
import torch
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
import librosa
import torchaudio
import sys

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)

### Fix torchaudio backend compatibility

In [2]:
if hasattr(torchaudio, 'list_audio_backends'):
    backends = torchaudio.list_audio_backends()
else:
    backends = ['torchcodec']

sys.modules['torchaudio'].list_audio_backends = lambda: backends

### Monkey-patch huggingface_hub for compatibility

In [None]:
import huggingface_hub

original_hf_hub_download = huggingface_hub.hf_hub_download

def patched_hf_hub_download(*args, **kwargs):
    if 'use_auth_token' in kwargs:
        kwargs['token'] = kwargs.pop('use_auth_token')
    return original_hf_hub_download(*args, **kwargs)

huggingface_hub.hf_hub_download = patched_hf_hub_download

## 2. Load Pre-trained ECAPA-TDNN Model

In [None]:
from speechbrain.inference import EncoderClassifier

print("Loading speechbrain ECAPA-TDNN model...")

classifier = EncoderClassifier.from_hparams(
    source="speechbrain/spkrec-ecapa-voxceleb",
    savedir="pretrained_models/"
)

print("✓ Model loaded successfully!")

## 3. Load Audio Dataset

In [None]:
CSV_PATH = "audio_features.csv"
df = pd.read_csv(CSV_PATH)

print(f"Loaded {len(df)} audio samples")
print(df.head())

speaker_counts = df['member_name'].value_counts()
print(speaker_counts)

### Visualize Speaker Distribution

In [None]:
plt.figure(figsize=(12, 6))
speaker_counts.plot(kind='bar', color='steelblue', edgecolor='black')
plt.title('Number of Audio Samples per Speaker', fontsize=14, fontweight='bold')
plt.xlabel('Speaker')
plt.ylabel('Number of Samples')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

## 4. Extract Speaker Embeddings

In [None]:
def get_embedding(audio_path):
    y, sr = librosa.load(audio_path, sr=16000)
    waveform = torch.tensor(y).unsqueeze(0)
    with torch.no_grad():
        emb = classifier.encode_batch(waveform)
    return emb.squeeze().cpu().numpy()

In [None]:
embeddings = []
labels = []
failed_files = []

print("Extracting embeddings...")
print("-" * 50)

for idx, row in df.iterrows():
    if idx % 10 == 0:
        print(f"Progress: {idx}/{len(df)} ({idx/len(df)*100:.1f}%)")

    try:
        emb = get_embedding(row['audio_path'])
        embeddings.append(emb)
        labels.append(row['member_name'])
    except Exception as e:
        print(f"✗ Skipping {row['audio_path']}: {e}")
        failed_files.append(row['audio_path'])

print("-" * 50)
print(f"✓ Extracted {len(embeddings)} embeddings")
if failed_files:
    print(f"✗ Failed files: {len(failed_files)}")

if len(embeddings) == 0:
    raise ValueError("No embeddings extracted!")

### Embedding Statistics

In [None]:
X = np.array(embeddings)
y = np.array(labels)

print(X.shape, len(np.unique(y)))

plt.figure(figsize=(14, 5))
plt.subplot(1,2,1); plt.hist(X.mean(axis=1), bins=30); plt.title("Mean Embedding Values")
plt.subplot(1,2,2); plt.hist(X.std(axis=1), bins=30); plt.title("Embedding Std Dev")
plt.show()

## 5. Prepare Training and Validation Sets

In [None]:
le = LabelEncoder()
y_enc = le.fit_transform(y)

from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(
    X, y_enc, test_size=0.3, stratify=y_enc, random_state=42
)

print(len(X_train), len(X_val))

## 6. Train Logistic Regression Classifier

In [None]:
model = LogisticRegression(max_iter=1000, class_weight='balanced')
model.fit(X_train, y_train)
print("Training complete.")

## 7. Evaluate Model Performance

In [None]:
y_pred_train = model.predict(X_train)
y_pred_val = model.predict(X_val)

print("Train Acc:", accuracy_score(y_train, y_pred_train))
print("Val Acc:", accuracy_score(y_val, y_pred_val))
print(classification_report(y_val, y_pred_val, target_names=le.classes_))

### Confusion Matrix

In [None]:
cm = confusion_matrix(y_val, y_pred_val)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=le.classes_, yticklabels=le.classes_)
plt.show()

### Per-Class Metrics

In [None]:
from sklearn.metrics import precision_recall_fscore_support

precision, recall, f1, support = precision_recall_fscore_support(y_val, y_pred_val)

metrics_df = pd.DataFrame({
    "Speaker": le.classes_,
    "Precision": precision,
    "Recall": recall,
    "F1": f1
})
metrics_df

## 8. Save Model

In [None]:
joblib.dump(model, "speechbrain_classifier.pkl")
joblib.dump(le, "speechbrain_label_encoder.pkl")
print("Models saved.")