In [4]:
import torch.nn as nn
import torch.nn.functional as F

class SpeakerVerificationModel(nn.Module):
    def __init__(self,embed_dim=128):
        super(SpeakerVerificationModel, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(2)
        self.conv3 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.global_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(64, embed_dim) 

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool(x)
        x = F.relu(self.conv2(x))
        x = self.pool(x)
        x = F.relu(self.conv3(x))
        x = self.global_pool(x) 
        x = x.view(x.size(0), -1)  # [B, 64]
        x = self.fc(x)            # [B, embed_dim]
        x = F.normalize(x, p=2, dim=1)
        return x

model = SpeakerVerificationModel()

In [7]:
import torch
model.load_state_dict(torch.load('siamese_speaker_model.pth', map_location=torch.device('cpu')))

  model.load_state_dict(torch.load('siamese_speaker_model.pth', map_location=torch.device('cpu')))


<All keys matched successfully>

In [8]:
model.eval()

SpeakerVerificationModel(
  (conv1): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (global_pool): AdaptiveAvgPool2d(output_size=(1, 1))
  (fc): Linear(in_features=64, out_features=128, bias=True)
)

In [9]:
import numpy as np
import torch
import torch.nn.functional as F
import torchaudio
import soundfile as sf

def load_mel_spec(file_path, sr=16000, n_mels=40):
    """
    Load a .wav file and return a Mel-spectrogram (1, n_mels, time).
    """
    audio_data, orig_sr = sf.read(file_path)
    audio_tensor = torch.from_numpy(audio_data).float().unsqueeze(0)
    if orig_sr != sr:
        audio_tensor = torchaudio.functional.resample(audio_tensor, orig_sr, sr)
    mel_transform = torchaudio.transforms.MelSpectrogram(sample_rate=sr, n_mels=n_mels)
    to_db = torchaudio.transforms.AmplitudeToDB()
    mel_spec = to_db(mel_transform(audio_tensor))
    return mel_spec

def enroll_speaker(model, file_paths, device='cpu'):
    """
    Enroll a speaker using multiple .wav files to create an average embedding.
    """
    model.eval()
    embeddings = []
    for fp in file_paths:
        mel_spec = load_mel_spec(fp).unsqueeze(0).to(device)  # [1, 1, n_mels, time]
        with torch.no_grad():
            emb = model(mel_spec)
        embeddings.append(emb.cpu().numpy())
    avg_emb = np.mean(embeddings, axis=0)  # shape [1, embed_dim]
    avg_emb = torch.from_numpy(avg_emb).float()
    avg_emb = F.normalize(avg_emb, p=2, dim=1)
    return avg_emb

In [10]:
def verify_speaker(model, enrolled_embedding, test_wav, threshold=0.5, device='cpu'):
    """
    Compare test_wav embedding to the enrolled_embedding.
    Return 'ACCEPT' if distance < threshold, else 'REJECT'.
    """
    model.eval()
    mel_spec = load_mel_spec(test_wav).unsqueeze(0).to(device)  # [1, 1, n_mels, time]
    with torch.no_grad():
        test_emb = model(mel_spec)  # [1, embed_dim]
    test_emb = F.normalize(test_emb, p=2, dim=1)
    dist = torch.norm(test_emb - enrolled_embedding.to(device), p=2).item()
    print(f"Distance to enrolled embedding: {dist:.3f}")
    if dist < threshold:
        return "ACCEPT"
    else:
        return "REJECT"