In [6]:
import sys
from pathlib import Path

# Auto-find project root (works across platforms)
if str(Path.cwd().parent) not in sys.path:
    sys.path.append(str(Path.cwd().parent))

import torch
import torchaudio
from voice_cloning.speaker_encoder.ecapa_tdnn import ECAPA_TDNN_SMALL

import librosa
import torch.nn.functional as F

In [7]:
speaker_encoder_path = "../voice_cloning/speaker_encoder/checkpoints/speaker_encoder.pt"

In [13]:
# Speaker Encoder for extracting speaker embedding
print('Initializing Speaker Encoder...')

spk_embedder = ECAPA_TDNN_SMALL(
    feat_dim=1024,
    feat_type="fbank",
)

state_dict = torch.load(speaker_encoder_path, map_location=lambda storage, loc: storage)
spk_embedder.load_state_dict(state_dict['model'], strict=False)
_ = spk_embedder.eval()

Initializing Speaker Encoder...


In [9]:
# Load reference samples
reference_path = "../voice_cloning/speaker_encoder/data/samples/speaker1_sample1.mp3"
reference_path1 = "../voice_cloning/speaker_encoder/data/samples/speaker1_sample2.mp3"  # Same speaker, different sample
reference_path2 = "../voice_cloning/speaker_encoder/data/samples/speaker2_sample1.mp3"  # Different speaker

# Process reference sample
wav_ref, sr = librosa.load(reference_path)
wav_ref = torch.FloatTensor(wav_ref).unsqueeze(0)
resample_fn = torchaudio.transforms.Resample(sr, 16000)
wav_ref = resample_fn(wav_ref)
spk_emb_ref = spk_embedder(wav_ref)
spk_emb_ref = spk_emb_ref / spk_emb_ref.norm()

# Process reference1 (same speaker)
wav_ref1, sr = librosa.load(reference_path1)
wav_ref1 = torch.FloatTensor(wav_ref1).unsqueeze(0)
wav_ref1 = resample_fn(wav_ref1)
spk_emb_ref1 = spk_embedder(wav_ref1)
spk_emb_ref1 = spk_emb_ref1 / spk_emb_ref1.norm()

# Process reference2 (different speaker)
wav_ref2, sr = librosa.load(reference_path2)
wav_ref2 = torch.FloatTensor(wav_ref2).unsqueeze(0)
wav_ref2 = resample_fn(wav_ref2)
spk_emb_ref2 = spk_embedder(wav_ref2)
spk_emb_ref2 = spk_emb_ref2 / spk_emb_ref2.norm()

# Calculate cosine distances
distance_same_speaker = 1 - F.cosine_similarity(spk_emb_ref, spk_emb_ref1)
distance_diff_speaker = 1 - F.cosine_similarity(spk_emb_ref, spk_emb_ref2)

print(f"Distance between same speaker samples: {distance_same_speaker.item():.4f}")
print(f"Distance between different speaker samples: {distance_diff_speaker.item():.4f}")

Distance between same speaker samples: 0.0181
Distance between different speaker samples: 0.0682
