In [1]:
import sys
from pathlib import Path

# Auto-find project root (works across platforms)
if str(Path.cwd().parent) not in sys.path:
    sys.path.append(str(Path.cwd().parent))

import torch
import torchaudio
from voice_cloning.speaker_encoder.ecapa_tdnn import ECAPA_TDNN_SMALL

import librosa
import torch.nn.functional as F


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/Users/user/Documents/Inno/GenAI/VoiceCloning/.venv/lib/python3.9/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance

In [None]:
speaker_encoder_path = "/repo/voice-cloning/voice_cloning/Grad-TTS/checkpts/best_linear_model.pt"

In [5]:
# Speaker Encoder for extracting speaker embedding
print('Initializing Speaker Encoder...')

spk_embedder = ECAPA_TDNN_SMALL(
    feat_dim=1024,
    feat_type="fbank",
)

if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")


# state_dict = torch.load(speaker_encoder_path, map_location=lambda storage, loc: storage)
# spk_embedder.load_state_dict(state_dict['model'], strict=False)
# _ = spk_embedder.eval()

spk_embedder = ECAPA_TDNN_SMALL(feat_dim=256).to(device)
checkpoint = torch.load(speaker_encoder_path, map_location=device)
spk_embedder.load_state_dict(checkpoint['model_state_dict'])
spk_embedder.eval()

Initializing Speaker Encoder...




ECAPA_TDNN(
  (feature_extract): MelSpectrogram(
    (spectrogram): Spectrogram()
    (mel_scale): MelScale()
  )
  (instance_norm): InstanceNorm1d(256, eps=1e-05, momentum=0.1, affine=False, track_running_stats=False)
  (layer1): Conv1dReluBn(
    (conv): Conv1d(256, 512, kernel_size=(5,), stride=(1,), padding=(2,))
    (bn): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (layer2): SE_Res2Block(
    (Conv1dReluBn1): Conv1dReluBn(
      (conv): Conv1d(512, 512, kernel_size=(1,), stride=(1,))
      (bn): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (Res2Conv1dReluBn): Res2Conv1dReluBn(
      (convs): ModuleList(
        (0-6): 7 x Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
      )
      (bns): ModuleList(
        (0-6): 7 x BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (Conv1dReluBn2): Conv1dReluBn(
      (conv): Conv1d

In [None]:
# Load reference samples
reference_path = "../voice_cloning/speaker_encoder/data/samples/speaker1_sample1.mp3"
reference_path1 = "../voice_cloning/speaker_encoder/data/samples/speaker1_sample2.mp3"  # Same speaker, different sample
reference_path2 = "../voice_cloning/speaker_encoder/data/samples/speaker2_sample1.mp3"  # Different speaker

# Process reference sample
wav_ref, sr = librosa.load(reference_path)
wav_ref = torch.FloatTensor(wav_ref).unsqueeze(0)
resample_fn = torchaudio.transforms.Resample(sr, 16000)
wav_ref = resample_fn(wav_ref)
spk_emb_ref = spk_embedder(wav_ref)
spk_emb_ref = spk_emb_ref / spk_emb_ref.norm()

# Process reference1 (same speaker)
wav_ref1, sr = librosa.load(reference_path1)
wav_ref1 = torch.FloatTensor(wav_ref1).unsqueeze(0)
wav_ref1 = resample_fn(wav_ref1)
spk_emb_ref1 = spk_embedder(wav_ref1)
spk_emb_ref1 = spk_emb_ref1 / spk_emb_ref1.norm()

# Process reference2 (different speaker)
wav_ref2, sr = librosa.load(reference_path2)
wav_ref2 = torch.FloatTensor(wav_ref2).unsqueeze(0)
wav_ref2 = resample_fn(wav_ref2)
spk_emb_ref2 = spk_embedder(wav_ref2)
spk_emb_ref2 = spk_emb_ref2 / spk_emb_ref2.norm()

# Calculate cosine distances
distance_same_speaker = 1 - F.cosine_similarity(spk_emb_ref, spk_emb_ref1)
distance_diff_speaker = 1 - F.cosine_similarity(spk_emb_ref, spk_emb_ref2)

print(f"Distance between same speaker samples: {distance_same_speaker.item():.4f}")
print(f"Distance between different speaker samples: {distance_diff_speaker.item():.4f}")

Distance between same speaker samples: 0.1896
Distance between different speaker samples: 0.4913
