In [2]:
import torch
import torchaudio
import numpy
import transformers
print("torch:", torch.__version__)
print("torchaudio:", torchaudio.__version__)
print("transformers:", transformers.__version__)
print("numpy:", numpy.__version__)
print("cuda:", torch.cuda.is_available())

torch: 2.7.0+cu118
torchaudio: 2.7.0+cu118
transformers: 4.57.6
numpy: 1.26.4
cuda: True


In [14]:
folder_path = r"E:\speech_data\train_vi_7s"

In [23]:
def quick_sight(embeddings_data):
    print("Embeddings Structure:")
    print(f"Keys: {embeddings_data.keys()}")
    print(f"\nEmbeddings shape: {embeddings_data['embeddings'].shape}")
    print(f"Number of embeddings: {embeddings_data['embeddings'].shape[0]}")
    print(f"Embedding dimension: {embeddings_data['embeddings'].shape[1]}")

    print(f"\nSpeaker IDs count: {len(embeddings_data['speaker_ids'])}")
    print(f"Unique speakers: {set(embeddings_data['speaker_ids'])}")

    # Show first few speaker IDs
    print(f"\nFirst 5 speaker IDs: {embeddings_data['speaker_ids'][:5]}")

    # Check for any NaN or Inf values
    print(f"\nHas NaN values: {torch.isnan(embeddings_data['embeddings']).any()}")
    print(f"Has Inf values: {torch.isinf(embeddings_data['embeddings']).any()}")

    # Show min/max values
    print(f"\nEmbeddings min: {embeddings_data['embeddings'].min():.6f}")
    print(f"Embeddings max: {embeddings_data['embeddings'].max():.6f}")
    print(f"Embeddings mean: {embeddings_data['embeddings'].mean():.6f}")

    # Print speaker ID and embedding pairs
    print("Speaker ID - Embedding Pairs:\n")
    for i, (speaker_id, embedding) in enumerate(zip(embeddings_data['speaker_ids'], embeddings_data['embeddings'])):
        print(f"{i+1}. Speaker ID: {speaker_id}")
        print(f"   Embedding (first 10 dims): {embedding[:10].numpy()}")
        print(f"   Full shape: {embedding.shape}")
        print()
        # Show only first 10 for clarity
        if i >= 9:
            remaining = len(embeddings_data['speaker_ids']) - 10
            print(f"... and {remaining} more embeddings")
            break   

HuBERT Embedding

In [None]:
import embedding

embedding_hubert = embedding.process_speaker_folder(folder_path, embedding.hubert_embedding, save_path='embeddings_hubert.pt')

In [24]:
import torch

# Load the embeddings file
embeddings_data = torch.load('embeddings_hubert.pt')
quick_sight(embeddings_data=embeddings_data)

Embeddings Structure:
Keys: dict_keys(['embeddings', 'speaker_ids'])

Embeddings shape: torch.Size([26279, 768])
Number of embeddings: 26279
Embedding dimension: 768

Speaker IDs count: 26279
Unique speakers: {'id02224', 'id01659', 'id01631', 'id01851', 'id01287', 'id00024', 'id01152', 'id01904', 'id02077', 'id02161', 'id02100', 'id01679', 'id01934', 'id01336', 'id01702', 'id01754', 'id01516', 'id01733', 'id01780', 'id00925', 'id01000', 'id01811', 'id01209', 'id00916', 'id01269', 'id02021', 'id02093', 'id01577', 'id00016', 'id00010', 'id01959', 'id00757', 'id01136', 'id00023', 'id01925', 'id01647', 'id02217', 'id00779', 'id00999', 'id01208', 'id01633', 'id02216', 'id01473', 'id01966', 'id02080', 'id00956', 'id00991', 'id00001', 'id01796', 'id01248', 'id00953', 'id00976', 'id01896', 'id01615', 'id02243', 'id01738', 'id01409', 'id00020', 'id01848', 'id00009', 'id02144', 'id00079', 'id01506', 'id01570', 'id02052', 'id01353', 'id01415', 'id00013', 'id00627', 'id02067', 'id01339', 'id00979'

Wav2vec2 Embedding

In [8]:
import embedding

embedding_wav2vec2 = embedding.process_speaker_folder(r"E:\speech_data\train_vi_7s", embedding.wav2vec2_embedding, save_path='embeddings_wav2vec2.pt')

Processed: E:\speech_data\train_vi_7s\id00000_00000_seg_001.wav (Speaker: id00000)
Processed: E:\speech_data\train_vi_7s\id00000_00000_seg_002.wav (Speaker: id00000)
Processed: E:\speech_data\train_vi_7s\id00000_00001_seg_001.wav (Speaker: id00000)
Processed: E:\speech_data\train_vi_7s\id00000_00001_seg_002.wav (Speaker: id00000)
Processed: E:\speech_data\train_vi_7s\id00000_00001_seg_003.wav (Speaker: id00000)
Processed: E:\speech_data\train_vi_7s\id00000_00002_seg_001.wav (Speaker: id00000)
Processed: E:\speech_data\train_vi_7s\id00000_00002_seg_002.wav (Speaker: id00000)
Processed: E:\speech_data\train_vi_7s\id00000_00002_seg_003.wav (Speaker: id00000)
Processed: E:\speech_data\train_vi_7s\id00000_00015_seg_001.wav (Speaker: id00000)
Processed: E:\speech_data\train_vi_7s\id00000_00015_seg_002.wav (Speaker: id00000)
Processed: E:\speech_data\train_vi_7s\id00000_00016_seg_001.wav (Speaker: id00000)
Processed: E:\speech_data\train_vi_7s\id00000_00016_seg_002.wav (Speaker: id00000)
Proc

In [25]:
import torch

# Load the embeddings file
embeddings_data = torch.load('embeddings_wav2vec2.pt')
quick_sight(embeddings_data)

Embeddings Structure:
Keys: dict_keys(['embeddings', 'speaker_ids'])

Embeddings shape: torch.Size([26279, 768])
Number of embeddings: 26279
Embedding dimension: 768

Speaker IDs count: 26279
Unique speakers: {'id02224', 'id01659', 'id01631', 'id01851', 'id01287', 'id00024', 'id01152', 'id01904', 'id02077', 'id02161', 'id02100', 'id01679', 'id01934', 'id01336', 'id01702', 'id01754', 'id01516', 'id01733', 'id01780', 'id00925', 'id01000', 'id01811', 'id01209', 'id00916', 'id01269', 'id02021', 'id02093', 'id01577', 'id00016', 'id00010', 'id01959', 'id00757', 'id01136', 'id00023', 'id01925', 'id01647', 'id02217', 'id00779', 'id00999', 'id01208', 'id01633', 'id02216', 'id01473', 'id01966', 'id02080', 'id00956', 'id00991', 'id00001', 'id01796', 'id01248', 'id00953', 'id00976', 'id01896', 'id01615', 'id02243', 'id01738', 'id01409', 'id00020', 'id01848', 'id00009', 'id02144', 'id00079', 'id01506', 'id01570', 'id02052', 'id01353', 'id01415', 'id00013', 'id00627', 'id02067', 'id01339', 'id00979'

WavLM Embedding

In [None]:
import embedding
embedding_wavlm = embedding.process_speaker_folder(r"E:\speech_data\train_vi_7s", embedding.wavlm_embedding, save_path='embeddings_wavlm.pt')



Processed: E:\speech_data\train_vi_7s\id00000_00000_seg_001.wav (Speaker: id00000)
Processed: E:\speech_data\train_vi_7s\id00000_00000_seg_002.wav (Speaker: id00000)
Processed: E:\speech_data\train_vi_7s\id00000_00001_seg_001.wav (Speaker: id00000)
Processed: E:\speech_data\train_vi_7s\id00000_00001_seg_002.wav (Speaker: id00000)
Processed: E:\speech_data\train_vi_7s\id00000_00001_seg_003.wav (Speaker: id00000)
Processed: E:\speech_data\train_vi_7s\id00000_00002_seg_001.wav (Speaker: id00000)
Processed: E:\speech_data\train_vi_7s\id00000_00002_seg_002.wav (Speaker: id00000)
Processed: E:\speech_data\train_vi_7s\id00000_00002_seg_003.wav (Speaker: id00000)
Processed: E:\speech_data\train_vi_7s\id00000_00015_seg_001.wav (Speaker: id00000)
Processed: E:\speech_data\train_vi_7s\id00000_00015_seg_002.wav (Speaker: id00000)
Processed: E:\speech_data\train_vi_7s\id00000_00016_seg_001.wav (Speaker: id00000)
Processed: E:\speech_data\train_vi_7s\id00000_00016_seg_002.wav (Speaker: id00000)
Proc

In [26]:
import torch

# Load the embeddings file
embeddings_data = torch.load('embeddings_wavlm.pt')
quick_sight(embeddings_data)

Embeddings Structure:
Keys: dict_keys(['embeddings', 'speaker_ids'])

Embeddings shape: torch.Size([26279, 768])
Number of embeddings: 26279
Embedding dimension: 768

Speaker IDs count: 26279
Unique speakers: {'id02224', 'id01659', 'id01631', 'id01851', 'id01287', 'id00024', 'id01152', 'id01904', 'id02077', 'id02161', 'id02100', 'id01679', 'id01934', 'id01336', 'id01702', 'id01754', 'id01516', 'id01733', 'id01780', 'id00925', 'id01000', 'id01811', 'id01209', 'id00916', 'id01269', 'id02021', 'id02093', 'id01577', 'id00016', 'id00010', 'id01959', 'id00757', 'id01136', 'id00023', 'id01925', 'id01647', 'id02217', 'id00779', 'id00999', 'id01208', 'id01633', 'id02216', 'id01473', 'id01966', 'id02080', 'id00956', 'id00991', 'id00001', 'id01796', 'id01248', 'id00953', 'id00976', 'id01896', 'id01615', 'id02243', 'id01738', 'id01409', 'id00020', 'id01848', 'id00009', 'id02144', 'id00079', 'id01506', 'id01570', 'id02052', 'id01353', 'id01415', 'id00013', 'id00627', 'id02067', 'id01339', 'id00979'