In [None]:
import torch
import torchaudio
import numpy
import transformers
print("torch:", torch.__version__)
print("torchaudio:", torchaudio.__version__)
print("transformers:", transformers.__version__)
print("numpy:", numpy.__version__)
print("cuda:", torch.cuda.is_available())

: 

In [1]:
import embedding

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Wav2vec2 Embedding

In [2]:
embedding_wav2vec2 = embedding.process_speaker_folder(r"E:\speech_data\train_raw", embedding.wav2vec2_embedding, save_path='embeddings_wav2vec2.pt')

Error processing E:\speech_data\train_raw\id00000_00000.wav: Could not load libtorchcodec. Likely causes:
          1. FFmpeg is not properly installed in your environment. We support
             versions 4, 5, 6, 7, and 8, and we attempt to load libtorchcodec
             for each of those versions. Errors for versions not installed on
             your system are expected; only the error for your installed FFmpeg
             version is relevant. On Windows, ensure you've installed the
             "full-shared" version which ships DLLs.
          2. The PyTorch version (2.10.0+cpu) is not compatible with
             this version of TorchCodec. Refer to the version compatibility
             table:
             https://github.com/pytorch/torchcodec?tab=readme-ov-file#installing-torchcodec.
          3. Another runtime dependency; see exceptions below.

        The following exceptions were raised as we tried to load libtorchcodec:
        
[start of libtorchcodec loading traceback]

KeyboardInterrupt: 

In [7]:
import torch

# Load the embeddings file
embeddings_data = torch.load('embeddings_wav2vec2.pt')

print("Embeddings Structure:")
print(f"Keys: {embeddings_data.keys()}")
print(f"\nEmbeddings shape: {embeddings_data['embeddings'].shape}")
print(f"Number of embeddings: {embeddings_data['embeddings'].shape[0]}")
print(f"Embedding dimension: {embeddings_data['embeddings'].shape[1]}")

print(f"\nSpeaker IDs count: {len(embeddings_data['speaker_ids'])}")
print(f"Unique speakers: {set(embeddings_data['speaker_ids'])}")

# Show first few speaker IDs
print(f"\nFirst 5 speaker IDs: {embeddings_data['speaker_ids'][:5]}")

# Check for any NaN or Inf values
print(f"\nHas NaN values: {torch.isnan(embeddings_data['embeddings']).any()}")
print(f"Has Inf values: {torch.isinf(embeddings_data['embeddings']).any()}")

# Show min/max values
print(f"\nEmbeddings min: {embeddings_data['embeddings'].min():.6f}")
print(f"Embeddings max: {embeddings_data['embeddings'].max():.6f}")
print(f"Embeddings mean: {embeddings_data['embeddings'].mean():.6f}")


Embeddings Structure:
Keys: dict_keys(['embeddings', 'speaker_ids'])

Embeddings shape: torch.Size([101, 768])
Number of embeddings: 101
Embedding dimension: 768

Speaker IDs count: 101
Unique speakers: {'id00000'}

First 5 speaker IDs: ['id00000', 'id00000', 'id00000', 'id00000', 'id00000']

Has NaN values: False
Has Inf values: False

Embeddings min: -1.118027
Embeddings max: 0.672648
Embeddings mean: -0.001198


In [8]:
# Print speaker ID and embedding pairs
print("Speaker ID - Embedding Pairs:\n")
for i, (speaker_id, embedding) in enumerate(zip(embeddings_data['speaker_ids'], embeddings_data['embeddings'])):
    print(f"{i+1}. Speaker ID: {speaker_id}")
    print(f"   Embedding (first 10 dims): {embedding[:10].numpy()}")
    print(f"   Full shape: {embedding.shape}")
    print()
    
    # Show only first 10 for clarity
    if i >= 9:
        remaining = len(embeddings_data['speaker_ids']) - 10
        print(f"... and {remaining} more embeddings")

Speaker ID - Embedding Pairs:

1. Speaker ID: id00000
   Embedding (first 10 dims): [-0.00187777  0.0059448  -0.11905094 -0.06194381  0.07164005 -0.11579791
  0.03576373 -0.03564595  0.02763155 -0.31219035]
   Full shape: torch.Size([768])

2. Speaker ID: id00000
   Embedding (first 10 dims): [-0.00268058 -0.0046959  -0.11174713 -0.05865612  0.06993213 -0.10732571
  0.03960858 -0.03378287  0.04747562 -0.31506166]
   Full shape: torch.Size([768])

3. Speaker ID: id00000
   Embedding (first 10 dims): [ 0.00245799 -0.00200538 -0.14199188 -0.05642491  0.07246763 -0.11261716
  0.02871719 -0.03202571  0.05043066 -0.31610882]
   Full shape: torch.Size([768])

4. Speaker ID: id00000
   Embedding (first 10 dims): [-0.02444447 -0.00681336 -0.07738816 -0.04130914  0.10990156 -0.09671832
  0.01651118 -0.04010316  0.07333934 -0.30630648]
   Full shape: torch.Size([768])

5. Speaker ID: id00000
   Embedding (first 10 dims): [ 0.02166418 -0.0026431  -0.13937962 -0.06632877  0.09230816 -0.09912664
  0