In [6]:
import torch
import torchaudio
import os
import numpy as np
from transformers import (
    Wav2Vec2FeatureExtractor,
    Wav2Vec2Model,
    HubertModel,
    WavLMModel
)


In [7]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# Load processor for all model input
processor = Wav2Vec2FeatureExtractor.from_pretrained("microsoft/wavlm-base") # Do các model WavLM, HUBERT không hỗ trợ input raw audio, nên cần lấy cấu trúc dữ liệu nhận raw audio từ wav2vec2processor, không ảnh hưởng khi embedding = model khác

wavlm = WavLMModel.from_pretrained(
    "microsoft/wavlm-base"
).to(device).eval()

hubert = HubertModel.from_pretrained(
    "facebook/hubert-base-ls960"
).to(device).eval()

wav2vec2 = Wav2Vec2Model.from_pretrained(
    "facebook/wav2vec2-base-960h"
).to(device).eval()


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
To support symlinks on Windows, you either 

In [8]:
def load_clean_audio(path):
    waveform, sr = torchaudio.load(path)

    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)

    assert sr == 16000
    assert waveform.dtype == torch.float32

    return waveform


In [9]:
@torch.no_grad() # Không tính gradient để tiết kiệm bộ nhớ
def wavlm_embedding(waveform):
    inputs = processor(
        waveform.squeeze(0),
        sampling_rate=16000,
        return_tensors="pt"
    ).to(device)

    outputs = wavlm(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze(0).cpu().numpy()


In [15]:
@torch.no_grad()
def hubert_embedding(waveform):
    inputs = processor(
        waveform.squeeze(0),
        sampling_rate=16000,
        return_tensors="pt"
    ).to(device)

    outputs = hubert(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze(0).cpu().numpy()

In [16]:
@torch.no_grad()
def wav2vec2_embedding(waveform):
    inputs = processor(
        waveform.squeeze(0),
        sampling_rate=16000,
        return_tensors="pt"
    ).to(device)

    outputs = wav2vec2(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze(0).cpu().numpy()

In [24]:
waveform = load_clean_audio("D:\Speak_Verification\id00002-20260115T145043Z-1-001\id00002\id00002_train_small-00000-of-00119_184.wav")
embedding = wavlm_embedding(waveform)
print(embedding[0])
print(embedding.shape)

-0.24095412
(768,)


In [26]:
embedding = wav2vec2_embedding(waveform)
print(embedding[0])
print(embedding.shape)

-0.01149833
(768,)


In [27]:
embedding = hubert_embedding(waveform)
print(embedding[0])
print(embedding.shape)

0.056408197
(768,)
