In [3]:
import torch
import torchaudio
import os
import numpy as np
from transformers import (
    Wav2Vec2FeatureExtractor,
    Wav2Vec2Model,
    HubertModel,
    WavLMModel
)


  from .autonotebook import tqdm as notebook_tqdm
  if not hasattr(np, "object"):


In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# Load processor for all model input
processor = Wav2Vec2FeatureExtractor.from_pretrained("microsoft/wavlm-base") # Do các model WavLM, HUBERT không hỗ trợ input raw audio, nên cần lấy cấu trúc dữ liệu nhận raw audio từ wav2vec2processor, không ảnh hưởng khi embedding = model khác

wavlm = WavLMModel.from_pretrained(
    "microsoft/wavlm-base"
).to(device).eval()

hubert = HubertModel.from_pretrained(
    "facebook/hubert-base-ls960"
).to(device).eval()

wav2vec2 = Wav2Vec2Model.from_pretrained(
    "facebook/wav2vec2-base-960h"
).to(device).eval()


Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Function lấy embedding

In [5]:
def load_clean_audio(path):
    waveform, sr = torchaudio.load(path)

    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)

    assert sr == 16000
    assert waveform.dtype == torch.float32

    return waveform


In [6]:
@torch.no_grad() # Không tính gradient để tiết kiệm bộ nhớ
def wavlm_embedding(waveform):
    inputs = processor(
        waveform.squeeze(0),
        sampling_rate=16000,
        return_tensors="pt"
    ).to(device)

    outputs = wavlm(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze(0).cpu().numpy()


In [7]:
@torch.no_grad()
def hubert_embedding(waveform):
    inputs = processor(
        waveform.squeeze(0),
        sampling_rate=16000,
        return_tensors="pt"
    ).to(device)

    outputs = hubert(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze(0).cpu().numpy()

In [8]:
@torch.no_grad()
def wav2vec2_embedding(waveform):
    inputs = processor(
        waveform.squeeze(0),
        sampling_rate=16000,
        return_tensors="pt"
    ).to(device)

    outputs = wav2vec2(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze(0).cpu().numpy()

Tạo Dataframe ứng với từng embedding

In [53]:
import os
import glob
import pandas as pd

def process_speaker_folder(folder_path, embedding_func, df=None):
    """
    Process all audio files in a folder and create/update a dataframe with embeddings.
    Recursively searches for audio files at any depth.
    """

    # Normalize path
    folder_path = os.path.abspath(folder_path)

    # Extract speaker ID from the TOP folder (id00002)
    id_speaker = os.path.basename(folder_path.rstrip(os.sep))

    # Recursively find all wav files
    audio_files = glob.glob(
        os.path.join(folder_path, "**", "*.wav"),
        recursive=True
    )

    if len(audio_files) == 0:
        print(f"No wav files found in {folder_path}")
        return df

    embeddings_list = []
    ids_list = []

    for audio_file in audio_files:
        try:
            waveform = load_clean_audio(audio_file)
            embedding = embedding_func(waveform)

            embeddings_list.append(embedding)
            ids_list.append(id_speaker)

            print(f"Processed: {audio_file}")

        except Exception as e:
            print(f"Error processing {audio_file}: {e}")
            continue

    print(f"\nTotal files processed for {id_speaker}: {len(embeddings_list)}")

    # Create dataframe
    new_df = pd.DataFrame({
        "embedding": embeddings_list,
        "id_speaker": ids_list
    })

    # Append to existing dataframe or return new one
    if df is not None:
        df = pd.concat([df, new_df], ignore_index=True)
        return df
    else:
        return new_df


In [None]:
# Tạo dataframe mới df_train_wavlm với embedding của wavlm
df_train_wavlm = process_speaker_folder('D:\Speak_Verification\id00002', wavlm_embedding)

Processed: D:\Speak_Verification\id00002\id00002\id00002_train_small-00000-of-00119_184.wav
Processed: D:\Speak_Verification\id00002\id00002\id00002_train_small-00000-of-00119_185.wav
Processed: D:\Speak_Verification\id00002\id00002\id00002_train_small-00000-of-00119_186.wav
Processed: D:\Speak_Verification\id00002\id00002\id00002_train_small-00000-of-00119_187.wav
Processed: D:\Speak_Verification\id00002\id00002\id00002_train_small-00000-of-00119_188.wav
Processed: D:\Speak_Verification\id00002\id00002\id00002_train_small-00000-of-00119_189.wav
Processed: D:\Speak_Verification\id00002\id00002\id00002_train_small-00000-of-00119_190.wav
Processed: D:\Speak_Verification\id00002\id00002\id00002_train_small-00000-of-00119_191.wav
Processed: D:\Speak_Verification\id00002\id00002\id00002_train_small-00000-of-00119_192.wav
Processed: D:\Speak_Verification\id00002\id00002\id00002_train_small-00000-of-00119_193.wav
Processed: D:\Speak_Verification\id00002\id00002\id00002_train_small-00000-of-00

In [None]:
# Check số lượng embedding ỉn ra có đúng không
df_train_wavlm.tail()

Unnamed: 0,embedding,id_speaker
82,"[-0.13876869, -0.092820674, -0.08958717, -0.00...",id00002
83,"[-0.08880446, -0.033784118, -0.08266386, 0.056...",id00002
84,"[-0.1370442, -0.05815104, -0.15679246, -0.0881...",id00002
85,"[-0.32138678, 0.23627988, -0.30431208, -0.1218...",id00002
86,"[-0.0666637, -0.032175392, -0.10087337, -0.017...",id00002


In [None]:
# Thêm embedding của folder khác (folder id00005) vào df_train_wavlm đã có từ trước
# Cách dùng: thêm tên df_train_wavlm đã có vào parameter thứ 3 của function process_speaker_folder
df_train_wavlm = process_speaker_folder(r'D:\Speak_Verification\id00005', wavlm_embedding, df_train_wavlm)



Processed: D:\Speak_Verification\id00005\id00005\id00005_test-00000-of-00038_0.wav
Processed: D:\Speak_Verification\id00005\id00005\id00005_test-00000-of-00038_1.wav
Processed: D:\Speak_Verification\id00005\id00005\id00005_test-00000-of-00038_10.wav
Processed: D:\Speak_Verification\id00005\id00005\id00005_test-00000-of-00038_11.wav
Processed: D:\Speak_Verification\id00005\id00005\id00005_test-00000-of-00038_12.wav
Processed: D:\Speak_Verification\id00005\id00005\id00005_test-00000-of-00038_13.wav
Processed: D:\Speak_Verification\id00005\id00005\id00005_test-00000-of-00038_14.wav
Processed: D:\Speak_Verification\id00005\id00005\id00005_test-00000-of-00038_15.wav
Processed: D:\Speak_Verification\id00005\id00005\id00005_test-00000-of-00038_16.wav
Processed: D:\Speak_Verification\id00005\id00005\id00005_test-00000-of-00038_17.wav
Processed: D:\Speak_Verification\id00005\id00005\id00005_test-00000-of-00038_18.wav
Processed: D:\Speak_Verification\id00005\id00005\id00005_test-00000-of-00038_1

In [None]:
# Check số lượng embedding ỉn ra xem có thêm không, dataframe đã tăng thêm dữ liệu với label của id00005
df_train_wavlm.tail()

Unnamed: 0,embedding,id_speaker
156,"[-0.061753623, -0.1948838, 0.016106505, -0.041...",id00005
157,"[-0.10144407, -0.20887241, -0.030622339, -0.05...",id00005
158,"[-0.073258, -0.23245417, 0.009042546, -0.05305...",id00005
159,"[-0.120365925, -0.11485453, -0.096402265, -0.0...",id00005
160,"[-0.041609768, -0.12069491, -0.03387584, 0.014...",id00005


In [17]:
# Check folder structure
import os
test_path = 'D:\Speak_Verification\id00002'
print("Folder structure:")
for root, dirs, files in os.walk(test_path):
    level = root.replace(test_path, '').count(os.sep)
    indent = ' ' * 2 * level
    print(f'{indent}{os.path.basename(root)}/')
    subindent = ' ' * 2 * (level + 1)
    for file in files[:3]:  # Show first 3 files
        print(f'{subindent}{file}')
    if len(files) > 3:
        print(f'{subindent}... and {len(files) - 3} more files')


Folder structure:
id00002/
  id00002/
    id00002_train_small-00000-of-00119_184.wav
    id00002_train_small-00000-of-00119_185.wav
    id00002_train_small-00000-of-00119_186.wav
    ... and 84 more files


In [61]:
df_train_hubert = process_speaker_folder(r'D:\Speak_Verification\id00002', hubert_embedding)

Processed: D:\Speak_Verification\id00002\id00002\id00002_train_small-00000-of-00119_184.wav
Processed: D:\Speak_Verification\id00002\id00002\id00002_train_small-00000-of-00119_185.wav
Processed: D:\Speak_Verification\id00002\id00002\id00002_train_small-00000-of-00119_186.wav
Processed: D:\Speak_Verification\id00002\id00002\id00002_train_small-00000-of-00119_187.wav
Processed: D:\Speak_Verification\id00002\id00002\id00002_train_small-00000-of-00119_188.wav
Processed: D:\Speak_Verification\id00002\id00002\id00002_train_small-00000-of-00119_189.wav
Processed: D:\Speak_Verification\id00002\id00002\id00002_train_small-00000-of-00119_190.wav
Processed: D:\Speak_Verification\id00002\id00002\id00002_train_small-00000-of-00119_191.wav
Processed: D:\Speak_Verification\id00002\id00002\id00002_train_small-00000-of-00119_192.wav
Processed: D:\Speak_Verification\id00002\id00002\id00002_train_small-00000-of-00119_193.wav
Processed: D:\Speak_Verification\id00002\id00002\id00002_train_small-00000-of-00

In [62]:
df_train_hubert.tail()

Unnamed: 0,embedding,id_speaker
82,"[0.047749765, -4.696302e-05, 0.057328064, -0.0...",id00002
83,"[0.026103087, -0.07959468, 0.08531942, -0.0357...",id00002
84,"[0.011636459, 0.049394533, 0.01956365, -0.0195...",id00002
85,"[0.012662408, 0.017968604, -0.033816285, -0.04...",id00002
86,"[0.019730102, -0.014303559, 0.021670833, -0.01...",id00002


In [63]:
df_train_hubert = process_speaker_folder('D:\\Speak_Verification\\id00005', hubert_embedding, df_train_hubert)
df_train_hubert.tail()

Processed: D:\Speak_Verification\id00005\id00005\id00005_test-00000-of-00038_0.wav
Processed: D:\Speak_Verification\id00005\id00005\id00005_test-00000-of-00038_1.wav
Processed: D:\Speak_Verification\id00005\id00005\id00005_test-00000-of-00038_10.wav
Processed: D:\Speak_Verification\id00005\id00005\id00005_test-00000-of-00038_11.wav
Processed: D:\Speak_Verification\id00005\id00005\id00005_test-00000-of-00038_12.wav
Processed: D:\Speak_Verification\id00005\id00005\id00005_test-00000-of-00038_13.wav
Processed: D:\Speak_Verification\id00005\id00005\id00005_test-00000-of-00038_14.wav
Processed: D:\Speak_Verification\id00005\id00005\id00005_test-00000-of-00038_15.wav
Processed: D:\Speak_Verification\id00005\id00005\id00005_test-00000-of-00038_16.wav
Processed: D:\Speak_Verification\id00005\id00005\id00005_test-00000-of-00038_17.wav
Processed: D:\Speak_Verification\id00005\id00005\id00005_test-00000-of-00038_18.wav
Processed: D:\Speak_Verification\id00005\id00005\id00005_test-00000-of-00038_1

Unnamed: 0,embedding,id_speaker
156,"[0.015212663, 0.010962834, 0.006009621, 0.0026...",id00005
157,"[0.0031089904, 0.019936338, 0.031597055, 0.038...",id00005
158,"[0.029207284, 0.03176958, -0.0011211648, 0.031...",id00005
159,"[0.014634313, 0.047949966, -0.020487344, -0.03...",id00005
160,"[0.003924029, 0.04678671, -0.07017839, 0.02231...",id00005
