IN THIS NOTEBOOK, SAMPLE SEGMENTS ARE SELECTED FROM THE DIARIZATION OUTCOME AND CORRESPONGDING FEATURES ARE EXTRACTED

STEP 1: select sample segments, randomly select 150 segments from each file

In [None]:
import pandas as pd

segments_df = pd.read_csv("/home/zmi/data/processed_data/diarization_outcome.csv")

selected_segments = []

grouped = segments_df.groupby('file_name')
for name, group in grouped:
    if len(group) < 150:
        raise ValueError(f"Not enough segments to sample 100 segments from file: {name}")
    selected_segments.append(group.sample(n=150))

#concatenate all the sampled segments into one DataFrame
selected_segments_df = pd.concat(selected_segments, ignore_index=True)

#STEP 2: extract MFCC, x-vectors and embeddings of selected segments

In [None]:
#prepare the file path
import os

directory_path = "/data/zmi/train_L/wav"

audio_file_paths = [os.path.join(directory_path, file) for file in os.listdir(directory_path)]

In [None]:
#extract the features
#the difference between x-vector and embeddings here are: extracted based on different models,x-vector capture global speaker identity across an entire segment,
#while embeddings are more granular, can capture both global and local speaker characteristics
import os
import librosa
import numpy as np
import torch
import torchaudio
import pandas as pd
from pyannote.audio import Inference
from pyannote.audio.models.embedding.xvector import XVectorMFCC

def extract_segment_mfcc(y, sr, start_time, end_time, n_mfcc=13, n_fft=320):
    start_sample = int(start_time * sr)
    end_sample = int(end_time * sr)
    y_segment = y[start_sample:end_sample]

    #ensure n_fft is not larger than the segment length
    segment_length = len(y_segment)
    n_fft = min(320, segment_length // 2)

    mfccs = librosa.feature.mfcc(y=y_segment, sr=sr, n_mfcc=n_mfcc, n_fft=n_fft)
    return mfccs

embedding_model = Inference("pyannote/embedding", use_auth_token="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")
xvector_model = XVectorMFCC(sample_rate=16000)

MIN_AUDIO_LENGTH = 3000

file_path_mapping = {os.path.basename(path): path for path in audio_file_paths}

segment_info = []

for file_name in selected_segments_df['file_name'].unique():
    file_path = file_path_mapping.get(file_name)
    if file_path is None:
        print(f"File path for {file_name} not found. Skipping.")
        continue

    #load the audio file using for mfcc extraction
    y, sr = librosa.load(file_path, sr=None)

    #load the audio file for embedding extraction
    waveform, sr_torch = torchaudio.load(file_path)

    #get the segments corresponding to this file
    segments = selected_segments_df[selected_segments_df['file_name'] == file_name]

    for _, segment in segments.iterrows():
        start_time = segment['start_time']
        end_time = segment['end_time']
        speaker = segment['speaker']

        #extract the corresponding audio segment for embedding and x-vector extraction
        start_sample = int(start_time * sr_torch)
        end_sample = int(end_time * sr_torch)
        segment_audio = waveform[:, start_sample:end_sample]

        #ensure the segment has the minimum required number of samples for the x-vector model
        if segment_audio.shape[1] < MIN_AUDIO_LENGTH:
            padding_needed = MIN_AUDIO_LENGTH - segment_audio.shape[1]
            segment_audio = torch.nn.functional.pad(segment_audio, (0, padding_needed))

        #extract the first channel (x-vector expects single-channel audio)
        segment_audio = segment_audio[0, :]

        #ensure the tensor has the correct shape (batch_size, num_channels, num_samples)
        if segment_audio.ndim == 1:
            segment_audio = segment_audio.unsqueeze(0)  #add channel dimension
        if segment_audio.ndim == 2:
            segment_audio = segment_audio.unsqueeze(0)  #add batch dimension

        print(f"Segment audio shape before model: {segment_audio.shape}")

        with torch.no_grad():
            xvector = xvector_model(segment_audio)

        embedding = embedding_model({"waveform": segment_audio.squeeze(0), "sample_rate": sr_torch})

        mfcc = extract_segment_mfcc(y, sr, start_time, end_time)

        #ensure embedding and xvector are tensors before converting them to numpy arrays
        if isinstance(embedding.data, torch.Tensor):
            embedding_np = embedding.data.cpu().numpy()
        else:
            embedding_np = embedding.data

        if isinstance(xvector, torch.Tensor):
            xvector_np = xvector.cpu().numpy()
        else:
            xvector_np = xvector

        segment_info.append({
            'file': file_name,
            'Speaker': speaker,
            'mfcc': mfcc,
            'embedding': embedding_np,
            'xvector': xvector_np
        })

segment_df = pd.DataFrame(segment_info)

STEP 3: store the extracted features for further analysis

In [27]:
segment_df.to_csv('features_selected_segments.csv', index=False)