In [1]:
import os
import torch
import torchaudio
import torchaudio.transforms as T
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import soundfile as sf
import librosa
import numpy as np

# ==============================================================================
# 1. GPU ENABLED EXTRACTOR
# ==============================================================================
class HandcraftedFeatureExtractor:
    def __init__(self, device, sample_rate=16000, n_mfcc=40, n_mels=80, n_fft=400, hop_length=160):
        self.sample_rate = sample_rate
        self.hop_length = hop_length
        self.n_fft = n_fft
        self.device = device 

        # --- Move transforms to GPU immediately upon initialization ---
        self.mfcc_transform = T.MFCC(
            sample_rate=sample_rate, n_mfcc=n_mfcc,
            melkwargs={"n_fft": n_fft, "n_mels": n_mels, "hop_length": hop_length, "center": False}
        ).to(self.device)

        self.mel_transform = T.MelSpectrogram(
            sample_rate=sample_rate, n_fft=n_fft, n_mels=n_mels, 
            hop_length=hop_length, center=False
        ).to(self.device)

    def _get_pitch_cpu(self, waveform_cpu):
        # This function runs on CPU because Librosa does not support GPU directly
        wav_numpy = waveform_cpu.squeeze().numpy()
        
        try:
            # Using pyin to ensure quality
            f0, _, _ = librosa.pyin(
                wav_numpy, 
                fmin=60, fmax=500, sr=self.sample_rate, 
                hop_length=self.hop_length, frame_length=self.n_fft, center=False
            )
        except:
            f0 = np.zeros(1)

        f0 = np.nan_to_num(f0)
        # Return Tensor (keep on CPU for now)
        return torch.from_numpy(f0).view(1, 1, -1).float()

    def _align_length(self, feat1, feat2):
        min_len = min(feat1.shape[-1], feat2.shape[-1])
        return feat1[..., :min_len], feat2[..., :min_len]

    def _apply_cmvn(self, feature):
        if feature is not None:
            mean = feature.mean(dim=-1, keepdim=True)
            std = feature.std(dim=-1, keepdim=True)
            feature = (feature - mean) / (std + 1e-6)
        return feature

    def extract_all(self, waveform_cpu):
        """
        Extracts ALL 5 modes at once to save computation time.
        Returns a dictionary: { 'ModeName': Tensor, ... }
        """
        output_dict = {}

        # 1. GPU Processing (MFCC / MFBE)
        waveform_gpu = waveform_cpu.to(self.device)
        if waveform_gpu.dim() == 1: waveform_gpu = waveform_gpu.unsqueeze(0)

        # Compute base features once
        mfcc_base = self.mfcc_transform(waveform_gpu)
        mel_base = self.mel_transform(waveform_gpu)
        mfbe_base = torch.log(mel_base + 1e-6)

        # 2. CPU Processing (Pitch)
        # Use the CPU copy of waveform for Librosa
        pitch_cpu = self._get_pitch_cpu(waveform_cpu)
        pitch_base = pitch_cpu.to(self.device) # Move to GPU for merging

        # Ensure pitch dim matches for concatenation
        if mfcc_base.dim() == 2: mfcc_base = mfcc_base.unsqueeze(0)
        if mfbe_base.dim() == 2: mfbe_base = mfbe_base.unsqueeze(0)

        # --- MODE 1: Only MFCC ---
        output_dict["Only MFCC"] = self._apply_cmvn(mfcc_base)

        # --- MODE 2: Only MFBE ---
        output_dict["Only MFBE"] = self._apply_cmvn(mfbe_base)

        # --- MODE 3: Only Pitch ---
        # Note: We squeeze dim 0 for pitch if it's standalone, depending on desired shape.
        # Keeping consistent (1, Time) or (1, 1, Time) -> Let's keep (1, Time) for standalone 1D feature
        output_dict["Only Pitch"] = self._apply_cmvn(pitch_base.squeeze(0))

        # --- MODE 4: MFCC + Pitch ---
        # Align lengths
        mfcc_aligned, pitch_aligned_1 = self._align_length(mfcc_base, pitch_base)
        combined_mfcc_pitch = torch.cat([mfcc_aligned, pitch_aligned_1], dim=1)
        output_dict["MFCC + Pitch"] = self._apply_cmvn(combined_mfcc_pitch)

        # --- MODE 5: MFBE + Pitch ---
        # Align lengths
        mfbe_aligned, pitch_aligned_2 = self._align_length(mfbe_base, pitch_base)
        combined_mfbe_pitch = torch.cat([mfbe_aligned, pitch_aligned_2], dim=1)
        output_dict["MFBE + Pitch"] = self._apply_cmvn(combined_mfbe_pitch)

        return output_dict

# ==============================================================================
# 2. DATASET
# ==============================================================================
class AudioFolderDataset(Dataset):
    def __init__(self, root_dir, extractor, sample_rate=16000):
        self.root_dir = root_dir
        self.extractor = extractor
        self.sample_rate = sample_rate
        self.file_list = []
        
        # Walk through directories
        for root, _, files in os.walk(root_dir):
            for file in files:
                if file.lower().endswith(('.wav', '.flac', '.mp3')):
                    self.file_list.append(os.path.join(root, file))

    def __len__(self): return len(self.file_list)

    def __getitem__(self, idx):
        path = self.file_list[idx]
        try:
            # Read audio
            wav_numpy, sr = sf.read(path)
            waveform = torch.from_numpy(wav_numpy).float()
            
            # Basic preprocessing
            if waveform.dim() == 1: waveform = waveform.unsqueeze(0)
            else: waveform = waveform.t()
            if waveform.shape[0] > 1: waveform = torch.mean(waveform, dim=0, keepdim=True)
            if sr != self.sample_rate: waveform = T.Resample(sr, self.sample_rate)(waveform)

            # Extract ALL features
            # Returns a dict of tensors on GPU
            features_dict_gpu = self.extractor.extract_all(waveform)
            
            # Move all tensors to CPU to prevent GPU OOM during DataLoader batching
            features_dict_cpu = {k: v.cpu().squeeze(0) for k, v in features_dict_gpu.items()}
            
            return features_dict_cpu, path 
        except Exception as e:
            print(f"Error processing {path}: {e}")
            return None, None

def collate_fn(batch):
    """
    Custom collate function to handle dictionary of features.
    """
    # Filter failed samples
    batch = [b for b in batch if b[0] is not None]
    if not batch: return None

    # batch is a list of tuples: (feature_dict, path)
    list_of_dicts, paths = zip(*batch)
    
    # Initialize output dictionary
    batched_output = {}
    
    # Get all mode keys from the first sample
    keys = list_of_dicts[0].keys()

    for key in keys:
        # Collect all tensors for this specific mode
        tensors = [d[key] for d in list_of_dicts]
        
        # Pad them
        max_len = max([t.shape[-1] for t in tensors])
        padded_tensors = [torch.nn.functional.pad(t, (0, max_len - t.shape[-1])) for t in tensors]
        
        # Stack them into a batch
        batched_output[key] = torch.stack(padded_tensors)
        
    return batched_output, paths

# ==============================================================================
# 3. EXECUTION
# ==============================================================================
if __name__ == "__main__":
    # --- CONFIGURATION ---
    INPUT_PATH = r"E:\speech_data\train_vi_7s"
    OUTPUT_BASE_PATH = r"E:\speech_data\7s_extracted_features"
    
    # List of modes to verify logic (The script extracts ALL of these automatically)
    MODES_TO_SAVE = ["Only MFCC", "Only MFBE", "Only Pitch", "MFCC + Pitch", "MFBE + Pitch"]
    
    # --- SETUP GPU ---
    if torch.cuda.is_available():
        device = torch.device("cuda")
        print(f" Running on GPU: {torch.cuda.get_device_name(0)}")
    else:
        device = torch.device("cpu")
        print(" GPU not found, running on CPU.")

    # Initialize Extractor
    extractor = HandcraftedFeatureExtractor(device=device)
    
    # Initialize Dataset (No mode needed here anymore)
    dataset = AudioFolderDataset(INPUT_PATH, extractor)
    
    # Note: On Windows, num_workers should be 0 to avoid multiprocessing issues with CUDA
    loader = DataLoader(dataset, batch_size=32, shuffle=False, num_workers=0, collate_fn=collate_fn)

    print(f"Starting extraction for ALL {len(MODES_TO_SAVE)} modes...")
    print(f"Input: {INPUT_PATH}")
    print(f"Output: {OUTPUT_BASE_PATH}")

    # Create sub-folders for each mode
    for mode in MODES_TO_SAVE:
        mode_folder = os.path.join(OUTPUT_BASE_PATH, mode)
        os.makedirs(mode_folder, exist_ok=True)
        print(f"-> Created/Checked folder: {mode_folder}")

    # Process loop
    for batch in tqdm(loader):
        if batch is None: continue
        
        # batched_features is a dict: {'Only MFCC': TensorBatch, ...}
        batched_features, paths = batch
        
        # Iterate over each mode and save
        for mode, features_tensor in batched_features.items():
            
            # Define output folder for this mode
            mode_output_dir = os.path.join(OUTPUT_BASE_PATH, mode)
            
            for i in range(len(paths)):
                # Calculate relative path to maintain folder structure (SpeakerID/File)
                rel_path = os.path.relpath(paths[i], INPUT_PATH)
                
                # Construct save path: OutputBase/ModeName/SpeakerID/Filename.pt
                save_path = os.path.join(mode_output_dir, os.path.splitext(rel_path)[0] + ".pt")
                
                # Create parent directory (SpeakerID folder) inside the Mode folder
                os.makedirs(os.path.dirname(save_path), exist_ok=True)
                
                # Save the tensor
                # Clone is used to detach from the batch tensor to save memory
                torch.save(features_tensor[i].clone(), save_path)

    print(f"\nCompleted! Please check the folders in: {OUTPUT_BASE_PATH}")

 Running on GPU: NVIDIA GeForce RTX 4060 Laptop GPU
Starting extraction for ALL 5 modes...
Input: D:\Study\7-SP26\DATxSLP\Data_after_cut\test_output
Output: D:\Study\7-SP26\DATxSLP\Data_after_extract_feature
-> Created/Checked folder: D:\Study\7-SP26\DATxSLP\Data_after_extract_feature\Only MFCC
-> Created/Checked folder: D:\Study\7-SP26\DATxSLP\Data_after_extract_feature\Only MFBE
-> Created/Checked folder: D:\Study\7-SP26\DATxSLP\Data_after_extract_feature\Only Pitch
-> Created/Checked folder: D:\Study\7-SP26\DATxSLP\Data_after_extract_feature\MFCC + Pitch
-> Created/Checked folder: D:\Study\7-SP26\DATxSLP\Data_after_extract_feature\MFBE + Pitch


100%|██████████| 20/20 [07:40<00:00, 23.01s/it]


Completed! Please check the folders in: D:\Study\7-SP26\DATxSLP\Data_after_extract_feature



