In [None]:
!pip install torchcodec

# Install the necessary system library for torchaudio backends (SoX)
!sudo apt-get update
!sudo apt-get install -y sox libsox-dev libsox-fmt-all

# Verify installation (optional)
!sox --version

In [None]:
# 14.12.2025
# Feature extractor for VoiceMOS 2022 data
#
# Author : Mustafa Ozan Duman

import numpy as np
import torch
import torchaudio
import torch.nn as nn
import torchaudio.transforms as T
import pandas as pd
import os
from google.colab import drive
from tqdm.notebook import tqdm

# --- Step 0: Google Drive Setup and Directory Definitions ---

# 1. Mount Google Drive
drive.mount('/content/drive')

# Define base paths according to your place
DRIVE_BASE_PATH = #'/content/drive/MyDrive/BUU_PHD_THESIS/datasets_with_MOS/main/main/' # original path is given here

WAV_DIR = os.path.join(DRIVE_BASE_PATH, 'DATA/wav')
MOS_LISTS_DIR = os.path.join(DRIVE_BASE_PATH, 'DATA/sets')

# Define output feature directories (using /content/ for faster I/O)
OUTPUT_BASE_PATH = #'/content/drive/MyDrive/BUU_PHD_THESIS/VoiceMOS_2022_features' # original path is comment out

TRAIN_OUTPUT_DIR = os.path.join(OUTPUT_BASE_PATH, 'VoiceMOS_2022_train_data_features')
TEST_OUTPUT_DIR = os.path.join(OUTPUT_BASE_PATH, 'VoiceMOS_2022_test_data_features')
VAL_OUTPUT_DIR = os.path.join(OUTPUT_BASE_PATH, 'VoiceMOS_2022_validation_data_features')

# Create output directories if they don't exist
os.makedirs(TRAIN_OUTPUT_DIR, exist_ok=True)
os.makedirs(TEST_OUTPUT_DIR, exist_ok=True)
os.makedirs(VAL_OUTPUT_DIR, exist_ok=True)
print("Output directories created in /content/.")

# Define the MOS list files
MOS_LIST_FILES = ['train_mos_list.txt', 'test_mos_list.txt', 'val_mos_list.txt']


# --- Feature Extraction Functions (Adapted for Global Max Time Steps) ---

# Step 1: Extract MFCC features and zero-padding
def extract_features(audio_path, max_time_steps):
    """
    Extract MFCC features (2D matrix) and zero-pad to match the GLOBAL max_time_steps.
    """
    try:
        waveform, sample_rate = torchaudio.load(audio_path, normalize=True)
    except Exception as e:
        print(f"Error loading {audio_path}: {e}")
        return None # Return None on load failure

    mfcc_transform = T.MFCC(
        sample_rate=sample_rate,
        n_mfcc=40,
        melkwargs={"n_fft": 400, "hop_length": 160, "n_mels": 40}
    )
    mfcc = mfcc_transform(waveform)

    # Transpose to (time_steps, n_mfcc)
    mfcc = mfcc.squeeze(0).transpose(0, 1)  # (time_steps, n_mfcc)

    # Handle single frame case
    if mfcc.dim() == 1:
        mfcc = mfcc.unsqueeze(0)

    # Zero-padding to match the max_time_steps
    time_steps, n_mfcc = mfcc.shape

    # Convert to numpy for padding and ensure float32 consistency
    mfcc_np = mfcc.numpy()

    if time_steps < max_time_steps:
        # Pad with zeros along the time axis (first dimension)
        padding = max_time_steps - time_steps
        mfcc_np = np.pad(mfcc_np, ((0, padding), (0, 0)), mode='constant')

    elif time_steps > max_time_steps:
        # Truncate if a file somehow exceeds the calculated global max (shouldn't happen
        # but included for robustness)
        mfcc_np = mfcc_np[:max_time_steps, :]

    return np.array(mfcc_np, dtype=np.float32)


# Step 2: Apply FFT to MFCC matrix (Unchanged, operates on padded matrix)
def apply_2d_fft(mfcc_matrix):
    """
    Apply 2D FFT to the MFCC matrix, shift the result, take the first quarter,
    and then standardize the matrix before returning.
    """
    # Apply 2D FFT
    fft_result = np.fft.fft2(mfcc_matrix)
    fft_result_shifted = np.fft.fftshift(fft_result)
    fft_result_abs = np.abs(fft_result_shifted)
    rows, cols = fft_result_abs.shape
    row_mid = rows // 2
    col_mid = cols // 2
    fft_result_quarter = fft_result_abs[:row_mid, :col_mid]

    # Handle cases where the dimensions are odd (keep the middle row/column if needed)
    if rows % 2 != 0:
        fft_result_quarter = np.vstack([
            fft_result_quarter,
            fft_result_abs[row_mid:row_mid + 1, :col_mid]  # Keep the middle row
        ])
    if cols % 2 != 0:
        fft_result_quarter = np.hstack([
            fft_result_quarter,
            fft_result_abs[:row_mid, col_mid:col_mid + 1]  # Keep the middle column
        ])

    # Standardize the result
    std_dev = np.std(fft_result_quarter)
    if std_dev != 0:
        fft_result_standardized = (fft_result_quarter - np.mean(fft_result_quarter)) / std_dev
    else:
        fft_result_standardized = fft_result_quarter - np.mean(fft_result_quarter)

    # print(f"Shape of saved fft_result_standardized array: {fft_result_standardized.shape}")
    return fft_result_standardized




# Step 3: Custom Data Reader function
def load_mos_list(mos_list_path):
    """
    Loads utterance IDs and MOS scores from the custom text file format.
    Returns a pandas DataFrame.
    """
    df = pd.read_csv(
        mos_list_path,
        header=None,
        names=['utt_id', 'mos_score'],
        sep=','
    )
    return df

# Step 4: Save Features Function
def save_features(mfcc_list, mos_scores, utt_ids, output_dir):
    """
    Save extracted MFCC features and MOS scores into .npy files in the specified directory.
    """

    mfcc_file = os.path.join(output_dir, "mfcc_features.npy")
    mos_file = os.path.join(output_dir, "mos_scores.npy")
    utt_ids_file = os.path.join(output_dir, "utt_ids.npy")

    # The output features will have a uniform shape due to global padding and FFT
    mfcc_array = np.array(mfcc_list)
    mos_array = np.array(mos_scores)
    utt_ids_array = np.array(utt_ids)

    # Save arrays
    np.save(mfcc_file, mfcc_array)
    np.save(mos_file, mos_array)
    np.save(utt_ids_file, utt_ids_array)

    print(f"\nSaved MFCCs to {mfcc_file}")
    print(f"Shape of saved MFCC array: {mfcc_array.shape}")
    print(f"Shape of saved MOS array: {mos_array.shape}")
    print("---")


# --- Main Execution ---

# Global counter for failed attempts to limit excessive output
global_failures = 0
MAX_PRINT_FAILURES = 5 # Only print errors for the first 5 failed files

def find_global_max_time_step():
    """
    Scans all files across all splits to find the single largest time step,
    with detailed debugging output for the first few failures.
    """
    global global_failures
    global_failures = 0

    print("\n--- Phase 1: Scanning all files to find GLOBAL Max Time Steps ---")

    all_files_to_scan = []

    # 1. Collect all utterance IDs from all MOS list files
    for file_name in MOS_LIST_FILES:
        mos_list_path = os.path.join(MOS_LISTS_DIR, file_name)
        # Assuming load_mos_list is defined correctly elsewhere
        df = load_mos_list(mos_list_path)
        all_files_to_scan.extend(df['utt_id'].tolist())

    print(f"Total files to scan: {len(all_files_to_scan)}")

    global_max_time_steps = 0

    # 2. Iterate through all files and determine the maximum length
    for utt_id in tqdm(all_files_to_scan, desc="Finding Global Max"):
        audio_path = os.path.join(WAV_DIR, utt_id)

        try:
            # We only load and calculate the MFCC shape, not the full extraction
            waveform, sample_rate = torchaudio.load(audio_path, normalize=True)

            # Use the defined MFCC transform parameters
            mfcc_transform = T.MFCC(
                sample_rate=sample_rate,
                n_mfcc=40,
                melkwargs={"n_fft": 400, "hop_length": 160, "n_mels": 40}
            )
            mfcc = mfcc_transform(waveform)
            mfcc = mfcc.squeeze(0).transpose(0, 1) # (time_steps, n_mfcc)
            time_steps = mfcc.shape[0]

            global_max_time_steps = max(global_max_time_steps, time_steps)

        except Exception as e:
            # --- DEBUGGER PRINTS ADDED HERE ---
            if global_failures < MAX_PRINT_FAILURES:
                print(f"\n[DEBUG FAILURE #{global_failures + 1}]")
                print(f"File: {audio_path}")
                print(f"Error Type: {type(e).__name__}")
                print(f"Error Message: {e}")

            global_failures += 1
            pass # Continue to the next file

    if global_max_time_steps == 0 and global_failures > 0:
        print(f"\n--- SCAN FAILED ---")
        print(f"Global max time steps is 0. Failed to load {global_failures} files.")
        print("This confirms a loading issue. Check 'torchaudio' dependencies (like SoX).")

    return global_max_time_steps


def process_all_splits(global_max_time_steps):
    """
    Processes all splits using the determined global_max_time_steps.
    """
    print(f"\n--- Phase 2: Processing All Data using GLOBAL Max Time Steps ({global_max_time_steps}) ---")

    # Define file names and their corresponding output directories
    data_splits = [
        ('train', 'train_mos_list.txt', TRAIN_OUTPUT_DIR),
        ('test', 'test_mos_list.txt', TEST_OUTPUT_DIR),
        ('val', 'val_mos_list.txt', VAL_OUTPUT_DIR),
    ]

    for split_name, file_name, output_dir in data_splits:
        print(f"\n**Starting processing for {split_name.upper()} split**")
        mos_list_path = os.path.join(MOS_LISTS_DIR, file_name)
        df = load_mos_list(mos_list_path)

        mfcc_list = []
        mos_scores = []
        utt_ids = []

        # Iterate and extract features with padding
        for index, row in tqdm(df.iterrows(), total=len(df), desc=f"Extracting {split_name}"):
            utt_id = row['utt_id']
            mos = row['mos_score']
            audio_path = os.path.join(WAV_DIR, utt_id)

            # Extract MFCCs and pad to global_max_time_steps
            mfcc = extract_features(audio_path, global_max_time_steps)

            if mfcc is not None:
                # Apply FFT to the now fixed-size MFCC matrix

                # print(f"Shape of mfcc array before fft: {mfcc.shape}")

                mfcc_fft = apply_2d_fft(mfcc)

                # print(f"Shape of mfcc array before fft: {mfcc.shape}")

                # Store the transformed MFCC and MOS score
                mfcc_list.append(mfcc_fft)
                mos_scores.append(mos)
                utt_ids.append(utt_id)
            else:
                print(f"Failed to process {utt_id}, skipping.")

        print(f"Successfully processed {len(mfcc_list)} files for {split_name}.")

        # Save results for the current split
        save_features(mfcc_list, mos_scores, utt_ids, output_dir)


if __name__ == '__main__':
    # Execute Phase 1: Find the Global Maximum
    global_max = find_global_max_time_step()
    # global_max = 4575 # found in my trial
    # Execute Phase 2: Process All Splits using the Global Maximum
    if global_max > 0:
        process_all_splits(global_max)
    else:
        print("Error: Could not determine a valid global max time step. Check data paths.")