In [None]:
# ## Step 2: Feature Extraction (with Jitter & Shimmer)
#
# This step takes the preprocessed audio files from Step 1, extracts various acoustic features using `librosa` and `parselmouth-praat`, and saves them into compressed `.npz` files. It includes caching and multiprocessing.
#
# **Features Extracted:**
# - **Librosa (Frame-wise):**
#   - Mel-Frequency Cepstral Coefficients (MFCCs)
#   - Root Mean Square Energy (RMS)
#   - Spectral Centroid
#   - Spectral Bandwidth
#   - Spectral Contrast
#   - Spectral Flatness
#   - Spectral Rolloff
#   - Pitch (Fundamental Frequency - F0 using PYIN)
#   - Zero-Crossing Rate (ZCR)
# - **Parselmouth/Praat (Scalar per file):**
#   - Pitch (Praat's algorithm - Median F0)
#   - Jitter (local, local_absolute, rap, ppq5)
#   - Shimmer (local, local_db, apq3, apq5, apq11)

In [None]:
# !pip install praat-parselmouth

In [1]:
import os
import time
import numpy as np
import librosa
import parselmouth
from parselmouth.praat import call
from multiprocessing import Pool, cpu_count, Manager, Value
import traceback  # For detailed error logging
import warnings  # To suppress specific warnings if needed
import gc  # For garbage collection
from datetime import datetime, timedelta
from tqdm.notebook import tqdm
import sys
import logging
import json

In [2]:
# Suppress specific warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=UserWarning, module="librosa")

In [10]:
# --- Configuration ---
# Consider moving these to a config file for easier management
# PREPROCESSED_FOLDER = "./data/"
PREPROCESSED_FOLDER = "..\\..\\..\\data\\preprocessed_sample\\"
FEATURE_FOLDER = "..\\..\\..\\data\\features\\"  # Output for Step 2

SR = 22050  # Target Sample Rate for librosa features
N_FFT = 2048  # FFT window size
HOP_LENGTH = 256  # Hop length for STFT
N_MELS = 96  # Number of Mel bands
N_MFCC = 13  # Number of MFCCs
FMIN_PITCH_LIBROSA = librosa.note_to_hz("A1")  # Min frequency for librosa pitch
FMAX_PITCH_LIBROSA = librosa.note_to_hz("A7")  # Max frequency for librosa pitch

# Praat Pitch Parameters
PRAAT_PITCH_FLOOR = 50.0  # Pitch floor (Hz) for Praat analysis
PRAAT_PITCH_CEILING = 800.0  # Pitch ceiling (Hz) for Praat analysis

# Processing parameters
MAX_WORKERS = max(1, cpu_count() - 2)  # Leave one core free
PRINT_INTERVAL = 5  # Print progress update every X seconds

In [11]:
# Setup logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)  # Adjust as needed
handler = logging.StreamHandler()
formatter = logging.Formatter("[%(levelname)s] %(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)

In [1]:
class TimeTracker:
    """Class to track processing time and provide estimates"""

    def __init__(self, total_files):
        self.start_time = time.time()
        self.total_files = total_files
        self.completed = 0
        self.skipped = 0
        self.failed = 0
        self.file_times = []  # Store processing times for better estimates
        self.last_print_time = time.time()

    def update(self, status, processing_time=None):
        """Update tracker with a new file status"""
        if status == "completed":
            self.completed += 1
            if processing_time:
                self.file_times.append(processing_time)
        elif status == "skipped":
            self.skipped += 1
        elif status == "failed":
            self.failed += 1

    def get_progress_str(self):
        """Get a formatted progress string with ETA"""
        elapsed = time.time() - self.start_time
        total_processed = self.completed + self.skipped + self.failed
        progress = total_processed / self.total_files if self.total_files > 0 else 0

        # Calculate ETA based on average of recent processing times
        if len(self.file_times) > 0 and self.completed > 0:
            # Use up to the last 10 files for a more accurate recent average
            recent_times = self.file_times[-min(10, len(self.file_times)) :]
            avg_time_per_file = sum(recent_times) / len(recent_times)
            files_remaining = self.total_files - total_processed
            eta_seconds = avg_time_per_file * files_remaining
        else:
            # Fall back to simple estimation if we don't have timing data
            if progress > 0 and elapsed > 0:
                eta_seconds = (elapsed / progress) - elapsed
            else:
                eta_seconds = 0

        # Format ETA string
        if eta_seconds > 0:
            eta_td = timedelta(seconds=int(eta_seconds))
            if eta_td.days > 0:
                eta_str = (
                    f"{eta_td.days}d {eta_td.seconds//3600}h {(eta_td.seconds//60)%60}m"
                )
            elif eta_td.seconds > 3600:
                eta_str = f"{eta_td.seconds//3600}h {(eta_td.seconds//60)%60}m {eta_td.seconds%60}s"
            elif eta_td.seconds > 60:
                eta_str = f"{eta_td.seconds//60}m {eta_td.seconds%60}s"
            else:
                eta_str = f"{eta_td.seconds}s"
        else:
            eta_str = "calculating..."

        # Format elapsed time string
        elapsed_td = timedelta(seconds=int(elapsed))
        if elapsed_td.days > 0:
            elapsed_str = f"{elapsed_td.days}d {elapsed_td.seconds//3600}h {(elapsed_td.seconds//60)%60}m"
        elif elapsed_td.seconds > 3600:
            elapsed_str = f"{elapsed_td.seconds//3600}h {(elapsed_td.seconds//60)%60}m {elapsed_td.seconds%60}s"
        elif elapsed_td.seconds > 60:
            elapsed_str = f"{elapsed_td.seconds//60}m {elapsed_td.seconds%60}s"
        else:
            elapsed_str = f"{elapsed_td.seconds}s"

        # Build the progress string
        progress_str = (
            f"[{total_processed}/{self.total_files}] {progress:.1%} "
            f"(✅{self.completed} ⏩{self.skipped} ❌{self.failed}) "
            f"Elapsed: {elapsed_str} | ETA: {eta_str}"
        )

        # If we have enough data, add throughput
        if elapsed > 60 and self.completed > 0:  # Minimum 1 minute elapsed
            files_per_minute = (self.completed / elapsed) * 60
            progress_str += f" | Rate: {files_per_minute:.1f} files/min"

        return progress_str

    def should_print_update(self):
        """Check if we should print a progress update"""
        current_time = time.time()
        if current_time - self.last_print_time >= PRINT_INTERVAL:
            self.last_print_time = current_time
            return True
        return False

    def get_final_stats(self):
        """Get final statistics string"""
        elapsed = time.time() - self.start_time
        elapsed_td = timedelta(seconds=int(elapsed))

        # Format elapsed time
        if elapsed_td.days > 0:
            elapsed_str = f"{elapsed_td.days}d {elapsed_td.seconds//3600}h {(elapsed_td.seconds//60)%60}m"
        elif elapsed_td.seconds > 3600:
            elapsed_str = f"{elapsed_td.seconds//3600}h {(elapsed_td.seconds//60)%60}m {elapsed_td.seconds%60}s"
        elif elapsed_td.seconds > 60:
            elapsed_str = f"{elapsed_td.seconds//60}m {elapsed_td.seconds%60}s"
        else:
            elapsed_str = f"{elapsed_td.seconds}s"

        # Calculate average time per file
        if self.completed > 0:
            avg_time = sum(self.file_times) / len(self.file_times)
            avg_time_str = f"{avg_time:.2f}s"
        else:
            avg_time_str = "N/A"

        # Build the stats string
        stats = [
            f"Total time: {elapsed_str}",
            f"Completed:  {self.completed} files",
            f"Skipped:    {self.skipped} files",
            f"Failed:     {self.failed} files",
            f"Total:      {self.total_files} files",
            f"Avg. time:  {avg_time_str} per file",
        ]

        if elapsed > 60 and self.completed > 0:
            files_per_minute = (self.completed / elapsed) * 60
            stats.append(f"Throughput:  {files_per_minute:.1f} files/min")

        return "\n".join(stats)

In [27]:
def extract_praat_features(
    sound, pitch_floor=PRAAT_PITCH_FLOOR, pitch_ceiling=PRAAT_PITCH_CEILING
):
    """
    Extracts scalar Jitter, Shimmer, and median F0 using Parselmouth/Praat.
    """
    praat_features = {
        "f0_median_praat": np.nan,
        "jitter_local": np.nan,
        "jitter_local_abs": np.nan,
        "jitter_rap": np.nan,
        "jitter_ppq5": np.nan,
        "shimmer_local": np.nan,
        "shimmer_local_db": np.nan,
        "shimmer_apq3": np.nan,
        "shimmer_apq5": np.nan,
        "shimmer_apq11": np.nan,
    }

    filename = getattr(sound, "name", "Unknown")

    # Increase minimum duration to ensure robust analysis for diverse voices
    if sound.get_total_duration() < 0.2:
        logger.warning(f"⚠️ Sound too short for Praat analysis: {filename}")
        return praat_features

    try:
        # Use autocorrelation method with adjusted time step for better temporal resolution
        pitch = call(
            sound,
            "To Pitch (ac)",
            0.01,
            pitch_floor,
            15,
            "yes",
            0.03,
            0.45,
            0.01,
            0.35,
            0.14,
            pitch_ceiling,
        )

        num_frames = call(pitch, "Get number of frames")
        if num_frames <= 0:
            logger.warning(f"⚠️ No valid pitch frames found in {filename}")
            return praat_features

        praat_features["f0_median_praat"] = call(
            pitch, "Get quantile", 0.0, 0.0, 0.5, "Hertz"
        )

        # Use cross-correlation for point process to improve accuracy
        point_process = call(
            sound, "To PointProcess (periodic, cc)", pitch_floor, pitch_ceiling
        )

        num_points = call(point_process, "Get number of points")
        if num_points < 8:  # Stricter threshold for diverse voices
            logger.warning(
                f"⚠️ Too few points ({num_points}) for jitter/shimmer analysis in {filename}"
            )
            return praat_features

        # Adjusted voice report parameters for broader pitch range
        report = call(
            [sound, point_process, pitch],
            "Voice report",
            0.0,
            0.0,
            pitch_floor,
            pitch_ceiling,
            1.5,  # Increased max period factor for low-pitched voices
            1.8,  # Increased max amplitude factor for high-pitched voices
            0.03,  # Silence threshold
            0.45,  # Voicing threshold
        )

        values = {}
        for line in report.strip().split("\n"):
            parts = line.split(":")
            if len(parts) == 2:
                key, val = parts[0].strip(), parts[1].strip().split(" ")[0]
                try:
                    values[key] = float(val) if val != "--undefined--" else np.nan
                except ValueError:
                    values[key] = np.nan

        mappings = {
            "Jitter (local)": "jitter_local",
            "Jitter (local, absolute)": "jitter_local_abs",
            "Jitter (rap)": "jitter_rap",
            "Jitter (ppq5)": "jitter_ppq5",
            "Shimmer (local)": "shimmer_local",
            "Shimmer (local, dB)": "shimmer_local_db",
            "Shimmer (apq3)": "shimmer_apq3",
            "Shimmer (apq5)": "shimmer_apq5",
            "Shimmer (apq11)": "shimmer_apq11",
        }

        for praat_key, feature_key in mappings.items():
            if praat_key in values:
                praat_features[feature_key] = values[praat_key]

        # Convert percent-based values
        if not np.isnan(praat_features["jitter_local"]):
            praat_features["jitter_local"] /= 100.0
        if not np.isnan(praat_features["shimmer_local"]):
            praat_features["shimmer_local"] /= 100.0

        # Adjusted thresholds for diverse voices
        for key in ["jitter_local", "jitter_rap", "jitter_ppq5"]:
            if (
                praat_features.get(key, 0) > 0.15
            ):  # Relaxed threshold for elderly/creaky voices
                praat_features[key] = np.nan

        if (
            praat_features["shimmer_local"] > 0.4
        ):  # Relaxed threshold for expressive voices
            praat_features["shimmer_local"] = np.nan

    except parselmouth.PraatError as e:
        logger.error(f"⚠️ PraatError processing {filename}: {e}. Storing NaNs.")
    except Exception as e:
        logger.exception(
            f"❌ Unexpected error during Praat processing for {filename}: {e}"
        )

    return praat_features

In [None]:
file_path = (
    "..\\..\\..\\data\\preprocessed_sample\\common_voice_en_1463_preprocessed.mp3"
)
sound = parselmouth.Sound(file_path)


# === Run extraction ===


features = extract_praat_features(sound)


# === Print output ===


print(json.dumps(features, indent=2))

{
  "f0_median_praat": 176.18724229901935,
  "jitter_local": NaN,
  "jitter_local_abs": 0.000111785,
  "jitter_rap": NaN,
  "jitter_ppq5": NaN,
  "shimmer_local": NaN,
  "shimmer_local_db": 1.211,
  "shimmer_apq3": NaN,
  "shimmer_apq5": NaN,
  "shimmer_apq11": NaN
}


In [29]:
import numpy as np
import librosa
import logging
import soundfile as sf
from pathlib import Path
import gc

# Basic logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


def extract_features(audio_path):
    """
    Extract audio features with parameters adjusted to avoid mel filter issues
    """
    try:
        # Step 1: Load the audio file
        logger.info(f"Loading audio: {audio_path}")
        y, sr = sf.read(audio_path)
        logger.info(f"Audio loaded: {len(y)/sr:.2f}s, sr={sr}Hz")

        # Step 2: Convert to mono if needed
        if y.ndim > 1:
            y = y.mean(axis=1)
            logger.info(f"Converted to mono: {len(y)} samples")

        # Step 3: Limit duration to first 5 seconds
        max_samples = 5 * sr
        if len(y) > max_samples:
            y = y[:max_samples]
            logger.info(f"Trimmed to first 5 seconds: {len(y)} samples")

        # Step 4: Calculate suitable fmax based on sample rate
        # The warning suggests adjusting fmax (maximum frequency)
        # A common rule is to set fmax to sr/2.5 or lower
        fmax = min(sr / 2.5, 8000)  # Cap at 8kHz which is usually enough for speech

        # Step 5: Adjust n_mels based on sample rate
        # Lower sample rates need fewer mel bands
        if sr < 16000:
            n_mels = 40  # Fewer mels for lower sample rates
        elif sr < 22050:
            n_mels = 64
        else:
            n_mels = 80  # Standard for higher sample rates

        logger.info(f"Using adjusted parameters: fmax={fmax}, n_mels={n_mels}")

        # Step 6: Extract features with the adjusted parameters
        features = {}

        # MFCC with adjusted parameters
        features["mfcc"] = librosa.feature.mfcc(
            y=y,
            sr=sr,
            n_mfcc=N_MFCC,
            n_fft=N_FFT,
            hop_length=HOP_LENGTH,
            fmax=fmax,
            n_mels=n_mels,
        )
        gc.collect()

        # RMS energy
        features["rms"] = librosa.feature.rms(
            y=y, frame_length=N_FFT, hop_length=HOP_LENGTH
        )
        gc.collect()

        # Spectral centroid with adjusted fmax
        features["spec_cent"] = librosa.feature.spectral_centroid(
            y=y,
            sr=sr,
            n_fft=N_FFT,
            hop_length=HOP_LENGTH,
            freq=None,  # Let librosa calculate frequencies based on sr
        )
        gc.collect()

        # Skip more memory-intensive features initially
        # We can add them back one by one if this works

        logger.info("Feature extraction successful")
        print(features.keys())
        return features

    except Exception as e:
        logger.error(f"Error during processing: {str(e)}")
        import traceback

        traceback.print_exc()
        return None


def test_feature_extraction():
    """
    Test function with better handling of audio files
    """
    # Try to find an audio file
    test_file = Path(
        "..\\..\\..\\data\\preprocessed_sample\\common_voice_en_1463_preprocessed.mp3"
    )

    if not test_file.exists():
        logger.warning(
            f"Test file {test_file} not found, searching for alternatives..."
        )

        # Look in the kaggle input directory
        kaggle_input = Path("..\\..\\..\\data\\preprocessed_sample\\")
        if kaggle_input.exists():
            # Search for any audio file
            for ext in [".mp3", ".wav", ".flac", ".ogg"]:
                for path in kaggle_input.glob(f"**/*{ext}"):
                    logger.info(f"Found alternative audio file: {path}")
                    test_file = path
                    break

                if test_file.exists() and test_file != Path(
                    "..\\..\\..\\data\\preprocessed_sample\\common_voice_en_100166_preprocessed.mp3"
                ):
                    break

    if test_file.exists():
        logger.info(f"Using test file: {test_file}")
        features = extract_features(test_file)

        if features:
            logger.info("Feature extraction results:")
            for name, feature in features.items():
                logger.info(f"- {name}: shape={feature.shape}")
        else:
            logger.error("Feature extraction failed")
    else:
        logger.error("No audio files found. Please upload a sample file.")
        # Create synthetic audio
        logger.info("Creating synthetic audio for testing...")
        sr = 22050
        duration = 2.0
        t = np.linspace(0, duration, int(sr * duration))
        y = 0.5 * np.sin(2 * np.pi * 440 * t)  # 440 Hz sine wave

        # Save synthetic audio to a temporary file
        output_file = Path(
            "..\\..\\..\\data\\preprocessed_sample\\common_voice_en_100166_synthesized.mp3"
        )
        sf.write(output_file, y, sr)
        logger.info(f"Saved synthetic audio to {output_file}")

        # Try extraction on synthetic file
        features = extract_features(output_file)

        if features:
            logger.info("Feature extraction on synthetic audio successful:")
            for name, feature in features.items():
                logger.info(f"- {name}: shape={feature.shape}")
        else:
            logger.error("Feature extraction on synthetic audio failed")


# Run test with proper error handling
if __name__ == "__main__":
    try:
        logger.info("Starting feature extraction test...")
        test_feature_extraction()
        logger.info("Test completed successfully")
        print("done")
    except Exception as e:
        logger.critical(f"Critical error: {e}")
        import traceback

        traceback.print_exc()
        print("failed")

[INFO] Starting feature extraction test...
[INFO] Starting feature extraction test...
INFO:__main__:Starting feature extraction test...
[INFO] Using test file: ..\..\..\data\preprocessed_sample\common_voice_en_1463_preprocessed.mp3
[INFO] Using test file: ..\..\..\data\preprocessed_sample\common_voice_en_1463_preprocessed.mp3
INFO:__main__:Using test file: ..\..\..\data\preprocessed_sample\common_voice_en_1463_preprocessed.mp3
[INFO] Loading audio: ..\..\..\data\preprocessed_sample\common_voice_en_1463_preprocessed.mp3
[INFO] Loading audio: ..\..\..\data\preprocessed_sample\common_voice_en_1463_preprocessed.mp3
INFO:__main__:Loading audio: ..\..\..\data\preprocessed_sample\common_voice_en_1463_preprocessed.mp3
[INFO] Audio loaded: 3.31s, sr=48000Hz
[INFO] Audio loaded: 3.31s, sr=48000Hz
INFO:__main__:Audio loaded: 3.31s, sr=48000Hz
[INFO] Using adjusted parameters: fmax=8000, n_mels=80
[INFO] Using adjusted parameters: fmax=8000, n_mels=80
INFO:__main__:Using adjusted parameters: fmax=

dict_keys(['mfcc', 'rms', 'spec_cent'])
done


In [None]:
import librosa
import logging
import numpy as np

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Constants for feature extraction (could be changed as needed)
N_FFT = 2048
HOP_LENGTH = 512
N_MFCC = 13
FMIN_PITCH_LIBROSA = 70.0
FMAX_PITCH_LIBROSA = 400.0


# Define feature extraction functions (as in the previous code)


def extract_mfcc(y, sr):
    return librosa.feature.mfcc(
        y=y, sr=sr, n_fft=N_FFT, hop_length=HOP_LENGTH, n_mfcc=N_MFCC
    )


def extract_rms(y):
    return librosa.feature.rms(y=y, frame_length=N_FFT, hop_length=HOP_LENGTH)


def extract_spec_cent(y, sr):
    return librosa.feature.spectral_centroid(
        y=y, sr=sr, n_fft=N_FFT, hop_length=HOP_LENGTH
    )


def extract_spec_bw(y, sr):
    return librosa.feature.spectral_bandwidth(
        y=y, sr=sr, n_fft=N_FFT, hop_length=HOP_LENGTH
    )


def extract_spec_contrast(y, sr):
    return librosa.feature.spectral_contrast(
        y=y, sr=sr, n_fft=N_FFT, hop_length=HOP_LENGTH
    )


def extract_spec_flatness(y):
    return librosa.feature.spectral_flatness(y=y, n_fft=N_FFT, hop_length=HOP_LENGTH)


def extract_spec_rolloff(y, sr):
    return librosa.feature.spectral_rolloff(
        y=y, sr=sr, n_fft=N_FFT, hop_length=HOP_LENGTH
    )


def extract_zcr(y):
    return librosa.feature.zero_crossing_rate(
        y=y, frame_length=N_FFT, hop_length=HOP_LENGTH
    )


def extract_pitch_librosa(y, sr):
    f0, voiced_flag, _ = librosa.pyin(
        y,
        fmin=FMIN_PITCH_LIBROSA,
        fmax=FMAX_PITCH_LIBROSA,
        sr=sr,
        frame_length=N_FFT,
        hop_length=HOP_LENGTH,
    )
    return np.nan_to_num(f0, nan=0.0), voiced_flag


def extract_librosa_features(y, sr):
    try:
        if len(y) < sr / 10:
            logger.warning("⚠️ Audio too short for meaningful feature extraction")
            return None

        features = {}
        features["mfcc"] = extract_mfcc(y, sr)
        features["rms"] = extract_rms(y)
        features["spec_cent"] = extract_spec_cent(y, sr)
        features["spec_bw"] = extract_spec_bw(y, sr)
        features["spec_contrast"] = extract_spec_contrast(y, sr)
        features["spec_flatness"] = extract_spec_flatness(y)
        features["spec_rolloff"] = extract_spec_rolloff(y, sr)
        features["zcr"] = extract_zcr(y)
        features["pitch_librosa"], features["voiced_flag"] = extract_pitch_librosa(
            y, sr
        )

        # Frame consistency alignment
        try:
            max_frames = max(
                val.shape[1] if val.ndim > 1 else len(val)
                for val in features.values()
                if isinstance(val, np.ndarray)
            )

            for key, val in features.items():
                if not isinstance(val, np.ndarray):
                    continue

                if val.ndim == 1:
                    val_len = len(val)
                    if val_len < max_frames:
                        features[key] = np.pad(
                            val, (0, max_frames - val_len), mode="edge"
                        )
                    elif val_len > max_frames:
                        features[key] = val[:max_frames]

                elif val.ndim == 2:
                    val_len = val.shape[1]
                    if val_len < max_frames:
                        features[key] = np.pad(
                            val, ((0, 0), (0, max_frames - val_len)), mode="edge"
                        )
                    elif val_len > max_frames:
                        features[key] = val[:, :max_frames]

        except ValueError as e:
            logger.warning(f"⚠️ Frame alignment issue: {e}")

        # Validate numerical stability
        for key, val in features.items():
            if isinstance(val, np.ndarray) and (
                np.isnan(val).any() or np.isinf(val).any()
            ):
                logger.warning(f"⚠️ NaN/Inf detected in {key}, replacing with zeros")
                features[key] = np.nan_to_num(val, nan=0.0, posinf=0.0, neginf=0.0)

        return features

    except MemoryError:
        logger.error(
            "❌ MemoryError during Librosa feature extraction - possibly large input"
        )
        return None
    except Exception as e:
        logger.exception(f"❌ Unexpected error during Librosa feature extraction: {e}")
        return None


# Test with an example audio file
def test_audio_features(audio_path):
    try:
        # Load audio file
        y, sr = librosa.load(
            audio_file_path, sr=None, duration=30
        )  # Only load the first 30 seconds

        # Extract features
        features = extract_librosa_features(y, sr)

        # Display the features (for demonstration purposes)
        if features:
            print("Extracted Features:")
            for feature_name, feature_data in features.items():
                print(
                    f"{feature_name}: {feature_data.shape if isinstance(feature_data, np.ndarray) else len(feature_data)}"
                )
        else:
            print("No features extracted.")
    except Exception as e:
        print(f"Error: {e}")


# Test with a sample file (use a path to your audio file)
audio_file_path = "/kaggle/input/tiny-sample/common_voice_en_100168_preprocessed.mp3"  # Replace with the path to your audio file
test_audio_features(audio_file_path)

In [1]:
# def extract_librosa_features(y, sr):
#     """
#     Extracts frame-wise features using Librosa.

#     Args:
#         y (np.ndarray): Audio time series.
#         sr (int): Sample rate.

#     Returns:
#         dict: Dictionary containing frame-wise librosa features. Returns None on failure.
#     """
#     try:
#         if len(y) < sr / 10:
#             logger.warning("⚠️ Audio too short for meaningful feature extraction")
#             return None

#         features = {
#             "mfcc": librosa.feature.mfcc(y=y, sr=sr, n_fft=N_FFT, hop_length=HOP_LENGTH, n_mfcc=N_MFCC),
#             "rms": librosa.feature.rms(y=y, frame_length=N_FFT, hop_length=HOP_LENGTH),
#             "spec_cent": librosa.feature.spectral_centroid(y=y, sr=sr, n_fft=N_FFT, hop_length=HOP_LENGTH),
#             "spec_bw": librosa.feature.spectral_bandwidth(y=y, sr=sr, n_fft=N_FFT, hop_length=HOP_LENGTH),
#             "spec_contrast": librosa.feature.spectral_contrast(y=y, sr=sr, n_fft=N_FFT, hop_length=HOP_LENGTH),
#             "spec_flatness": librosa.feature.spectral_flatness(y=y, n_fft=N_FFT, hop_length=HOP_LENGTH),
#             "spec_rolloff": librosa.feature.spectral_rolloff(y=y, sr=sr, n_fft=N_FFT, hop_length=HOP_LENGTH),
#             "zcr": librosa.feature.zero_crossing_rate(y=y, frame_length=N_FFT, hop_length=HOP_LENGTH),
#         }

#         f0, voiced_flag, _ = librosa.pyin(
#             y,
#             fmin=FMIN_PITCH_LIBROSA,
#             fmax=FMAX_PITCH_LIBROSA,
#             sr=sr,
#             frame_length=N_FFT,
#             hop_length=HOP_LENGTH,
#         )
#         features["pitch_librosa"] = np.nan_to_num(f0, nan=0.0)
#         features["voiced_flag"] = voiced_flag

#         # Frame consistency alignment
#         try:
#             max_frames = max(
#                 val.shape[1] if val.ndim > 1 else len(val)
#                 for val in features.values()
#                 if isinstance(val, np.ndarray)
#             )

#             for key, val in features.items():
#                 if not isinstance(val, np.ndarray):
#                     continue

#                 if val.ndim == 1:
#                     val_len = len(val)
#                     if val_len < max_frames:
#                         features[key] = np.pad(val, (0, max_frames - val_len), mode="edge")
#                     elif val_len > max_frames:
#                         features[key] = val[:max_frames]

#                 elif val.ndim == 2:
#                     val_len = val.shape[1]
#                     if val_len < max_frames:
#                         features[key] = np.pad(val, ((0, 0), (0, max_frames - val_len)), mode="edge")
#                     elif val_len > max_frames:
#                         features[key] = val[:, :max_frames]

#         except ValueError as e:
#             logger.warning(f"⚠️ Frame alignment issue: {e}")

#         # Validate numerical stability
#         for key, val in features.items():
#             if isinstance(val, np.ndarray) and (np.isnan(val).any() or np.isinf(val).any()):
#                 logger.warning(f"⚠️ NaN/Inf detected in {key}, replacing with zeros")
#                 features[key] = np.nan_to_num(val, nan=0.0, posinf=0.0, neginf=0.0)

#         return features

#     except MemoryError:
#         logger.error("❌ MemoryError during Librosa feature extraction - possibly large input")
#         return None
#     except Exception as e:
#         logger.exception(f"❌ Unexpected error during Librosa feature extraction: {e}")
#         return None

In [None]:
def extract_features(audio_path):
    """
    Loads an audio file and extracts features using both Librosa and Parselmouth.

    Args:
        audio_path (str): Path to the preprocessed audio file.

    Returns:
        dict: Dictionary of feature names mapped to numpy arrays or scalars.
              Returns None if loading or critical processing fails.
    """
    all_features = {}
    basename = os.path.basename(audio_path)

    try:
        # --- Librosa Extraction ---
        try:
            y, sr = librosa.load(audio_path, sr=SR)
            if len(y) == 0:
                logger.warning(f"⚠️ Empty audio data in {basename}")
                return None

            if np.isnan(y).any() or np.isinf(y).any():
                logger.warning(
                    f"⚠️ NaN or Inf in audio data for {basename}, replacing with zeros"
                )
                y = np.nan_to_num(y)

            librosa_features = extract_librosa_features(y, sr)
            if librosa_features:
                all_features.update(librosa_features)
                logger.info(f"✅ Librosa features extracted for {basename}")
            else:
                logger.warning(f"⚠️ Failed to extract Librosa features for {basename}")
        except Exception as e:
            logger.exception(f"❌ Librosa processing error for {basename}: {e}")

        # --- Parselmouth Extraction ---
        try:
            sound = parselmouth.Sound(audio_path)
            sound.name = basename

            praat_features = extract_praat_features(
                sound, PRAAT_PITCH_FLOOR, PRAAT_PITCH_CEILING
            )
            all_features.update(praat_features)

            if not all(np.isnan(v) for v in praat_features.values()):
                logger.info(f"✅ Praat features extracted for {basename}")
            else:
                logger.warning(f"⚠️ All Praat features are NaN for {basename}")

            del sound  # Free memory

        except parselmouth.PraatError as e:
            logger.error(f"❌ PraatError processing {basename}: {e}")
        except Exception as e:
            logger.exception(f"❌ Parselmouth processing error for {basename}: {e}")

        # Final check
        if not all_features:
            logger.error(f"🛑 No features could be extracted for {basename}")
            return None

        gc.collect()  # Free up memory
        return all_features

    except FileNotFoundError:
        logger.error(f"❌ File not found: {audio_path}")
        return None
    except Exception as e:
        logger.exception(f"❌ Unexpected error processing {basename}: {e}")
        return None

In [None]:
def save_features(features, output_path):
    """
    Saves the extracted features dictionary to a compressed .npz file.

    Args:
        features (dict): Dictionary of features to save.
        output_path (str): Path to save the compressed .npz file.

    Returns:
        bool: True if saved successfully, False otherwise.
    """
    try:
        # Validate feature contents
        for key, value in features.items():
            if isinstance(value, np.ndarray) and (
                np.isnan(value).any() or np.isinf(value).any()
            ):
                logger.warning(
                    f"⚠️ Feature '{key}' contains NaN or Inf - replacing with zeros"
                )
                features[key] = np.nan_to_num(value, nan=0.0, posinf=0.0, neginf=0.0)

        # Ensure directory exists
        os.makedirs(os.path.dirname(output_path), exist_ok=True)

        # Save to compressed .npz file
        np.savez_compressed(output_path, **features)
        logger.info(f"✅ Features saved to {output_path}")
        return True

    except Exception as e:
        logger.exception(f"❌ Error saving features to {output_path}: {e}")
        return False

In [None]:
def process_feature_wrapper(args):
    """
    Wrapper function for multiprocessing that handles feature extraction and saving.

    Args:
        args (tuple): Tuple containing (filename, time_tracker, progress_lock).
    """
    preprocessed_filename, time_tracker, progress_lock = args

    input_path = os.path.join(PREPROCESSED_FOLDER, preprocessed_filename)
    base_name = os.path.splitext(preprocessed_filename)[0]
    original_name = base_name.replace(
        "_preprocessed", ""
    )  # Modify if naming convention changes
    output_filename = f"{original_name}_features.npz"
    output_path = os.path.join(FEATURE_FOLDER, output_filename)

    # Skip if already processed (caching)
    if os.path.exists(output_path):
        with progress_lock:
            time_tracker.update("skipped")
            if time_tracker.should_print_update():
                logger.info(f"⏩ {time_tracker.get_progress_str()}")
        return

    start_time = time.time()
    features = extract_features(input_path)
    elapsed = time.time() - start_time

    with progress_lock:
        if features:
            if save_features(features, output_path):
                # time_tracker.update("completed", elapsed)
                logger.info(f"✅ Processed {preprocessed_filename} in {elapsed:.1f}s")
            else:
                # time_tracker.update("failed")
                logger.error(f"❌ Failed to save features for {preprocessed_filename}")
        else:
            # time_tracker.update("failed")
            logger.error(f"🛑 Feature extraction failed for {preprocessed_filename}")

In [None]:
# def tqdm_process_feature_wrapper(args_with_counter):
#     """Wrapper that processes a file and updates the progress counter for tqdm."""
#     args, counter, lock = args_with_counter
#     try:
#         result = process_feature_wrapper(args)
#     except Exception as e:
#         logger.error(f"Error processing {args[0]}: {e}")
#         result = None

#     with lock:
#         counter.value += 1

#     return result


# def extract_features_for_all(input_folder, output_folder):
#     """Processes all compatible audio files using multiprocessing with comprehensive time tracking."""
#     try:
#         os.makedirs(output_folder, exist_ok=True)
#         logger.info(f"Ensured output directory exists: {output_folder}")
#     except OSError as e:
#         logger.error(f"Error creating output directory {output_folder}: {e}")
#         return

#     try:
#         logger.info(f"Searching for preprocessed files in: {input_folder}")
#         if not os.path.isdir(input_folder):
#             raise FileNotFoundError(f"Input directory not found: {input_folder}")

#         audio_extensions = (".wav", ".mp3", ".flac", ".ogg", ".m4a")
#         all_files = os.listdir(input_folder)
#         preprocessed_files = [
#             f
#             for f in all_files
#             if os.path.isfile(os.path.join(input_folder, f))
#             and f.lower().endswith(audio_extensions)
#         ]

#         # Filter out zero-sized files
#         valid_files = []
#         for f in preprocessed_files:
#             full_path = os.path.join(input_folder, f)
#             if os.path.getsize(full_path) == 0:
#                 logger.warning(f"Skipping zero-sized file: {f}")
#                 continue
#             valid_files.append(f)

#         preprocessed_files = valid_files
#         logger.info(f"Found {len(preprocessed_files)} valid audio files.")

#     except FileNotFoundError as e:
#         logger.error(f"FileNotFoundError: {e}")
#         return
#     except Exception as e:
#         logger.error(f"Error listing files in {input_folder}: {e}")
#         traceback.print_exc()
#         return

#     if not preprocessed_files:
#         logger.info(f"No compatible audio files found in {input_folder} to process.")
#         return

#     # Setup multiprocessing and time tracking
#     manager = Manager()
#     progress_lock = manager.Lock()
#     # time_tracker = TimeTracker(len(preprocessed_files))

#     # Create shared counter for tqdm
#     counter = manager.Value("i", 0)
#     tqdm_lock = manager.Lock()

#     # Prepare process arguments with progress tracking
#     original_args = [(filename, progress_lock) for filename in preprocessed_files]
#     process_args = [(arg, counter, tqdm_lock) for arg in original_args]

#     # Determine optimal number of workers
#     num_processes = min(MAX_WORKERS, len(preprocessed_files))
#     logger.info(f"🚀 Starting feature extraction with {num_processes} processes...")
#     logger.info(
#         f"📊 Advanced progress tracking enabled - will show completion rate and ETA"
#     )

#     try:
#         # Create tqdm progress bar
#         with tqdm(
#             total=len(preprocessed_files), desc="🔄 Processing", unit="file", ncols=100
#         ) as pbar:
#             # Start processing with multiprocessing
#             with Pool(processes=num_processes) as pool:
#                 results = pool.map_async(tqdm_process_feature_wrapper, process_args)

#                 # Update progress bar while waiting for results
#                 while not results.ready():
#                     completed = counter.value
#                     pbar.n = completed
#                     pbar.refresh()
#                     time.sleep(0.1)

#                 # Ensure progress bar is complete
#                 pbar.n = len(preprocessed_files)
#                 pbar.refresh()

#     except KeyboardInterrupt:
#         logger.warning(
#             "⚠️ Feature extraction interrupted by user. Some results may be incomplete."
#         )
#     except Exception as e:
#         logger.error(f"❌ Error during multiprocessing: {e}")
#         traceback.print_exc()

#     # Report final statistics
#     logger.info("\n📊 Feature Extraction Summary")
#     logger.info("=" * 50)
#     # logger.info(time_tracker.get_final_stats())
#     logger.info("=" * 50)

#     # Validate results
#     processed_files = [
#         f for f in os.listdir(output_folder) if f.endswith("_features.npz")
#     ]
#     success_rate = (
#         len(processed_files) / len(preprocessed_files) if preprocessed_files else 0
#     )
#     logger.info(f"Extracted features saved: {len(processed_files)} files")
#     logger.info(f"Overall success rate: {success_rate:.1%}")
#     logger.info(f"Feature extraction completed. Results are in: {output_folder}")

In [None]:
if __name__ == "__main__":
    # Run the feature extraction process
    extract_features_for_all(PREPROCESSED_FOLDER, FEATURE_FOLDER)
    print("Feature extraction script completed.")