In [56]:
import os
import glob
import hashlib
import numpy as np
import pandas as pd
import librosa
import librosa.effects
from tqdm import tqdm
import warnings

warnings.filterwarnings("ignore")



In [57]:
DATA_DIR = '../data/raw'
EXPLORATION_CSV = 'audio_data_exploration.csv'
PROCESSED_ROOT = '../data/processed'

# Audio parameters
TARGET_SR = 16000
FRAME_SEC = 0.96
HOP_SEC = 0.48

# aggressive silence threshold based on post-processing analysis
SILENCE_RMS_THR = 0.015  # to filter out low-quality frames
CLIP_MAX_AMP_THR = 0.98

# Quality thresholds to filter poor frames
MIN_FRAME_RMS = 0.01  # Minimum RMS after normalization

CATEGORY_RMS_TARGETS = {
    'Glass_Breaking': 0.10, 
    'Alarm_Clock': 0.14,    
    'Car_Horn': 0.17,       
    'Gunshot': 0.16,       
    'Siren': 0.19          
}

# augmentation to balance dataset
AUGMENT_MULTIPLIERS = {
    'Glass_Breaking': 4, 
    'Alarm_Clock': 3,   
    'Siren': 2,          
    'Car_Horn': 2,      
    'Gunshot': 2         
}


In [58]:
def md5_hash(filepath: str) -> str:
    """Compute MD5 hash of a file."""
    hash_md5 = hashlib.md5()
    with open(filepath, "rb") as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()


def load_audio(filepath: str):
    """Load audio, force mono and target sample rate."""
    y, sr = librosa.load(filepath, sr=TARGET_SR, mono=True)
    return y, sr


def compute_rms(y: np.ndarray) -> float:
    """Compute RMS energy."""
    return np.sqrt(np.mean(y**2))


def estimate_snr(y: np.ndarray) -> float:
    """
    Estimate SNR using RMS-based method.
    Returns SNR in dB.
    NOTE: Not used in current pipeline - SNR filtering removed as it was
    too aggressive and filtered out valid clean audio.
    """
    rms_frames = librosa.feature.rms(y=y)[0]
    signal_rms = np.mean(rms_frames)
    noise_rms = np.min(rms_frames) + 1e-6
    snr_db = 20 * np.log10(signal_rms / noise_rms) if signal_rms > 0 else 0
    return snr_db


def is_silent(y: np.ndarray, thr: float = SILENCE_RMS_THR) -> bool:
    """Return True if RMS is below the silence threshold."""
    return compute_rms(y) < thr


def handle_clipping(y: np.ndarray, thr: float = CLIP_MAX_AMP_THR) -> np.ndarray:
    """
    Scale down if max amplitude exceeds threshold.
    """
    max_amp = np.max(np.abs(y))
    if max_amp >= thr:
        y = y / (max_amp + 1e-8) * 0.95
    return np.clip(y, -1.0, 1.0)


def normalise_rms(y: np.ndarray, target: float, max_gain: float = 5.0) -> np.ndarray:
    """
    Scale to a target RMS energy level with maximum gain limit.
    """
    current_rms = compute_rms(y)
    if current_rms > 1e-8:
        scale_factor = target / current_rms
        scale_factor = min(scale_factor, max_gain)
        y = y * scale_factor
    return y


def pad_to_length(y: np.ndarray, sr: int, target_sec: float = FRAME_SEC) -> np.ndarray:
    """Zero-pad if shorter than target length."""
    target_samples = int(target_sec * sr)
    if len(y) < target_samples:
        y = np.pad(y, (0, target_samples - len(y)), mode='constant')
    return y


def extract_frames(y: np.ndarray, sr: int,
                   frame_sec: float = FRAME_SEC,
                   hop_sec: float = HOP_SEC) -> list:
    """Extract overlapping frames from audio."""
    frame_samples = int(frame_sec * sr)
    hop_samples = int(hop_sec * sr)
    frames = []
    
    for start in range(0, len(y) - frame_samples + 1, hop_samples):
        frame = y[start:start + frame_samples]
        frames.append(frame)
    
    # Handle last frame if necessary
    if len(y) > frame_samples and (len(y) - frame_samples) % hop_samples != 0:
        last_frame = y[-frame_samples:]
        frames.append(last_frame)
    
    return frames


def augment_frame(frame: np.ndarray, sr: int, category: str, 
                  n_augmentations: int) -> list:
    """
    Generate augmented versions of a frame.
    More consistent RMS preservation with tighter tolerance.
    """
    augmented = [frame.copy()]
    
    if n_augmentations <= 1:
        return augmented
    
    original_rms = compute_rms(frame)
    
    for i in range(n_augmentations - 1):
        aug = frame.copy()
        
        # Time shift with zero padding (not circular)
        shift_samples = np.random.randint(-len(aug)//8, len(aug)//8)
        if shift_samples > 0:
            aug = np.pad(aug[shift_samples:], (0, shift_samples), mode='constant')
        elif shift_samples < 0:
            aug = np.pad(aug[:shift_samples], (-shift_samples, 0), mode='constant')
        
        # Conservative pitch shift (50% probability)
        if np.random.random() > 0.5:
            n_steps = np.random.uniform(-0.5, 0.5)
            try:
                aug = librosa.effects.pitch_shift(aug, sr=sr, n_steps=n_steps)
            except Exception:
                pass
        
        # Light time stretching (50% probability)
        if np.random.random() > 0.5:
            rate = np.random.uniform(0.95, 1.05)
            try:
                aug = librosa.effects.time_stretch(aug, rate=rate)
                # Trim or pad back to original length
                if len(aug) > len(frame):
                    aug = aug[:len(frame)]
                else:
                    aug = np.pad(aug, (0, len(frame) - len(aug)), mode='constant')
            except Exception:
                pass
        
        # Minimal noise (30% probability)
        if np.random.random() > 0.7:
            current_rms = compute_rms(aug)
            noise_level = current_rms * 0.005
            noise = np.random.normal(0, noise_level, len(aug))
            aug = aug + noise
        
        # FIXED: Restore RMS with tighter tolerance (±5% instead of ±10%)
        aug_rms = compute_rms(aug)
        if aug_rms > 1e-8:
            target_rms = original_rms * np.random.uniform(0.95, 1.05)
            aug = aug * (target_rms / aug_rms)
        
        # Safety clipping
        aug = handle_clipping(aug)
        augmented.append(aug)
    
    return augmented


In [59]:
if not os.path.exists(EXPLORATION_CSV):
    raise FileNotFoundError(f"Exploration CSV not found: {EXPLORATION_CSV}")

df_expl = pd.read_csv(EXPLORATION_CSV)
print(f"\nLoaded exploration data: {len(df_expl)} files")



Loaded exploration data: 843 files


In [60]:
# Remove duplicates
original_count = len(df_expl)
df_expl = df_expl.drop_duplicates(subset='file_hash', keep='first')
duplicates_removed = original_count - len(df_expl)
print(f"Removed {duplicates_removed} duplicate files → {len(df_expl)} unique files")

Removed 40 duplicate files → 803 unique files


In [61]:
# Filter successful loads
df_expl = df_expl[df_expl['load_success'] == True].copy()
print(f"Using {len(df_expl)} files that loaded successfully")

# Prepare output directories
os.makedirs(PROCESSED_ROOT, exist_ok=True)
for category in df_expl['category'].unique():
    os.makedirs(os.path.join(PROCESSED_ROOT, category), exist_ok=True)

# Initialize tracking
processed_records = []
stats = {
    'files_processed': 0,
    'files_skipped_silent': 0,
    'files_skipped_error': 0,
    'frames_original': 0,
    'frames_augmented': 0,
    'frames_skipped_silent': 0,
    'frames_skipped_post_norm': 0
}
skipped_details = []



Using 803 files that loaded successfully


In [62]:
print(f"\nProcessing files...")
print(f"Silence threshold: {SILENCE_RMS_THR}")
print(f"Min frame RMS after normalization: {MIN_FRAME_RMS}\n")

# Process each file
for _, row in tqdm(df_expl.iterrows(), total=len(df_expl), desc="Processing"):
    category = row['category']
    filename = row['file']
    filepath = os.path.join(DATA_DIR, category, filename)
    
    if not os.path.exists(filepath):
        stats['files_skipped_error'] += 1
        skipped_details.append(f"{filename}: File not found")
        continue
    
    try:
        # 1. Load audio (mono, 16kHz)
        y, sr = load_audio(filepath)
        
        # 2. Check for silence at file level
        if is_silent(y):
            stats['files_skipped_silent'] += 1
            skipped_details.append(f"{filename}: Silent file (RMS={compute_rms(y):.6f})")
            continue
        
        # REMOVED: SNR filtering was too aggressive for clean AudioSet data
        # It filtered out 94.5% of Siren files based on estimation artifacts
        
        # 3. Handle clipping
        y = handle_clipping(y)
        
        # 4. Pad if needed
        y = pad_to_length(y, sr)
        
        stats['files_processed'] += 1
        
    except Exception as e:
        stats['files_skipped_error'] += 1
        skipped_details.append(f"{filename}: Error loading - {str(e)}")
        continue
    
    # FRAME-LEVEL PROCESSING
    frames = extract_frames(y, sr)
    augment_factor = AUGMENT_MULTIPLIERS.get(category, 2)
    target_rms = CATEGORY_RMS_TARGETS.get(category, 0.15)
    
    for frame_idx, frame in enumerate(frames):
        # Check frame for silence BEFORE normalization
        if is_silent(frame):
            stats['frames_skipped_silent'] += augment_factor
            continue
        
        # Normalize frame
        frame_normalized = normalise_rms(frame, target_rms, max_gain=4.0)
        frame_normalized = handle_clipping(frame_normalized)
        
        # Additional check after normalization
        # Ensure frame actually reached a reasonable RMS
        final_rms = compute_rms(frame_normalized)
        if final_rms < MIN_FRAME_RMS:
            stats['frames_skipped_post_norm'] += augment_factor
            continue
        
        # Generate augmented versions
        augmented_frames = augment_frame(frame_normalized, sr, category, augment_factor)
        
        for aug_idx, aug_frame in enumerate(augmented_frames):
            # Final safety clipping
            aug_frame = handle_clipping(aug_frame)
            
            # Verify augmented frame quality
            aug_rms = compute_rms(aug_frame)
            if aug_rms < MIN_FRAME_RMS:
                continue  # Skip this augmented frame
            
            # Save frame
            frame_name = f"{os.path.splitext(filename)[0]}_f{frame_idx}_a{aug_idx}.npy"
            cat_out_dir = os.path.join(PROCESSED_ROOT, category)
            frame_path = os.path.join(cat_out_dir, frame_name)
            np.save(frame_path, aug_frame)
            
            # Record metadata
            processed_records.append({
                'category': category,
                'original_file': filename,
                'frame_idx': frame_idx,
                'aug_idx': aug_idx,
                'is_augmented': aug_idx > 0,
                'frame_path': frame_path,
                'duration_sec': FRAME_SEC,
                'rms': aug_rms
            })
            
            if aug_idx == 0:
                stats['frames_original'] += 1
            else:
                stats['frames_augmented'] += 1

# Save metadata
meta_df = pd.DataFrame(processed_records)
meta_path = os.path.join(PROCESSED_ROOT, 'processed_frames_metadata.csv')
meta_df.to_csv(meta_path, index=False)

# Print summary
print("\n" + "="*70)
print("PREPROCESSING COMPLETE")
print("="*70)
print(f"\nSaved {len(meta_df)} total frames")
print(f"  - Original frames: {stats['frames_original']}")
print(f"  - Augmented frames: {stats['frames_augmented']}")
print(f"  - Metadata: {meta_path}")

print(f"\nFrames per category:")
category_counts = meta_df['category'].value_counts().sort_index()
for cat, count in category_counts.items():
    orig = len(meta_df[(meta_df['category'] == cat) & (meta_df['aug_idx'] == 0)])
    aug = count - orig
    print(f"  {cat:15s}: {count:4d} ({orig} original + {aug} augmented)")

# RMS statistics
print(f"\nRMS Statistics by Category:")
rms_stats = meta_df.groupby('category')['rms'].agg(['mean', 'std', 'min', 'max'])
print(rms_stats)

# Check if targets were reached
print(f"\nRMS Target Achievement:")
for cat in category_counts.index:
    target = CATEGORY_RMS_TARGETS.get(cat, 0.15)
    actual = meta_df[meta_df['category'] == cat]['rms'].mean()
    ratio = (actual / target) * 100
    status = "✓" if ratio >= 90 else "⚠"
    print(f"  {status} {cat:15s}: Target={target:.3f}, Actual={actual:.3f} ({ratio:.1f}%)")

# Class balance
max_count = category_counts.max()
min_count = category_counts.min()
balance_ratio = max_count / min_count if min_count > 0 else float('inf')
print(f"\nClass balance ratio: {balance_ratio:.2f} (max/min)")
if balance_ratio > 1.5:
    print(f"Dataset has some imbalance (ratio > 1.5)")
else:
    print(f"Dataset is well balanced")


Processing files...
Silence threshold: 0.015
Min frame RMS after normalization: 0.01



Processing: 100%|██████████| 803/803 [01:13<00:00, 10.91it/s]


PREPROCESSING COMPLETE

Saved 7100 total frames
  - Original frames: 3030
  - Augmented frames: 4070
  - Metadata: ../data/processed\processed_frames_metadata.csv

Frames per category:
  Alarm_Clock    : 1296 (432 original + 864 augmented)
  Car_Horn       : 1480 (740 original + 740 augmented)
  Glass_Breaking : 1216 (304 original + 912 augmented)
  Gunshot        : 1536 (768 original + 768 augmented)
  Siren          : 1572 (786 original + 786 augmented)

RMS Statistics by Category:
                    mean       std       min       max
category                                              
Alarm_Clock     0.129290  0.020177  0.039896  0.146997
Car_Horn        0.159343  0.024383  0.026693  0.178447
Glass_Breaking  0.088367  0.019063  0.022397  0.104994
Gunshot         0.141693  0.030626  0.032118  0.167980
Siren           0.184815  0.018496  0.063129  0.199414

RMS Target Achievement:
  ✓ Alarm_Clock    : Target=0.140, Actual=0.129 (92.3%)
  ✓ Car_Horn       : Target=0.170, Actual=0.




In [63]:
print(f"\nFile processing:")
print(f"  - Processed: {stats['files_processed']}")
print(f"  - Skipped (silent): {stats['files_skipped_silent']}")
print(f"  - Skipped (error): {stats['files_skipped_error']}")

print(f"\nFrame filtering:")
print(f"  - Frames skipped (silent): {stats['frames_skipped_silent']}")
print(f"  - Frames skipped (post-norm too quiet): {stats['frames_skipped_post_norm']}")

if skipped_details:
    print(f"\nSkipped files details (first 15):")
    for detail in skipped_details[:15]:
        print(f"  - {detail}")
    if len(skipped_details) > 15:
        print(f"  ... and {len(skipped_details) - 15} more")



File processing:
  - Processed: 773
  - Skipped (silent): 30
  - Skipped (error): 0

Frame filtering:
  - Frames skipped (silent): 1466
  - Frames skipped (post-norm too quiet): 0

Skipped files details (first 15):
  - -74dab6wYqU_30.wav: Silent file (RMS=0.002488)
  - -eQkuW-SGkA_30.wav: Silent file (RMS=0.009133)
  - 5CYK7AKivDo_180.wav: Silent file (RMS=0.007945)
  - 9gEGGTWHOjU_10.wav: Silent file (RMS=0.011665)
  - BldrjFV90H0_50.wav: Silent file (RMS=0.014469)
  - CijrnO-HHGo_10.wav: Silent file (RMS=0.007601)
  - CYgCdPSDuLU_30.wav: Silent file (RMS=0.008393)
  - EDzycaY5D_Y_50.wav: Silent file (RMS=0.011952)
  - FXrlR_YbiNU_30.wav: Silent file (RMS=0.010852)
  - IZH2Npv-TB4_0.wav: Silent file (RMS=0.013739)
  - KlhaNw2GOdI_30.wav: Silent file (RMS=0.012178)
  - Nc2S-9oUB8M_70.wav: Silent file (RMS=0.007312)
  - Ogx6MVJ9gOo_340.wav: Silent file (RMS=0.008449)
  - Glass_Breaking_108.wav: Silent file (RMS=0.009846)
  - Glass_Breaking_11.wav: Silent file (RMS=0.011350)
  ... and 1