In [1]:
import os
import glob
import hashlib
import numpy as np
import pandas as pd
import librosa
import librosa.effects
from tqdm import tqdm
import warnings
from scipy.signal import wiener

try:
    import noisereduce as nr
    NOISE_REDUCE_AVAILABLE = True
except ImportError:
    NOISE_REDUCE_AVAILABLE = False
    print("Warning: noisereduce not available. Using scipy.signal.wiener as fallback.")

warnings.filterwarnings("ignore")

In [2]:
DATA_DIR            = '../data/raw'
EXPLORATION_CSV     = 'audio_data_exploration.csv'
PROCESSED_ROOT      = '../data/processed'
TARGET_SR           = 16000
FRAME_SEC           = 0.96
HOP_SEC             = 0.48

SILENCE_RMS_THR     = 0.003  
CLIP_MAX_AMP_THR    = 0.99
LOW_SNR_THR         = 10.0  

CATEGORY_RMS_TARGETS = {
    'Glass_Breaking': 0.05,  
    'Alarm_Clock': 0.08,    
    'Car_Horn': 0.10,
    'Gunshot': 0.10,
    'Siren': 0.10
}

# Augmentation configuration 
AUGMENT_MULTIPLIERS = {
    'Glass_Breaking': 6, 
    'Alarm_Clock': 5,    
    'Siren': 3,         
    'Car_Horn': 3,        
    'Gunshot': 3         
}

In [None]:
def md5_hash(filepath: str) -> str:
    """Compute MD5 hash of a file."""
    hash_md5 = hashlib.md5()
    with open(filepath, "rb") as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()

def load_audio(filepath: str):
    """Load audio, force mono and target sample rate."""
    y, sr = librosa.load(filepath, sr=TARGET_SR, mono=True)
    return y, sr

def compute_rms(y: np.ndarray) -> float:
    """Compute RMS energy."""
    return np.sqrt(np.mean(y**2))

def is_silent(y: np.ndarray, thr: float = SILENCE_RMS_THR) -> bool:
    """Return True if RMS is below the silence threshold."""
    return compute_rms(y) < thr

def handle_clipping(y: np.ndarray, thr: float = CLIP_MAX_AMP_THR) -> np.ndarray:
    """
    Scale down if max amplitude exceeds threshold.
    Always enforce [-1, 1] range.
    """
    max_amp = np.max(np.abs(y))
    if max_amp >= thr:
        y = y / (max_amp + 1e-8) * 0.95
    return np.clip(y, -1.0, 1.0)

def normalise_rms(y: np.ndarray, target: float, max_gain: float = 3.0) -> np.ndarray:
    """
    Scale to a target RMS energy level with maximum gain limit.
    FIXED: Added max_gain parameter to prevent over-amplification.
    """
    current_rms = compute_rms(y)
    if current_rms > 1e-8:
        scale_factor = target / current_rms
        # FIXED: Limit gain to prevent distortion
        scale_factor = min(scale_factor, max_gain)
        y = y * scale_factor
    return y

def pad_to_length(y: np.ndarray, sr: int, target_sec: float = FRAME_SEC) -> np.ndarray:
    """Zero-pad if shorter than target length."""
    target_samples = int(target_sec * sr)
    if len(y) < target_samples:
        y = np.pad(y, (0, target_samples - len(y)), mode='constant')
    return y

def reduce_noise(y: np.ndarray, sr: int) -> np.ndarray:
    """Apply moderate noise reduction."""
    if NOISE_REDUCE_AVAILABLE:
        try:
            y = nr.reduce_noise(
                y=y, 
                sr=sr, 
                stationary=True, 
                prop_decrease=0.5,
                time_constant_s=2.0
            )
        except Exception as e:
            print(f"  Warning: noisereduce failed ({str(e)}). Using Wiener filter.")
            y = wiener(y)
    else:
        y = wiener(y)
    return y

def extract_frames(y: np.ndarray, sr: int,
                   frame_sec: float = FRAME_SEC,
                   hop_sec: float = HOP_SEC) -> list:
    """Extract overlapping frames from audio."""
    frame_samples = int(frame_sec * sr)
    hop_samples = int(hop_sec * sr)
    
    frames = []
    for start in range(0, len(y) - frame_samples + 1, hop_samples):
        frame = y[start:start + frame_samples]
        frames.append(frame)
    
    if len(y) > frame_samples and (len(y) - frame_samples) % hop_samples != 0:
        last_frame = y[-frame_samples:]
        frames.append(last_frame)
    
    return frames

def augment_frame(frame: np.ndarray, sr: int, category: str, n_augmentations: int) -> list:
    """
    Generate augmented versions of a frame.
    FIXED: Preserve RMS energy across augmentations.
    """
    augmented = [frame.copy()]
    original_rms = compute_rms(frame)
    
    for i in range(n_augmentations - 1):
        aug = frame.copy()
        
        # 1. Time shift (circular)
        shift_samples = np.random.randint(-len(aug)//4, len(aug)//4)
        aug = np.roll(aug, shift_samples)
        
        # 2. Pitch shift (subtle)
        n_steps = np.random.uniform(-1.0, 1.0)
        try:
            aug = librosa.effects.pitch_shift(aug, sr=sr, n_steps=n_steps)
        except Exception:
            pass
        
        # 3. Add very subtle Gaussian noise
        current_rms = compute_rms(aug)
        noise_level = current_rms * 0.001
        noise = np.random.normal(0, noise_level, len(aug))
        aug = aug + noise
        
        # 4. FIXED: Restore original RMS to preserve energy
        aug_rms = compute_rms(aug)
        if aug_rms > 1e-8:
            aug = aug * (original_rms / aug_rms)
        
        # 5. Safety clipping
        aug = handle_clipping(aug)
        
        augmented.append(aug)
    
    return augmented
   

In [4]:
if not os.path.exists(EXPLORATION_CSV):
    raise FileNotFoundError(f"Exploration CSV not found: {EXPLORATION_CSV}")  
df_expl = pd.read_csv(EXPLORATION_CSV)
print(f"\nLoaded exploration data: {len(df_expl)} files")


Loaded exploration data: 843 files


In [5]:
# Remove duplicates by hash
original_count = len(df_expl)
df_expl = df_expl.drop_duplicates(subset='file_hash', keep='first')
duplicates_removed = original_count - len(df_expl)
print(f"Removed {duplicates_removed} duplicate files → {len(df_expl)} unique files")

Removed 40 duplicate files → 803 unique files


In [6]:
# Remove files that failed to load
df_expl = df_expl[df_expl['load_success'] == True].copy()
print(f"Using {len(df_expl)} files that loaded successfully")

Using 803 files that loaded successfully


In [None]:
# Prepare output directories
os.makedirs(PROCESSED_ROOT, exist_ok=True)
for category in df_expl['category'].unique():
    os.makedirs(os.path.join(PROCESSED_ROOT, category), exist_ok=True)

# Initialize tracking
processed_records = []
stats = {
    'files_processed': 0,
    'files_skipped_silent': 0,
    'files_skipped_error': 0,
    'frames_original': 0,
    'frames_augmented': 0,
    'frames_skipped_silent': 0
}
    
skipped_details = []


Processing files...


In [None]:
print(f"\nProcessing files...")

# Process each file
for _, row in tqdm(df_expl.iterrows(), total=len(df_expl), desc="Processing"):
    category = row['category']
    filename = row['file']
    filepath = os.path.join(DATA_DIR, category, filename)
    original_snr = row.get('snr_db', np.inf)
        
    if not os.path.exists(filepath):
        stats['files_skipped_error'] += 1
        skipped_details.append(f"{filename}: File not found")
        continue
        
    try:
        # 1. Load audio (mono, 16kHz)
        y, sr = load_audio(filepath)
            
        # 2. Check for silence at file level
        if is_silent(y):
            stats['files_skipped_silent'] += 1
            skipped_details.append(f"{filename}: Silent file (RMS={compute_rms(y):.6f})")
            continue
            
        # 3. Apply noise reduction for low-SNR Siren files only
        if category == 'Siren' and original_snr < LOW_SNR_THR:
            y = reduce_noise(y, sr)
            
        # 4. Fix clipping at file level
        y = handle_clipping(y)
            
        # 5. Pad if needed (before frame extraction)
        y = pad_to_length(y, sr)
            
        stats['files_processed'] += 1
            
    except Exception as e:
        stats['files_skipped_error'] += 1
        skipped_details.append(f"{filename}: Error loading - {str(e)}")
        continue
        
    # FRAME-LEVEL PROCESSING 
    frames = extract_frames(y, sr)
    augment_factor = AUGMENT_MULTIPLIERS.get(category, 3)
    target_rms = CATEGORY_RMS_TARGETS.get(category, 0.10)
        
    for frame_idx, frame in enumerate(frames):
        # Check frame for silence BEFORE normalization
        if is_silent(frame):
            stats['frames_skipped_silent'] += augment_factor  # Count all would-be augmentations
            continue
            
        # Normalize frame ONCE before augmentation
        frame_normalized = normalise_rms(frame, target_rms, max_gain=3.0)
        frame_normalized = handle_clipping(frame_normalized)
            
        # Double-check after normalization (catches edge cases)
        if is_silent(frame_normalized):
            stats['frames_skipped_silent'] += augment_factor
            continue
            
        # Generate augmented versions
        augmented_frames = augment_frame(frame_normalized, sr, category, augment_factor)
            
        for aug_idx, aug_frame in enumerate(augmented_frames):
            # No silence check needed - already verified before augmentation
            # and RMS is preserved during augmentation
                
            # Final safety clipping
            aug_frame = handle_clipping(aug_frame)
                
            # Save frame
            frame_name = f"{os.path.splitext(filename)[0]}_f{frame_idx}_a{aug_idx}.npy"
            cat_out_dir = os.path.join(PROCESSED_ROOT, category)
            frame_path = os.path.join(cat_out_dir, frame_name)
                
            np.save(frame_path, aug_frame)
                
            # Record metadata
            processed_records.append({
                'category': category,
                'original_file': filename,
                'frame_idx': frame_idx,
                'aug_idx': aug_idx,
                'is_augmented': aug_idx > 0,
                'frame_path': frame_path,
                'duration_sec': FRAME_SEC
            })
                
            if aug_idx == 0:
                stats['frames_original'] += 1
            else:
                stats['frames_augmented'] += 1

Processing: 100%|██████████| 803/803 [02:35<00:00,  5.16it/s]


In [9]:
 # SAVE METADATA 
meta_df = pd.DataFrame(processed_records)
meta_path = os.path.join(PROCESSED_ROOT, 'processed_frames_metadata.csv')
meta_df.to_csv(meta_path, index=False)
    
# PRINT SUMMARY
print("PREPROCESSING COMPLETE")
print(f"\nSaved {len(meta_df)} total frames")
print(f"  - Original frames: {stats['frames_original']}")
print(f"  - Augmented frames: {stats['frames_augmented']}")
print(f"  - Metadata: {meta_path}")
print(f"\nFrames per category:")
category_counts = meta_df['category'].value_counts().sort_index()
for cat, count in category_counts.items():
    orig = len(meta_df[(meta_df['category'] == cat) & (meta_df['aug_idx'] == 0)])
    aug = count - orig
    print(f"  {cat:15s}: {count:4d} ({orig} original + {aug} augmented)")
    
# Calculate balance
max_count = category_counts.max()
min_count = category_counts.min()
balance_ratio = max_count / min_count if min_count > 0 else float('inf')
print(f"\nClass balance ratio: {balance_ratio:.2f} (max/min)")
if balance_ratio > 2.0:
    print(f"  Dataset is imbalanced (ratio > 2.0)")
else:
    print(f"  Dataset is reasonably balanced")
    
print(f"\nFile processing:")
print(f"  - Processed: {stats['files_processed']}")
print(f"  - Skipped (silent): {stats['files_skipped_silent']}")
print(f"  - Skipped (error): {stats['files_skipped_error']}")
    
print(f"\nFrame filtering:")
print(f"  - Frames skipped (silent): {stats['frames_skipped_silent']}")
    
if skipped_details:
    print(f"\nSkipped files details (first 10):")
    for detail in skipped_details[:10]:
        print(f"  - {detail}")
    if len(skipped_details) > 10:
        print(f"  ... and {len(skipped_details) - 10} more")
    

PREPROCESSING COMPLETE

Saved 11906 total frames
  - Original frames: 3243
  - Augmented frames: 8663
  - Metadata: ../data/processed\processed_frames_metadata.csv

Frames per category:
  Alarm_Clock    : 2450 (490 original + 1960 augmented)
  Car_Horn       : 2325 (775 original + 1550 augmented)
  Glass_Breaking : 2394 (399 original + 1995 augmented)
  Gunshot        : 2355 (785 original + 1570 augmented)
  Siren          : 2382 (794 original + 1588 augmented)

Class balance ratio: 1.05 (max/min)
  Dataset is reasonably balanced

File processing:
  - Processed: 802
  - Skipped (silent): 1
  - Skipped (error): 0

Frame filtering:
  - Frames skipped (silent): 2095

Skipped files details (first 10):
  - -74dab6wYqU_30.wav: Silent file (RMS=0.002488)
