# Audio Preprocessing Experiments

**Goal:** Test different preprocessing techniques to improve STT accuracy on degraded archival audio.

**Methods to test:**
1. Baseline (no preprocessing)
2. Noise reduction (spectral gating, Wiener filter)
3. Loudness normalization (-23 to -18 LUFS)
4. High-pass filtering (remove low-frequency rumble)
5. Resampling (16kHz vs 48kHz)
6. Combinations of above

**Sample audio:** VHP audio (bandwidth-limited, low noise per previous analysis)

In [None]:
# Imports
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import soundfile as sf
from IPython.display import Audio, display
import pyloudnorm as pyln
from scipy import signal
import noisereduce as nr

# Set plot style
plt.style.use('default')
%matplotlib inline

## 1. Load Sample Audio

Using VHP sample from previous quality analysis.

In [None]:
# Load audio file
audio_path = Path("../data/loc_veteran/sample_audio.mp3")

# Load with librosa (preserves original sample rate)
y, sr = librosa.load(audio_path, sr=None, mono=True)

print(f"Sample rate: {sr} Hz")
print(f"Duration: {len(y) / sr:.2f} seconds")
print(f"Audio shape: {y.shape}")

# Display audio player
print("\nOriginal audio:")
display(Audio(y, rate=sr))

In [None]:
# Visualize original waveform and spectrogram
fig, axes = plt.subplots(3, 1, figsize=(14, 10))

# Waveform
librosa.display.waveshow(y, sr=sr, ax=axes[0])
axes[0].set_title("Original Waveform")
axes[0].set_xlabel("Time (s)")
axes[0].set_ylabel("Amplitude")

# Spectrogram
D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max)
librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='hz', ax=axes[1])
axes[1].set_title("Spectrogram (Original)")
axes[1].set_ylabel("Frequency (Hz)")

# Mel spectrogram
M = librosa.feature.melspectrogram(y=y, sr=sr)
M_db = librosa.power_to_db(M, ref=np.max)
librosa.display.specshow(M_db, sr=sr, x_axis='time', y_axis='mel', ax=axes[2])
axes[2].set_title("Mel Spectrogram (Original)")
axes[2].set_ylabel("Mel Frequency")

plt.tight_layout()
plt.show()

## 2. Baseline Audio Quality Metrics

Measure the original audio quality before preprocessing.

In [None]:
def calculate_audio_metrics(y, sr):
    """
    Calculate audio quality metrics.
    
    Returns:
        dict: Audio quality metrics
    """
    # Spectral centroid (brightness)
    centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
    centroid_mean = np.mean(centroid)
    
    # Spectral roll-off (bandwidth)
    rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr, roll_percent=0.85)
    rolloff_mean = np.mean(rolloff)
    
    # Spectral flatness (noise vs tonal)
    flatness = librosa.feature.spectral_flatness(y=y)
    flatness_mean = np.mean(flatness)
    
    # Zero crossing rate (noisiness)
    zcr = librosa.feature.zero_crossing_rate(y)
    zcr_mean = np.mean(zcr)
    
    # RMS energy
    rms = librosa.feature.rms(y=y)
    rms_mean = np.mean(rms)
    
    # Loudness (LUFS) - using pyloudnorm
    meter = pyln.Meter(sr)
    loudness = meter.integrated_loudness(y)
    
    return {
        'centroid_hz': centroid_mean,
        'rolloff_hz': rolloff_mean,
        'flatness': flatness_mean,
        'zcr': zcr_mean,
        'rms': rms_mean,
        'loudness_lufs': loudness
    }

# Calculate baseline metrics
baseline_metrics = calculate_audio_metrics(y, sr)

print("Baseline Audio Quality Metrics:")
print("="*50)
for key, value in baseline_metrics.items():
    print(f"{key:20s}: {value:.4f}")

## 3. Preprocessing Method 1: Noise Reduction

Use spectral gating to remove background noise.

**How it works:**
- Analyzes noise profile from audio
- Applies spectral gate to reduce noise below threshold
- Preserves speech signal

In [None]:
# Apply noise reduction using noisereduce library
# Method 1: Stationary noise reduction (spectral gating)
y_reduced = nr.reduce_noise(y=y, sr=sr, stationary=True, prop_decrease=1.0)

print("Noise reduction applied.")
print("\nNoise-reduced audio:")
display(Audio(y_reduced, rate=sr))

In [None]:
# Visualize before/after noise reduction
fig, axes = plt.subplots(2, 2, figsize=(14, 8))

# Original waveform
librosa.display.waveshow(y, sr=sr, ax=axes[0, 0])
axes[0, 0].set_title("Original Waveform")
axes[0, 0].set_ylabel("Amplitude")

# Noise-reduced waveform
librosa.display.waveshow(y_reduced, sr=sr, ax=axes[0, 1])
axes[0, 1].set_title("Noise-Reduced Waveform")
axes[0, 1].set_ylabel("Amplitude")

# Original spectrogram
D_orig = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max)
librosa.display.specshow(D_orig, sr=sr, x_axis='time', y_axis='hz', ax=axes[1, 0])
axes[1, 0].set_title("Original Spectrogram")
axes[1, 0].set_ylabel("Frequency (Hz)")

# Noise-reduced spectrogram
D_reduced = librosa.amplitude_to_db(np.abs(librosa.stft(y_reduced)), ref=np.max)
librosa.display.specshow(D_reduced, sr=sr, x_axis='time', y_axis='hz', ax=axes[1, 1])
axes[1, 1].set_title("Noise-Reduced Spectrogram")
axes[1, 1].set_ylabel("Frequency (Hz)")

plt.tight_layout()
plt.show()

In [None]:
# Compare metrics
noise_reduced_metrics = calculate_audio_metrics(y_reduced, sr)

print("Metrics Comparison: Original vs Noise-Reduced")
print("="*70)
print(f"{'Metric':<20} {'Original':>15} {'Noise-Reduced':>15} {'Change':>15}")
print("="*70)
for key in baseline_metrics.keys():
    orig = baseline_metrics[key]
    reduced = noise_reduced_metrics[key]
    change = ((reduced - orig) / orig * 100) if orig != 0 else 0
    print(f"{key:<20} {orig:>15.4f} {reduced:>15.4f} {change:>14.2f}%")

## 4. Preprocessing Method 2: Loudness Normalization

Normalize audio to target loudness (ITU-R BS.1770-4 standard).

**Target:** -23 LUFS (broadcasting standard) to -18 LUFS (podcast standard)

**Why:** Consistent loudness helps STT models (trained on normalized audio).

In [None]:
# Loudness normalization
def normalize_loudness(y, sr, target_lufs=-23.0):
    """
    Normalize audio to target loudness (LUFS).
    
    Args:
        y: Audio signal
        sr: Sample rate
        target_lufs: Target loudness in LUFS (default: -23.0 for broadcasting)
    
    Returns:
        Normalized audio signal
    """
    meter = pyln.Meter(sr)
    loudness = meter.integrated_loudness(y)
    
    print(f"Current loudness: {loudness:.2f} LUFS")
    print(f"Target loudness: {target_lufs:.2f} LUFS")
    
    # Normalize
    y_normalized = pyln.normalize.loudness(y, loudness, target_lufs)
    
    # Verify
    new_loudness = meter.integrated_loudness(y_normalized)
    print(f"New loudness: {new_loudness:.2f} LUFS")
    
    return y_normalized

# Test with -23 LUFS (broadcasting standard)
print("Normalizing to -23 LUFS (broadcasting standard)...")
y_norm_23 = normalize_loudness(y, sr, target_lufs=-23.0)

print("\nLoudness-normalized audio (-23 LUFS):")
display(Audio(y_norm_23, rate=sr))

In [None]:
# Test with -18 LUFS (podcast standard)
print("Normalizing to -18 LUFS (podcast standard)...")
y_norm_18 = normalize_loudness(y, sr, target_lufs=-18.0)

print("\nLoudness-normalized audio (-18 LUFS):")
display(Audio(y_norm_18, rate=sr))

In [None]:
# Visualize waveforms: original vs normalized
fig, axes = plt.subplots(3, 1, figsize=(14, 10))

librosa.display.waveshow(y, sr=sr, ax=axes[0])
axes[0].set_title(f"Original (Loudness: {baseline_metrics['loudness_lufs']:.2f} LUFS)")
axes[0].set_ylabel("Amplitude")

librosa.display.waveshow(y_norm_23, sr=sr, ax=axes[1])
axes[1].set_title("Normalized to -23 LUFS (Broadcasting Standard)")
axes[1].set_ylabel("Amplitude")

librosa.display.waveshow(y_norm_18, sr=sr, ax=axes[2])
axes[2].set_title("Normalized to -18 LUFS (Podcast Standard)")
axes[2].set_ylabel("Amplitude")

plt.tight_layout()
plt.show()

## 5. Preprocessing Method 3: High-Pass Filtering

Remove low-frequency rumble (< 80 Hz) often present in analog recordings.

**Why:** Low-frequency noise (rumble, hum) doesn't contribute to speech but can interfere with STT.

In [None]:
# High-pass filter
def highpass_filter(y, sr, cutoff_freq=80):
    """
    Apply high-pass filter to remove low-frequency rumble.
    
    Args:
        y: Audio signal
        sr: Sample rate
        cutoff_freq: Cutoff frequency in Hz (default: 80 Hz)
    
    Returns:
        Filtered audio signal
    """
    # Design Butterworth high-pass filter (4th order)
    nyquist = sr / 2
    normalized_cutoff = cutoff_freq / nyquist
    
    b, a = signal.butter(4, normalized_cutoff, btype='high', analog=False)
    
    # Apply filter
    y_filtered = signal.filtfilt(b, a, y)
    
    print(f"High-pass filter applied: cutoff = {cutoff_freq} Hz")
    
    return y_filtered

# Apply high-pass filter
y_highpass = highpass_filter(y, sr, cutoff_freq=80)

print("\nHigh-pass filtered audio:")
display(Audio(y_highpass, rate=sr))

In [None]:
# Visualize frequency response
fig, axes = plt.subplots(2, 2, figsize=(14, 8))

# Original waveform
librosa.display.waveshow(y, sr=sr, ax=axes[0, 0])
axes[0, 0].set_title("Original Waveform")
axes[0, 0].set_ylabel("Amplitude")

# High-pass filtered waveform
librosa.display.waveshow(y_highpass, sr=sr, ax=axes[0, 1])
axes[0, 1].set_title("High-Pass Filtered Waveform")
axes[0, 1].set_ylabel("Amplitude")

# Original spectrum (low frequencies)
D_orig = np.abs(librosa.stft(y))
freqs_orig = librosa.fft_frequencies(sr=sr)
low_freq_mask = freqs_orig < 500  # Focus on low frequencies
axes[1, 0].plot(freqs_orig[low_freq_mask], np.mean(D_orig, axis=1)[low_freq_mask])
axes[1, 0].set_title("Original Spectrum (0-500 Hz)")
axes[1, 0].set_xlabel("Frequency (Hz)")
axes[1, 0].set_ylabel("Magnitude")
axes[1, 0].axvline(x=80, color='r', linestyle='--', label='Cutoff (80 Hz)')
axes[1, 0].legend()

# High-pass filtered spectrum (low frequencies)
D_highpass = np.abs(librosa.stft(y_highpass))
axes[1, 1].plot(freqs_orig[low_freq_mask], np.mean(D_highpass, axis=1)[low_freq_mask])
axes[1, 1].set_title("High-Pass Filtered Spectrum (0-500 Hz)")
axes[1, 1].set_xlabel("Frequency (Hz)")
axes[1, 1].set_ylabel("Magnitude")
axes[1, 1].axvline(x=80, color='r', linestyle='--', label='Cutoff (80 Hz)')
axes[1, 1].legend()

plt.tight_layout()
plt.show()

## 6. Preprocessing Method 4: Resampling

Test different sample rates for STT models.

**Common rates:**
- 16 kHz: Whisper, Wav2Vec2 preferred rate
- 48 kHz: High-quality preservation

In [None]:
# Resample to 16 kHz (Whisper/Wav2Vec2 standard)
y_16k = librosa.resample(y, orig_sr=sr, target_sr=16000)

print(f"Original sample rate: {sr} Hz")
print(f"Resampled to: 16000 Hz")
print(f"Original duration: {len(y) / sr:.2f} seconds")
print(f"Resampled duration: {len(y_16k) / 16000:.2f} seconds")

print("\nResampled audio (16 kHz):")
display(Audio(y_16k, rate=16000))

In [None]:
# Visualize spectrograms at different sample rates
fig, axes = plt.subplots(2, 1, figsize=(14, 8))

# Original sample rate spectrogram
D_orig = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max)
librosa.display.specshow(D_orig, sr=sr, x_axis='time', y_axis='hz', ax=axes[0])
axes[0].set_title(f"Original Spectrogram ({sr} Hz)")
axes[0].set_ylabel("Frequency (Hz)")

# 16 kHz spectrogram
D_16k = librosa.amplitude_to_db(np.abs(librosa.stft(y_16k)), ref=np.max)
librosa.display.specshow(D_16k, sr=16000, x_axis='time', y_axis='hz', ax=axes[1])
axes[1].set_title("Resampled Spectrogram (16 kHz)")
axes[1].set_ylabel("Frequency (Hz)")

plt.tight_layout()
plt.show()

## 7. Combination Method: Full Preprocessing Pipeline

Combine multiple preprocessing steps:
1. High-pass filter (remove rumble)
2. Noise reduction (remove background noise)
3. Loudness normalization (consistent levels)
4. Resample to 16 kHz (model input)

In [None]:
def full_preprocessing_pipeline(y, sr, 
                                 highpass_cutoff=80, 
                                 noise_reduce=True,
                                 target_lufs=-23.0,
                                 target_sr=16000):
    """
    Apply full preprocessing pipeline.
    
    Args:
        y: Audio signal
        sr: Sample rate
        highpass_cutoff: High-pass filter cutoff frequency (Hz)
        noise_reduce: Whether to apply noise reduction
        target_lufs: Target loudness (LUFS)
        target_sr: Target sample rate (Hz)
    
    Returns:
        Preprocessed audio signal, new sample rate
    """
    print("Starting preprocessing pipeline...")
    
    # Step 1: High-pass filter
    print("\n[1/4] Applying high-pass filter...")
    y_proc = highpass_filter(y, sr, cutoff_freq=highpass_cutoff)
    
    # Step 2: Noise reduction
    if noise_reduce:
        print("\n[2/4] Applying noise reduction...")
        y_proc = nr.reduce_noise(y=y_proc, sr=sr, stationary=True, prop_decrease=1.0)
    else:
        print("\n[2/4] Skipping noise reduction")
    
    # Step 3: Loudness normalization
    print("\n[3/4] Normalizing loudness...")
    y_proc = normalize_loudness(y_proc, sr, target_lufs=target_lufs)
    
    # Step 4: Resample
    print(f"\n[4/4] Resampling to {target_sr} Hz...")
    if sr != target_sr:
        y_proc = librosa.resample(y_proc, orig_sr=sr, target_sr=target_sr)
        print(f"Resampled: {sr} Hz → {target_sr} Hz")
    else:
        print(f"Already at {target_sr} Hz, skipping resample")
    
    print("\nPreprocessing pipeline complete!")
    return y_proc, target_sr

# Apply full pipeline
y_full_preproc, sr_full_preproc = full_preprocessing_pipeline(
    y, sr,
    highpass_cutoff=80,
    noise_reduce=True,
    target_lufs=-23.0,
    target_sr=16000
)

print("\nFully preprocessed audio:")
display(Audio(y_full_preproc, rate=sr_full_preproc))

In [None]:
# Final comparison: original vs fully preprocessed
fig, axes = plt.subplots(2, 2, figsize=(14, 8))

# Original waveform
librosa.display.waveshow(y, sr=sr, ax=axes[0, 0])
axes[0, 0].set_title("Original Waveform")
axes[0, 0].set_ylabel("Amplitude")

# Preprocessed waveform
librosa.display.waveshow(y_full_preproc, sr=sr_full_preproc, ax=axes[0, 1])
axes[0, 1].set_title("Fully Preprocessed Waveform")
axes[0, 1].set_ylabel("Amplitude")

# Original spectrogram
D_orig = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max)
librosa.display.specshow(D_orig, sr=sr, x_axis='time', y_axis='hz', ax=axes[1, 0])
axes[1, 0].set_title(f"Original Spectrogram ({sr} Hz)")
axes[1, 0].set_ylabel("Frequency (Hz)")

# Preprocessed spectrogram
D_preproc = librosa.amplitude_to_db(np.abs(librosa.stft(y_full_preproc)), ref=np.max)
librosa.display.specshow(D_preproc, sr=sr_full_preproc, x_axis='time', y_axis='hz', ax=axes[1, 1])
axes[1, 1].set_title(f"Preprocessed Spectrogram ({sr_full_preproc} Hz)")
axes[1, 1].set_ylabel("Frequency (Hz)")

plt.tight_layout()
plt.show()

## 8. Save Preprocessed Audio Files

Save different preprocessing variants for testing with STT models.

In [None]:
# Create output directory
output_dir = Path("../data/preprocessed_samples")
output_dir.mkdir(exist_ok=True)

# Save variants
variants = {
    "baseline": (y, sr),
    "noise_reduced": (y_reduced, sr),
    "loudness_norm_23lufs": (y_norm_23, sr),
    "loudness_norm_18lufs": (y_norm_18, sr),
    "highpass_filtered": (y_highpass, sr),
    "resampled_16khz": (y_16k, 16000),
    "full_pipeline": (y_full_preproc, sr_full_preproc)
}

for variant_name, (audio, sample_rate) in variants.items():
    output_path = output_dir / f"sample_{variant_name}.wav"
    sf.write(output_path, audio, sample_rate)
    print(f"Saved: {output_path}")

print(f"\nAll variants saved to {output_dir}")

## 9. Summary and Next Steps

**Preprocessing methods tested:**
1. ✅ Noise reduction (spectral gating)
2. ✅ Loudness normalization (-23 LUFS, -18 LUFS)
3. ✅ High-pass filtering (80 Hz cutoff)
4. ✅ Resampling (16 kHz)
5. ✅ Full pipeline (combination of all)

**Next steps:**
1. Create `preprocess_audio.py` script based on notebook code
2. Test script locally on VHP samples
3. Run preprocessing on cloud for full dataset
4. Run STT inference on preprocessed variants
5. Compare WER across preprocessing methods

**Key insights for script design:**
- Need command-line args for preprocessing method selection
- Config YAML for parameterization (cutoff frequencies, target LUFS, etc.)
- Azure Blob integration for input/output
- Parallel processing support for batch operations