# Dog Vocalization EDA
Exploratory analysis of collected audio data

In [None]:
import sys
sys.path.append('../src')

import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display
from pathlib import Path
from preprocessing.audio_utils import extract_features, create_spectrogram

In [None]:
# Load and analyze sample audio
def analyze_audio_file(file_path):
    data = np.load(file_path, allow_pickle=True).item()
    audio, sr = data['audio'], data['sr']
    
    # Basic stats
    print(f"Duration: {len(audio)/sr:.2f}s")
    print(f"Sample rate: {sr}Hz")
    
    # Visualizations
    fig, axes = plt.subplots(2, 2, figsize=(12, 8))
    
    # Waveform
    librosa.display.waveshow(audio, sr=sr, ax=axes[0,0])
    axes[0,0].set_title('Waveform')
    
    # Spectrogram
    S = librosa.stft(audio)
    librosa.display.specshow(librosa.amplitude_to_db(np.abs(S)), sr=sr, ax=axes[0,1])
    axes[0,1].set_title('Spectrogram')
    
    # MFCC
    mfccs = librosa.feature.mfcc(y=audio, sr=sr)
    librosa.display.specshow(mfccs, sr=sr, ax=axes[1,0])
    axes[1,0].set_title('MFCC')
    
    # Spectral centroid
    cent = librosa.feature.spectral_centroid(y=audio, sr=sr)
    axes[1,1].plot(cent[0])
    axes[1,1].set_title('Spectral Centroid')
    
    plt.tight_layout()
    return audio, sr

In [None]:
# Dataset overview
data_path = Path('../data/raw')
if data_path.exists():
    for label_dir in data_path.iterdir():
        if label_dir.is_dir():
            count = len(list(label_dir.glob('*.npy')))
            print(f"{label_dir.name}: {count} samples")
else:
    print("No data collected yet. Use AudioCollector to add samples.")