# EDA and Baseline Audio Features for Egyptian Fruit Bat 10k Dataset

This notebook performs exploratory data analysis (EDA), optional resampling of audio to lower sample rates (e.g., 48 kHz), and computation of mel-spectrogram features for the ~10k subset stored in `data/annotations.csv`.

**Important:** This notebook does **not** modify files under `data/`; all derived artifacts are written to new folders under `derived/`.

In [None]:
from __future__ import annotations

from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import librosa
import soundfile as sf

sns.set_theme(context="notebook", style="whitegrid")

ROOT = Path(__file__).resolve().parent  # this notebook's directory
DATA_DIR = ROOT / "data"
DERIVED_DIR = ROOT / "derived"
AUDIO_DIR = DATA_DIR / "audio"
DERIVED_AUDIO_48K = DERIVED_DIR / "audio_48k"
DERIVED_MELS_48K = DERIVED_DIR / "mels_48k"

DERIVED_DIR.mkdir(exist_ok=True)
DERIVED_AUDIO_48K.mkdir(exist_ok=True, parents=True)
DERIVED_MELS_48K.mkdir(exist_ok=True, parents=True)


In [None]:
# Load annotations: 10k subset and full annotations with richer labels
ann_small = pd.read_csv(DATA_DIR / "annotations.csv")
ann_full = pd.read_csv(ROOT / "annotations_filenames.csv")

print("ann_small shape:", ann_small.shape)
print("ann_full shape :", ann_full.shape)

# Merge to bring richer labels onto the 10k subset (join on Emitter + File Name)
ann = ann_small.merge(
    ann_full,
    on=["Emitter", "File Name"],
    how="left",
)

print("Merged shape (10k with full labels):", ann.shape)
missing_fileid = ann['FileID'].isna().sum() if 'FileID' in ann.columns else 0
print("Rows with missing FileID after merge:", missing_fileid)
ann.head()


In [None]:
# Basic EDA on labels in the 10k subset
plt.figure(figsize=(10, 4))
ann['Emitter'].value_counts().head(20).plot(kind='bar')
plt.title('Top emitters in 10k subset')
plt.xlabel('Emitter')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

if 'Context' in ann.columns:
    plt.figure(figsize=(10, 4))
    ann['Context'].value_counts().head(20).plot(kind='bar')
    plt.title('Top contexts in 10k subset')
    plt.xlabel('Context')
    plt.ylabel('Count')
    plt.tight_layout()
    plt.show()


In [None]:
# Inspect audio durations (using a sample to keep it fast)
from typing import Tuple

def get_duration_and_sr(path: Path) -> Tuple[float, int]:
    y, sr = librosa.load(path, sr=None)  # keep native 250 kHz
    return float(len(y)) / float(sr), int(sr)

sample_n = 256  # change to None or larger to scan more files
paths = [AUDIO_DIR / fn for fn in ann_small['File Name'][:sample_n]]

durations = []
srs = []
for p in paths:
    if not p.exists():
        continue
    d, sr = get_duration_and_sr(p)
    durations.append(d)
    srs.append(sr)

print("Unique sample rates in sample:", sorted(set(srs)))
print("Duration stats (seconds) in sample:")
print(pd.Series(durations).describe())

plt.figure(figsize=(8, 4))
sns.histplot(durations, bins=50, kde=False)
plt.xlabel('Duration (s)')
plt.ylabel('Count')
plt.title('Distribution of call durations (sample)')
plt.tight_layout()
plt.show()


## Resample audio to 48 kHz (optional)

The code below creates a *separate* downsampled copy in `derived/audio_48k/` and leaves `data/audio/` untouched.
Run this cell when you are ready; it can take a while for all ~10k files.

In [None]:
TARGET_SR = 48_000

def resample_file(src: Path, dst: Path, target_sr: int = TARGET_SR) -> None:
    if dst.exists():
        return
    y, sr = librosa.load(src, sr=None)
    if sr == target_sr:
        y_resampled = y
    else:
        y_resampled = librosa.resample(y, orig_sr=sr, target_sr=target_sr)
    sf.write(dst, y_resampled, target_sr)

for fn in ann_small['File Name']:
    src = AUDIO_DIR / fn
    dst = DERIVED_AUDIO_48K / fn
    if not src.exists():
        continue
    resample_file(src, dst)

len(list(DERIVED_AUDIO_48K.glob('*.wav')))


## Compute mel-spectrograms for baseline features

We compute log-mel spectrograms from the (optionally) downsampled 48 kHz audio and store them as `.npy` files under `derived/mels_48k/`.

In [None]:
N_MELS = 80
HOP_LENGTH = 512
N_FFT = 2048

def compute_mel(path: Path) -> np.ndarray:
    y, sr = librosa.load(path, sr=None)
    mel = librosa.feature.melspectrogram(
        y=y, sr=sr, n_fft=N_FFT, hop_length=HOP_LENGTH, n_mels=N_MELS, fmax=sr // 2
    )
    mel_db = librosa.power_to_db(mel, ref=np.max)
    return mel_db.astype(np.float32)

for fn in ann_small['File Name']:
    src = DERIVED_AUDIO_48K / fn if (DERIVED_AUDIO_48K / fn).exists() else AUDIO_DIR / fn
    if not src.exists():
        continue
    out_path = DERIVED_MELS_48K / (Path(fn).stem + '.npy')
    if out_path.exists():
        continue
    mel = compute_mel(src)
    np.save(out_path, mel)

len(list(DERIVED_MELS_48K.glob('*.npy')))


In [None]:
# Quick sanity check: visualize one mel-spectrogram
example_fn = ann_small['File Name'].iloc[0]
mel_path = DERIVED_MELS_48K / (Path(example_fn).stem + '.npy')
mel = np.load(mel_path)

plt.figure(figsize=(10, 4))
librosa.display.specshow(mel, sr=TARGET_SR, hop_length=HOP_LENGTH, x_axis='time', y_axis='mel')
plt.colorbar(format='%+2.0f dB')
plt.title('Log-mel spectrogram example (48 kHz)')
plt.tight_layout()
plt.show()
