# BirdCLEF 2025 Data Exploration

In [1]:
# 0.2.1: List all audio files
import sys
import os
from pathlib import Path

# Add src directory to path to import data_inventory
src_path = Path(os.getcwd()).parent / 'src'
sys.path.append(str(src_path))

# Make sure the import works relative to the notebook's location
if src_path.name != 'src': # Adjust if running from a different cwd
    src_path = Path(os.getcwd()) / 'src'
    if str(src_path) not in sys.path:
        sys.path.append(str(src_path))

from data_inventory import list_audio_files

# Define data directory relative to the notebook
# Assumes notebook is in 'notebooks/' and data is in 'data/' sibling directory
data_dir = Path(os.getcwd()).parent / 'data'
train_audio_path = data_dir / "train_audio"

print(f"Looking for audio files in: {train_audio_path}")

# List audio files
audio_files = list_audio_files(train_audio_path)

print(f"Found {len(audio_files)} audio files.")
# Optionally print the first few files
# print("First 5 files:", audio_files[:5])

Looking for audio files in: /Users/param/birdclef-2025/data/train_audio
Found 28564 audio files.


In [2]:
# 0.2.2: Count files per group
import pandas as pd
from data_inventory import count_files_per_group

# Load taxonomy data
taxonomy_path = data_dir / "taxonomy.csv"
taxonomy_df = pd.read_csv(taxonomy_path)
print(f"Loaded taxonomy data from: {taxonomy_path}")

# Count files per species and group
# Assumes 'audio_files' list is available from the previous cell
species_counts, group_counts = count_files_per_group(audio_files, taxonomy_df)

print(f"\nFound {len(species_counts)} unique species.")
# print("Top 5 species counts:", dict(list(species_counts.items())[:5])) # Example

print(f"\nFound {len(group_counts)} unique taxonomic groups (classes).")
print("Files per taxonomic group:", group_counts)

Loaded taxonomy data from: /Users/param/birdclef-2025/data/taxonomy.csv

Found 206 unique species.

Found 4 unique taxonomic groups (classes).
Files per taxonomic group: {'Aves': 27648, 'Insecta': 155, 'Amphibia': 583, 'Mammalia': 178}


In [3]:
# 0.2.3: Extract file metadata
from tqdm.auto import tqdm
from data_inventory import extract_file_metadata

print("Extracting metadata for all audio files (this might take a while)...")

# Assumes 'audio_files' list is available from cell 1
metadata_list = [extract_file_metadata(f) for f in tqdm(audio_files)]

print(f"\nExtracted metadata for {len(metadata_list)} files.")



# Check for errors
error_count = sum(1 for meta in metadata_list if meta.get('error') is not None)
print(f"\nEncountered {error_count} errors during metadata extraction.")
if error_count > 0:
    print("Example errors:")
    errors_shown = 0
    for meta in metadata_list:
        if meta.get('error') and errors_shown < 5:
            print(f"  - {meta['file_path']}: {meta['error']}")
            errors_shown += 1

Extracting metadata for all audio files (this might take a while)...


  0%|          | 0/28564 [00:00<?, ?it/s]


Extracted metadata for 28564 files.

Encountered 0 errors during metadata extraction.


In [4]:
# 0.2.4: Create metadata DataFrame
from data_inventory import create_metadata_dataframe
import pandas as pd # Ensure pandas is imported if not already

print("Creating metadata DataFrame...")

# Assumes 'metadata_list' is available from cell 3
# Assumes 'taxonomy_df' is available from cell 2
metadata_df = create_metadata_dataframe(metadata_list, taxonomy_df)

print("\nDataFrame created successfully.")

# Display some info about the DataFrame
print("\nDataFrame Info:")
metadata_df.info()

print("\nDataFrame Head:")
display(metadata_df.head()) # Use display() for better notebook rendering

# Check for rows where taxonomy info might be missing
missing_taxonomy = metadata_df['class_name'].isna().sum()
print(f"\nNumber of files with missing taxonomy info (if any): {missing_taxonomy}")

Creating metadata DataFrame...

DataFrame created successfully.

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28564 entries, 0 to 28563
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   file_path      28564 non-null  object 
 1   duration       28564 non-null  float64
 2   sampling_rate  28564 non-null  int64  
 3   format         28564 non-null  object 
 4   error          0 non-null      object 
 5   primary_label  28564 non-null  object 
 6   class_name     28564 non-null  object 
dtypes: float64(1), int64(1), object(5)
memory usage: 1.5+ MB

DataFrame Head:


Unnamed: 0,file_path,duration,sampling_rate,format,error,primary_label,class_name
0,/Users/param/birdclef-2025/data/train_audio/cr...,63.94775,32000,.ogg,,crbtan1,Aves
1,/Users/param/birdclef-2025/data/train_audio/cr...,20.610625,32000,.ogg,,crbtan1,Aves
2,/Users/param/birdclef-2025/data/train_audio/cr...,12.042438,32000,.ogg,,crbtan1,Aves
3,/Users/param/birdclef-2025/data/train_audio/cr...,11.859594,32000,.ogg,,crbtan1,Aves
4,/Users/param/birdclef-2025/data/train_audio/cr...,45.384,32000,.ogg,,crbtan1,Aves



Number of files with missing taxonomy info (if any): 0


In [5]:
# 0.2.5: Generate summary statistics
from data_inventory import generate_summary_statistics
import json # For pretty printing the summary

print("Generating summary statistics...")

# Assumes 'metadata_df' is available from the previous cell
summary_stats = generate_summary_statistics(metadata_df)

print("\nSummary statistics generated successfully.")

# Pretty print the summary dictionary
print("\n--- Summary Statistics ---")
print(json.dumps(summary_stats, indent=4, default=str)) # Use default=str to handle numpy types if any

Generating summary statistics...

Summary statistics generated successfully.

--- Summary Statistics ---
{
    "total_files": 28564,
    "total_species": 206,
    "total_taxonomic_groups": 4,
    "files_per_species": {
        "grekis": 990,
        "compau": 808,
        "trokin": 787,
        "roahaw": 709,
        "banana": 610,
        "whtdov": 572,
        "socfly1": 543,
        "yeofly1": 525,
        "bobfly1": 514,
        "wbwwre1": 499,
        "soulap1": 487,
        "sobtyr1": 478,
        "trsowl": 470,
        "laufal1": 467,
        "strcuc1": 431,
        "bbwduc": 424,
        "saffin": 419,
        "amekes": 409,
        "tropar": 397,
        "compot1": 383,
        "blbgra1": 380,
        "bubwre1": 379,
        "strfly1": 377,
        "gycwor1": 365,
        "greegr": 340,
        "linwoo1": 330,
        "pirfly1": 324,
        "littin1": 323,
        "bkmtou1": 311,
        "yercac1": 302,
        "butsal1": 298,
        "smbani": 287,
        "bugtan": 280,
   