Este agarra el CSV de todas las especies, y carga los CSVs recortados para procesar y entrenar

In [None]:
# Hyperparameters for Selection

MIN_LAT = -57.0 # decrease for more samples & species
MAX_LAT = -15.0 # increase for more samples & species

MIN_LON = -77.0 # decrease for more samples & species
MAX_LON = -48.0 # increase for more samples & species

SAMPLES_CUTOFF = 18 # decrease for more species
SEGMENTS_CUTOFF = 72 # decrease for more species
MIN_RATING_CUTOFF = 1 # decrease for more samples
MIN_AUTHORS_CUTOFF = 9 # decrease for more species

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import os
import librosa
import numpy as np
from collections import defaultdict
from tabulate import tabulate

In [None]:
from sklearn.preprocessing import LabelEncoder

latin_america_bounds = {
    "lat": (MIN_LAT, MAX_LAT),
    "lon": (MIN_LON, MAX_LON)
}

df = pd.read_csv(os.path.join('..', 'database', 'train_metadata.csv'))
print(f"DataFrame shape: {df.shape}")
df = df.dropna(subset=['latitude', 'longitude'])
df = df[df['filename'].notna() & (df['filename'] != '')]
df = df[~(df['scientific_name'].isna() & df['common_name'].isna())]
print(f"DataFrame shape after 'cuts': {df.shape}")

In [None]:
def in_region(lat, lon):
    return (latin_america_bounds['lat'][0] <= lat <= latin_america_bounds['lat'][1] and
            latin_america_bounds['lon'][0] <= lon <= latin_america_bounds['lon'][1])

df['region'] = df.apply(lambda row: 'Latin America' if in_region(row['latitude'], row['longitude']) else None, axis=1)
region_df = df[df['region'].notna()][['primary_label', 'latitude', 'longitude', 'scientific_name', 'common_name', 'region', 'filename', 'author']]

# Add sample count per species
species_counts = region_df['scientific_name'].value_counts().to_dict()
region_df['amount_of_samples'] = region_df['scientific_name'].map(species_counts)

# Drop rows where 'author' is NaN
region_df = region_df.dropna(subset=['author'])

# Create and apply label encoding to 'author' column
author_encoder = LabelEncoder()
region_df['author'] = author_encoder.fit_transform(region_df['author'])

rated_df = region_df[df.loc[region_df.index, 'rating'] >= MIN_RATING_CUTOFF]
rated_species_counts = rated_df['scientific_name'].value_counts()

In [None]:
# Assuming rated_df is your DataFrame and contains 'primary_label' and 'filename' columns
segment_sec = 5.0
sr = 32000
samples_per_segment = int(sr * segment_sec)
frame_len = 2048
hoplen = 512
THRESH = 0.75

usable_segments = []

for _, row in rated_df.iterrows():
    primary_label = row['primary_label']
    filename = row['filename']
    audio_path = os.path.join('../data/birdclef-2021/train_short_audio', primary_label, filename)

    try:
        y, srate = librosa.load(audio_path, sr=sr, mono=True)
    except Exception as e:
        print(f"Error loading audio file {audio_path}: {e}")
        usable_segments.append(0)
        continue

    rms = librosa.feature.rms(y=y, frame_length=frame_len, hop_length=hoplen)[0]
    threshold = THRESH * np.mean(rms)
    count = 0

    for start in range(0, len(y) - samples_per_segment + 1, samples_per_segment):
        segment = y[start:start + samples_per_segment]
        seg_rms = np.mean(librosa.feature.rms(y=segment)[0])
        if seg_rms < threshold:
            continue
        count += 1

    usable_segments.append(count)

segs_df = rated_df.copy()
segs_df['usable_segments'] = usable_segments

In [None]:
species_sample_counts = segs_df['scientific_name'].value_counts()
species_segment_counts = segs_df.groupby('scientific_name')['usable_segments'].sum()

species_to_keep = [
    species for species in species_sample_counts.index
    if species_sample_counts[species] >= SAMPLES_CUTOFF and species_segment_counts[species] >= SEGMENTS_CUTOFF
]

minsmps_df = segs_df[segs_df['scientific_name'].isin(species_to_keep)].copy()
summary_table = minsmps_df.groupby('scientific_name').agg(
    samples_count=('scientific_name', 'count'),
    unique_segments_count=('usable_segments', 'sum')
).reset_index()

# Sort by unique_segments_count ascending
summary_table = summary_table.sort_values('unique_segments_count', ascending=True)

# Pretty print the table
print(tabulate(summary_table, headers='keys', tablefmt='github', showindex=False))

In [None]:
# Get author counts per species
author_counts_per_species = minsmps_df.groupby('scientific_name')['author'].nunique()

# Find species with less than 2 unique authors
species_with_few_author = author_counts_per_species[author_counts_per_species < MIN_AUTHORS_CUTOFF]

print(f"Species with less than {MIN_AUTHORS_CUTOFF} unique authors ({len(species_with_few_author)} species):")
print("=" * 60)
for species, author_count in species_with_few_author.items():
    print(f"{species}: {author_count} author(s)")

# Filter out species with less than 2 unique authors
species_to_keep_multiauthor = author_counts_per_species[author_counts_per_species >= MIN_AUTHORS_CUTOFF].index
authors_df = minsmps_df[minsmps_df['scientific_name'].isin(species_to_keep_multiauthor)]

print(f"\nAfter filtering species with less than {MIN_AUTHORS_CUTOFF} unique authors:")
print(f"Remaining samples: {len(authors_df)}")
print(f"Remaining species: {authors_df['scientific_name'].nunique()}")

In [None]:
# Load the Class IDs for training
final_df = authors_df[['primary_label', 'scientific_name', 'common_name', 'filename', 'author', 'usable_segments']].copy()

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
final_df['class_id'] = le.fit_transform(final_df['scientific_name'])

# Save species to CSV
final_df.to_csv(os.path.join('..', 'database', 'meta', 'final_species.csv'), index=False)

# Build mapping with class_id, scientific_name, common_name
mapping_df = final_df[['class_id', 'scientific_name', 'common_name']].drop_duplicates().sort_values('class_id')
mapping_df.to_csv(os.path.join('..', 'database', 'meta', 'class_mapping.csv'), index=False)

print(f"Final DataFrame shape: {final_df.shape}")
print(f"Final number of samples: {len(final_df)}")
print(f"Final number of usable segments: {final_df['usable_segments'].sum()}")
print(f"Final number of species: {final_df['scientific_name'].nunique()}")