Este agarra el CSV de todas las especies, y carga los CSVs recortados para procesar y entrenar

### Modify these bounds and values for Data Selection

In [None]:
# Bounding box for Latin America (min_lat, max_lat, min_lon, max_lon)
latin_america_bounds = {
    "lat": (-55.0, -20),
    "lon": (-73, -53)
}

MIN_SAMPLES_CUTOFF = 12
MIN_RATING_CUTOFF = 3.0

### The rest of the code below should remain unchanged.

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import os
from collections import defaultdict

In [None]:
# Work is done, csv ya procesado
# # Load original CSV
# df = pd.read_csv('data/birdclef-2021/train_metadata.csv')

# # Drop rows without coordinates
# df = df.dropna(subset=['latitude', 'longitude', 'filename'])

# # Keep only specified columns
# cols = ['primary_label', 'latitude', 'longitude', 'scientific_name', 'common_name', 'filename', 'rating']
# df = df[cols]

# # Overwrite original file
# df.to_csv('data/birdclef-2021/train_metadata.csv', index=False)
# df.to_csv('database/train_metadata.csv', index=False)

# # Preview
# df.head(n=10)

In [None]:
df = pd.read_csv(os.path.join('..', 'database', 'train_metadata.csv'))
print(f"DataFrame shape: {df.shape}")
df = df.dropna(subset=['latitude', 'longitude'])
df = df[df['filename'].notna() & (df['filename'] != '')]
df = df[~(df['scientific_name'].isna() & df['common_name'].isna())]
print(f"DataFrame shape after 'cuts': {df.shape}")

In [None]:
df.head(n=10)

In [None]:
def in_region(lat, lon):
    return (latin_america_bounds['lat'][0] <= lat <= latin_america_bounds['lat'][1] and
            latin_america_bounds['lon'][0] <= lon <= latin_america_bounds['lon'][1])

df['region'] = df.apply(lambda row: 'Latin America' if in_region(row['latitude'], row['longitude']) else None, axis=1)
region_df = df[df['region'].notna()][['primary_label', 'latitude', 'longitude', 'scientific_name', 'common_name', 'region', 'filename']]

# Add sample count per species
species_counts = region_df['scientific_name'].value_counts().to_dict()
region_df['amount_of_samples'] = region_df['scientific_name'].map(species_counts)

total_samples = len(region_df)
total_species = region_df['scientific_name'].nunique()

print(f"Total samples: {total_samples}")
print(f"Total distinct species: {total_species}")

In [None]:
species_counts = region_df['scientific_name'].value_counts()

plt.figure(figsize=(12, 6))
species_counts.hist(bins=50, edgecolor='black')
plt.xlabel('Number of Samples')
plt.ylabel('Number of Species')
plt.title('Distribution of Samples per Species')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
rated_df = region_df[df.loc[region_df.index, 'rating'] >= MIN_RATING_CUTOFF]
rated_species_counts = rated_df['scientific_name'].value_counts()

plt.figure(figsize=(12, 6))
rated_species_counts.hist(bins=50, edgecolor='black')
plt.xlabel('Number of Samples')
plt.ylabel('Number of Species')
plt.title('Distribution of Samples per Species after Rating Filter')
plt.grid(True)
plt.tight_layout()
plt.show()

print(f"Total samples: {len(rated_df)}")
print(f"Total distinct species: {rated_df['scientific_name'].nunique()}")

In [None]:
species_counts_filtered = species_counts[species_counts > MIN_SAMPLES_CUTOFF]
minsmps_df = rated_df[region_df['scientific_name'].isin(species_counts_filtered.index)]

plt.figure(figsize=(12, 6))
species_counts_filtered.hist(bins=100, edgecolor='black')
plt.xlabel('Number of Samples')
plt.ylabel('Number of Species')
plt.title('Distribution of Samples Species with 4+ Samples')
plt.grid(True)
plt.tight_layout()
plt.show()

print(f"Total samples: {len(minsmps_df)}")
print(f"Total distinct species: {minsmps_df['scientific_name'].nunique()}")

In [None]:
# Final Amount of Samples and Species
print(f"Final amount of samples: {len(minsmps_df)}")
print(f"Final amount of species: {minsmps_df['scientific_name'].nunique()}")

In [None]:
# Load the Class IDs for training
final_df = minsmps_df[['primary_label', 'scientific_name', 'common_name', 'filename']].copy()

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
final_df['class_id'] = le.fit_transform(final_df['scientific_name'])

# Save species to CSV
final_df.to_csv(os.path.join('..', 'database', 'meta', 'final_species.csv'), index=False)

# Build mapping with class_id, scientific_name, common_name
mapping_df = final_df[['class_id', 'scientific_name', 'common_name']].drop_duplicates().sort_values('class_id')
mapping_df.to_csv(os.path.join('..', 'database', 'meta', 'class_mapping.csv'), index=False)