In [1]:
# Imports
import os
import random
import logging


import numpy as np
import pandas as pd
from tqdm import tqdm
from multiprocessing import Pool

# Data processing and scientific computing
from scipy.io import wavfile
from scipy.signal import butter, lfilter
from scipy.spatial.distance import cosine

# Audio processing
import librosa

# Visualization
import matplotlib.pyplot as plt
import plotly.express as px

# Set up logging
logging.basicConfig(level=logging.INFO)

import torch
from transformers import BertTokenizer, BertModel
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA

In [2]:
data = pd.read_csv('ohio_bird_recordings_metadata.csv')

## Data Cleaning

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 745 entries, 0 to 744
Data columns (total 17 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           745 non-null    int64  
 1   genus        745 non-null    object 
 2   species      745 non-null    object 
 3   common_name  745 non-null    object 
 4   latitude     745 non-null    float64
 5   longitude    745 non-null    float64
 6   type         738 non-null    object 
 7   date         745 non-null    object 
 8   time         745 non-null    object 
 9   length       745 non-null    object 
 10  quality      745 non-null    object 
 11  remarks      515 non-null    object 
 12  sex          318 non-null    object 
 13  stage        304 non-null    object 
 14  also         422 non-null    object 
 15  file_name    745 non-null    object 
 16  local_file   744 non-null    object 
dtypes: float64(2), int64(1), object(14)
memory usage: 99.1+ KB


In [4]:
# Fill all NA values with 'None'
data = data.fillna('None')

# Verify the change
print(data.isnull().sum())

id             0
genus          0
species        0
common_name    0
latitude       0
longitude      0
type           0
date           0
time           0
length         0
quality        0
remarks        0
sex            0
stage          0
also           0
file_name      0
local_file     0
dtype: int64


In [5]:
print(data['common_name'].value_counts())
print(f'\n', data['species'].value_counts())

common_name
Identity unknown        64
Red-winged Blackbird    18
Baltimore Oriole        18
Warbling Vireo          17
Carolina Wren           14
                        ..
Grey-cheeked Thrush      1
Common Pheasant          1
Dunlin                   1
Common Starling          1
Dark-eyed Junco          1
Name: count, Length: 147, dtype: int64

 species
mystery         64
carolinensis    28
ludovicianus    22
bicolor         19
phoeniceus      18
                ..
neglecta         1
celata           1
solitaria        1
colchicus        1
vulgaris         1
Name: count, Length: 128, dtype: int64


In [7]:
data['sex'].value_counts()

sex
None            365
male            173
uncertain       119
female, male     17
female            7
Name: count, dtype: int64

In [8]:


# Assuming 'data' is your DataFrame
top_25_birds = data['common_name'].value_counts().nlargest(25)

fig = px.bar(
    x=top_25_birds.index,
    y=top_25_birds.values,
    labels={'x': 'Bird Species', 'y': 'Frequency'},
    title='Top 25 Most Frequent Bird Species',
    color=top_25_birds.values,
    color_continuous_scale='viridis'
)

fig.update_layout(
    font=dict(family="Arial", size=14),
    plot_bgcolor='white',
    xaxis=dict(
        tickangle=45,
        title_font=dict(size=16),
        tickfont=dict(size=12),
        gridcolor='lightgray'
    ),
    yaxis=dict(
        title_font=dict(size=16),
        tickfont=dict(size=12),
        gridcolor='lightgray'
    ),
    coloraxis_colorbar=dict(title='Frequency'),
    title=dict(font=dict(size=24)),
    height=800,
    width=1200
)

fig.show()

In [9]:
data.isna().sum()

id             0
genus          0
species        0
common_name    0
latitude       0
longitude      0
type           0
date           0
time           0
length         0
quality        0
remarks        0
sex            0
stage          0
also           0
file_name      0
local_file     0
dtype: int64

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 681 entries, 0 to 680
Data columns (total 17 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           681 non-null    int64  
 1   genus        681 non-null    object 
 2   species      681 non-null    object 
 3   common_name  681 non-null    object 
 4   latitude     681 non-null    float64
 5   longitude    681 non-null    float64
 6   type         681 non-null    object 
 7   date         681 non-null    object 
 8   time         681 non-null    object 
 9   length       681 non-null    object 
 10  quality      681 non-null    object 
 11  remarks      681 non-null    object 
 12  sex          681 non-null    object 
 13  stage        681 non-null    object 
 14  also         681 non-null    object 
 15  file_name    681 non-null    object 
 16  local_file   681 non-null    object 
dtypes: float64(2), int64(1), object(14)
memory usage: 95.8+ KB


In [11]:
data.head()

Unnamed: 0,id,genus,species,common_name,latitude,longitude,type,date,time,length,quality,remarks,sex,stage,also,file_name,local_file
0,726750,Branta,canadensis,Canada Goose,39.2095,-84.7821,flight call,2022-05-11,11:22,0:14,A,Flock of 5 landing on lake.,uncertain,adult,"Agelaius phoeniceus, Melospiza melodia, Cardin...",Branta_canadensis_Whitewater_Township_near__Ha...,Original Recordings\Branta_canadensis_Whitewat...
1,418000,Branta,canadensis,Canada Goose,40.5652,-83.6255,flight call,2018-05-06,13:10,0:31,A,Natural vocalizations as the birds flew overhe...,,,"Myiarchus crinitus, Quiscalus quiscula",Branta_canadensis_Lawrence_Woods_SNP_418000.mp3,Original Recordings\Branta_canadensis_Lawrence...
2,691528,Branta,canadensis,Canada Goose,39.283,-84.7459,flight call,2021-12-14,07:48,0:19,B,Part of large flock (total count 124 birds) ta...,,,,Branta_canadensis_Miami_Whitewater_Forest_Park...,Original Recordings\Branta_canadensis_Miami_Wh...
3,713788,Cygnus,buccinator,Trumpeter Swan,40.7095,-83.3032,call,2021-12-03,09:00,2:12,B,About 100m away in marshy pond. Numerous indiv...,,,,Cygnus_buccinator_Killdeer_Plains_Wildlife_Man...,Original Recordings\Cygnus_buccinator_Killdeer...
4,815809,Aix,sponsa,Wood Duck,41.6275,-83.1897,call,2023-05-16,08:29,0:04,C,calls from two birds in flight;,uncertain,adult,,Aix_sponsa_Magee_Marsh_-_boardwalk_Lucas_Count...,Original Recordings\Aix_sponsa_Magee_Marsh_-_b...


In [12]:
data['type'].value_counts()

type
song                                                                                       407
call                                                                                       144
call, song                                                                                  46
flight call                                                                                 13
alarm call                                                                                   8
song, atypical                                                                               6
None                                                                                         5
duet, song                                                                                   5
flight call, song                                                                            3
call, flight call                                                                            3
imitation, song                              

In [13]:
def time_to_seconds(time_str):
    minutes, seconds = map(int, time_str.split(':'))
    return minutes * 60 + seconds

# Apply the function to the 'length' column
data['length_seconds'] = data['length'].apply(time_to_seconds)

# Display the first few rows to verify the conversion
print(data[['length', 'length_seconds']].head())

  length  length_seconds
0   0:14              14
1   0:31              31
2   0:19              19
3   2:12             132
4   0:04               4


In [14]:
data.sort_values(by='length_seconds',ascending=False).head(10)

Unnamed: 0,id,genus,species,common_name,latitude,longitude,type,date,time,length,quality,remarks,sex,stage,also,file_name,local_file,length_seconds
660,302932,Cardinalis,cardinalis,Northern Cardinal,39.192,-84.555,"song, several different song types appear in t...",2007-03-18,11:20,9:36,A,Filtering: Bass roll-off from about 600 Hz\r\n...,,,,Cardinalis_cardinalis_Tanglewood_Lane_Cincinna...,Original Recordings\Cardinalis_cardinalis_Tang...,576
439,814755,Agelaius,phoeniceus,Red-winged Blackbird,41.6275,-83.1897,"call, song",2023-05-16,10:02,7:30,A,calls of several types and song; interval shor...,"female, male",adult,"Setophaga aestiva, Branta canadensis, Charadri...",Agelaius_phoeniceus_Magee_Marsh_-_boardwalk_Lu...,Original Recordings\Agelaius_phoeniceus_Magee_...,450
557,825241,Setophaga,ruticilla,American Redstart,41.6275,-83.1897,song,2023-05-15,08:10,4:45,A,adult male; singing while foraging; migrant;,male,adult,"Agelaius phoeniceus, Zenaida macroura",Setophaga_ruticilla_Magee_Marsh_-_boardwalk_Lu...,Original Recordings\Setophaga_ruticilla_Magee_...,285
308,646400,Mimus,polyglottos,Northern Mockingbird,39.1237,-84.5904,"imitation, song, mimicry/imitation",2021-04-28,15:42,4:32,B,"Bird sallying upward from treetop, coming down...",male,adult,"Chaetura pelagica, Turdus migratorius, Spizell...",Mimus_polyglottos_Dunham_Recreation_Complex_ne...,Original Recordings\Mimus_polyglottos_Dunham_R...,272
615,827845,Setophaga,pensylvanica,Chestnut-sided Warbler,41.6275,-83.1897,song,2023-05-15,10:35,4:07,B,"singing (intermittently while foraging, about ...",male,adult,"Agelaius phoeniceus, Setophaga aestiva, Melosp...",Setophaga_pensylvanica_Magee_Marsh_-_boardwalk...,Original Recordings\Setophaga_pensylvanica_Mag...,247
639,179676,Cardellina,canadensis,Canada Warbler,40.0168,-83.0435,song,2014-05-24,09:00,3:58,B,Recorded along the southeast margin of the woo...,,,"Melospiza melodia, Catharus ustulatus, Geothly...",Cardellina_canadensis_OSU's_Waterman_Farm_wood...,Original Recordings\Cardellina_canadensis_OSU'...,238
223,827054,Tachycineta,bicolor,Tree Swallow,41.6275,-83.1897,"call, song",2023-05-15,08:53,3:54,A,"interval shortened at 1:01, after which a perc...",male,adult,"Agelaius phoeniceus, Icterus galbula",Tachycineta_bicolor_Magee_Marsh_-_boardwalk_Lu...,Original Recordings\Tachycineta_bicolor_Magee_...,234
563,825240,Setophaga,ruticilla,American Redstart,41.6275,-83.1897,song,2023-05-15,06:51,3:48,B,adult male; migrant;,male,adult,"Agelaius phoeniceus, Setophaga aestiva, Geothl...",Setophaga_ruticilla_Magee_Marsh_-_boardwalk_Lu...,Original Recordings\Setophaga_ruticilla_Magee_...,228
423,818926,Icterus,galbula,Baltimore Oriole,41.6275,-83.1897,"call, song",2023-05-16,06:29,3:45,B,"from the canopy, interval shortened at 1:29; i...",male,adult,"Agelaius phoeniceus, Setophaga aestiva, Tachyc...",Icterus_galbula_Magee_Marsh_-_boardwalk_Lucas_...,Original Recordings\Icterus_galbula_Magee_Mars...,225
638,179679,Cardellina,canadensis,Canada Warbler,40.0168,-83.0435,song,2014-05-24,09:05,3:45,A,Recorded along the southeast margin of the woo...,,,"Corvus brachyrhynchos, Melospiza melodia, Geot...",Cardellina_canadensis_OSU's_Waterman_Farm_wood...,Original Recordings\Cardellina_canadensis_OSU'...,225


In [15]:
# Create the map
fig = px.scatter_mapbox(data, 
                        lat='latitude', 
                        lon='longitude', 
                        hover_name='common_name', 
                        hover_data=['length_seconds'],  # Additional info for hover
                        color='common_name',  # Color points by bird species
                        zoom=7,  # Adjust zoom level to fit Ohio
                        height=800,
                        width=1000,
                        title='Bird Recording Locations in Ohio')

# Update the map layout
fig.update_layout(
    mapbox_style="open-street-map",
    mapbox_center={"lat": 40.4173, "lon": -82.9071},  # Center of Ohio
    margin={"r":0,"t":50,"l":0,"b":0},
    title={
        'text': "Bird Recording Locations in Ohio",
        'y':0.98,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top',
        'font': dict(size=24)
    },
    legend_title_text='Bird Species',
    legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="left",
        x=0.01
    )
)

# Show the map

fig.show()

Apply NLP to 'remarks' column to attempt to extract valuable info

In [16]:

class BERTFeatureExtractor:
    def __init__(self, model_name='bert-base-uncased'):
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertModel.from_pretrained(model_name)
        self.model.eval()  # Set the model to evaluation mode
        
    def extract_features(self, text, max_length=512):
        # Tokenize and encode the text
        inputs = self.tokenizer(text, return_tensors="pt", max_length=max_length, 
                                truncation=True, padding=True)
        
        # Generate BERT embeddings
        with torch.no_grad():
            outputs = self.model(**inputs)
        
        # Use the [CLS] token embedding as the sentence representation
        sentence_embedding = outputs.last_hidden_state[:, 0, :].numpy()
        
        return sentence_embedding[0]  # Return as a 1D array

def process_remarks_with_bert(df, pca_components=50):
    # Initialize BERT feature extractor
    bert_extractor = BERTFeatureExtractor()
    
    # Extract BERT features
    bert_features = df['remarks'].apply(bert_extractor.extract_features)
    
    # Convert to numpy array
    bert_features_array = np.stack(bert_features.values)
    
    # Apply PCA to reduce dimensionality
    pca = PCA(n_components=pca_components)
    bert_features_pca = pca.fit_transform(bert_features_array)
    
    # Create a DataFrame with PCA features
    bert_features_df = pd.DataFrame(
        bert_features_pca, 
        columns=[f'bert_feature_{i+1}' for i in range(pca_components)]
    )
    
    # Combine with original dataframe
    result_df = pd.concat([df, bert_features_df], axis=1)
    
    return result_df

# Usage
processed_df = process_remarks_with_bert(data)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development



vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [17]:
processed_df.head(10)

Unnamed: 0,id,genus,species,common_name,latitude,longitude,type,date,time,length,...,bert_feature_41,bert_feature_42,bert_feature_43,bert_feature_44,bert_feature_45,bert_feature_46,bert_feature_47,bert_feature_48,bert_feature_49,bert_feature_50
0,726750,Branta,canadensis,Canada Goose,39.2095,-84.7821,flight call,2022-05-11,11:22,0:14,...,0.717311,-0.281597,0.104254,-0.674174,0.072663,0.016954,0.800495,-0.228412,-0.078024,-0.443113
1,418000,Branta,canadensis,Canada Goose,40.5652,-83.6255,flight call,2018-05-06,13:10,0:31,...,0.477382,0.27958,0.189457,0.625589,-0.100937,-0.309759,0.575987,0.230533,0.246648,0.531847
2,691528,Branta,canadensis,Canada Goose,39.283,-84.7459,flight call,2021-12-14,07:48,0:19,...,0.649731,-0.324535,0.054672,-1.0307,-0.202203,-0.408179,0.853107,-0.203342,0.230101,-1.089392
3,713788,Cygnus,buccinator,Trumpeter Swan,40.7095,-83.3032,call,2021-12-03,09:00,2:12,...,1.45144,-0.873512,0.075764,-0.632438,0.728325,-0.348187,0.132433,0.537158,-0.077179,-0.07767
4,815809,Aix,sponsa,Wood Duck,41.6275,-83.1897,call,2023-05-16,08:29,0:04,...,0.4195,-0.565904,0.296022,-0.357175,0.106553,-0.195627,0.306617,0.145971,0.051418,0.332721
5,855312,Anas,acuta,Northern Pintail,39.176,-84.528,call,2023-12-13,16:45,0:17,...,0.968439,-0.370287,-0.289275,1.165928,-0.261743,0.095985,0.466899,0.47529,0.37148,-0.029545
6,805814,Colinus,virginianus,Northern Bobwhite,39.1833,-83.7177,"song, crickets",2023-05-27,17:34,0:59,...,0.528334,-0.666164,-0.46252,-0.516705,0.307032,-0.590384,-0.919369,-0.773565,-0.06458,-0.040714
7,17130,Colinus,virginianus,Northern Bobwhite,41.93338,-83.54994,song,2007-07-18,?,0:20,...,-0.003471,-0.003516,0.007376,0.000698,0.002809,-0.001706,-0.004156,-0.002353,-0.011926,0.002294
8,17061,Colinus,virginianus,Northern Bobwhite,41.93338,-83.54994,song,2007-06-05,?,0:03,...,-0.003471,-0.003515,0.007375,0.000698,0.002809,-0.001705,-0.004154,-0.002353,-0.011926,0.002296
9,17060,Colinus,virginianus,Northern Bobwhite,41.93338,-83.54994,song,2007-06-05,?,0:02,...,-0.003473,-0.003517,0.007376,0.000699,0.002812,-0.001707,-0.004153,-0.002352,-0.011925,0.002295


There is 'Identity unknown' in common_name and 'mystery' in species. Let's remove these as we won't be able to use them without labels. Once we have the model trained, we can return to these unknown and try to identify them

In [None]:
# Identify the unidentified entries
unidentified = data[(data['common_name'] == 'Identity unknown') | (data['species'] == 'mystery')]

# Save the unidentified entries to a CSV file
unidentified.to_csv('unidentified_preprocessed.csv', index=False)

# Apply a boolean mask to remove those entries from the main dataset
data = data[~((data['common_name'] == 'Identity unknown') | (data['species'] == 'mystery'))]

# Check for remaining 'Identity unknown' or 'mystery' entries
unknown_common = data['common_name'] == 'Identity unknown'
mystery_species = data['species'] == 'mystery'

# Count the occurrences
unknown_common_count = unknown_common.sum()
mystery_species_count = mystery_species.sum()

print(f"Entries with 'Identity unknown' common name: {unknown_common_count}")
print(f"Entries with 'mystery' species: {mystery_species_count}")
print(f"Number of unidentified entries saved to CSV: {len(unidentified)}")