In [1]:
# Imports
import os
import random
import logging


import numpy as np
import pandas as pd
from tqdm import tqdm
from multiprocessing import Pool

# Data processing and scientific computing
from scipy.io import wavfile
from scipy.signal import butter, lfilter
from scipy.spatial.distance import cosine

# Audio processing
import librosa

# Visualization
import matplotlib.pyplot as plt
import plotly.express as px

# Set up logging
logging.basicConfig(level=logging.INFO)

import torch
from transformers import BertTokenizer, BertModel
from sklearn.decomposition import PCA

In [2]:
data = pd.read_csv('ohio_bird_recordings_metadata.csv')

## Data Cleaning

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 745 entries, 0 to 744
Data columns (total 17 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           745 non-null    int64  
 1   genus        745 non-null    object 
 2   species      745 non-null    object 
 3   common_name  745 non-null    object 
 4   latitude     745 non-null    float64
 5   longitude    745 non-null    float64
 6   type         738 non-null    object 
 7   date         745 non-null    object 
 8   time         745 non-null    object 
 9   length       745 non-null    object 
 10  quality      745 non-null    object 
 11  remarks      515 non-null    object 
 12  sex          318 non-null    object 
 13  stage        304 non-null    object 
 14  also         422 non-null    object 
 15  file_name    745 non-null    object 
 16  local_file   744 non-null    object 
dtypes: float64(2), int64(1), object(14)
memory usage: 99.1+ KB


In [8]:
data.head(10)

Unnamed: 0,id,genus,species,common_name,latitude,longitude,type,date,time,length,quality,remarks,sex,stage,also,file_name,local_file
0,726750,Branta,canadensis,Canada Goose,39.2095,-84.7821,flight call,2022-05-11,11:22,0:14,A,Flock of 5 landing on lake.,uncertain,adult,"Agelaius phoeniceus, Melospiza melodia, Cardin...",Branta_canadensis_Whitewater_Township_near__Ha...,Original Recordings\Branta_canadensis_Whitewat...
1,418000,Branta,canadensis,Canada Goose,40.5652,-83.6255,flight call,2018-05-06,13:10,0:31,A,Natural vocalizations as the birds flew overhe...,,,"Myiarchus crinitus, Quiscalus quiscula",Branta_canadensis_Lawrence_Woods_SNP_418000.mp3,Original Recordings\Branta_canadensis_Lawrence...
2,691528,Branta,canadensis,Canada Goose,39.283,-84.7459,flight call,2021-12-14,07:48,0:19,B,Part of large flock (total count 124 birds) ta...,,,,Branta_canadensis_Miami_Whitewater_Forest_Park...,Original Recordings\Branta_canadensis_Miami_Wh...
3,713788,Cygnus,buccinator,Trumpeter Swan,40.7095,-83.3032,call,2021-12-03,09:00,2:12,B,About 100m away in marshy pond. Numerous indiv...,,,,Cygnus_buccinator_Killdeer_Plains_Wildlife_Man...,Original Recordings\Cygnus_buccinator_Killdeer...
4,815809,Aix,sponsa,Wood Duck,41.6275,-83.1897,call,2023-05-16,08:29,0:04,C,calls from two birds in flight;,uncertain,adult,,Aix_sponsa_Magee_Marsh_-_boardwalk_Lucas_Count...,Original Recordings\Aix_sponsa_Magee_Marsh_-_b...
5,855312,Anas,acuta,Northern Pintail,39.176,-84.528,call,2023-12-13,16:45,0:17,B,Bird feeding and calling among Mallards (Anas ...,male,adult,Cyanocitta cristata,Anas_acuta_Cincinnati_Hamilton_County_Ohio_855...,Original Recordings\Anas_acuta_Cincinnati_Hami...
6,805814,Colinus,virginianus,Northern Bobwhite,39.1833,-83.7177,"song, crickets",2023-05-27,17:34,0:59,A,Bird singing from cover in a short grass field...,male,adult,"Agelaius phoeniceus, Turdus migratorius, Quisc...",Colinus_virginianus_New_Market_Township_near__...,Original Recordings\Colinus_virginianus_New_Ma...
7,17130,Colinus,virginianus,Northern Bobwhite,41.93338,-83.54994,song,2007-07-18,?,0:20,B,,,,"Spinus tristis, Passerina cyanea, Turdus migra...",Colinus_virginianus_Michigan_Monroe_County_171...,Original Recordings\Colinus_virginianus_Michig...
8,17061,Colinus,virginianus,Northern Bobwhite,41.93338,-83.54994,song,2007-06-05,?,0:03,C,,,,,Colinus_virginianus_Michigan_Monroe_County_170...,Original Recordings\Colinus_virginianus_Michig...
9,17060,Colinus,virginianus,Northern Bobwhite,41.93338,-83.54994,song,2007-06-05,?,0:02,C,,,,,Colinus_virginianus_Michigan_Monroe_County_170...,Original Recordings\Colinus_virginianus_Michig...


In [4]:
# Fill all NA values with 'None'
data = data.fillna('None')

# Verify the change
print(data.isnull().sum())

id             0
genus          0
species        0
common_name    0
latitude       0
longitude      0
type           0
date           0
time           0
length         0
quality        0
remarks        0
sex            0
stage          0
also           0
file_name      0
local_file     0
dtype: int64


### Checking for duplicates

In [7]:
duplicate_mask = data.duplicated(keep=False)
duplicate_count = duplicate_mask.sum()

print(f"Number of duplicate entries: {duplicate_count}")

if duplicate_count > 0:
    print("\nDuplicate entries:")
    duplicate_entries = data[duplicate_mask].sort_values(by=data.columns.tolist())
    print(duplicate_entries)
else:
    print("No duplicate entries found.")

Number of duplicate entries: 0
No duplicate entries found.


In [15]:
data['genus'].nunique()

100

### Analyze the 'type' column
Group into few categories

In [16]:
data['type'].value_counts()

type
song                                                                                       423
call                                                                                       166
call, song                                                                                  52
flight call                                                                                 25
alarm call                                                                                   8
None                                                                                         7
song, atypical                                                                               6
duet, song                                                                                   5
uncertain                                                                                    4
drumming                                                                                     3
subsong                                      

In [18]:
def simplify_type(type_str):
    if pd.isna(type_str) or type_str == 'None' or type_str == 'uncertain':
        return 'Unknown'
    if 'song' in type_str.lower():
        return 'Song'
    if 'call' in type_str.lower():
        return 'Call'
    return 'Other'

data['simplified_type'] = data['type'].apply(simplify_type)

# Check the new distribution
print(data['simplified_type'].value_counts())

simplified_type
Song       509
Call       218
Unknown     11
Other        7
Name: count, dtype: int64


### Date and Time
Convert the date column to seasons
Convert the time to time of day categories

In [20]:
data['date'].value_counts()

date
2023-05-15    49
2018-05-06    34
2023-05-16    24
2007-05-02    23
2018-05-05    15
              ..
2020-05-03     1
2021-04-04     1
2018-03-29     1
2011-06-06     1
2014-10-27     1
Name: count, Length: 258, dtype: int64

In [21]:
# First, let's convert the 'date' column to datetime if it's not already
data['date'] = pd.to_datetime(data['date'])

# Now, let's define a function to convert date to season
def get_season(date):
    month = date.month
    if 3 <= month <= 5:
        return 'Spring'
    elif 6 <= month <= 8:
        return 'Summer'
    elif 9 <= month <= 11:
        return 'Fall'
    else:
        return 'Winter'

# Apply the function to create a new 'season' column
data['season'] = data['date'].apply(get_season)

# Let's check the distribution of seasons
print(data['season'].value_counts())

season
Spring    561
Summer    110
Fall       41
Winter     33
Name: count, dtype: int64


In [23]:
data['time'].value_counts()

time
?        88
10:00    33
09:00    20
11:00    20
08:30    16
         ..
23:30     1
05:48     1
09:54     1
04:30     1
18:36     1
Name: count, Length: 266, dtype: int64

In [24]:
def categorize_time(time_str):
    if time_str == '?' or pd.isna(time_str):
        return 'Unknown'
    
    try:
        time = pd.to_datetime(time_str).time()
        hour = time.hour
        
        if 5 <= hour < 12:
            return 'Morning'
        elif 12 <= hour < 17:
            return 'Afternoon'
        elif 17 <= hour < 21:
            return 'Evening'
        else:
            return 'Night'
    except:
        return 'Unknown'

# Apply the function to create a new 'time_of_day' column
data['time_of_day'] = data['time'].apply(categorize_time)

# Check the distribution of time categories
print(data['time_of_day'].value_counts())

time_of_day
Morning      412
Afternoon    171
Unknown       88
Night         43
Evening       31
Name: count, dtype: int64


In [14]:
# Convert 'length' from mm:ss to seconds

def time_to_seconds(time_str):
    minutes, seconds = map(int, time_str.split(':'))
    return minutes * 60 + seconds

# Apply the function to the 'length' column
data['length_seconds'] = data['length'].apply(time_to_seconds)

# Display the first few rows to verify the conversion
print(data[['length', 'length_seconds']].head())

  length  length_seconds
0   0:14              14
1   0:31              31
2   0:19              19
3   2:12             132
4   0:04               4


Apply NLP to 'remarks' column to attempt to extract valuable info

In [None]:

class BERTFeatureExtractor:
    def __init__(self, model_name='bert-base-uncased'):
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertModel.from_pretrained(model_name)
        self.model.eval()  # Set the model to evaluation mode
        
    def extract_features(self, text, max_length=512):
        # Tokenize and encode the text
        inputs = self.tokenizer(text, return_tensors="pt", max_length=max_length, 
                                truncation=True, padding=True)
        
        # Generate BERT embeddings
        with torch.no_grad():
            outputs = self.model(**inputs)
        
        # Use the [CLS] token embedding as the sentence representation
        sentence_embedding = outputs.last_hidden_state[:, 0, :].numpy()
        
        return sentence_embedding[0]  # Return as a 1D array

def process_remarks_with_bert(df, pca_components=50):
    # Initialize BERT feature extractor
    bert_extractor = BERTFeatureExtractor()
    
    # Extract BERT features
    bert_features = df['remarks'].apply(bert_extractor.extract_features)
    
    # Convert to numpy array
    bert_features_array = np.stack(bert_features.values)
    
    # Apply PCA to reduce dimensionality
    pca = PCA(n_components=pca_components)
    bert_features_pca = pca.fit_transform(bert_features_array)
    
    # Create a DataFrame with PCA features
    bert_features_df = pd.DataFrame(
        bert_features_pca, 
        columns=[f'bert_feature_{i+1}' for i in range(pca_components)]
    )
    
    # Combine with original dataframe
    result_df = pd.concat([df, bert_features_df], axis=1)
    
    return result_df

# Usage
processed_df = process_remarks_with_bert(data)

### Analyze 'sex'
Too many missing
too few females
dropping this column

In [28]:
data['sex'].value_counts()

sex
None            427
male            173
uncertain       121
female, male     17
female            7
Name: count, dtype: int64

### Analyze 'stage' 
too many missing
imbalanced to adults
drop this

In [31]:
data['stage'].value_counts()

stage
None               441
adult              256
uncertain           40
juvenile             7
adult, juvenile      1
Name: count, dtype: int64

### Analyze 'also' 
This column contains data that indicates what other species' noises are present in the recording

### Leave the 'also' column to be handled with the audio segmentation

In [33]:
data['also'].value_counts()

also
None                                                                                                                                                                    323
Agelaius phoeniceus                                                                                                                                                      29
Turdus migratorius                                                                                                                                                       10
Agelaius phoeniceus, Setophaga aestiva                                                                                                                                    7
Pipilo erythrophthalmus                                                                                                                                                   5
                                                                                                                                       

In [36]:
data.head()

Unnamed: 0,id,genus,species,common_name,latitude,longitude,type,date,time,length,quality,remarks,sex,stage,also,file_name,local_file,simplified_type,season,time_of_day
0,726750,Branta,canadensis,Canada Goose,39.2095,-84.7821,flight call,2022-05-11,11:22,0:14,A,Flock of 5 landing on lake.,uncertain,adult,"Agelaius phoeniceus, Melospiza melodia, Cardin...",Branta_canadensis_Whitewater_Township_near__Ha...,Original Recordings\Branta_canadensis_Whitewat...,Call,Spring,Morning
1,418000,Branta,canadensis,Canada Goose,40.5652,-83.6255,flight call,2018-05-06,13:10,0:31,A,Natural vocalizations as the birds flew overhe...,,,"Myiarchus crinitus, Quiscalus quiscula",Branta_canadensis_Lawrence_Woods_SNP_418000.mp3,Original Recordings\Branta_canadensis_Lawrence...,Call,Spring,Afternoon
2,691528,Branta,canadensis,Canada Goose,39.283,-84.7459,flight call,2021-12-14,07:48,0:19,B,Part of large flock (total count 124 birds) ta...,,,,Branta_canadensis_Miami_Whitewater_Forest_Park...,Original Recordings\Branta_canadensis_Miami_Wh...,Call,Winter,Morning
3,713788,Cygnus,buccinator,Trumpeter Swan,40.7095,-83.3032,call,2021-12-03,09:00,2:12,B,About 100m away in marshy pond. Numerous indiv...,,,,Cygnus_buccinator_Killdeer_Plains_Wildlife_Man...,Original Recordings\Cygnus_buccinator_Killdeer...,Call,Winter,Morning
4,815809,Aix,sponsa,Wood Duck,41.6275,-83.1897,call,2023-05-16,08:29,0:04,C,calls from two birds in flight;,uncertain,adult,,Aix_sponsa_Magee_Marsh_-_boardwalk_Lucas_Count...,Original Recordings\Aix_sponsa_Magee_Marsh_-_b...,Call,Spring,Morning


There is 'Identity unknown' in common_name and 'mystery' in species. Let's remove these as we won't be able to use them without labels. However we can use this unidentified files to make predictions on with our final model.

In [None]:
# Create a boolean mask for unidentified entries
unidentified_mask = (data['common_name'] == 'Identity unknown') & (data['species'] == 'mystery')

# Save the unidentified entries to a CSV file
unidentified_data = data[unidentified_mask]
unidentified_data.to_csv('unidentified_data.csv', index=False)

# Remove the unidentified entries from the main dataframe
data = data[~unidentified_mask]

# Check for remaining 'Identity unknown' or 'mystery' entries
unknown_common = data['common_name'] == 'Identity unknown'
mystery_species = data['species'] == 'mystery'

# Count the occurrences
unknown_common_count = unknown_common.sum()
mystery_species_count = mystery_species.sum()

print(f"Entries with 'Identity unknown' common name: {unknown_common_count}")
print(f"Entries with 'mystery' species: {mystery_species_count}")
print(f"Number of unidentified entries saved to CSV: {len(unidentified_data)}")

In [None]:


# Assuming 'data' is your DataFrame
top_25_birds = data['common_name'].value_counts().nlargest(25)

fig = px.bar(
    x=top_25_birds.index,
    y=top_25_birds.values,
    labels={'x': 'Bird Species', 'y': 'Frequency'},
    title='Top 25 Most Frequent Bird Species',
    color=top_25_birds.values,
    color_continuous_scale='viridis'
)

fig.update_layout(
    font=dict(family="Arial", size=14),
    plot_bgcolor='white',
    xaxis=dict(
        tickangle=45,
        title_font=dict(size=16),
        tickfont=dict(size=12),
        gridcolor='lightgray'
    ),
    yaxis=dict(
        title_font=dict(size=16),
        tickfont=dict(size=12),
        gridcolor='lightgray'
    ),
    coloraxis_colorbar=dict(title='Frequency'),
    title=dict(font=dict(size=24)),
    height=800,
    width=1200
)

fig.show()

In [None]:
# Create the map
fig = px.scatter_mapbox(data, 
                        lat='latitude', 
                        lon='longitude', 
                        hover_name='common_name', 
                        hover_data=['length_seconds'],  # Additional info for hover
                        color='common_name',  # Color points by bird species
                        zoom=7,  # Adjust zoom level to fit Ohio
                        height=800,
                        width=1000,
                        title='Bird Recording Locations in Ohio')

# Update the map layout
fig.update_layout(
    mapbox_style="open-street-map",
    mapbox_center={"lat": 40.4173, "lon": -82.9071},  # Center of Ohio
    margin={"r":0,"t":50,"l":0,"b":0},
    title={
        'text': "Bird Recording Locations in Ohio",
        'y':0.98,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top',
        'font': dict(size=24)
    },
    legend_title_text='Bird Species',
    legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="left",
        x=0.01
    )
)

# Show the map

fig.show()