In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

In [2]:
files = os.listdir('raven_annotations')
files = [f for f in files if f.endswith('.txt')]

In [3]:
annotations = pd.DataFrame()

for file in files:
    recording_name = f"{'_'.join(file.split('_')[0:3])}.wav"
    table = pd.read_table(f'raven_annotations/{file}')
    table['filename'] = recording_name
    id_column = 'ID'
    if 'ID' in table.columns:
        id_column = 'ID'
    elif 'Annotation' in table.columns:
        id_column = 'Annotation'
    elif 'Bird ID' in table.columns:
        id_column = 'Bird ID'
    elif 'fBirdID' in table.columns:
        id_column = 'fBirdID'
    else:
        raise ValueError(f'No ID column found, cols: {table.columns}')
    out = table[['filename', 'Begin Time (s)', 'End Time (s)', id_column]]
    out.columns = ['filename', 'begin_time_s', 'end_time_s', 'id']
    out.loc[:, 'id'] = out['id'].str.lower()
    annotations = pd.concat([annotations, out]).reset_index(drop=True)

annotations = annotations.dropna(subset=['id'])

In [4]:
bird_codes = pd.read_csv('bird_codes.csv')
bird_codes.loc[:, 'four_code'] = bird_codes['four_code'].str.lower()

In [5]:
codes_dict = dict(zip(bird_codes['four_code'], bird_codes['code']))

In [6]:
for i, row in annotations.iterrows():
    four_code_plus = row['id']
    if '*' in four_code_plus:
            continue
    for code in codes_dict:
        if code in four_code_plus:
            annotations.loc[i, 'label'] = codes_dict[code]

In [7]:
annotations = annotations.dropna(subset=['label']).reset_index(drop=True)

In [8]:
annotations['timestamp_s'] = ((annotations['begin_time_s'] + annotations['end_time_s']) / 2) - (((annotations['begin_time_s'] + annotations['end_time_s']) / 2) % 5)

In [9]:
for i, row in annotations.iterrows():
    if len(row['filename'].split('_')[2]) == 9:
        annotations.at[i, 'filename'] = row['filename'].split('_')[0] + '_' + row['filename'].split('_')[1] + '_' + '0' + row['filename'].split('_')[2]


In [22]:
species_counts = annotations['label'].value_counts()
max_count = species_counts.max()

# Resample each species to the maximum count
df_resampled = pd.DataFrame()

for species in species_counts.index:
    species_df = annotations[annotations['label'] == species]
    df_resampled = pd.concat([df_resampled, resample(species_df, replace=True, n_samples=max_count, random_state=42)])

# Shuffle the oversampled DataFrame
df_shuffled = df_resampled.sample(frac=1, random_state=42).reset_index(drop=True)

# Take a random sample of the original DataFrame size
df_final = df_shuffled.sample(n=len(annotations), random_state=42).reset_index(drop=True)

In [24]:
df_final.loc[0:1000, :]['label'].value_counts()

label
sonspa     31
gnttow     30
olsfly     29
westan     28
dusfly     27
herthr     27
bkhgro     26
comrav     26
swathr     25
gockin     25
mouqua     25
haiwoo     24
herwar     24
norfli     24
stejay     24
macwar     24
chispa     23
rebsap     23
evegro     23
warvir     23
amerob     22
casvir     22
naswar     22
whbnut     22
whhwoo     22
towsol     21
pilwoo     21
yerwar     21
mouchi     21
unknown    20
purfin     20
wewpew     19
pasfly     19
btywar     19
rebnut     19
comnig     18
wilsap     17
spotow     17
moublu     17
pacwre1    16
casfin     15
daejun     15
clanut     15
brncre     14
foxspa     14
annhum     12
linspa     10
Name: count, dtype: int64

In [25]:
df_final.to_csv('raven_annotations_even.csv', index=False)

In [40]:
annotations.to_csv('raven_annotations.csv', index=False)