# eBird Data

## explore the publicly available data from eBird

In [1]:
import pandas as pd
import numpy as np
import re
import os

All eBird data from Bexar county in 2023

In [108]:
df = pd.read_csv('data/ebd_US-TX-029_2023/ebd_US-TX-029_202301_202312_smp_relDec-2023.txt', sep='\t')

In [109]:
df.shape

(254085, 50)

In [110]:
df.columns

Index(['GLOBAL UNIQUE IDENTIFIER', 'LAST EDITED DATE', 'TAXONOMIC ORDER',
       'CATEGORY', 'TAXON CONCEPT ID', 'COMMON NAME', 'SCIENTIFIC NAME',
       'SUBSPECIES COMMON NAME', 'SUBSPECIES SCIENTIFIC NAME', 'EXOTIC CODE',
       'OBSERVATION COUNT', 'BREEDING CODE', 'BREEDING CATEGORY',
       'BEHAVIOR CODE', 'AGE/SEX', 'COUNTRY', 'COUNTRY CODE', 'STATE',
       'STATE CODE', 'COUNTY', 'COUNTY CODE', 'IBA CODE', 'BCR CODE',
       'USFWS CODE', 'ATLAS BLOCK', 'LOCALITY', 'LOCALITY ID', 'LOCALITY TYPE',
       'LATITUDE', 'LONGITUDE', 'OBSERVATION DATE',
       'TIME OBSERVATIONS STARTED', 'OBSERVER ID', 'SAMPLING EVENT IDENTIFIER',
       'PROTOCOL TYPE', 'PROTOCOL CODE', 'PROJECT CODE', 'DURATION MINUTES',
       'EFFORT DISTANCE KM', 'EFFORT AREA HA', 'NUMBER OBSERVERS',
       'ALL SPECIES REPORTED', 'GROUP IDENTIFIER', 'HAS MEDIA', 'APPROVED',
       'REVIEWED', 'REASON', 'TRIP COMMENTS', 'SPECIES COMMENTS',
       'Unnamed: 49'],
      dtype='object')

In [112]:
df.columns = df.columns.str.lower().str.replace('\W+','_',regex=True)

In [113]:
df.head()

Unnamed: 0,global_unique_identifier,last_edited_date,taxonomic_order,category,taxon_concept_id,common_name,scientific_name,subspecies_common_name,subspecies_scientific_name,exotic_code,...,number_observers,all_species_reported,group_identifier,has_media,approved,reviewed,reason,trip_comments,species_comments,unnamed_49
0,URN:CornellLabOfOrnithology:EBIRD:OBS1604293624,2023-01-11 23:54:00.580061,5751,species,avibase-8EB0C1FA,American Avocet,Recurvirostra americana,,,,...,1.0,1,,0,1,0,,,Seen and heard in flight over Bird Pond; indiv...,
1,URN:CornellLabOfOrnithology:EBIRD:OBS1595602863,2023-10-25 00:17:23.078205,7335,species,avibase-13E9F9B4,American Bittern,Botaurus lentiginosus,,,,...,2.0,1,,0,1,0,,,,
2,URN:CornellLabOfOrnithology:EBIRD:OBS1607571036,2023-01-15 21:40:16.314384,21168,species,avibase-69544B59,American Crow,Corvus brachyrhynchos,,,,...,1.0,0,,0,1,0,,,,
3,URN:CornellLabOfOrnithology:EBIRD:OBS1611944707,2023-01-21 17:57:44.50285,21168,species,avibase-69544B59,American Crow,Corvus brachyrhynchos,,,,...,13.0,1,G9532004,0,1,0,,This was a bird walk for the Bexar Audubon Soc...,,
4,URN:CornellLabOfOrnithology:EBIRD:OBS1610976542,2023-01-20 15:08:11.491595,21168,species,avibase-69544B59,American Crow,Corvus brachyrhynchos,,,,...,1.0,1,,0,1,0,,,,


### global_unique_identifier

In [114]:
df.global_unique_identifier.value_counts()

URN:CornellLabOfOrnithology:EBIRD:OBS1604293624    1
URN:CornellLabOfOrnithology:EBIRD:OBS1815648852    1
URN:CornellLabOfOrnithology:EBIRD:OBS1807882793    1
URN:CornellLabOfOrnithology:EBIRD:OBS1808641027    1
URN:CornellLabOfOrnithology:EBIRD:OBS1805764249    1
                                                  ..
URN:CornellLabOfOrnithology:EBIRD:OBS1701573924    1
URN:CornellLabOfOrnithology:EBIRD:OBS1706648170    1
URN:CornellLabOfOrnithology:EBIRD:OBS1702629678    1
URN:CornellLabOfOrnithology:EBIRD:OBS1702634657    1
URN:CornellLabOfOrnithology:EBIRD:OBS1899726873    1
Name: global_unique_identifier, Length: 254085, dtype: int64

> unique identifier, can drop

In [115]:
df = df.drop(columns='global_unique_identifier')

In [116]:
df.head()

Unnamed: 0,last_edited_date,taxonomic_order,category,taxon_concept_id,common_name,scientific_name,subspecies_common_name,subspecies_scientific_name,exotic_code,observation_count,...,number_observers,all_species_reported,group_identifier,has_media,approved,reviewed,reason,trip_comments,species_comments,unnamed_49
0,2023-01-11 23:54:00.580061,5751,species,avibase-8EB0C1FA,American Avocet,Recurvirostra americana,,,,1,...,1.0,1,,0,1,0,,,Seen and heard in flight over Bird Pond; indiv...,
1,2023-10-25 00:17:23.078205,7335,species,avibase-13E9F9B4,American Bittern,Botaurus lentiginosus,,,,1,...,2.0,1,,0,1,0,,,,
2,2023-01-15 21:40:16.314384,21168,species,avibase-69544B59,American Crow,Corvus brachyrhynchos,,,,4,...,1.0,0,,0,1,0,,,,
3,2023-01-21 17:57:44.50285,21168,species,avibase-69544B59,American Crow,Corvus brachyrhynchos,,,,2,...,13.0,1,G9532004,0,1,0,,This was a bird walk for the Bexar Audubon Soc...,,
4,2023-01-20 15:08:11.491595,21168,species,avibase-69544B59,American Crow,Corvus brachyrhynchos,,,,3,...,1.0,1,,0,1,0,,,,


### last_edited_date

In [117]:
df.last_edited_date.value_counts()

2023-03-04 07:36:07.729744    7387
2023-10-02 21:23:29.101142    6240
2023-10-02 21:23:56.025217    2261
2024-01-04 21:50:18.950508    2113
2023-12-09 08:50:37.86482     1882
                              ... 
2023-02-03 10:38:40.595456       1
2023-02-27 13:23:49.591597       1
2023-02-19 11:02:45.47566        1
2023-02-17 18:29:14.564959       1
2023-12-16 13:53:28.540152       1
Name: last_edited_date, Length: 26981, dtype: int64

> dont need down to the microsecond, just keeping the date

In [118]:
df.last_edited_date = df.last_edited_date.str.split(expand=True)[0]

In [119]:
df.last_edited_date.value_counts()

2023-10-25    22221
2023-10-02    13036
2023-03-04     9003
2024-01-04     4998
2023-03-03     3276
              ...  
2024-01-03       59
2024-01-09       53
2023-08-03       41
2024-01-08       34
2024-01-10       25
Name: last_edited_date, Length: 375, dtype: int64

In [120]:
df.last_edited_date.value_counts().sort_index()

2023-01-01    286
2023-01-02    475
2023-01-03    279
2023-01-04    308
2023-01-05    472
             ... 
2024-01-06    284
2024-01-07    619
2024-01-08     34
2024-01-09     53
2024-01-10     25
Name: last_edited_date, Length: 375, dtype: int64

> is there a bird date? not an edit date

### taxonomic_order

In [121]:
df.taxonomic_order.value_counts()

33967    10985
2404      8884
27652     8332
26897     7878
31018     7070
         ...  
18870        1
10910        1
7340         1
3759         1
26898        1
Name: taxonomic_order, Length: 492, dtype: int64

> numeric value assigned to this taxon in the eBird/Clements taxonomy to arrange the species in the latest taxonomic sequence

> i dont know what this means

### category

In [122]:
df.category.value_counts()

species     242768
issf          4210
domestic      3422
spuh          2017
slash         1399
form           242
hybrid          27
Name: category, dtype: int64

    species = what it is
    issf = subspecies
    domestic = not wild
    spuh = broad species level identification
    slash = indentification narrowed to two possible species
    form = some species have different forms
    hybrid = mix of two species
    intergrade = mix of two subspecies

> imma get rid of hybrids & intergrade, they are so unlikely and rarely verified. theres also so few of them anyway.

In [123]:
df = df [(df.category != 'hybrid') & ((df.category != 'intergrade'))]

### taxon_concept_id

In [124]:
df.taxon_concept_id.value_counts()

avibase-4E74AE22    10985
avibase-BFBC73AF     8884
avibase-7EFF698D     8332
avibase-CB5469E1     7878
avibase-240E3390     7070
                    ...  
avibase-25C6A911        1
avibase-304D8767        1
avibase-693BD192        1
avibase-EB0CD5DC        1
avibase-4880DC55        1
Name: taxon_concept_id, Length: 487, dtype: int64

> unique taxonomic identifier meant to identify a specific taxonomic concept

> this is important if a species gets a name change, as this number is constant

### common_name

In [125]:
df.common_name.value_counts()

Northern Cardinal        11565
White-winged Dove         8884
Northern Mockingbird      8332
Carolina Wren             7879
House Sparrow             7070
                         ...  
Black-bellied Plover         1
Sanderling                   1
Orchard/Hooded Oriole        1
Cassin's Vireo               1
Virginia Rail                1
Name: common_name, Length: 437, dtype: int64

### scientific_name

In [126]:
df.scientific_name.value_counts()

Cardinalis cardinalis         11565
Zenaida asiatica               8884
Mimus polyglottos              8332
Thryothorus ludovicianus       7879
Passer domesticus              7070
                              ...  
Pluvialis squatarola              1
Calidris alba                     1
Icterus spurius/cucullatus        1
Vireo cassinii                    1
Rallus limicola                   1
Name: scientific_name, Length: 437, dtype: int64

In [127]:
df.scientific_name [df.scientific_name.str.contains('\(')].value_counts()

Anatidae (duck sp.)                       97
Accipitridae sp. (hawk sp.)               97
Anser sp. (Domestic type)                 93
Calidris sp. (peep sp.)                   60
Tyrannus sp. (yellow-bellied)             30
Anatidae sp. (dabbling duck sp.)          28
Anatidae sp. (teal sp.)                   12
Corvus sp. (raven sp.)                    10
Scolopacidae sp. (large shorebird sp.)     5
Falco sp. (small falcon sp.)               4
Spinus sp. (goldfinch sp.)                 3
Anatidae (goose sp.)                       2
Rallidae sp. (rail/crake sp.)              1
Name: scientific_name, dtype: int64

In [128]:
#removing the common name in the parenthesis
df.scientific_name = df.scientific_name.str.replace(r'\(.*\)','',regex=True)

In [129]:
df.scientific_name [df.scientific_name.str.contains('\(')].value_counts()

Series([], Name: scientific_name, dtype: int64)

In [130]:
df.scientific_name.str.split(expand=True)

Unnamed: 0,0,1,2
0,Recurvirostra,americana,
1,Botaurus,lentiginosus,
2,Corvus,brachyrhynchos,
3,Corvus,brachyrhynchos,
4,Corvus,brachyrhynchos,
...,...,...,...
254080,Buteo,albonotatus,
254081,Buteo,albonotatus,
254082,Buteo,albonotatus,
254083,Buteo,albonotatus,


In [131]:
df.scientific_name [df.scientific_name.str.split(expand=True)[2].notnull()].value_counts()

Nycticorax nycticorax/Nyctanassa violacea    15
Name: scientific_name, dtype: int64

In [132]:
df.common_name [df.scientific_name == 'Nycticorax nycticorax/Nyctanassa violacea'].value_counts()

Yellow-crowned/Black-crowned Night Heron    15
Name: common_name, dtype: int64

In [133]:
df.scientific_name.value_counts()

Cardinalis cardinalis           11565
Zenaida asiatica                 8884
Mimus polyglottos                8332
Thryothorus ludovicianus         7879
Passer domesticus                7070
                                ...  
Branta hutchinsii/canadensis        1
Ixobrychus exilis                   1
Calidris alba                       1
Sternula antillarum                 1
Rallus limicola                     1
Name: scientific_name, Length: 435, dtype: int64

In [134]:
df.scientific_name.str.split(expand=True)[0].value_counts()

Zenaida           13766
Cardinalis        11955
Mimus              8332
Thryothorus        7879
Baeolophus         7089
                  ...  
Elanoides             1
Phalaenoptilus        1
Cynanthus             1
Rallidae              1
Rallus                1
Name: 0, Length: 228, dtype: int64

### subspecies_common_name

In [135]:
df.subspecies_common_name.value_counts(dropna=False)

NaN                                          246277
Rock Pigeon (Feral Pigeon)                     1550
Mallard (Domestic type)                         899
Muscovy Duck (Domestic type)                    700
Yellow-rumped Warbler (Myrtle)                  683
Northern Cardinal (Common)                      580
Great-tailed Grackle (Great-tailed)             475
Crested Caracara (Northern)                     450
Barn Swallow (American)                         359
Golden-fronted Woodpecker (Northern)            249
American Coot (Red-shielded)                    234
Great Blue Heron (Great Blue)                   221
Red-shouldered Hawk (lineatus Group)            191
Red-winged Blackbird (Red-winged)               170
Great Egret (American)                          159
Black-bellied Whistling-Duck (fulgens)          119
House Wren (Northern)                           109
Swan Goose (Domestic type)                      104
Yellow Warbler (Northern)                        68
Graylag Goos

### subspecies_scientific_name

In [136]:
df.subspecies_scientific_name.value_counts(dropna=False)

NaN                                                246277
Columba livia (Feral Pigeon)                         1550
Anas platyrhynchos (Domestic type)                    899
Cairina moschata (Domestic type)                      700
Setophaga coronata coronata                           683
Cardinalis cardinalis [cardinalis Group]              580
Quiscalus mexicanus [mexicanus Group]                 475
Caracara plancus cheriway                             450
Hirundo rustica erythrogaster                         359
Melanerpes aurifrons aurifrons                        249
Fulica americana (Red-shielded)                       234
Ardea herodias [herodias Group]                       221
Buteo lineatus [lineatus Group]                       191
Agelaius phoeniceus [phoeniceus Group]                170
Ardea alba egretta                                    159
Dendrocygna autumnalis fulgens                        119
Troglodytes aedon [aedon Group]                       109
Anser cygnoide

### exotic_code

In [137]:
df.exotic_code.value_counts()

N    12440
X     2130
P     1534
Name: exotic_code, dtype: int64

    N (Naturalized)
    P (Provisional)
    X (Escapee)

### observation_count

In [138]:
df.observation_count.value_counts()

1       96173
2       51051
3       24182
4       15929
5       11530
        ...  
518         1
141         1
315         1
237         1
3000        1
Name: observation_count, Length: 297, dtype: int64

### breeding_code

In [139]:
df.breeding_code.value_counts(dropna=False)

NaN    250093
H        1252
F        1242
S         475
FL        165
ON        149
P         116
FY        101
C          95
NB         81
CN         77
NY         40
N          35
T          29
S7         27
CF         26
M          25
A           6
DD          6
UN          5
B           4
NE          4
FS          3
PE          2
Name: breeding_code, dtype: int64

In [140]:
df.columns

Index(['last_edited_date', 'taxonomic_order', 'category', 'taxon_concept_id',
       'common_name', 'scientific_name', 'subspecies_common_name',
       'subspecies_scientific_name', 'exotic_code', 'observation_count',
       'breeding_code', 'breeding_category', 'behavior_code', 'age_sex',
       'country', 'country_code', 'state', 'state_code', 'county',
       'county_code', 'iba_code', 'bcr_code', 'usfws_code', 'atlas_block',
       'locality', 'locality_id', 'locality_type', 'latitude', 'longitude',
       'observation_date', 'time_observations_started', 'observer_id',
       'sampling_event_identifier', 'protocol_type', 'protocol_code',
       'project_code', 'duration_minutes', 'effort_distance_km',
       'effort_area_ha', 'number_observers', 'all_species_reported',
       'group_identifier', 'has_media', 'approved', 'reviewed', 'reason',
       'trip_comments', 'species_comments', 'unnamed_49'],
      dtype='object')

In [141]:
df[['breeding_code','behavior_code']].fillna('0').dropna()

Unnamed: 0,breeding_code,behavior_code
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
...,...,...
254080,0,0
254081,0,0
254082,0,0
254083,0,0


In [142]:
(df.fillna('0').breeding_code != df.fillna('0').breeding_code).sum()

0