# North American Bird Records
Using `species.csv`

In [295]:
import pandas as pd
import numpy as np
import re
from collections import Counter, defaultdict
from fuzzywuzzy import process, fuzz
from itertools import islice

from Functions import process_scientific_names, standardize_common_names, standardize_common_names_subspecies

----
&nbsp;
## Exploratory Analysis

In [296]:
df = pd.read_csv('DATA/species.csv', low_memory=False)
print(f"Shape: {df.shape}")
df.head()

Shape: (119248, 14)


Unnamed: 0,Species ID,Park Name,Category,Order,Family,Scientific Name,Common Names,Record Status,Occurrence,Nativeness,Abundance,Seasonality,Conservation Status,Unnamed: 13
0,ACAD-1000,Acadia National Park,Mammal,Artiodactyla,Cervidae,Alces alces,Moose,Approved,Present,Native,Rare,Resident,,
1,ACAD-1001,Acadia National Park,Mammal,Artiodactyla,Cervidae,Odocoileus virginianus,"Northern White-Tailed Deer, Virginia Deer, Whi...",Approved,Present,Native,Abundant,,,
2,ACAD-1002,Acadia National Park,Mammal,Carnivora,Canidae,Canis latrans,"Coyote, Eastern Coyote",Approved,Present,Not Native,Common,,Species of Concern,
3,ACAD-1003,Acadia National Park,Mammal,Carnivora,Canidae,Canis lupus,"Eastern Timber Wolf, Gray Wolf, Timber Wolf",Approved,Not Confirmed,Native,,,Endangered,
4,ACAD-1004,Acadia National Park,Mammal,Carnivora,Canidae,Vulpes vulpes,"Black Fox, Cross Fox, Eastern Red Fox, Fox, Re...",Approved,Present,Unknown,Common,Breeder,,


In [297]:
print(f"Type of data:\n\n{df.dtypes}\n")

Type of data:

Species ID             object
Park Name              object
Category               object
Order                  object
Family                 object
Scientific Name        object
Common Names           object
Record Status          object
Occurrence             object
Nativeness             object
Abundance              object
Seasonality            object
Conservation Status    object
Unnamed: 13            object
dtype: object



In [298]:
print(f"Unique values:\n\n{df.nunique()}")

Unique values:

Species ID             119248
Park Name                  56
Category                   14
Order                     554
Family                   2332
Scientific Name         46022
Common Names            35825
Record Status              53
Occurrence                  7
Nativeness                  5
Abundance                   8
Seasonality                24
Conservation Status        11
Unnamed: 13                 3
dtype: int64


In [299]:
df.columns = [col.lower().replace(" ", "_") for col in df.columns]
df.rename(columns={'unnamed:_13': 'unnamed'}, inplace=True)
print(f"Columns: {df.columns.tolist()}\n")

Columns: ['species_id', 'park_name', 'category', 'order', 'family', 'scientific_name', 'common_names', 'record_status', 'occurrence', 'nativeness', 'abundance', 'seasonality', 'conservation_status', 'unnamed']



In [300]:
print(df['unnamed'].unique())

[nan 'Endangered' 'Threatened' 'Species of Concern']


In [301]:
# Filter rows where 'Unnamed: 13' is not NaN
filtered_df = df[df['unnamed'].notna()]
filtered_df

Unnamed: 0,species_id,park_name,category,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,unnamed
6441,BISC-1026,Biscayne National Park,Mammal,Sirenia,Trichechidae,Trichechus manatus,Manatee,Manati,Approved,Present,Unknown,Unknown,,Endangered
31786,EVER-1414,Everglades National Park,Reptile,Crocodilia,Crocodylidae,Crocodylus acutus,American Crocodile,Cocodrilo De Tumbes,Approved,Present,Native,Uncommon,Resident,Threatened
31826,EVER-1454,Everglades National Park,Reptile,Testudines,Cheloniidae,Caretta caretta,Loggerhead,Cabezon,Approved,Present,Native,Rare,Breeder,Threatened
44733,GRSA-1136,Great Sand Dunes National Park and Preserve,Bird,Falconiformes,Falconidae,Falco columbarius,Merlin,Pigeon Hawk,Approved,Present,Native,Rare,Resident,Species of Concern
44944,GRSA-1347,Great Sand Dunes National Park and Preserve,Vascular Plant,Asparagales,Iridaceae,Iris missouriensis,Blue Flag,Wild Iris,Approved,Present,Native,Rare,,Species of Concern


In [302]:
print(df['conservation_status'].unique())

[nan 'Species of Concern' 'Endangered' 'In Recovery' 'Threatened'
 'Under Review' 'Proposed Threatened' 'Extinct' 'Proposed Endangered'
 'Resident' 'Breeder' 'Migratory']


#### *Discrepancies between `conservation_status` and `unnamed` need to be assessed later
It seems as if `unnamed` can be written to `conservation_status`

In [303]:
# Remove all text beyond " National Park" including variations like " and Preserve"
df['park_name'] = df['park_name'].str.replace(r' National Park.*', '', case=False, regex=True)

In [304]:
# Is the species-id unique?
condor_df = df[df['common_names'].str.contains('condor', case=False, na=False)]
condor_df

Unnamed: 0,species_id,park_name,category,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,unnamed
1779,ARCH-1070,Arches,Bird,Accipitriformes,Cathartidae,Gymnogyps californianus,California Condor,Approved,Present,Native,Occasional,Vagrant,Endangered,
9334,BRCA-1087,Bryce Canyon,Bird,Accipitriformes,Cathartidae,Gymnogyps californianus,California Condor,Approved,Present,Native,Occasional,Vagrant,Endangered,
10620,CANY-1087,Canyonlands,Bird,Accipitriformes,Cathartidae,Gymnogyps californianus,California Condor,Approved,Not Confirmed,Native,,,Endangered,
11841,CARE-1085,Capitol Reef,Bird,Accipitriformes,Cathartidae,Gymnogyps californianus,California Condor,Approved,Not Confirmed,Native,,,Endangered,
26319,DEVA-1234,Death Valley,Bird,Accipitriformes,Cathartidae,Gymnogyps californianus,California Condor,Approved,Not Present (False Report),Native,,,Endangered,
39444,GRBA-1122,Great Basin,Bird,Accipitriformes,Cathartidae,Gymnogyps californianus,California Condor,Approved,Not Present (Historical Report),Native,,,Endangered,
42096,GRCA-1121,Grand Canyon,Bird,Accipitriformes,Cathartidae,Gymnogyps californianus,California Condor,Approved,Present,Native,Common,Breeder,Endangered,
65258,JOTR-1085,Joshua Tree,Bird,Accipitriformes,Cathartidae,Gymnogyps californianus,California Condor,Approved,Not Confirmed,Native,,,Endangered,
72896,LAVO-1115,Lassen Volcanic,Bird,Accipitriformes,Cathartidae,Gymnogyps californianus,California Condor,Approved,Not Present (Historical Report),Native,,,Endangered,
77178,MEVE-1101,Mesa Verde,Bird,Accipitriformes,Cathartidae,Gymnogyps californianus,California Condor,Approved,Present,Native,Occasional,Vagrant,Endangered,


`species_id` numeric is not unique for similar species but it probably should be

In [305]:
# Splitting species_id column at the hyphen to create a new `park_code` public key
df['park_code'] = df['species_id'].str.split('-').str[0]

# Reorder columns to place 'park_code' as the second column
cols = list(df.columns)  # Get the list of current columns
cols.insert(1, cols.pop(cols.index('park_code')))  # Move 'park_code' to the second position
df = df[cols]  # Reorder DataFrame

In [306]:
df.head()

Unnamed: 0,species_id,park_code,park_name,category,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,unnamed
0,ACAD-1000,ACAD,Acadia,Mammal,Artiodactyla,Cervidae,Alces alces,Moose,Approved,Present,Native,Rare,Resident,,
1,ACAD-1001,ACAD,Acadia,Mammal,Artiodactyla,Cervidae,Odocoileus virginianus,"Northern White-Tailed Deer, Virginia Deer, Whi...",Approved,Present,Native,Abundant,,,
2,ACAD-1002,ACAD,Acadia,Mammal,Carnivora,Canidae,Canis latrans,"Coyote, Eastern Coyote",Approved,Present,Not Native,Common,,Species of Concern,
3,ACAD-1003,ACAD,Acadia,Mammal,Carnivora,Canidae,Canis lupus,"Eastern Timber Wolf, Gray Wolf, Timber Wolf",Approved,Not Confirmed,Native,,,Endangered,
4,ACAD-1004,ACAD,Acadia,Mammal,Carnivora,Canidae,Vulpes vulpes,"Black Fox, Cross Fox, Eastern Red Fox, Fox, Re...",Approved,Present,Unknown,Common,Breeder,,


-----
&nbsp;
## Isolating Birds

In [307]:
birds = df[df.category == 'Bird']
print(f"Shape: {birds.shape}")
birds.head()

Shape: (14601, 15)


Unnamed: 0,species_id,park_code,park_name,category,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,unnamed
55,ACAD-1055,ACAD,Acadia,Bird,Accipitriformes,Accipitridae,Accipiter cooperii,Cooper's Hawk,Approved,Present,Native,Uncommon,,Species of Concern,
56,ACAD-1056,ACAD,Acadia,Bird,Accipitriformes,Accipitridae,Accipiter gentilis,"Eastern Goshawk, Goshawk, Northern Goshawk",Approved,Present,Native,Uncommon,Breeder,,
57,ACAD-1057,ACAD,Acadia,Bird,Accipitriformes,Accipitridae,Accipiter striatus,"Northern Sharp-Shinned Hawk, Sharp-Shinned Hawk",Approved,Present,Native,Common,Breeder,Species of Concern,
58,ACAD-1058,ACAD,Acadia,Bird,Accipitriformes,Accipitridae,Aquila chrysaetos,"American Golden Eagle, Golden Eagle",Approved,Present,Native,Occasional,Vagrant,Species of Concern,
59,ACAD-1059,ACAD,Acadia,Bird,Accipitriformes,Accipitridae,Buteo jamaicensis,"Eastern Red-Tailed Hawk, Red-Tailed Hawk",Approved,Present,Native,Common,Breeder,,


In [308]:
print(f"Unique values:\n\n{birds.nunique()}")

Unique values:

species_id             14601
park_code                 56
park_name                 56
category                   1
order                     24
family                    86
scientific_name         1436
common_names            1550
record_status             10
occurrence                 6
nativeness                 4
abundance                  7
seasonality               23
conservation_status        8
unnamed                    1
dtype: int64


In [309]:
print(birds['conservation_status'].unique())

['Species of Concern' nan 'In Recovery' 'Threatened' 'Endangered'
 'Under Review' 'Breeder' 'Resident' 'Proposed Endangered']


In [310]:
print(f"NaN count: {birds.conservation_status.isna().sum()}\n")
birds.groupby("conservation_status").size()

NaN count: 11970



conservation_status
Breeder                   4
Endangered               64
In Recovery              65
Proposed Endangered       2
Resident                  4
Species of Concern     2371
Threatened               51
Under Review             70
dtype: int64

### `conservation_status`

- `Endangered`: seriously at risk of extinction
- `In Recovery`: formerly `Endangered`, but currently not in danger of extinction throughout all or a significant portion of its range
- `Threatened`: vulnerable to endangerment in the near future
- `Species of Concern`: declining or appear to be in need of conservation
- `Under Review`: ?
- `Resident`: ?
- `Breeder`: ?

In [311]:
# Does resident and breeder come under `seasonality`?
print(birds.seasonality.unique())

[nan 'Breeder' 'Vagrant' 'Resident' 'Migratory' 'Breeder, Migratory'
 'Breeder, Summer' 'Migratory, Vagrant' 'Migratory, Winter'
 'Breeder, Winter' 'Winter' 'Resident, Winter' 'Summer, Vagrant'
 'Breeder, Resident, Summer' 'Resident, Summer' 'Summer'
 'Breeder, Resident, Migratory, Summer'
 'Breeder, Resident, Summer, Winter' 'Migratory, Summer' 'Winter, Vagrant'
 'Rare' 'Uncommon' 'Common' 'Breeder, Resident']


#### `Resident` and `Breeder` belong in `seasonality`. 
Seems wrong as a `conservation_status` and could be the result of Human Error. Investigating the unique values in the other columns

In [312]:
keywords = ['Breeder', 'Resident']
pattern = '|'.join(keywords)  

# Filter the DataFrame for rows where column contains any of the keywords
matching_df = birds[birds['conservation_status'].str.contains(pattern, case=False, na=False)]
matching_df

Unnamed: 0,species_id,park_code,park_name,category,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,unnamed
44666,GRSA-1069,GRSA,Great Sand Dunes,Bird,Accipitriformes,Accipitridae,Accipiter gentilis,Goshawk,Northern Goshawk,Approved,Present,Native,Rare,Breeder,
44678,GRSA-1081,GRSA,Great Sand Dunes,Bird,Anseriformes,Anatidae,Anas acuta,Pintail,Northern Pintail,Approved,Present,Native,Rare,Resident,
44705,GRSA-1108,GRSA,Great Sand Dunes,Bird,Caprimulgiformes,Caprimulgidae,Phalaenoptilus nuttallii,Poor-Will,Common Poorwill,Approved,Present,Native,Uncommon,Breeder,
44733,GRSA-1136,GRSA,Great Sand Dunes,Bird,Falconiformes,Falconidae,Falco columbarius,Merlin,Pigeon Hawk,Approved,Present,Native,Rare,Resident,Species of Concern
44744,GRSA-1147,GRSA,Great Sand Dunes,Bird,Passeriformes,Aegithalidae,Psaltriparus minimus,Common Bushtit,Bushtit,Approved,Present,Native,Uncommon,Breeder,
44759,GRSA-1162,GRSA,Great Sand Dunes,Bird,Passeriformes,Corvidae,Corvus brachyrhynchos,Common Crow,American Crow,Approved,Present,Native,Rare,Resident,
44818,GRSA-1221,GRSA,Great Sand Dunes,Bird,Passeriformes,Mimidae,Dumetella carolinensis,Gray Catbird,Catbird,Approved,Present,Native,Rare,Resident,
44859,GRSA-1262,GRSA,Great Sand Dunes,Bird,Passeriformes,Turdidae,Turdus migratorius,American Robin,Robin,Approved,Present,Native,Common,Breeder,


In [313]:
print(birds.abundance.unique())

['Uncommon' 'Common' 'Occasional' nan 'Rare' 'Unknown' 'Abundant' 'Native']


In [314]:
# Count unique values where 'abundance' is 'Native'
unique_native_abundance = birds[birds['abundance'] == 'Native']['species_id'].nunique()
print(f"Unique records where abundance is 'Native': {unique_native_abundance}")

Unique records where abundance is 'Native': 8


In [315]:
print(birds.nativeness.unique())

['Native' 'Unknown' 'Not Native' nan 'Present']


In [316]:
# Count unique values where 'nativeness' is 'Present'
unique_present_nativeness = birds[birds['nativeness'] == 'Present']['species_id'].nunique()
print(f"Unique records where nativeness is 'Present': {unique_present_nativeness}")

Unique records where nativeness is 'Present': 8


In [317]:
print(birds.record_status.unique())

['Approved' 'In Review' ' Northern Goshawk' ' Northern Pintail'
 ' Common Poorwill' ' Pigeon Hawk' ' Bushtit' ' American Crow' ' Catbird'
 ' Robin']


#### Discrepancies in these records seem to be the result of human error
We need to remove the common_name values in `record_status` and shift all columns to the right, one column left

In [318]:
# indices that need correction are based on 'matching_df'
indices_to_shift = matching_df.index

# Define the column start position for shifting
start_pos = matching_df.columns.get_loc('record_status')

# Create a shifted version of the relevant subset of matching_df
# Shifting is done by slicing the DataFrame to remove the start position column, and appending NaN at the end
shifted_df = matching_df.loc[indices_to_shift].apply(
    lambda row: pd.Series(np.append(row[start_pos + 1:].values, pd.NA), index=row[start_pos:].index), axis=1
)

# Combine the unchanged part of the rows with the shifted part
matching_df.loc[indices_to_shift, matching_df.columns[start_pos:]] = shifted_df

# Apply the same changes to the parent and child DataFrames using indices
df.loc[indices_to_shift] = matching_df.loc[indices_to_shift]
birds.loc[indices_to_shift] = matching_df.loc[indices_to_shift]

In [319]:
columns = ['conservation_status', 'abundance', 'nativeness', 'record_status', 'occurrence', 'unnamed']
for column in columns:
    print(f"{column.title()}:\n{birds[column].unique()}")

Conservation_Status:
['Species of Concern' nan 'In Recovery' 'Threatened' 'Endangered'
 'Under Review' 'Proposed Endangered']
Abundance:
['Uncommon' 'Common' 'Occasional' nan 'Rare' 'Unknown' 'Abundant']
Nativeness:
['Native' 'Unknown' 'Not Native' nan]
Record_Status:
['Approved' 'In Review']
Occurrence:
['Present' 'Not Confirmed' 'Not Present (Historical Report)'
 'Not Present (False Report)' nan 'Not Present']
Unnamed:
[nan <NA>]


In [320]:
# Dropping unnamed
birds = birds.drop(columns=['unnamed'])

#### Dealing with `seasonality` column

In [321]:
print(birds.seasonality.unique())

[nan 'Breeder' 'Vagrant' 'Resident' 'Migratory' 'Breeder, Migratory'
 'Breeder, Summer' 'Migratory, Vagrant' 'Migratory, Winter'
 'Breeder, Winter' 'Winter' 'Resident, Winter' 'Summer, Vagrant'
 'Breeder, Resident, Summer' 'Resident, Summer' 'Summer'
 'Breeder, Resident, Migratory, Summer'
 'Breeder, Resident, Summer, Winter' 'Migratory, Summer' 'Winter, Vagrant'
 'Breeder, Resident']


In [322]:
print(f"NaN count: {birds.seasonality.isna().sum()}\n")
birds.groupby("seasonality").size()

NaN count: 3502



seasonality
Breeder                                 5005
Breeder, Migratory                         4
Breeder, Resident                          9
Breeder, Resident, Migratory, Summer       1
Breeder, Resident, Summer                  3
Breeder, Resident, Summer, Winter          1
Breeder, Summer                            2
Breeder, Winter                           64
Migratory                               2726
Migratory, Summer                          2
Migratory, Vagrant                         4
Migratory, Winter                         23
Resident                                1632
Resident, Summer                           2
Resident, Winter                           9
Summer                                    25
Summer, Vagrant                            3
Vagrant                                 1563
Winter                                    20
Winter, Vagrant                            1
dtype: int64

There's not enough records to warrant this variation. Change to `Breeder`, `Migratory`, `Unknown`

In [323]:
# Fill NaN values with 'Unknown'
birds['seasonality'] = birds['seasonality'].fillna('Unknown')

# Define keywords in the order of priority
priority_keywords = ['Winter', 'Summer', 'Breeder', 'Migratory', 'Resident', 'Vagrant']

def simplify_seasonality(value):
    for keyword in priority_keywords:
        if keyword in value:
            return keyword
    return 'Unknown'

# Apply the classification function to simplify seasonality to one word
birds['seasonality'] = birds['seasonality'].apply(simplify_seasonality)

# Display the updated counts for verification
seasonality_counts = birds['seasonality'].value_counts()
print(seasonality_counts)

seasonality
Breeder      5018
Unknown      3502
Migratory    2730
Resident     1632
Vagrant      1563
Winter        118
Summer         38
Name: count, dtype: int64


----
&nbsp;
### Dealing with `NaN` values and conversion to categorical columns

#### `conservation_status` as per [IUCN](https://en.wikipedia.org/wiki/IUCN_Red_List) Red List
We fill nan with `Least Concern` meaning 'non-protected' where all other values mean 'protected'

In [324]:
# Define the fill values for each column
fill_values = {
    'conservation_status': 'Least Concern',
    'abundance': 'Unknown',
    'nativeness': 'Unknown',
    'occurrence': 'Not Confirmed'
}

# Fill NaN values in the specified columns
birds['conservation_status'] = birds['conservation_status'].fillna(fill_values['conservation_status'])
birds['abundance'] = birds['abundance'].fillna(fill_values['abundance'])
birds['nativeness'] = birds['nativeness'].fillna(fill_values['nativeness'])
birds['occurrence'] = birds['occurrence'].fillna(fill_values['occurrence'])

In [325]:
# Define the ordered categories for each column
conservation_status_order = ['Least Concern', 'Species of Concern', 'In Recovery', 'Under Review', 'Threatened', 'Proposed Endangered', 'Endangered']
abundance_order = ['Rare', 'Uncommon', 'Unknown', 'Occasional', 'Common', 'Abundant']
nativeness_order = ['Not Native', 'Unknown', 'Native']
record_status_order = ['In Review', 'Approved']
occurrence_order = ['Not Present (False Report)', 'Not Present (Historical Report)', 'Not Present', 'Not Confirmed', 'Present']

In [326]:
# Covert columns to categorical
birds['record_status'] = pd.Categorical(birds['record_status'], categories=record_status_order, ordered=True)
birds['occurrence'] = pd.Categorical(birds['occurrence'], categories=occurrence_order, ordered=True)
birds['nativeness'] = pd.Categorical(birds['nativeness'], categories=nativeness_order, ordered=True)
birds['abundance'] = pd.Categorical(birds['abundance'], categories=abundance_order, ordered=True)
birds['conservation_status'] = pd.Categorical(birds['conservation_status'], categories=conservation_status_order, ordered=True)

# add boolean column 'is_protected'
birds['is_protected'] = birds.conservation_status != 'Least Concern'

# Dropping category as all birds
birds = birds.drop(columns=['category'])

In [327]:
print(birds.dtypes) 
birds.head() 

species_id               object
park_code                object
park_name                object
order                    object
family                   object
scientific_name          object
common_names             object
record_status          category
occurrence             category
nativeness             category
abundance              category
seasonality              object
conservation_status    category
is_protected               bool
dtype: object


Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected
55,ACAD-1055,ACAD,Acadia,Accipitriformes,Accipitridae,Accipiter cooperii,Cooper's Hawk,Approved,Present,Native,Uncommon,Unknown,Species of Concern,True
56,ACAD-1056,ACAD,Acadia,Accipitriformes,Accipitridae,Accipiter gentilis,"Eastern Goshawk, Goshawk, Northern Goshawk",Approved,Present,Native,Uncommon,Breeder,Least Concern,False
57,ACAD-1057,ACAD,Acadia,Accipitriformes,Accipitridae,Accipiter striatus,"Northern Sharp-Shinned Hawk, Sharp-Shinned Hawk",Approved,Present,Native,Common,Breeder,Species of Concern,True
58,ACAD-1058,ACAD,Acadia,Accipitriformes,Accipitridae,Aquila chrysaetos,"American Golden Eagle, Golden Eagle",Approved,Present,Native,Occasional,Vagrant,Species of Concern,True
59,ACAD-1059,ACAD,Acadia,Accipitriformes,Accipitridae,Buteo jamaicensis,"Eastern Red-Tailed Hawk, Red-Tailed Hawk",Approved,Present,Native,Common,Breeder,Least Concern,False


----
&nbsp;
## Classifying Species Records


In [328]:
print(f"There are {birds.species_id.nunique()} records")
print(f"There are {birds.common_names.nunique()} unique common names")
print(f"There are {birds.scientific_name.nunique()} unique scientific names")
print(f"There are {birds.family.nunique()} unique families")

There are 14601 records
There are 1550 unique common names
There are 1436 unique scientific names
There are 86 unique families


In [329]:
# This dataset is well formatted with zero lower case values
lowercase_count = (birds.map(lambda x: isinstance(x, str) and x.islower())).sum().sum()
print(lowercase_count)

0


In [330]:
# Searching for punctuation in scientific name
punctuation_pattern = r"[^\w\s,]"
punctuation_matches = birds[birds['scientific_name'].str.contains(punctuation_pattern, na=False)]

# Display the results
print(f"Found {len(punctuation_matches)} scientific names with punctuation:")
punctuation_matches[['scientific_name', 'common_names']]

Found 1 scientific names with punctuation:


Unnamed: 0,scientific_name,common_names
30773,Tyrannus melancholicus/couchii,"""Tropical"" Kingbird,In Review,Present,Not Nati..."


In [331]:
def strip_punctuation(text):
    return re.sub(punctuation_pattern, ' ', text)

# Apply the function to the 'scientific_name' column
birds['scientific_name'] = birds['scientific_name'].apply(strip_punctuation)

In [332]:
print(f"Scientific name NaN count: {birds.scientific_name.isna().sum()}")
print(f"Common Name NaN count: {birds.common_names.isna().sum()}")

Scientific name NaN count: 0
Common Name NaN count: 280


We aim to:

Extract Unique Scientific Names into two groups:

    - Single Names: Scientific names listing only Genus
    - Standard Names: Scientific names with two words (genus and species).
    - Extended Names: Scientific names with more than two words (includes subspecies).

Attach Common Name Counts to Each Scientific Name:

    - Create a dictionary that maps each scientific name to the counts of its associated common names.

Determine an ‘Ultimate Common Name’:

    - Based on the frequency of common names and possible ambiguities, establish a preferred common name for each scientific name.

In [333]:
sci_name_set = set(birds.scientific_name)

# Separate scientific names into standard and extended based on word count
single_sci_names = {name for name in sci_name_set if len(name.split()) == 1}
standard_sci_names = {name for name in sci_name_set if len(name.split()) == 2}
extended_sci_names = {name for name in sci_name_set if len(name.split()) > 2}

print("Single Scientific Name Count (1 word):", len(single_sci_names))
print("Standard Scientific Names Count (2 words):", len(standard_sci_names))
print("Extended Scientific Names Count (> 2 words):", len(extended_sci_names))

Single Scientific Name Count (1 word): 64
Standard Scientific Names Count (2 words): 998
Extended Scientific Names Count (> 2 words): 374


----
&nbsp;
### Single `scientific_name`

In [334]:
results = process_scientific_names(birds, condition=1)

Single Scientific Name Count (1 word): 64
Scientific names with no associated common names: 7
Scientific names with multiple associated common names: 9
Scientific names with a single associated common name: 48


In [335]:
# Filter the birds DataFrame to get records with a single scientific name
single_sci_name_records = birds[birds['scientific_name'].isin(single_sci_names)]

print(f"Shape: {single_sci_name_records.shape}")
print(single_sci_name_records['record_status'].value_counts())

single_sci_name_records_common = single_sci_name_records.common_names.tolist()
print(f"Assoc common names: {single_sci_name_records_common}")
single_sci_name_records.head()

Shape: (100, 14)
record_status
In Review    99
Approved      1
Name: count, dtype: int64
Assoc common names: ['Nighthawks', 'Meadowlarks', 'Warbler', 'Starling', 'Woodpecker', 'Coots, Rails, Waterhens', 'Empidonax Sp.', 'Eagle', 'Hummingbird', 'Falconiforms, Falcons', 'Barn Owl', nan, 'Bird Hawks', 'Dabbling Ducks', 'Diving Ducks', 'Rufous Hummingbirds', 'Nighthawks', "Gulls, Ivory Gulls, Kittiwakes, Ross' Gulls, Sabine's Gulls", 'Stints', 'Dowitchers', 'Coots', 'Scrub Jays', 'Crows', 'Rough-Winged Swallow', 'Cowbirds', 'Mockingbirds', 'Pipits', 'Phainopeplas', 'Empidonax Flycatchers', 'Kingbirds', 'Diving Ducks', 'Goldeneyes', 'Scoters', 'Greater Mergansers', "Gulls, Ivory Gulls, Kittiwakes, Ross' Gulls, Sabine's Gulls", 'Western Grebe', 'Jaegers', 'Saw-Whet Owls', 'Barn-Owls', 'Dowitcher', 'Rufous Hummingbirds', nan, nan, 'Diving Ducks', 'Goldeneyes', 'Swans', 'Alcids, Auks, Gulls, Oystercatchers, Plovers, Shore Birds', 'Dowitcher', 'Jaegers', 'Falconiforms, Falcons', 'Ptarmigans', '

Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected
4301,BIBE-1155,BIBE,Big Bend,Caprimulgiformes,Caprimulgidae,Chordeiles,Nighthawks,In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False
4459,BIBE-1313,BIBE,Big Bend,Passeriformes,Icteridae,Sturnella,Meadowlarks,In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False
6579,BISC-1164,BISC,Biscayne,Passeriformes,Parulidae,Dendroica,Warbler,In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False
6615,BISC-1200,BISC,Biscayne,Passeriformes,Sturnidae,Sturnus,Starling,In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False
6660,BISC-1245,BISC,Biscayne,Piciformes,Picidae,Picoides,Woodpecker,In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False


It seems reasonable to drop the single name records for several reasons:

1. **Generic Common Names:** The common names listed are very generic and often refer to groups or types rather than specific species, which can lead to ambiguity and confusion.

2. **Review Status:** The fact that all but one of these records are marked as “In Review” suggests that these records are not finalized and are potentially under investigation or pending confirmation.

3. **Data Quality and Relevance:** The goal is to create a well-defined OLAP database with precise species information.


In [336]:
# Save the single scientific name records to a CSV for backup
single_sci_name_records.to_csv('DATA/Backups/single_sci_name_birds.csv', index=True)

# Drop the records with single scientific names from the birds DataFrame in place
birds.drop(birds[birds['scientific_name'].isin(single_sci_names)].index, inplace=True)

In [337]:
sci_name_set = set(birds.scientific_name)
single_sci_names = {name for name in sci_name_set if len(name.split()) == 1}
assert single_sci_names == set(), "single_sci_names is not an empty set as required"

----
&nbsp;
### Standard `scientific_name`

In [338]:
results2 = process_scientific_names(birds, condition=2)
no_common_names = results2['no_common_names']
multiple_common_names = results2['multiple_common_names']
single_common_names = results2['single_common_names']

Standard Scientific Names Count (2 words): 998
Scientific names with no associated common names: 36
Scientific names with multiple associated common names: 371
Scientific names with a single associated common name: 591


In [339]:
# Extract all records from birds where scientific_name is in no_common_names
no_common_names_records = birds[birds['scientific_name'].isin(no_common_names)]
print(f"Shape: {no_common_names_records.shape}")
print(no_common_names_records['record_status'].value_counts())
no_common_names_records.head()

Shape: (41, 14)
record_status
In Review    38
Approved      3
Name: count, dtype: int64


Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected
26320,DEVA-1235,DEVA,Death Valley,Accipitriformes,Cathartidae,Pseudogryphus californianus,,In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False
26367,DEVA-1282,DEVA,Death Valley,Apodiformes,Apodidae,Nephoecetes niger,,In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False
26472,DEVA-1387,DEVA,Death Valley,Gaviiformes,Gaviidae,Urinator imber,,In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False
26473,DEVA-1388,DEVA,Death Valley,Gaviiformes,Gaviidae,Urinator lumme,,In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False
26474,DEVA-1389,DEVA,Death Valley,Gaviiformes,Gaviidae,Urinator pacificus,,In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False


In [340]:
# Save the no_common_names_records to a CSV for backup
no_common_names_records.to_csv('DATA/Backups/no_common_names_birds.csv', index=True)

In [341]:
# Filter records with 'record_status' == 'Approved'
approved_records = no_common_names_records[no_common_names_records['record_status'] == 'Approved']
print(approved_records.scientific_name.unique())
approved_records

['Eromophila alpestris' 'Glaucidium californicum' 'Geothlypis tolomiei']


Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected
37569,GLBA-1204,GLBA,Glacier Bay,Passeriformes,Alaudidae,Eromophila alpestris,,Approved,Present,Native,Rare,Migratory,Least Concern,False
86453,PINN-1226,PINN,Pinnacles,Strigiformes,Strigidae,Glaucidium californicum,,Approved,Not Confirmed,Native,Unknown,Unknown,Least Concern,False
111476,YELL-1078,YELL,Yellowstone,Passeriformes,Parulidae,Geothlypis tolomiei,,Approved,Present,Native,Uncommon,Breeder,Least Concern,False


In [342]:
# Extract genera of approved records
approved_genera = approved_records['scientific_name'].apply(lambda x: x.split()[0]).unique()

# Create separate DataFrames for each genus using a dictionary comprehension
genus_dataframes = {genus: birds[birds['scientific_name'].str.startswith(genus)] for genus in approved_genera}

# Accessing each DataFrame by genus name
glaucidium_df = genus_dataframes.get('Glaucidium')
geothlypis_df = genus_dataframes.get('Geothlypis')
eromophila_df = genus_dataframes.get('Eromophila')

- *Geothlypis tolomiei* not being assigned a `common_name` seems to be the result of a typo
- *Glaucidium californicum* could be the result of ambiguous naming of the [`Northern/Mountain Pygmy Owl`](https://en.wikipedia.org/wiki/Northern_pygmy_owl)

In [343]:
geothlypis_df = geothlypis_df[geothlypis_df['scientific_name'] == 'Geothlypis tolmiei']
geothlypis_df[['scientific_name', 'common_names', 'record_status']].head()

Unnamed: 0,scientific_name,common_names,record_status
8329,Geothlypis tolmiei,Macgillivray's Warbler,Approved
26637,Geothlypis tolmiei,Macgillivray's Warbler,Approved
42370,Geothlypis tolmiei,Macgillivray's Warbler,Approved
65428,Geothlypis tolmiei,Macgillivray's Warbler,Approved
115621,Geothlypis tolmiei,Macgillivray's Warbler,Approved


In [344]:
glaucidium_df = glaucidium_df[glaucidium_df['common_names'] == 'Mountain Pygmy Owl, Northern Pygmy-Owl']
glaucidium_df[['scientific_name', 'common_names', 'record_status']]

Unnamed: 0,scientific_name,common_names,record_status
52505,Glaucidium gnoma,"Mountain Pygmy Owl, Northern Pygmy-Owl",Approved
73119,Glaucidium gnoma,"Mountain Pygmy Owl, Northern Pygmy-Owl",Approved
77396,Glaucidium gnoma,"Mountain Pygmy Owl, Northern Pygmy-Owl",Approved
99240,Glaucidium gnoma,"Mountain Pygmy Owl, Northern Pygmy-Owl",Approved
111801,Glaucidium gnoma,"Mountain Pygmy Owl, Northern Pygmy-Owl",Approved


In [345]:
# Also possibly a typo
eromophila_df

Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected
37569,GLBA-1204,GLBA,Glacier Bay,Passeriformes,Alaudidae,Eromophila alpestris,,Approved,Present,Native,Rare,Migratory,Least Concern,False
39528,GRBA-1206,GRBA,Great Basin,Passeriformes,Alaudidae,Eromophila alpestris,,In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False


In [346]:
all_sci_names = birds['scientific_name'].unique()
potential_matches = {}

for sci_name in no_common_names:
    # Use fuzzy matching to find the best matches in the list of all scientific names
    matches = process.extract(sci_name, all_sci_names, scorer=fuzz.ratio, limit=5)  # Adjust limit as needed
    
    # Filter matches to include only those with a similarity score over 90
    high_quality_matches = [match for match in matches if match[1] > 90]
    if len(high_quality_matches) > 1:
        # Retrieve common names for the matching scientific names
        common_names = birds[birds['scientific_name'].isin([match[0] for match in high_quality_matches])]['common_names'].dropna()

        common_names_counter = Counter([name.strip() for names in common_names for name in names.split(',')])
        most_common_name = common_names_counter.most_common(1)
        
        potential_matches[sci_name] = {
            'matches': high_quality_matches,
            'most_common_name': most_common_name[0][0] if most_common_name else 'No common name found'
        }

for sci_name, info in potential_matches.items():
    print(f"\nPotential matches for '{sci_name}':")
    for match in info['matches']:
        print(f"  - {match[0]} (Similarity Score: {match[1]})")
    print(f"Most common common name: {info['most_common_name']}")


Potential matches for 'Geothlypis tolomiei':
  - Geothlypis tolomiei (Similarity Score: 100)
  - Geothlypis tolmiei (Similarity Score: 97)
Most common common name: Macgillivray's Warbler

Potential matches for 'Peucaea cassini':
  - Peucaea cassini (Similarity Score: 100)
  - Peucaea cassinii (Similarity Score: 97)
  - Peucaea casinii (Similarity Score: 93)
Most common common name: Cassin's Sparrow

Potential matches for 'Lophortyx californica':
  - Lophortyx californica (Similarity Score: 100)
  - Lophortyx californicus (Similarity Score: 93)
Most common common name: California Quail

Potential matches for 'Eromophila alpestris':
  - Eromophila alpestris (Similarity Score: 100)
  - Eremophila alpestris (Similarity Score: 95)
Most common common name: Horned Lark


In [347]:
peucaea = birds[birds['scientific_name'].isin(['Peucaea casinii', 'Peucaea cassinii', 'Peucaea cassini'])]
peucaea

Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected
26551,DEVA-1466,DEVA,Death Valley,Passeriformes,Emberizidae,Peucaea cassini,,In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False
42281,GRCA-1306,GRCA,Grand Canyon,Passeriformes,Emberizidae,Peucaea cassinii,Cassin's Sparrow,Approved,Not Present,Native,Unknown,Vagrant,Least Concern,False
65240,JOTR-1067,JOTR,Joshua Tree,Passeriformes,Emberizidae,Peucaea casinii,Cassin'a Sparrow,Approved,Not Confirmed,Native,Unknown,Unknown,Least Concern,False


In [348]:
# Corrected scientific names mapping based on investigation
corrected_names = {
    'Lophortyx californica': 'Lophortyx californicus',  
    'Eromophila alpestris': 'Eremophila alpestris',
    'Peucaea cassini': 'Peucaea cassinii', 
    'Peucaea casinii': 'Peucaea cassinii',
    'Geothlypis tolomiei': 'Geothlypis tolmiei',
    'Glaucidium californicum': 'Glaucidium gnoma'
}

# Corresponding common names for the corrected scientific names
common_name_updates = {
    'Lophortyx californicus': 'California Quail',
    'Eremophila alpestris': 'Horned Lark',
    'Peucaea cassinii': "Cassin's Sparrow",  # Ensure this is correctly assigned
    'Geothlypis tolmiei': "Macgillivray's Warbler",
    'Glaucidium gnoma': 'Mountain Pygmy Owl, Northern Pygmy-Owl'
}

# Update scientific names in the birds DataFrame
for old_name, new_name in corrected_names.items():
    birds.loc[birds['scientific_name'] == old_name, 'scientific_name'] = new_name

# Update common names based on corrected scientific names
for sci_name, common_name in common_name_updates.items():
    birds.loc[birds['scientific_name'] == sci_name, 'common_names'] = common_name

birds_updated = birds[birds['scientific_name'].isin(corrected_names.values())]
birds_updated[['scientific_name', 'common_names', 'record_status']].head()

Unnamed: 0,scientific_name,common_names,record_status
207,Eremophila alpestris,Horned Lark,Approved
1825,Eremophila alpestris,Horned Lark,Approved
1969,Glaucidium gnoma,"Mountain Pygmy Owl, Northern Pygmy-Owl",Approved
2912,Eremophila alpestris,Horned Lark,Approved
4369,Eremophila alpestris,Horned Lark,Approved


In [349]:
results2 = process_scientific_names(birds, condition=2)
no_common_names = results2['no_common_names']
multiple_common_names = results2['multiple_common_names']
single_common_names = results2['single_common_names']

Standard Scientific Names Count (2 words): 992
Scientific names with no associated common names: 31
Scientific names with multiple associated common names: 369
Scientific names with a single associated common name: 592


In [350]:
# Extract all records from birds where scientific_name is in no_common_names
no_common_names_records = birds[birds['scientific_name'].isin(no_common_names)]
print(f"Shape: {no_common_names_records.shape}")
print(no_common_names_records['record_status'].value_counts())

Shape: (34, 14)
record_status
In Review    34
Approved      0
Name: count, dtype: int64


In [351]:
# Drop the records with no common names from the birds DataFrame in place
birds.drop(birds[birds['scientific_name'].isin(no_common_names)].index, inplace=True)

In [352]:
results2 = process_scientific_names(birds, condition=2)

Standard Scientific Names Count (2 words): 961
Scientific names with no associated common names: 0
Scientific names with multiple associated common names: 369
Scientific names with a single associated common name: 592


----
&nbsp;
#### Multiple `common_names`

In [353]:
print(f"Scientific names with multiple associated common names: {len(multiple_common_names)}\n")

# Display subset of scientific names with multiple common names
for sci_name, counts in multiple_common_names[:5]:
    print(f"{sci_name}: {counts}")

Scientific names with multiple associated common names: 369

Gavia adamsii: Counter({'Yellow-Billed Loon': 9, 'White-Billed Diver, Yellow-Billed Loon': 1})
Melanitta nigra: Counter({'Black Scoter': 13, 'Black Scoter, Common Scoter': 2, 'American Common Scoter, American Scoter, Black Scoter, Butter-Bill, Common Scoter': 1})
Branta bernicla: Counter({'Brant': 11, 'American Brant, Brant': 2, 'Brant, Brant Goose, Brent Goose': 2})
Myioborus pictus: Counter({'Painted Redstart': 9, 'Painted Redstart, Painted Whitestart': 1})
Stercorarius pomarinus: Counter({'Pomarine Jaeger': 15, 'Pomarine Jaeger, Pomarine Skua': 2})


In [354]:
# Apply the function to the multiple_common_names
standardized_name_mapping = standardize_common_names(multiple_common_names)

Total ties: 31


In [355]:
print(f"Length: {len(standardized_name_mapping)}, Type: {type(standardized_name_mapping)}")

Length: 369, Type: <class 'dict'>


In [356]:
# Extract the first elements (scientific names) from the list of tuples
multi_sci_names_list = [sci_name for sci_name, counts in multiple_common_names]
print(multi_sci_names_list[:5])

['Gavia adamsii', 'Melanitta nigra', 'Branta bernicla', 'Myioborus pictus', 'Stercorarius pomarinus']


In [357]:
# Return a subset of the birds DataFrame where scientific_name matches any in the list
multi_common_names_records = birds[birds['scientific_name'].isin(multi_sci_names_list)]
print(f"Shape: {multi_common_names_records.shape}")
multi_common_names_records.head()

Shape: (8650, 14)


Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected
56,ACAD-1056,ACAD,Acadia,Accipitriformes,Accipitridae,Accipiter gentilis,"Eastern Goshawk, Goshawk, Northern Goshawk",Approved,Present,Native,Uncommon,Breeder,Least Concern,False
57,ACAD-1057,ACAD,Acadia,Accipitriformes,Accipitridae,Accipiter striatus,"Northern Sharp-Shinned Hawk, Sharp-Shinned Hawk",Approved,Present,Native,Common,Breeder,Species of Concern,True
58,ACAD-1058,ACAD,Acadia,Accipitriformes,Accipitridae,Aquila chrysaetos,"American Golden Eagle, Golden Eagle",Approved,Present,Native,Occasional,Vagrant,Species of Concern,True
59,ACAD-1059,ACAD,Acadia,Accipitriformes,Accipitridae,Buteo jamaicensis,"Eastern Red-Tailed Hawk, Red-Tailed Hawk",Approved,Present,Native,Common,Breeder,Least Concern,False
60,ACAD-1060,ACAD,Acadia,Accipitriformes,Accipitridae,Buteo lagopus,"American Rough-Legged Hawk, Rough-Legged Hawk",Approved,Present,Native,Uncommon,Resident,Species of Concern,True


In [358]:
# Save the no_common_names_records to a CSV for backup
multi_common_names_records.to_csv('DATA/Backups/multi_common_names_birds.csv', index=True)

In [359]:
# Update the birds DataFrame with the standardized common names
for sci_name, common_name in standardized_name_mapping.items():
    birds.loc[birds['scientific_name'] == sci_name, 'common_names'] = common_name

In [360]:
results2 = process_scientific_names(birds, condition=2)

Standard Scientific Names Count (2 words): 961
Scientific names with no associated common names: 0
Scientific names with multiple associated common names: 0
Scientific names with a single associated common name: 961


In [361]:
print(f"There are {birds.species_id.nunique()} records")
print(f"There are {birds.common_names.nunique()} unique common names")
print(f"There are {birds.scientific_name.nunique()} unique scientific names")
print(f"There are {birds.family.nunique()} unique families")

There are 14467 records
There are 927 unique common names
There are 1335 unique scientific names
There are 86 unique families


----
&nbsp;
### Extended `scientific_name`
Potential the most complicated records to fixed

In [362]:
results3 = process_scientific_names(birds, condition=3)
no_common_names = results3['no_common_names']
multiple_common_names = results3['multiple_common_names']
single_common_names = results3['single_common_names']

Extended Scientific Names Count (> 2 words): 374
Scientific names with no associated common names: 160
Scientific names with multiple associated common names: 29
Scientific names with a single associated common name: 185


In [363]:
# Extract all records from birds where scientific_name is in no_common_names
no_common_names_records = birds[birds['scientific_name'].isin(no_common_names)]
print(f"Shape: {no_common_names_records.shape}")
print(no_common_names_records['record_status'].value_counts())
no_common_names_records.head()

Shape: (163, 14)
record_status
Approved     96
In Review    67
Name: count, dtype: int64


Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected
19294,CRLA-1232,CRLA,Crater Lake,Passeriformes,Fringillidae,Coccothraustes vespertinus brooksi,,Approved,Not Confirmed,Native,Unknown,Unknown,Least Concern,False
19350,CRLA-1288,CRLA,Crater Lake,Passeriformes,Troglodytidae,Troglodytes troglodytes pacificus,,Approved,Present,Native,Uncommon,Breeder,Least Concern,False
26298,DEVA-1213,DEVA,Death Valley,Passeriformes,Paridae,Parus inornatus griseus,,In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False
26308,DEVA-1223,DEVA,Death Valley,Accipitriformes,Accipitridae,Buteo lineatus elegans,,In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False
26342,DEVA-1257,DEVA,Death Valley,Anseriformes,Anatidae,Branta canadensis occidentalis,,In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False


In [364]:
# Save the no_common_names_records to a CSV for backup
no_common_names_records.to_csv('DATA/Backups/no_common_names_subspecies_birds.csv', index=True)

#### Handling [Subspecies](https://en.wikipedia.org/wiki/Subspecies) in Scientific Names

A subspecies is a taxonomic classification below species, representing populations of a species that are genetically distinct due to geographic or ecological factors. Subspecies names are often included in scientific naming as a third part, following the genus and species (e.g., *Buteo jamaicensis borealis*).

- **Matching on Genus and Species**: We extracted the genus and species parts of the scientific names and used fuzzy matching to find existing records with associated common names.
- **Updating Common Names**: For each match, the most appropriate common name was selected and updated to include the subspecies designation in parentheses (e.g., *Red-Tailed Hawk (borealis subspecies)*).

In [365]:
all_sci_names = birds['scientific_name'].unique()
potential_matches = {}

for sci_name in no_common_names:
    genus_species = ' '.join(sci_name.split()[:2])

    # Use fuzzy matching to find best matches in the list of all scientific names
    matches = process.extract(genus_species, all_sci_names, scorer=fuzz.ratio, limit=5)
    high_quality_matches = [match for match in matches if match[1] > 90]

    if high_quality_matches:
        # Retrieve common names for the matching scientific names
        matched_sci_names = [match[0] for match in high_quality_matches]
        common_names = birds[birds['scientific_name'].isin(matched_sci_names)]['common_names'].dropna()

        # Count occurrences of each common name
        common_names_counter = Counter([name.strip() for names in common_names for name in names.split(',')])
        most_common_name = common_names_counter.most_common(1)

        # Store potential matches and the most common name found
        potential_matches[sci_name] = {
            'matches': high_quality_matches,
            'most_common_name': most_common_name[0][0] if most_common_name else 'No common name found'
        }

print(f"Potential matches: {len(potential_matches)}")

# Subset the first 5 items from the potential_matches dictionary
matches_subset = dict(islice(potential_matches.items(), 5))
for sci_name, info in matches_subset.items():
    print(f"\nPotential matches for '{sci_name}':")
    for match in info['matches']:
        print(f"  - {match[0]} (Similarity Score: {match[1]})")
    print(f"Most common common name: {info['most_common_name']}")

Potential matches: 145

Potential matches for 'Poecile gambeli inyoensis':
  - Poecile gambeli (Similarity Score: 100)
Most common common name: Mountain Chickadee

Potential matches for 'Mimus polyglottos polyglottos':
  - Mimus polyglottos (Similarity Score: 100)
Most common common name: Northern Mockingbird

Potential matches for 'Vireo griseus griseus':
  - Vireo griseus (Similarity Score: 100)
Most common common name: White-Eyed Vireo

Potential matches for 'Agelaius phoeniceus phoeniceus':
  - Agelaius phoeniceus (Similarity Score: 100)
Most common common name: Red-Winged Blackbird

Potential matches for 'Carpodacus mexicanus solitudinus':
  - Carpodacus mexicanus (Similarity Score: 100)
Most common common name: House Finch


In [366]:
updated_common_names = {}

for sci_name, info in potential_matches.items():
    most_common_name = info['most_common_name']
    subspecies = sci_name.split()[2]  
    updated_name = f"{most_common_name} ({subspecies} subspecies)"
    updated_common_names[sci_name] = updated_name

# Update the birds DataFrame with the new common names
for sci_name, common_name in updated_common_names.items():
    birds.loc[birds['scientific_name'] == sci_name, 'common_names'] = common_name

In [367]:
results3 = process_scientific_names(birds, condition=3)
no_common_names = results3['no_common_names']

Extended Scientific Names Count (> 2 words): 374
Scientific names with no associated common names: 15
Scientific names with multiple associated common names: 29
Scientific names with a single associated common name: 330


In [368]:
no_common_names_records = birds[birds['scientific_name'].isin(no_common_names)]
print(f"Shape: {no_common_names_records.shape}")
print(no_common_names_records['record_status'].value_counts())
no_common_names_records

Shape: (15, 14)
record_status
In Review    14
Approved      1
Name: count, dtype: int64


Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected
26433,DEVA-1348,DEVA,Death Valley,Charadriiformes,Scolopacidae,Symphemia semipalmata inornata,,In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False
26489,DEVA-1404,DEVA,Death Valley,Passeriformes,Alaudidae,Otocoris alpestris arenicola,,In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False
26490,DEVA-1405,DEVA,Death Valley,Passeriformes,Alaudidae,Otocoris alpestris chrysolaema,,In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False
26564,DEVA-1479,DEVA,Death Valley,Passeriformes,Emberizidae,Spizella monticola ochracea,,In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False
26568,DEVA-1483,DEVA,Death Valley,Passeriformes,Emberizidae,Spizella socialis arizonae,,In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False
26675,DEVA-1590,DEVA,Death Valley,Passeriformes,Parulidae,Sylvania pusilla pileolata,,In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False
26703,DEVA-1618,DEVA,Death Valley,Passeriformes,Troglodytidae,Thryothorus bewickii spilurus,,In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False
26718,DEVA-1633,DEVA,Death Valley,Passeriformes,Turdidae,Turdus aonalaschkae auduboni,,In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False
26722,DEVA-1637,DEVA,Death Valley,Passeriformes,Turdidae,Turdus ustulatus swainsonii,,In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False
26759,DEVA-1674,DEVA,Death Valley,Passeriformes,Vireonidae,Vireosylva gilva swainsonii,,In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False


In [369]:
# Dropping the remaining unnamed subspecies
birds.drop(birds[birds['scientific_name'].isin(no_common_names)].index, inplace=True)

In [370]:
results3 = process_scientific_names(birds, condition=3)
multiple_common_names = results3['multiple_common_names']

Extended Scientific Names Count (> 2 words): 359
Scientific names with no associated common names: 0
Scientific names with multiple associated common names: 29
Scientific names with a single associated common name: 330


----
&nbsp;
#### Multiple `common_names`

In [371]:
print(f"Scientific names with multiple associated common names: {len(multiple_common_names)}\n")

Scientific names with multiple associated common names: 29



In [372]:
standardized_names_mapping = standardize_common_names_subspecies(multiple_common_names)

Total ties: 11


In [373]:
print(f"Length: {len(standardized_names_mapping)}, Type: {type(standardized_names_mapping)}")

# Extract the first elements (scientific names) from the list of tuples
multi_sci_names_list = [sci_name for sci_name, counts in multiple_common_names]
print(multi_sci_names_list[:5])

Length: 29, Type: <class 'dict'>
['Junco hyemalis caniceps', 'Vermivora pinus X chrysoptera', 'Cinclus mexicanus unicolor', 'Bonasa umbellus sabini', 'Junco hyemalis mearnsi']


In [374]:
# Return a subset of the birds DataFrame where scientific_name matches any in the list
multi_common_names_records = birds[birds['scientific_name'].isin(multi_sci_names_list)]
print(f"Shape: {multi_common_names_records.shape}")
multi_common_names_records.head()

Shape: (123, 14)


Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected
1847,ARCH-1138,ARCH,Arches,Passeriformes,Emberizidae,Junco hyemalis caniceps,Dark-Eyed Junco (Gray-Headed),Approved,Present,Native,Common,Migratory,Least Concern,False
1848,ARCH-1139,ARCH,Arches,Passeriformes,Emberizidae,Junco hyemalis hyemalis,Dark-Eyed Junco (Slate-Colored),Approved,Present,Native,Rare,Resident,Least Concern,False
1849,ARCH-1140,ARCH,Arches,Passeriformes,Emberizidae,Junco hyemalis mearnsi,Dark-Eyed Junco (Pink-Sided),Approved,Present,Native,Common,Resident,Least Concern,False
1850,ARCH-1141,ARCH,Arches,Passeriformes,Emberizidae,Junco hyemalis oreganus,Dark-Eyed Junco (Oregon),Approved,Present,Native,Common,Resident,Least Concern,False
1935,ARCH-1226,ARCH,Arches,Passeriformes,Tyrannidae,Empidonax traillii extimus,Southwest Willow Flycatcher,Approved,Not Confirmed,Native,Unknown,Unknown,Endangered,True


In [375]:
# Update the birds DataFrame with the standardized common names
for sci_name, common_name in standardized_names_mapping.items():
    birds.loc[birds['scientific_name'] == sci_name, 'common_names'] = common_name

In [376]:
results3 = process_scientific_names(birds, condition=3)
single_common_names = results3['single_common_names']

Extended Scientific Names Count (> 2 words): 359
Scientific names with no associated common names: 0
Scientific names with multiple associated common names: 0
Scientific names with a single associated common name: 359


----
&nbsp;
#### Remaining NaN records in `birds`

In [377]:
print(f"Scientific name NaN count: {birds.scientific_name.isna().sum()}")
print(f"Common Name NaN count: {birds.common_names.isna().sum()}")

Scientific name NaN count: 0
Common Name NaN count: 41


In [378]:
nan_common = birds[birds.common_names.isna()]
print(f"Shape: {nan_common.shape}")
nan_common.head()

Shape: (41, 14)


Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected
19285,CRLA-1223,CRLA,Crater Lake,Passeriformes,Emberizidae,Zonotrichia leucophrys pugetensis,,Approved,Present,Native,Uncommon,Migratory,Least Concern,False
19308,CRLA-1246,CRLA,Crater Lake,Passeriformes,Icteridae,Agelaius phoeniceus caurinus,,Approved,Not Confirmed,Native,Unknown,Unknown,Least Concern,False
19348,CRLA-1286,CRLA,Crater Lake,Passeriformes,Troglodytidae,Troglodytes aedon parkmanii,,Approved,Present,Native,Rare,Breeder,Least Concern,False
19355,CRLA-1293,CRLA,Crater Lake,Passeriformes,Turdidae,Myadestes townsendi townsendi,,Approved,Present,Native,Uncommon,Breeder,Least Concern,False
19379,CRLA-1317,CRLA,Crater Lake,Piciformes,Picidae,Colaptes auratus collaris,,Approved,Present,Native,Common,Breeder,Least Concern,False


In [379]:
# Save unique scientific name in above df
nan_sci_names = nan_common.scientific_name.unique().tolist()

In [380]:
all_sci_names = birds['scientific_name'].unique()
potential_matches = {}

for sci_name in nan_sci_names:
    genus_species = ' '.join(sci_name.split()[:2])

    # Use fuzzy matching to find best matches in the list of all scientific names
    matches = process.extract(genus_species, all_sci_names, scorer=fuzz.ratio, limit=5)
    high_quality_matches = [match for match in matches if match[1] > 90]

    if high_quality_matches:
        # Retrieve common names for the matching scientific names
        matched_sci_names = [match[0] for match in high_quality_matches]
        common_names = birds[birds['scientific_name'].isin(matched_sci_names)]['common_names'].dropna()

        # Count occurrences of each common name
        common_names_counter = Counter([name.strip() for names in common_names for name in names.split(',')])
        most_common_name = common_names_counter.most_common(1)

        # Store potential matches and the most common name found
        potential_matches[sci_name] = {
            'matches': high_quality_matches,
            'most_common_name': most_common_name[0][0] if most_common_name else 'No common name found'
        }

print(f"Potential matches: {len(potential_matches)}")

# Subset the first 5 items from the potential_matches dictionary
matches_subset = dict(islice(potential_matches.items(), 5))
for sci_name, info in matches_subset.items():
    print(f"\nPotential matches for '{sci_name}':")
    for match in info['matches']:
        print(f"  - {match[0]} (Similarity Score: {match[1]})")
    print(f"Most common common name: {info['most_common_name']}")

Potential matches: 37

Potential matches for 'Zonotrichia leucophrys pugetensis':
  - Zonotrichia leucophrys (Similarity Score: 100)
Most common common name: White-Crowned Sparrow

Potential matches for 'Agelaius phoeniceus caurinus':
  - Agelaius phoeniceus (Similarity Score: 100)
Most common common name: Red-Winged Blackbird

Potential matches for 'Troglodytes aedon parkmanii':
  - Troglodytes aedon (Similarity Score: 100)
Most common common name: House Wren

Potential matches for 'Myadestes townsendi townsendi':
  - Myadestes townsendi (Similarity Score: 100)
Most common common name: Townsend's Solitaire

Potential matches for 'Colaptes auratus collaris':
  - Colaptes auratus (Similarity Score: 100)
  - Calaptes auratus (Similarity Score: 94)
Most common common name: Northern Flicker


In [381]:
updated_common_names = {}

for sci_name, info in potential_matches.items():
    most_common_name = info['most_common_name']
    
    # Split the scientific name into parts
    sci_name_parts = sci_name.split()
    
    # Check if the scientific name includes a subspecies
    if len(sci_name_parts) > 2:
        subspecies = sci_name_parts[2]  # Get the subspecies part
        updated_name = f"{most_common_name} ({subspecies} subspecies)"
    else:
        # If no subspecies, just use the most common name
        updated_name = most_common_name

    updated_common_names[sci_name] = updated_name

# Update the birds DataFrame with the new common names
for sci_name, common_name in updated_common_names.items():
    birds.loc[birds['scientific_name'] == sci_name, 'common_names'] = common_name

In [382]:
# Check if there are still any NaN values in 'common_names'
print(f"Common Name NaN count: {birds.common_names.isna().sum()}")

Common Name NaN count: 0


----
&nbsp;
#### Single `common_names`
Searching for 'unusual' records

In [383]:
# Define thresholds and patterns for identifying unusual names
MAX_LENGTH = 50  # Define a threshold for very long common names
MAX_WORD_COUNT = 6  # Define a word count threshold
UNUSUAL_PUNCTUATION_PATTERN = r"[^a-zA-Z\s,\'\-\(\)/`´]"  # Allows letters, spaces, commas, apostrophes, and hyphens

long_names = birds[birds['common_names'].str.len() > MAX_LENGTH]
high_word_count = birds[birds['common_names'].str.split().apply(len) > MAX_WORD_COUNT]
unusual_punctuation = birds[birds['common_names'].str.contains(UNUSUAL_PUNCTUATION_PATTERN, na=False)]

unusual_common_names = pd.concat([long_names, high_word_count, unusual_punctuation]).drop_duplicates()

In [384]:
unusual_common_names

Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected
111,ACAD-1111,ACAD,Acadia,Charadriiformes,Alcidae,Cepphus grylle,"Black Guillemot, Sea Pigeon, Southern Black Gu...",Approved,Present,Native,Common,Breeder,Least Concern,False
26297,DEVA-1212,DEVA,Death Valley,Passeriformes,Parulidae,Dendroica aestiva,"Golden Warbler, Northern Yellow Warbler, Summe...",In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False
26777,DEVA-1692,DEVA,Death Valley,Piciformes,Picidae,Ceophloeus pileatus,"Black Woodcock, Logcock, Pileated Woodpecker, ...",In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False
30773,DRTO-1249,DRTO,Dry Tortugas,Passeriformes,Tyrannidae,Tyrannus melancholicus couchii,"""Tropical"" Kingbird,In Review,Present,Not Nati...",Approved,Present,Native,Uncommon,Migratory,Least Concern,False
39491,GRBA-1169,GRBA,Great Basin,Charadriiformes,Scolopacidae,Gallinago gallinago delicata,"Common Snipe, English Snipe, Jacksnipe, Wilson...",Approved,Present,Native,Uncommon,Unknown,Least Concern,False
63823,ISRO-1047,ISRO,Isle Royale,Anseriformes,Anatidae,Anas rubripes X platyrhynchos,"American Black Mallard, Mallard X Black Duck H...",Approved,Present,Native,Occasional,Unknown,Least Concern,False
19387,CRLA-1325,CRLA,Crater Lake,Piciformes,Picidae,Picoides tridactylus fasciatus,Northern 3-Toed Woodpecker,Approved,Present,Native,Uncommon,Breeder,Least Concern,False
42377,GRCA-1402,GRCA,Grand Canyon,Passeriformes,Parulidae,Leiothlypis luciae,LucyS Warbler,Approved,Present,Native,Common,Breeder,Least Concern,False
58589,HAVO-1061,HAVO,Hawaii Volcanoes,Passeriformes,Fringillidae,Hemignathus wilsoni,´Akiap_L_´Au,Approved,Present,Native,Rare,Breeder,Least Concern,False
58605,HAVO-1077,HAVO,Hawaii Volcanoes,Passeriformes,Turdidae,Myadestes obscurus,"´_Ma´_, Hawai´I Thrush",Approved,Present,Native,Common,Breeder,Least Concern,False


In [385]:
# Change manually for genus species records and numeric/strange characters
birds.loc[111, 'common_names'] = 'Black Guillemot'
birds.loc[26297, 'common_names'] = "Yellow Warbler"
birds.loc[26777, 'common_names'] = "Pileated Woodpecker"
birds.loc[19387, 'common_names'] = 'Northern Three-Toed Woodpecker'
birds.loc[42377, 'common_names'] = "Lucy's Warbler"

In [386]:
tyrannus_mix = birds[birds['scientific_name'].str.contains("Tyrannus melancholicus couchii", na=False)]
tyrannus_mix

Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected
30773,DRTO-1249,DRTO,Dry Tortugas,Passeriformes,Tyrannidae,Tyrannus melancholicus couchii,"""Tropical"" Kingbird,In Review,Present,Not Nati...",Approved,Present,Native,Uncommon,Migratory,Least Concern,False


The above record is an absolute mess

In [387]:
tyrannus1 = birds[birds['scientific_name'].str.contains("Tyrannus melancholicus", na=False)]
tyrannus1

Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected
4588,BIBE-1442,BIBE,Big Bend,Passeriformes,Tyrannidae,Tyrannus melancholicus,Tropical Kingbird,Approved,Present,Native,Uncommon,Breeder,Least Concern,False
26742,DEVA-1657,DEVA,Death Valley,Passeriformes,Tyrannidae,Tyrannus melancholicus,Tropical Kingbird,Approved,Present,Native,Occasional,Vagrant,Least Concern,False
30773,DRTO-1249,DRTO,Dry Tortugas,Passeriformes,Tyrannidae,Tyrannus melancholicus couchii,"""Tropical"" Kingbird,In Review,Present,Not Nati...",Approved,Present,Native,Uncommon,Migratory,Least Concern,False
88216,REDW-1574,REDW,Redwood,Passeriformes,Tyrannidae,Tyrannus melancholicus,Tropical Kingbird,Approved,Present,Not Native,Occasional,Vagrant,Least Concern,False


In [388]:
tyrannus2 = birds[birds['scientific_name'].str.contains("Tyrannus couchii", na=False)]
tyrannus2

Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected
4585,BIBE-1439,BIBE,Big Bend,Passeriformes,Tyrannidae,Tyrannus couchii,Couch's Kingbird,Approved,Present,Native,Rare,Breeder,Least Concern,False


In [389]:
# Drop the record with index 30773
birds.drop(index=30773, inplace=True)

# Verify the record has been dropped
assert not 30773 in birds.index, "30773 exists in birds df"

In [390]:
print(f"There are {birds.species_id.nunique()} records")
print(f"There are {birds.common_names.nunique()} unique common names")
print(f"There are {birds.scientific_name.nunique()} unique scientific names")
print(f"There are {birds.family.nunique()} unique families")

There are 14451 records
There are 1062 unique common names
There are 1319 unique scientific names
There are 86 unique families


----
&nbsp;
### Finding typos under `scientific_name`, normalising subspecies

In [391]:
# Step 1: Generate the common name counts for each scientific name
sci_name_common_name_counts = {}

for sci_name in birds['scientific_name'].unique():
    # Extract common names associated with the current scientific name
    common_names = birds[birds['scientific_name'] == sci_name]['common_names']
    # Count occurrences of each common name
    all_common_names = [name.strip() for names in common_names.dropna() for name in names.split(',')]
    sci_name_common_name_counts[sci_name] = Counter(all_common_names)

In [392]:
print(len(sci_name_common_name_counts))

1319


##### Cross-referencing potential typos with the common name counts will provide a data-driven way to determine which scientific name is likely correct. 
We do not have domain expertise in taxonomic nomenclature or ornithology...

In [393]:
# DataFrame ordered by scientific name in ascending order
birds_sorted = birds.sort_values(by='scientific_name', ascending=True)

two_word_sci_names = birds_sorted[birds_sorted['scientific_name'].str.split().str.len() == 2]
three_or_more_word_sci_names = birds_sorted[birds_sorted['scientific_name'].str.split().str.len() >= 3]

In [394]:
print(f"Shape: {two_word_sci_names.shape}")
two_word_sci_names.head()

Shape: (13925, 14)


Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected
24922,DENA-1157,DENA,Denali,Passeriformes,Fringillidae,Acanthis flammea,Common Redpoll,Approved,Not Present (False Report),Native,Unknown,Unknown,Least Concern,False
26573,DEVA-1488,DEVA,Death Valley,Passeriformes,Fringillidae,Acanthis flammea,Common Redpoll,Approved,Not Present (False Report),Native,Unknown,Unknown,Least Concern,False
42299,GRCA-1324,GRCA,Grand Canyon,Passeriformes,Fringillidae,Acanthis flammea,Common Redpoll,Approved,Present,Native,Occasional,Vagrant,Least Concern,False
111658,YELL-1260,YELL,Yellowstone,Passeriformes,Fringillidae,Acanthus flammea,"Common Redpoll, Redpoll",Approved,Present,Native,Uncommon,Winter,Least Concern,False
55,ACAD-1055,ACAD,Acadia,Accipitriformes,Accipitridae,Accipiter cooperii,Cooper's Hawk,Approved,Present,Native,Uncommon,Unknown,Species of Concern,True


In [395]:
print(f"Shape: {three_or_more_word_sci_names.shape}")
three_or_more_word_sci_names.head()

Shape: (526, 14)


Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected
78390,MORA-1070,MORA,Mount Rainier,Accipitriformes,Accipitridae,Accipiter gentilis atricapillus,Western Goshawk,Approved,Present,Native,Uncommon,Breeder,Least Concern,False
78391,MORA-1071,MORA,Mount Rainier,Accipitriformes,Accipitridae,Accipiter gentilis laingi,Goshawk,Approved,Not Present (False Report),Native,Unknown,Unknown,Least Concern,False
61880,HOSP-1054,HOSP,Hot Springs,Accipitriformes,Accipitridae,Accipiter striatus velox,Sharp-Shinned Hawk (velox subspecies),Approved,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False
39433,GRBA-1111,GRBA,Great Basin,Accipitriformes,Accipitridae,Accipiter striatus velox,Sharp-Shinned Hawk (velox subspecies),Approved,Present,Native,Unknown,Unknown,Least Concern,False
42133,GRCA-1158,GRCA,Grand Canyon,Apodiformes,Apodidae,Aeronautes saxatalis saxatalis,White-Throated Swift,Approved,Present,Native,Common,Breeder,Least Concern,False


----
&nbsp;
#### Reviewing `scientific_name` for multiple comma-separated entries under `common_names`

In [396]:
# Isolate records with comma-separated common names
comma_separated_common_names = two_word_sci_names[two_word_sci_names['common_names'].str.contains(',', na=False)]

print(f"Shape: {comma_separated_common_names.shape}")
comma_separated_common_names.head()

Shape: (153, 14)


Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected
111658,YELL-1260,YELL,Yellowstone,Passeriformes,Fringillidae,Acanthus flammea,"Common Redpoll, Redpoll",Approved,Present,Native,Uncommon,Winter,Least Concern,False
109,ACAD-1109,ACAD,Acadia,Charadriiformes,Alcidae,Alca torda,"Northern Razor-Billed Auk, Razorbill",Approved,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False
31592,EVER-1220,EVER,Everglades,Passeriformes,Emberizidae,Ammodramus caudacutus,"Saltmarsh Sharp-Tailed Sparrow, Saltmarsh Sparrow",Approved,Present,Native,Uncommon,Resident,Least Concern,False
62041,HOSP-1215,HOSP,Hot Springs,Passeriformes,Emberizidae,Ammodramus caudacutus,"Saltmarsh Sharp-Tailed Sparrow, Saltmarsh Sparrow",Approved,Not Present (False Report),Unknown,Unknown,Unknown,Least Concern,False
227,ACAD-1227,ACAD,Acadia,Passeriformes,Emberizidae,Ammodramus caudacutus,"Saltmarsh Sharp-Tailed Sparrow, Saltmarsh Sparrow",Approved,Not Present (False Report),Unknown,Unknown,Unknown,Least Concern,False


In [397]:
subset_sci_names = comma_separated_common_names['scientific_name'].unique()
potential_matches = {}

for sci_name in subset_sci_names:
    # Extract genus and species from the scientific name
    genus_species = ' '.join(sci_name.split()[:2])

    # Use fuzzy matching to find best matches in the list of all scientific names
    matches = process.extract(genus_species, all_sci_names, scorer=fuzz.ratio, limit=5)
    high_quality_matches = [match for match in matches if match[1] > 90]

    if high_quality_matches:
        # Retrieve common names for the matching scientific names
        matched_sci_names = [match[0] for match in high_quality_matches]
        common_names = birds[birds['scientific_name'].isin(matched_sci_names)]['common_names'].dropna()

        # Count occurrences of each full common name as a whole string
        common_names_counter = Counter(common_names)

        # Only consider potential matches with multiple distinct common names
        if len(common_names_counter) > 1:
            most_common_name = common_names_counter.most_common(1)

            # Store potential matches and the most common name found, along with the counter
            potential_matches[sci_name] = {
                'matches': high_quality_matches,
                'most_common_name': most_common_name[0][0] if most_common_name else 'No common name found',
                'common_name_counts': common_names_counter  # Include the Counter object with counts of full strings
            }

In [398]:
print(f"Potential matches: {len(potential_matches)}")
for sci_name, info in potential_matches.items():
    print(f"\nPotential matches for '{sci_name}':")
    for match in info['matches']:
        print(f"  - {match[0]} (Similarity Score: {match[1]})")
    print(f"Most common common name: {info['most_common_name']}")
    print(f"Common name counts: {info['common_name_counts']}") 

Potential matches: 3

Potential matches for 'Acanthus flammea':
  - Acanthus flammea (Similarity Score: 100)
  - Acanthis flammea (Similarity Score: 94)
Most common common name: Common Redpoll
Common name counts: Counter({'Common Redpoll': 3, 'Common Redpoll, Redpoll': 1})

Potential matches for 'Butorides striata':
  - Butorides striata (Similarity Score: 100)
  - Butorides striatus (Similarity Score: 91)
Most common common name: Green-Backed Heron
Common name counts: Counter({'Green-Backed Heron': 5, 'Green-Backed Heron, Striated Heron': 1})

Potential matches for 'Charadrius vociferous':
  - Charadrius vociferous (Similarity Score: 100)
  - Charadrius vociferus (Similarity Score: 98)
Most common common name: Killdeer
Common name counts: Counter({'Killdeer': 50, 'Killdeer, Killdeer Plover': 1})


In [399]:
# Create mappings for scientific names and common names based on potential matches
scientific_name_mapping = {}
common_name_mapping = {}

for sci_name, info in potential_matches.items():
    # Assume the best match is the one with the highest count
    best_match_sci_name = info['matches'][1][0]  
    best_common_name = info['most_common_name']
    
    # Add to the mapping dictionaries
    scientific_name_mapping[sci_name] = best_match_sci_name
    common_name_mapping[sci_name] = best_common_name

print(f"Scientific Name Mapping: {scientific_name_mapping}")
print(f"Common Name Mapping: {common_name_mapping}")

Scientific Name Mapping: {'Acanthus flammea': 'Acanthis flammea', 'Butorides striata': 'Butorides striatus', 'Charadrius vociferous': 'Charadrius vociferus'}
Common Name Mapping: {'Acanthus flammea': 'Common Redpoll', 'Butorides striata': 'Green-Backed Heron', 'Charadrius vociferous': 'Killdeer'}


In [400]:
# Update the common_names
birds['common_names'] = birds.apply(
    lambda row: common_name_mapping.get(row['scientific_name'], row['common_names']), axis=1
)

# Update the scientific_name
birds['scientific_name'] = birds.apply(
    lambda row: scientific_name_mapping.get(row['scientific_name'], row['scientific_name']), axis=1
)

----
&nbsp;
#### Looking for typos/ambiguities under matching common names where `scientific_name` is of form: *Genus species*

In [401]:
# Initialize dictionary to store common names and their associated scientific names with counts
common_name_to_sci_names = {}

for common_name in two_word_sci_names['common_names'].unique():
    # Extract associated scientific names and their record counts
    associated_sci_names = two_word_sci_names[two_word_sci_names['common_names'] == common_name]['scientific_name']
    sci_name_counts = Counter(associated_sci_names)
    common_name_to_sci_names[common_name] = sci_name_counts

# Step 2: Analyze counts to identify potential typos or inconsistencies
potential_issues = {}
for common_name, sci_name_counts in common_name_to_sci_names.items():
    if len(sci_name_counts) > 1:  # If multiple scientific names are associated with the same common name
        potential_issues[common_name] = sci_name_counts

In [402]:
# Convert potential issues to a DataFrame
potential_issues_df = pd.DataFrame([
    {'common_names': common_name, 
     'scientific_name': sci_name, 
     'record_count': count
     } 
    for common_name, sci_name_counts in potential_issues.items()
    for sci_name, count in sci_name_counts.items()
])

print(f"Shape: {potential_issues_df.shape}")
potential_issues_df.head(10)

Shape: (265, 3)


Unnamed: 0,common_names,scientific_name,record_count
0,Common Redpoll,Acanthis flammea,3
1,Common Redpoll,Carduelis flammea,28
2,Spotted Sandpiper,Actitis macularia,25
3,Spotted Sandpiper,Actitis macularius,30
4,Spotted Sandpiper,Tringa macularia,1
5,Cassin's Sparrow,Aimophila cassinii,6
6,Cassin's Sparrow,Peucaea cassinii,3
7,Roseate Spoonbill,Ajaia ajaja,1
8,Roseate Spoonbill,Platalea ajaja,4
9,Le Conte's Sparrow,Ammodramus leconteii,11


#### We identify the affected indices in `birds` to create a backup

In [403]:
# Create a dictionary to hold indices and affected records for each common name
affected_records = {}
issues_common_names = set(potential_issues_df.common_names)

for common_name in issues_common_names:
    records = birds[(birds['common_names'] == common_name) & 
                    (birds['scientific_name'].str.split().apply(len) == 2)]
    indices = records.index.tolist()
    if indices:  
        affected_records[common_name] = records

# Combine all affected records into a single DataFrame for exporting
affected_records_df = pd.concat(affected_records.values())
print(f"Shape: {affected_records_df.shape}")
assert set(potential_issues_df.common_names) == set(affected_records_df.common_names), "The sets are not equal as required"
assert all(affected_records_df['scientific_name'].str.split().apply(len) == 2), "There are subspecies or incorrect scientific names present"
affected_records_df.head()

Shape: (3270, 14)


Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected
383,ACAD-1383,ACAD,Acadia,Piciformes,Picidae,Colaptes auratus,Northern Flicker,Approved,Present,Native,Common,Breeder,Least Concern,False
1954,ARCH-1245,ARCH,Arches,Piciformes,Picidae,Colaptes auratus,Northern Flicker,Approved,Present,Native,Uncommon,Breeder,Least Concern,False
3041,BADL-1284,BADL,Badlands,Piciformes,Picidae,Calaptes auratus,Northern Flicker,Approved,Present,Native,Uncommon,Unknown,Least Concern,False
3042,BADL-1285,BADL,Badlands,Piciformes,Picidae,Colaptes auratus,Northern Flicker,Approved,Present,Native,Common,Breeder,Least Concern,False
4622,BIBE-1476,BIBE,Big Bend,Piciformes,Picidae,Colaptes auratus,Northern Flicker,Approved,Present,Native,Common,Breeder,Least Concern,False


In [404]:
# Save the affected records to a CSV file for backup before making changes
affected_records_df.to_csv('DATA/Backups/multi_sci_names_birds.csv', index=True)

In [405]:
# Create a mapping of common names to the selected scientific name based on highest count, and flag ties for manual review
selected_sci_names = {}
ties_for_review = {}

for common_name, group in potential_issues_df.groupby('common_names'):
    max_count = group['record_count'].max()
    max_count_names = group[group['record_count'] == max_count]
    
    if len(max_count_names) > 1:
        # Flag for manual review due to tie
        ties_for_review[common_name] = max_count_names[['scientific_name', 'record_count']].to_dict(orient='records')
        print(f"Tie detected for '{common_name}':\n{ties_for_review[common_name]}\n")
    else:
        chosen_name = max_count_names.iloc[0]['scientific_name']
        selected_sci_names[common_name] = chosen_name

Tie detected for 'Abert's Towhee':
[{'scientific_name': 'Melozone aberti', 'record_count': 2}, {'scientific_name': 'Pipilo aberti', 'record_count': 2}]

Tie detected for 'Black-Footed Albatross':
[{'scientific_name': 'Diomedea nigripes', 'record_count': 3}, {'scientific_name': 'Phoebastria nigripes', 'record_count': 3}]

Tie detected for 'Far Eastern Curlew':
[{'scientific_name': 'Numenius arquata', 'record_count': 1}, {'scientific_name': 'Numenius madagascariensis', 'record_count': 1}]

Tie detected for 'Gray-Headed Chickadee':
[{'scientific_name': 'Poecile cincta', 'record_count': 1}, {'scientific_name': 'Poecile cinctus', 'record_count': 1}]

Tie detected for 'Lawrence's Warbler':
[{'scientific_name': 'Vermivora lawrencei', 'record_count': 1}, {'scientific_name': 'Vermivora lawrencii', 'record_count': 1}]

Tie detected for 'Wandering Tattler':
[{'scientific_name': 'Heteroscelus incanus', 'record_count': 9}, {'scientific_name': 'Tringa incana', 'record_count': 9}]



In [406]:
# The above records, tied on count will be reviewed manually on wikipedia
manual_choices = {
    "Abert's Towhee": "Melozone aberti",
    "Black-Footed Albatross": "Phoebastria nigripes",
    "Far Eastern Curlew": "Numenius madagascariensis",
    "Gray-Headed Chickadee": "Poecile cinctus",
    "Lawrence's Warbler": "Vermivora lawrencei",  # Example choice
    "Wandering Tattler": "Tringa incana"
}

# Append manual choices to selected_sci_names
selected_sci_names.update(manual_choices)
print(f"Number of amendments: {len(selected_sci_names)}")

Number of amendments: 126


In [407]:
# Create a mapping of common names to selected scientific names
selected_sci_names_mapping = {common_name: sci_name for common_name, sci_name in selected_sci_names.items()}

# Update the birds DataFrame in place, only affecting scientific names with exactly two words
birds['scientific_name'] = birds.apply(
    lambda row: selected_sci_names_mapping.get(row['common_names'], row['scientific_name']) 
    if len(row['scientific_name'].split()) == 2 else row['scientific_name'], 
    axis=1
)

In [408]:
print(f"There are {birds.species_id.nunique()} records")
print(f"There are {birds.common_names.nunique()} unique common names")
print(f"There are {birds.scientific_name.nunique()} unique scientific names")
print(f"There are {birds.family.nunique()} unique families")

There are 14451 records
There are 1059 unique common names
There are 1177 unique scientific names
There are 86 unique families


In [409]:
two_word_sci_names = birds_sorted[birds_sorted['scientific_name'].str.split().str.len() == 2]
sci_name_to_common_names = defaultdict(list)

for index, row in two_word_sci_names.iterrows():
    sci_name = row['scientific_name']
    common_name = row['common_names']
    sci_name_to_common_names[sci_name].append(common_name)

sci_name_to_common_names = {sci_name: list(set(common_names)) for sci_name, common_names in sci_name_to_common_names.items()}
print(f"Length of genus species sci names: {len(sci_name_to_common_names)}")

comma_separated_names = {
    sci_name: list(set(common_names)) 
    for sci_name, common_names in sci_name_to_common_names.items() 
    if any(',' in name for name in common_names)
}
print(f"Length comma separated common names: {len(comma_separated_names)}")

Length of genus species sci names: 961
Length comma separated common names: 36


#### Hawaiian names

In [410]:
# Check for Hawaiian birds based on the presence of an acute accent character in common names
special_char_birds = two_word_sci_names[two_word_sci_names['common_names'].str.contains('´', na=False)]

# Display the results
print(f"Shape: {special_char_birds.shape}")
special_char_birds.head()

Shape: (20, 14)


Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected
58544,HAVO-1016,HAVO,Hawaii Volcanoes,Accipitriformes,Accipitridae,Buteo solitarius,"´Io, Hawaiian Hawk",Approved,Present,Native,Uncommon,Breeder,Endangered,True
58600,HAVO-1072,HAVO,Hawaii Volcanoes,Passeriformes,Monarchidae,Chasiempis sandwichensis,Hawai´I ´Elepaio,Approved,Present,Native,Common,Breeder,Least Concern,False
58584,HAVO-1056,HAVO,Hawaii Volcanoes,Passeriformes,Fringillidae,Ciridops anna,´Ula-´Ai-Hawane,Approved,Not Present (Historical Report),Native,Unknown,Unknown,Least Concern,False
58577,HAVO-1049,HAVO,Hawaii Volcanoes,Passeriformes,Corvidae,Corvus hawaiiensis,´Alala,Approved,Not Present (Historical Report),Native,Unknown,Unknown,Endangered,True
58587,HAVO-1059,HAVO,Hawaii Volcanoes,Passeriformes,Fringillidae,Drepanis pacifica,Hawai´I Mamo,Approved,Not Present (Historical Report),Native,Unknown,Unknown,Least Concern,False


In [411]:
# Define correction mappings for common misformatted Hawaiian characters
correction_mappings = {
    '´': 'ʻ',  # Replace acute accent with okina
    '_': ' ',  # Replace underscores with spaces
    'L': 'l'   # Correct capital L if needed
}

def correct_hawaiian_names(name):
    for incorrect, correct in correction_mappings.items():
        name = name.replace(incorrect, correct)
    return name

# Apply corrections to the Hawaiian bird names
birds['common_names'] = birds.apply(
    lambda row: correct_hawaiian_names(row['common_names']) if row['scientific_name'] in special_char_birds['scientific_name'].values else row['common_names'], 
    axis=1
)

# Display the corrected DataFrame
corrected_hawaiian_birds = birds[birds['scientific_name'].isin(special_char_birds['scientific_name'])]
print(f"Shape: {corrected_hawaiian_birds.shape}")

Shape: (20, 14)


----
&nbsp;
### Reviewing subspecies of form *Genus species subspecies*

In [412]:
# DataFrame ordered by scientific name in ascending order
birds_sorted = birds.sort_values(by='scientific_name', ascending=True)

two_word_sci_names = birds_sorted[birds_sorted['scientific_name'].str.split().str.len() == 2].copy()
three_word_sci_names = birds_sorted[birds_sorted['scientific_name'].str.split().str.len() >= 3].copy()

In [413]:
# Group by genus and species to identify subspecies variations
three_word_sci_names.loc[:, 'genus_species'] = three_word_sci_names['scientific_name'].apply(lambda x: ' '.join(x.split()[:2]))

In [414]:
# Create a mapping of genus_species to common names from two_word_sci_names
genus_species_to_common_name = two_word_sci_names.set_index('scientific_name')['common_names'].to_dict()
three_word_sci_names.loc[:, 'matched_common_name'] = three_word_sci_names['genus_species'].map(genus_species_to_common_name)

In [415]:
# Exclude specific columns from three_word_sci_names 
columns_to_exclude = ['record_status', 'occurrence', 'nativeness', 'abundance', 'seasonality', 'conservation_status', 'is_protected'] 
three_word_sci_names_filtered = three_word_sci_names.drop(columns=columns_to_exclude, errors='ignore') 

print(f"Shape: {three_word_sci_names_filtered.shape}")
three_word_sci_names_filtered.head()

Shape: (526, 9)


Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,genus_species,matched_common_name
78390,MORA-1070,MORA,Mount Rainier,Accipitriformes,Accipitridae,Accipiter gentilis atricapillus,Western Goshawk,Accipiter gentilis,Northern Goshawk
78391,MORA-1071,MORA,Mount Rainier,Accipitriformes,Accipitridae,Accipiter gentilis laingi,Goshawk,Accipiter gentilis,Northern Goshawk
61880,HOSP-1054,HOSP,Hot Springs,Accipitriformes,Accipitridae,Accipiter striatus velox,Sharp-Shinned Hawk (velox subspecies),Accipiter striatus,Sharp-Shinned Hawk
39433,GRBA-1111,GRBA,Great Basin,Accipitriformes,Accipitridae,Accipiter striatus velox,Sharp-Shinned Hawk (velox subspecies),Accipiter striatus,Sharp-Shinned Hawk
42133,GRCA-1158,GRCA,Grand Canyon,Apodiformes,Apodidae,Aeronautes saxatalis saxatalis,White-Throated Swift,Aeronautes saxatalis,White-Throated Swift


In [416]:
# list of common names to exclude to inspection
common_names_to_exclude = ["Mountain Pygmy Owl, Northern Pygmy-Owl", 
                           "Baltimore Oriole, Northern Oriole", 
                           "Western Gull",
                           "Whimbrel",
                           "Lazuli Bunting",
                           "American Three-Toed Woodpecker",
                           "Snail Kite",
                           "Great Gray Owl",
                           "Spotted Owl",
                           "Prairie Chicken",
                           "Golden-Winged Warbler",
                           "Blue-Winged Warbler"
                           ]  

# Filter out rows where 'matched_common_name' is in the list of values to exclude
three_word_sci_names = three_word_sci_names[~three_word_sci_names['matched_common_name'].isin(common_names_to_exclude)]

columns_to_exclude = ['record_status', 'occurrence', 'nativeness', 'abundance', 'seasonality', 'conservation_status', 'is_protected'] 
three_word_sci_names = three_word_sci_names.drop(columns=columns_to_exclude, errors='ignore') 

In [417]:
# Filter out rows with NaN in 'matched_common_name'
three_word_sci_names_filtered = three_word_sci_names.dropna(subset=['matched_common_name'])

# Update the 'common_names' column to include subspecies information if not already present
def standardize_common_names(row):
    subspecies = row['scientific_name'].split()[-1]
    
    # Check if the 'common_names' already contains brackets
    if '(' not in row['common_names']:
        new_common_name = f"{row['matched_common_name']} ({subspecies} subspecies)"
        return new_common_name
    else:
        # If brackets already present, keep the original common name
        return row['common_names']

three_word_sci_names_filtered.loc[:, 'common_names'] = three_word_sci_names_filtered.apply(standardize_common_names, axis=1)
print(f"Shape after updating common names: {three_word_sci_names_filtered.shape}")
three_word_sci_names_filtered.head()

Shape after updating common names: (468, 9)


Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,genus_species,matched_common_name
78390,MORA-1070,MORA,Mount Rainier,Accipitriformes,Accipitridae,Accipiter gentilis atricapillus,Northern Goshawk (atricapillus subspecies),Accipiter gentilis,Northern Goshawk
78391,MORA-1071,MORA,Mount Rainier,Accipitriformes,Accipitridae,Accipiter gentilis laingi,Northern Goshawk (laingi subspecies),Accipiter gentilis,Northern Goshawk
61880,HOSP-1054,HOSP,Hot Springs,Accipitriformes,Accipitridae,Accipiter striatus velox,Sharp-Shinned Hawk (velox subspecies),Accipiter striatus,Sharp-Shinned Hawk
39433,GRBA-1111,GRBA,Great Basin,Accipitriformes,Accipitridae,Accipiter striatus velox,Sharp-Shinned Hawk (velox subspecies),Accipiter striatus,Sharp-Shinned Hawk
42133,GRCA-1158,GRCA,Grand Canyon,Apodiformes,Apodidae,Aeronautes saxatalis saxatalis,White-Throated Swift (saxatalis subspecies),Aeronautes saxatalis,White-Throated Swift


In [418]:
# We now update birds by indices filtered above
filtered_indices = three_word_sci_names_filtered.index
birds_filtered_subset = birds.loc[filtered_indices]
print(f"Shape of the subset from birds DataFrame: {birds_filtered_subset.shape}")
birds_filtered_subset.head()

Shape of the subset from birds DataFrame: (468, 14)


Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected
78390,MORA-1070,MORA,Mount Rainier,Accipitriformes,Accipitridae,Accipiter gentilis atricapillus,Western Goshawk,Approved,Present,Native,Uncommon,Breeder,Least Concern,False
78391,MORA-1071,MORA,Mount Rainier,Accipitriformes,Accipitridae,Accipiter gentilis laingi,Goshawk,Approved,Not Present (False Report),Native,Unknown,Unknown,Least Concern,False
61880,HOSP-1054,HOSP,Hot Springs,Accipitriformes,Accipitridae,Accipiter striatus velox,Sharp-Shinned Hawk (velox subspecies),Approved,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False
39433,GRBA-1111,GRBA,Great Basin,Accipitriformes,Accipitridae,Accipiter striatus velox,Sharp-Shinned Hawk (velox subspecies),Approved,Present,Native,Unknown,Unknown,Least Concern,False
42133,GRCA-1158,GRCA,Grand Canyon,Apodiformes,Apodidae,Aeronautes saxatalis saxatalis,White-Throated Swift,Approved,Present,Native,Common,Breeder,Least Concern,False


In [419]:
# Save the subset to a CSV file for records
birds_filtered_subset.to_csv('DATA/Backups/subspecies_sci_name_birds.csv', index=False)

In [420]:
# Update the 'common_names' in the original birds DataFrame using the indices from three_word_sci_names_filtered
birds.loc[filtered_indices, 'common_names'] = three_word_sci_names_filtered['common_names']

In [421]:
# Checking change made
birds.loc[[78390]]

Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected
78390,MORA-1070,MORA,Mount Rainier,Accipitriformes,Accipitridae,Accipiter gentilis atricapillus,Northern Goshawk (atricapillus subspecies),Approved,Present,Native,Uncommon,Breeder,Least Concern,False


In [422]:
print(f"There are {birds.species_id.nunique()} records")
print(f"There are {birds.common_names.nunique()} unique common names")
print(f"There are {birds.scientific_name.nunique()} unique scientific names")
print(f"There are {birds.family.nunique()} unique families")

There are 14451 records
There are 1159 unique common names
There are 1177 unique scientific names
There are 86 unique families


#### Identify `common_names` with multiple `scientific_name`:

In [423]:
# Group by common name and count unique scientific names
common_name_sci_count = birds.groupby('common_names')['scientific_name'].nunique()

# Filter to find common names with more than one associated scientific name
multi_sci_common_names = common_name_sci_count[common_name_sci_count > 1]

# Display the results
print("Common names with multiple associated scientific names:")
print(multi_sci_common_names)

Common names with multiple associated scientific names:
common_names
American Three-Toed Woodpecker                  2
Black-Throated Gray Warbler                     2
Brewster's Warbler                              3
Dark-Eyed Junco (Oregon Race)                   2
Evening Grosbeak                                3
Great Gray Owl                                  2
House Finch                                     2
Lesser Goldfinch                                2
Mountain Chickadee (inyoensis subspecies)       2
Orange-Crowned Warbler (orestera subspecies)    2
Peregrine Falcon                                2
Pine Siskin                                     2
Purple Finch                                    2
Wilson's Warbler                                3
Yellow-Rumped Warbler                           2
Name: scientific_name, dtype: int64


In [424]:
multi_sci_common_names_list = multi_sci_common_names.index.tolist()
multi_sci_common_name_records = birds[birds['common_names'].isin(multi_sci_common_names_list)]

print(f"Shape: {multi_sci_common_name_records.shape}")
multi_sci_common_name_records.head()

Shape: (450, 14)


Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected
186,ACAD-1186,ACAD,Acadia,Falconiformes,Falconidae,Falco peregrinus,Peregrine Falcon,Approved,Present,Not Native,Uncommon,Breeder,Species of Concern,True
253,ACAD-1253,ACAD,Acadia,Passeriformes,Fringillidae,Carduelis pinus,Pine Siskin,Approved,Present,Native,Abundant,Breeder,Least Concern,False
255,ACAD-1255,ACAD,Acadia,Passeriformes,Fringillidae,Carpodacus mexicanus,House Finch,Approved,Present,Not Native,Common,Unknown,Least Concern,False
256,ACAD-1256,ACAD,Acadia,Passeriformes,Fringillidae,Carpodacus purpureus,Purple Finch,Approved,Present,Native,Abundant,Breeder,Least Concern,False
257,ACAD-1257,ACAD,Acadia,Passeriformes,Fringillidae,Coccothraustes vespertinus,Evening Grosbeak,Approved,Not Present (Historical Report),Native,Unknown,Unknown,Least Concern,False


In [425]:
common_name_to_sci_names_counts = {}

for common_name, group in multi_sci_common_name_records.groupby('common_names'):
    sci_names_counts = Counter(group['scientific_name'])
    common_name_to_sci_names_counts[common_name] = sci_names_counts

for common_name, sci_names_counts in common_name_to_sci_names_counts.items():
    print(f"Common Name: {common_name}")
    print("Associated Scientific Names and Counts:")
    for sci_name, count in sci_names_counts.items():
        print(f"  - {sci_name}: {count} records")
    print()

Common Name: American Three-Toed Woodpecker
Associated Scientific Names and Counts:
  - Picoides dorsalis: 12 records
  - Picoides dorsalis fasciatus: 2 records

Common Name: Black-Throated Gray Warbler
Associated Scientific Names and Counts:
  - Dendroica nigrescens: 32 records
  - Setophaga nigrescens halseii: 1 records

Common Name: Brewster's Warbler
Associated Scientific Names and Counts:
  - Vermivora pinus X chrysoptera: 2 records
  - Vermivora chrysoptera X pinus: 2 records
  - Vermivora leucobronchialis: 1 records

Common Name: Dark-Eyed Junco (Oregon Race)
Associated Scientific Names and Counts:
  - Junco hyemalis shufeldti: 4 records
  - Junco hyemalis thurberi: 1 records

Common Name: Evening Grosbeak
Associated Scientific Names and Counts:
  - Coccothraustes vespertinus: 44 records
  - Hesperiphona vespertina brooksi: 1 records
  - Hesperiphona vespertina montana: 1 records

Common Name: Great Gray Owl
Associated Scientific Names and Counts:
  - Strix nebulosa: 21 records


In [426]:
# Define the updates for common names with the subspecies information in parentheses
updates = {
    'Picoides dorsalis fasciatus': "American Three-Toed Woodpecker (fasciatus subspecies)",
    'Junco hyemalis shufeldti': "Dark-Eyed Junco (shufeldti subspecies)",
    'Junco hyemalis thurberi': "Dark-Eyed Junco (thurberi subspecies)",
    'Strix nebulosa nebulosa': "Great Gray Owl (nebulosa subspecies)",
    'Falco peregrinum anatum': "Peregrine Falcon (anatum subspecies)"
}

# Apply the updates to the birds DataFrame
for sci_name, new_common_name in updates.items():
    birds.loc[birds['scientific_name'] == sci_name, 'common_names'] = new_common_name

In [427]:
print(f"There are {birds.species_id.nunique()} records")
print(f"There are {birds.common_names.nunique()} unique common names")
print(f"There are {birds.scientific_name.nunique()} unique scientific names")
print(f"There are {birds.family.nunique()} unique families")

There are 14451 records
There are 1162 unique common names
There are 1177 unique scientific names
There are 86 unique families


In [428]:
# Count the number of NaN values in each column
nan_counts_per_column = birds.isna().sum()
print(nan_counts_per_column)

species_id              0
park_code               0
park_name               0
order                   0
family                 16
scientific_name         0
common_names            0
record_status           0
occurrence              0
nativeness              0
abundance               0
seasonality             0
conservation_status     0
is_protected            0
dtype: int64


In [429]:
# Filter records with NaN in the 'family' column
nan_family_records = birds[birds['family'].isna()]
print(f"Shape: {nan_family_records.shape}")
nan_family_records.head()

Shape: (16, 14)


Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected
26362,DEVA-1277,DEVA,Death Valley,Apodiformes,,Stellula calliope,Calliope Hummingbird,Approved,Present,Native,Rare,Migratory,Least Concern,False
26381,DEVA-1296,DEVA,Death Valley,Charadriiformes,,Tryngites subruficollis,Buff-Breasted Sandpiper,Approved,Not Present (False Report),Native,Unknown,Unknown,Least Concern,False
26483,DEVA-1398,DEVA,Death Valley,Passeriformes,,Carpodacus mexicanus,House Finch,Approved,Present,Native,Abundant,Breeder,Least Concern,False
26484,DEVA-1399,DEVA,Death Valley,Passeriformes,,Carpodacus purpureus,Purple Finch,Approved,Present,Native,Rare,Vagrant,Least Concern,False
26485,DEVA-1400,DEVA,Death Valley,Passeriformes,,Calcarius mccownii,Mccown's Longspur,Approved,Present,Native,Occasional,Vagrant,Least Concern,False


In [430]:
# Create a mapping of ('order', 'scientific_name') to 'family' for known values
known_families = birds.dropna(subset=['family']).set_index(['order', 'scientific_name'])['family'].to_dict()

# Attempt to fill missing 'family' values by matching 'order' and 'scientific_name'
for index, row in nan_family_records.iterrows():
    order = row['order']
    scientific_name = row['scientific_name']
    family = known_families.get((order, scientific_name))
    if family:
        birds.at[index, 'family'] = family

In [431]:
assert birds['family'].isna().sum() == 0, f"NaN records present under 'family': {birds['family'].isna().sum()} records missing"

----
&nbsp;
## Isolating Birds of Prey
Isolating birds of prey: We create a list of birds of prey 'groups' to search under `common_name`

In [434]:
# Define the list of bird of prey keywords
birds_of_prey = ["Eagle", "Hawk", "Falcon", "Buzzard", "Harrier", "Kite", "Owl", "Osprey", 
                 "Vulture", "Condor", "Kestrel", 'Buteo', 'Accipiter', 'Caracara']

# Create a regex pattern from the list
pattern = '|'.join(birds_of_prey)

# Function to find and append only the matching keywords
def find_raptors(common_names):
    # Convert to string and handle NaN values
    if pd.isna(common_names):
        return ''
    # Find keywords that are present in the common_names
    matches = set()  # Use a set to avoid duplicates
    for keyword in birds_of_prey:
        if keyword in common_names:
            matches.add(keyword)
    return ', '.join(matches)

birds['raptor_group'] = birds['common_names'].apply(find_raptors)
birds.head()

Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected,raptor_group
55,ACAD-1055,ACAD,Acadia,Accipitriformes,Accipitridae,Accipiter cooperii,Cooper's Hawk,Approved,Present,Native,Uncommon,Unknown,Species of Concern,True,Hawk
56,ACAD-1056,ACAD,Acadia,Accipitriformes,Accipitridae,Accipiter gentilis,Northern Goshawk,Approved,Present,Native,Uncommon,Breeder,Least Concern,False,
57,ACAD-1057,ACAD,Acadia,Accipitriformes,Accipitridae,Accipiter striatus,Sharp-Shinned Hawk,Approved,Present,Native,Common,Breeder,Species of Concern,True,Hawk
58,ACAD-1058,ACAD,Acadia,Accipitriformes,Accipitridae,Aquila chrysaetos,Golden Eagle,Approved,Present,Native,Occasional,Vagrant,Species of Concern,True,Eagle
59,ACAD-1059,ACAD,Acadia,Accipitriformes,Accipitridae,Buteo jamaicensis,Red-Tailed Hawk,Approved,Present,Native,Common,Breeder,Least Concern,False,Hawk


In [435]:
total_raptors_comm = birds['raptor_group'].loc[birds['raptor_group'] != ''].count()
print(f"Under common name: {total_raptors_comm} raptor records")
diff_com_types = birds.raptor_group.unique().tolist()
print(diff_com_types)

Under common name: 1341 raptor records
['Hawk', '', 'Eagle', 'Harrier', 'Vulture', 'Osprey', 'Falcon', 'Kestrel', 'Owl', 'Hawk, Owl', 'Condor', 'Kite', 'Caracara']


There's some ambiguity in the above list so we cross-reference `family`

#### Birds of Prey Scientific Families and Genera

According to OpenAI's language model ChatGPT 4o (2024):

- Accipitridae (Hawks, Eagles, and relatives)
- Falconidae (Falcons)
- Harpagiidae (Harriers)
- Pandionidae (Ospreys)
- Accipitridae (Kites)
- Cathartidae (New World Vultures)
- Buteo (Buzzards and Buteos)
- Accipiter (Goshawks and Accipiters)
- Tytonidae (Barn Owls)
- Strigidae (Typical Owls)

*Caveat emptor*: This list may not be comprehensive.

In [284]:
# Define the list of bird of prey scientific families and genera
birds_of_prey_sci = [
    "Accipitridae", "Falconidae", "Harpagiidae", 
    "Pandionidae", "Cathartidae", "Buteo", "Accipiter", 
    "Tytonidae", "Strigidae"
]

# Create a regex pattern from the list
pattern = '|'.join(birds_of_prey_sci)
birds['raptor_sci_fam'] = birds['family'].str.findall(f'({pattern})')
birds['raptor_sci_fam'] = birds['raptor_sci_fam'].apply(lambda x: ', '.join(x) if isinstance(x, list) and x else '')
birds.head()

Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected,raptor_group,raptor_sci_fam
55,ACAD-1055,ACAD,Acadia,Accipitriformes,Accipitridae,Accipiter cooperii,Cooper's Hawk,Approved,Present,Native,Uncommon,Unknown,Species of Concern,True,Hawk,Accipitridae
56,ACAD-1056,ACAD,Acadia,Accipitriformes,Accipitridae,Accipiter gentilis,Northern Goshawk,Approved,Present,Native,Uncommon,Breeder,Least Concern,False,,Accipitridae
57,ACAD-1057,ACAD,Acadia,Accipitriformes,Accipitridae,Accipiter striatus,Sharp-Shinned Hawk,Approved,Present,Native,Common,Breeder,Species of Concern,True,Hawk,Accipitridae
58,ACAD-1058,ACAD,Acadia,Accipitriformes,Accipitridae,Aquila chrysaetos,Golden Eagle,Approved,Present,Native,Occasional,Vagrant,Species of Concern,True,Eagle,Accipitridae
59,ACAD-1059,ACAD,Acadia,Accipitriformes,Accipitridae,Buteo jamaicensis,Red-Tailed Hawk,Approved,Present,Native,Common,Breeder,Least Concern,False,Hawk,Accipitridae


In [285]:
total_raptors_sci = birds['raptor_sci_fam'].loc[birds['raptor_sci_fam'] != ''].count()
print(f"Under scientific family: {total_raptors_sci} raptor records")
diff_sci_types = birds.raptor_sci_fam.unique().tolist()
print(diff_sci_types)

Under scientific family: 1470 raptor records
['Accipitridae', 'Cathartidae', 'Pandionidae', '', 'Falconidae', 'Strigidae', 'Tytonidae']


In [286]:
raptors_df = (birds[(birds.raptor_group != '') | (birds.raptor_sci_fam != '')])
print(raptors_df.shape)

(1470, 16)


In [290]:
raptors_df = raptors_df.copy()
mask = raptors_df['raptor_group'] == ''
raptors_df.loc[mask, 'ambiguous'] = True
result = raptors_df[raptors_df['ambiguous'] == True]
print(f"Shape: {result.shape}")
print(f"Unique common names:\n{result.common_names.unique().tolist()}")

Shape: (129, 17)
Unique common names:
['Northern Goshawk', 'Merlin', 'Gyrfalcon', 'Merlin (Coastal Forest Subspecies)', 'Merlin (richardsonii subspecies)', 'Merlin (Tundra Subspecies)', 'Northern Goshawk (atricapillus subspecies)', 'Northern Goshawk (laingi subspecies)', 'Merlin (bendirei subspecies)', 'Merlin (richardsoni subspecies)']


*Accipiter gentilis* is `Northern Goshawk`, a type of `Hawk`

*Falco columbarius* is `Merlin`, a type of `Falcon`

In [292]:
raptors_df = raptors_df.copy()
merlin_gyrfalcon_mask = raptors_df['common_names'].str.contains(r'Merlin|Gyrfalcon', case=False, regex=True)
northern_goshawk_mask = raptors_df['common_names'].str.contains(r'Northern Goshawk', case=False, regex=True)

raptors_df.loc[merlin_gyrfalcon_mask, 'raptor_common'] = 'Falcon'
raptors_df.loc[northern_goshawk_mask, 'raptor_common'] = 'Hawk'

# Verify the changes
result = raptors_df[raptors_df['ambiguous'] == True]
print(f"Updated shape: {result.shape}")
print(f"Updated 'raptor_common' values:\n{result[['common_names', 'raptor_common']].drop_duplicates()}")

Updated shape: (129, 18)
Updated 'raptor_common' values:
                                     common_names raptor_common
56                               Northern Goshawk          Hawk
185                                        Merlin        Falcon
187                                     Gyrfalcon        Falcon
19228          Merlin (Coastal Forest Subspecies)        Falcon
26456            Merlin (richardsonii subspecies)        Falcon
42199                  Merlin (Tundra Subspecies)        Falcon
78390  Northern Goshawk (atricapillus subspecies)          Hawk
78391        Northern Goshawk (laingi subspecies)          Hawk
87990                Merlin (bendirei subspecies)        Falcon
87992             Merlin (richardsoni subspecies)        Falcon


In [438]:
# Extract indices from the result DataFrame
updated_indices = result.index
birds.loc[updated_indices, 'raptor_group'] = result['raptor_common']
assert not birds.loc[updated_indices, 'raptor_group'].isna().any(), "NaN values present in 'raptor_common' for the specified indices"

In [439]:
birds.head()

Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected,raptor_group
55,ACAD-1055,ACAD,Acadia,Accipitriformes,Accipitridae,Accipiter cooperii,Cooper's Hawk,Approved,Present,Native,Uncommon,Unknown,Species of Concern,True,Hawk
56,ACAD-1056,ACAD,Acadia,Accipitriformes,Accipitridae,Accipiter gentilis,Northern Goshawk,Approved,Present,Native,Uncommon,Breeder,Least Concern,False,Hawk
57,ACAD-1057,ACAD,Acadia,Accipitriformes,Accipitridae,Accipiter striatus,Sharp-Shinned Hawk,Approved,Present,Native,Common,Breeder,Species of Concern,True,Hawk
58,ACAD-1058,ACAD,Acadia,Accipitriformes,Accipitridae,Aquila chrysaetos,Golden Eagle,Approved,Present,Native,Occasional,Vagrant,Species of Concern,True,Eagle
59,ACAD-1059,ACAD,Acadia,Accipitriformes,Accipitridae,Buteo jamaicensis,Red-Tailed Hawk,Approved,Present,Native,Common,Breeder,Least Concern,False,Hawk


In [441]:
# Filter the DataFrame to find records where 'raptor_common' matches 'Hawk, Owl'
hawk_owl_records = birds[birds['raptor_group'] == 'Hawk, Owl']
print(f"Number of records matching 'Hawk, Owl': {hawk_owl_records.shape[0]}")
print(hawk_owl_records.common_names.unique().tolist())

Number of records matching 'Hawk, Owl': 12
['Northern Hawk Owl']


In [447]:
hawk_owl_indices = birds[birds['raptor_group'] == 'Hawk, Owl'].index
birds.loc[hawk_owl_indices, 'raptor_group'] = "Owl"
assert birds[birds['raptor_group'] == 'Hawk, Owl'].empty, "There are records with 'raptor_group' set to 'Hawk, Owl'."

In [448]:
# Change the empty string in 'raptor_group' to "None"
birds['raptor_group'] = birds['raptor_group'].replace('', 'None')
print(birds.raptor_group.unique().tolist())

['Hawk', 'Eagle', 'Harrier', 'Vulture', 'Osprey', 'None', 'Falcon', 'Kestrel', 'Owl', 'Condor', 'Kite', 'Caracara']


In [450]:
# Create a new boolean column 'is_raptor' based on 'raptor_group'
birds['is_raptor'] = birds['raptor_group'].apply(lambda x: x != 'None')
assert birds['is_raptor'].dtype == bool, "is_raptor is not a boolean column"

In [452]:
# Reorder the columns to place 'raptor_group' next to 'common_names'
columns_order = birds.columns.tolist()  # Get the current order of columns
common_names_index = columns_order.index('common_names')
columns_order.insert(common_names_index + 1, columns_order.pop(columns_order.index('raptor_group')))
birds = birds[columns_order] 
birds.head()

Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,raptor_group,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected,is_raptor
55,ACAD-1055,ACAD,Acadia,Accipitriformes,Accipitridae,Accipiter cooperii,Cooper's Hawk,Hawk,Approved,Present,Native,Uncommon,Unknown,Species of Concern,True,True
56,ACAD-1056,ACAD,Acadia,Accipitriformes,Accipitridae,Accipiter gentilis,Northern Goshawk,Hawk,Approved,Present,Native,Uncommon,Breeder,Least Concern,False,True
57,ACAD-1057,ACAD,Acadia,Accipitriformes,Accipitridae,Accipiter striatus,Sharp-Shinned Hawk,Hawk,Approved,Present,Native,Common,Breeder,Species of Concern,True,True
58,ACAD-1058,ACAD,Acadia,Accipitriformes,Accipitridae,Aquila chrysaetos,Golden Eagle,Eagle,Approved,Present,Native,Occasional,Vagrant,Species of Concern,True,True
59,ACAD-1059,ACAD,Acadia,Accipitriformes,Accipitridae,Buteo jamaicensis,Red-Tailed Hawk,Hawk,Approved,Present,Native,Common,Breeder,Least Concern,False,True


In [454]:
print(f"total raptors: {birds[birds['is_raptor'] == True].shape[0]}")

total raptors: 1470


In [458]:
# Export the 'birds' DataFrame to a CSV file
birds.to_csv('DATA/birds.csv', index=False)
birds.to_pickle('DATA/birds.pkl')

----