# North American Birds
Using `species.csv`

In [1]:
import pandas as pd
import numpy as np
import re
from collections import Counter
from fuzzywuzzy import process, fuzz
from itertools import islice

from Functions import process_scientific_names, standardize_common_names, standardize_common_names_subspecies

----
&nbsp;
## Exploratory Analysis

In [2]:
df = pd.read_csv('DATA/species.csv', low_memory=False)
print(f"Shape: {df.shape}")
df.head()

Shape: (119248, 14)


Unnamed: 0,Species ID,Park Name,Category,Order,Family,Scientific Name,Common Names,Record Status,Occurrence,Nativeness,Abundance,Seasonality,Conservation Status,Unnamed: 13
0,ACAD-1000,Acadia National Park,Mammal,Artiodactyla,Cervidae,Alces alces,Moose,Approved,Present,Native,Rare,Resident,,
1,ACAD-1001,Acadia National Park,Mammal,Artiodactyla,Cervidae,Odocoileus virginianus,"Northern White-Tailed Deer, Virginia Deer, Whi...",Approved,Present,Native,Abundant,,,
2,ACAD-1002,Acadia National Park,Mammal,Carnivora,Canidae,Canis latrans,"Coyote, Eastern Coyote",Approved,Present,Not Native,Common,,Species of Concern,
3,ACAD-1003,Acadia National Park,Mammal,Carnivora,Canidae,Canis lupus,"Eastern Timber Wolf, Gray Wolf, Timber Wolf",Approved,Not Confirmed,Native,,,Endangered,
4,ACAD-1004,Acadia National Park,Mammal,Carnivora,Canidae,Vulpes vulpes,"Black Fox, Cross Fox, Eastern Red Fox, Fox, Re...",Approved,Present,Unknown,Common,Breeder,,


In [3]:
print(f"Type of data:\n\n{df.dtypes}\n")

Type of data:

Species ID             object
Park Name              object
Category               object
Order                  object
Family                 object
Scientific Name        object
Common Names           object
Record Status          object
Occurrence             object
Nativeness             object
Abundance              object
Seasonality            object
Conservation Status    object
Unnamed: 13            object
dtype: object



In [4]:
print(f"Unique values:\n\n{df.nunique()}")

Unique values:

Species ID             119248
Park Name                  56
Category                   14
Order                     554
Family                   2332
Scientific Name         46022
Common Names            35825
Record Status              53
Occurrence                  7
Nativeness                  5
Abundance                   8
Seasonality                24
Conservation Status        11
Unnamed: 13                 3
dtype: int64


In [5]:
df.columns = [col.lower().replace(" ", "_") for col in df.columns]
df.rename(columns={'unnamed:_13': 'unnamed'}, inplace=True)
print(f"Columns: {df.columns.tolist()}\n")

Columns: ['species_id', 'park_name', 'category', 'order', 'family', 'scientific_name', 'common_names', 'record_status', 'occurrence', 'nativeness', 'abundance', 'seasonality', 'conservation_status', 'unnamed']



In [6]:
print(df['unnamed'].unique())

[nan 'Endangered' 'Threatened' 'Species of Concern']


In [7]:
# Filter rows where 'Unnamed: 13' is not NaN
filtered_df = df[df['unnamed'].notna()]
filtered_df

Unnamed: 0,species_id,park_name,category,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,unnamed
6441,BISC-1026,Biscayne National Park,Mammal,Sirenia,Trichechidae,Trichechus manatus,Manatee,Manati,Approved,Present,Unknown,Unknown,,Endangered
31786,EVER-1414,Everglades National Park,Reptile,Crocodilia,Crocodylidae,Crocodylus acutus,American Crocodile,Cocodrilo De Tumbes,Approved,Present,Native,Uncommon,Resident,Threatened
31826,EVER-1454,Everglades National Park,Reptile,Testudines,Cheloniidae,Caretta caretta,Loggerhead,Cabezon,Approved,Present,Native,Rare,Breeder,Threatened
44733,GRSA-1136,Great Sand Dunes National Park and Preserve,Bird,Falconiformes,Falconidae,Falco columbarius,Merlin,Pigeon Hawk,Approved,Present,Native,Rare,Resident,Species of Concern
44944,GRSA-1347,Great Sand Dunes National Park and Preserve,Vascular Plant,Asparagales,Iridaceae,Iris missouriensis,Blue Flag,Wild Iris,Approved,Present,Native,Rare,,Species of Concern


In [8]:
print(df['conservation_status'].unique())

[nan 'Species of Concern' 'Endangered' 'In Recovery' 'Threatened'
 'Under Review' 'Proposed Threatened' 'Extinct' 'Proposed Endangered'
 'Resident' 'Breeder' 'Migratory']


#### *Discrepancies between `conservation_status` and `unnamed` need to be assessed later
It seems as if `unnamed` can be written to `conservation_status`

In [9]:
# Remove all text beyond " National Park" including variations like " and Preserve"
df['park_name'] = df['park_name'].str.replace(r' National Park.*', '', case=False, regex=True)

In [10]:
# Is the species-id unique?
condor_df = df[df['common_names'].str.contains('condor', case=False, na=False)]
condor_df

Unnamed: 0,species_id,park_name,category,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,unnamed
1779,ARCH-1070,Arches,Bird,Accipitriformes,Cathartidae,Gymnogyps californianus,California Condor,Approved,Present,Native,Occasional,Vagrant,Endangered,
9334,BRCA-1087,Bryce Canyon,Bird,Accipitriformes,Cathartidae,Gymnogyps californianus,California Condor,Approved,Present,Native,Occasional,Vagrant,Endangered,
10620,CANY-1087,Canyonlands,Bird,Accipitriformes,Cathartidae,Gymnogyps californianus,California Condor,Approved,Not Confirmed,Native,,,Endangered,
11841,CARE-1085,Capitol Reef,Bird,Accipitriformes,Cathartidae,Gymnogyps californianus,California Condor,Approved,Not Confirmed,Native,,,Endangered,
26319,DEVA-1234,Death Valley,Bird,Accipitriformes,Cathartidae,Gymnogyps californianus,California Condor,Approved,Not Present (False Report),Native,,,Endangered,
39444,GRBA-1122,Great Basin,Bird,Accipitriformes,Cathartidae,Gymnogyps californianus,California Condor,Approved,Not Present (Historical Report),Native,,,Endangered,
42096,GRCA-1121,Grand Canyon,Bird,Accipitriformes,Cathartidae,Gymnogyps californianus,California Condor,Approved,Present,Native,Common,Breeder,Endangered,
65258,JOTR-1085,Joshua Tree,Bird,Accipitriformes,Cathartidae,Gymnogyps californianus,California Condor,Approved,Not Confirmed,Native,,,Endangered,
72896,LAVO-1115,Lassen Volcanic,Bird,Accipitriformes,Cathartidae,Gymnogyps californianus,California Condor,Approved,Not Present (Historical Report),Native,,,Endangered,
77178,MEVE-1101,Mesa Verde,Bird,Accipitriformes,Cathartidae,Gymnogyps californianus,California Condor,Approved,Present,Native,Occasional,Vagrant,Endangered,


`species_id` numeric is not unique for similar species but it probably should be

In [11]:
# Splitting species_id column at the hyphen to create a new `park_code` public key
df['park_code'] = df['species_id'].str.split('-').str[0]

# Reorder columns to place 'park_code' as the second column
cols = list(df.columns)  # Get the list of current columns
cols.insert(1, cols.pop(cols.index('park_code')))  # Move 'park_code' to the second position
df = df[cols]  # Reorder DataFrame

In [12]:
df.head()

Unnamed: 0,species_id,park_code,park_name,category,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,unnamed
0,ACAD-1000,ACAD,Acadia,Mammal,Artiodactyla,Cervidae,Alces alces,Moose,Approved,Present,Native,Rare,Resident,,
1,ACAD-1001,ACAD,Acadia,Mammal,Artiodactyla,Cervidae,Odocoileus virginianus,"Northern White-Tailed Deer, Virginia Deer, Whi...",Approved,Present,Native,Abundant,,,
2,ACAD-1002,ACAD,Acadia,Mammal,Carnivora,Canidae,Canis latrans,"Coyote, Eastern Coyote",Approved,Present,Not Native,Common,,Species of Concern,
3,ACAD-1003,ACAD,Acadia,Mammal,Carnivora,Canidae,Canis lupus,"Eastern Timber Wolf, Gray Wolf, Timber Wolf",Approved,Not Confirmed,Native,,,Endangered,
4,ACAD-1004,ACAD,Acadia,Mammal,Carnivora,Canidae,Vulpes vulpes,"Black Fox, Cross Fox, Eastern Red Fox, Fox, Re...",Approved,Present,Unknown,Common,Breeder,,


-----
&nbsp;
## Isolating Birds

In [13]:
birds = df[df.category == 'Bird']
print(f"Shape: {birds.shape}")
birds.head()

Shape: (14601, 15)


Unnamed: 0,species_id,park_code,park_name,category,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,unnamed
55,ACAD-1055,ACAD,Acadia,Bird,Accipitriformes,Accipitridae,Accipiter cooperii,Cooper's Hawk,Approved,Present,Native,Uncommon,,Species of Concern,
56,ACAD-1056,ACAD,Acadia,Bird,Accipitriformes,Accipitridae,Accipiter gentilis,"Eastern Goshawk, Goshawk, Northern Goshawk",Approved,Present,Native,Uncommon,Breeder,,
57,ACAD-1057,ACAD,Acadia,Bird,Accipitriformes,Accipitridae,Accipiter striatus,"Northern Sharp-Shinned Hawk, Sharp-Shinned Hawk",Approved,Present,Native,Common,Breeder,Species of Concern,
58,ACAD-1058,ACAD,Acadia,Bird,Accipitriformes,Accipitridae,Aquila chrysaetos,"American Golden Eagle, Golden Eagle",Approved,Present,Native,Occasional,Vagrant,Species of Concern,
59,ACAD-1059,ACAD,Acadia,Bird,Accipitriformes,Accipitridae,Buteo jamaicensis,"Eastern Red-Tailed Hawk, Red-Tailed Hawk",Approved,Present,Native,Common,Breeder,,


In [14]:
print(f"Unique values:\n\n{birds.nunique()}")

Unique values:

species_id             14601
park_code                 56
park_name                 56
category                   1
order                     24
family                    86
scientific_name         1436
common_names            1550
record_status             10
occurrence                 6
nativeness                 4
abundance                  7
seasonality               23
conservation_status        8
unnamed                    1
dtype: int64


In [15]:
print(birds['conservation_status'].unique())

['Species of Concern' nan 'In Recovery' 'Threatened' 'Endangered'
 'Under Review' 'Breeder' 'Resident' 'Proposed Endangered']


In [16]:
print(f"NaN count: {birds.conservation_status.isna().sum()}\n")
birds.groupby("conservation_status").size()

NaN count: 11970



conservation_status
Breeder                   4
Endangered               64
In Recovery              65
Proposed Endangered       2
Resident                  4
Species of Concern     2371
Threatened               51
Under Review             70
dtype: int64

### `conservation_status`

- `Endangered`: seriously at risk of extinction
- `In Recovery`: formerly `Endangered`, but currently not in danger of extinction throughout all or a significant portion of its range
- `Threatened`: vulnerable to endangerment in the near future
- `Species of Concern`: declining or appear to be in need of conservation
- `Under Review`: ?
- `Resident`: ?
- `Breeder`: ?

In [17]:
# Does resident and breeder come under `seasonality`?
print(birds.seasonality.unique())

[nan 'Breeder' 'Vagrant' 'Resident' 'Migratory' 'Breeder, Migratory'
 'Breeder, Summer' 'Migratory, Vagrant' 'Migratory, Winter'
 'Breeder, Winter' 'Winter' 'Resident, Winter' 'Summer, Vagrant'
 'Breeder, Resident, Summer' 'Resident, Summer' 'Summer'
 'Breeder, Resident, Migratory, Summer'
 'Breeder, Resident, Summer, Winter' 'Migratory, Summer' 'Winter, Vagrant'
 'Rare' 'Uncommon' 'Common' 'Breeder, Resident']


#### `Resident` and `Breeder` belong in `seasonality`. 
Seems wrong as a `conservation_status` and could be the result of Human Error. Investigating the unique values in the other columns

In [18]:
keywords = ['Breeder', 'Resident']
pattern = '|'.join(keywords)  

# Filter the DataFrame for rows where column contains any of the keywords
matching_df = birds[birds['conservation_status'].str.contains(pattern, case=False, na=False)]
matching_df

Unnamed: 0,species_id,park_code,park_name,category,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,unnamed
44666,GRSA-1069,GRSA,Great Sand Dunes,Bird,Accipitriformes,Accipitridae,Accipiter gentilis,Goshawk,Northern Goshawk,Approved,Present,Native,Rare,Breeder,
44678,GRSA-1081,GRSA,Great Sand Dunes,Bird,Anseriformes,Anatidae,Anas acuta,Pintail,Northern Pintail,Approved,Present,Native,Rare,Resident,
44705,GRSA-1108,GRSA,Great Sand Dunes,Bird,Caprimulgiformes,Caprimulgidae,Phalaenoptilus nuttallii,Poor-Will,Common Poorwill,Approved,Present,Native,Uncommon,Breeder,
44733,GRSA-1136,GRSA,Great Sand Dunes,Bird,Falconiformes,Falconidae,Falco columbarius,Merlin,Pigeon Hawk,Approved,Present,Native,Rare,Resident,Species of Concern
44744,GRSA-1147,GRSA,Great Sand Dunes,Bird,Passeriformes,Aegithalidae,Psaltriparus minimus,Common Bushtit,Bushtit,Approved,Present,Native,Uncommon,Breeder,
44759,GRSA-1162,GRSA,Great Sand Dunes,Bird,Passeriformes,Corvidae,Corvus brachyrhynchos,Common Crow,American Crow,Approved,Present,Native,Rare,Resident,
44818,GRSA-1221,GRSA,Great Sand Dunes,Bird,Passeriformes,Mimidae,Dumetella carolinensis,Gray Catbird,Catbird,Approved,Present,Native,Rare,Resident,
44859,GRSA-1262,GRSA,Great Sand Dunes,Bird,Passeriformes,Turdidae,Turdus migratorius,American Robin,Robin,Approved,Present,Native,Common,Breeder,


In [19]:
print(birds.abundance.unique())

['Uncommon' 'Common' 'Occasional' nan 'Rare' 'Unknown' 'Abundant' 'Native']


In [20]:
# Count unique values where 'abundance' is 'Native'
unique_native_abundance = birds[birds['abundance'] == 'Native']['species_id'].nunique()
print(f"Unique records where abundance is 'Native': {unique_native_abundance}")

Unique records where abundance is 'Native': 8


In [21]:
print(birds.nativeness.unique())

['Native' 'Unknown' 'Not Native' nan 'Present']


In [22]:
# Count unique values where 'nativeness' is 'Present'
unique_present_nativeness = birds[birds['nativeness'] == 'Present']['species_id'].nunique()
print(f"Unique records where nativeness is 'Present': {unique_present_nativeness}")

Unique records where nativeness is 'Present': 8


In [23]:
print(birds.record_status.unique())

['Approved' 'In Review' ' Northern Goshawk' ' Northern Pintail'
 ' Common Poorwill' ' Pigeon Hawk' ' Bushtit' ' American Crow' ' Catbird'
 ' Robin']


#### Discrepancies in these records seem to be the result of human error
We need to remove the common_name values in `record_status` and shift all columns to the right, one column left

In [24]:
# indices that need correction are based on 'matching_df'
indices_to_shift = matching_df.index

# Define the column start position for shifting
start_pos = matching_df.columns.get_loc('record_status')

# Create a shifted version of the relevant subset of matching_df
# Shifting is done by slicing the DataFrame to remove the start position column, and appending NaN at the end
shifted_df = matching_df.loc[indices_to_shift].apply(
    lambda row: pd.Series(np.append(row[start_pos + 1:].values, pd.NA), index=row[start_pos:].index), axis=1
)

# Combine the unchanged part of the rows with the shifted part
matching_df.loc[indices_to_shift, matching_df.columns[start_pos:]] = shifted_df

# Apply the same changes to the parent and child DataFrames using indices
df.loc[indices_to_shift] = matching_df.loc[indices_to_shift]
birds.loc[indices_to_shift] = matching_df.loc[indices_to_shift]

In [25]:
columns = ['conservation_status', 'abundance', 'nativeness', 'record_status', 'occurrence', 'unnamed']
for column in columns:
    print(f"{column.title()}:\n{birds[column].unique()}")

Conservation_Status:
['Species of Concern' nan 'In Recovery' 'Threatened' 'Endangered'
 'Under Review' 'Proposed Endangered']
Abundance:
['Uncommon' 'Common' 'Occasional' nan 'Rare' 'Unknown' 'Abundant']
Nativeness:
['Native' 'Unknown' 'Not Native' nan]
Record_Status:
['Approved' 'In Review']
Occurrence:
['Present' 'Not Confirmed' 'Not Present (Historical Report)'
 'Not Present (False Report)' nan 'Not Present']
Unnamed:
[nan <NA>]


In [26]:
# Dropping unnamed
birds = birds.drop(columns=['unnamed'])

#### Dealing with `seasonality` column

In [27]:
print(birds.seasonality.unique())

[nan 'Breeder' 'Vagrant' 'Resident' 'Migratory' 'Breeder, Migratory'
 'Breeder, Summer' 'Migratory, Vagrant' 'Migratory, Winter'
 'Breeder, Winter' 'Winter' 'Resident, Winter' 'Summer, Vagrant'
 'Breeder, Resident, Summer' 'Resident, Summer' 'Summer'
 'Breeder, Resident, Migratory, Summer'
 'Breeder, Resident, Summer, Winter' 'Migratory, Summer' 'Winter, Vagrant'
 'Breeder, Resident']


In [28]:
print(f"NaN count: {birds.seasonality.isna().sum()}\n")
birds.groupby("seasonality").size()

NaN count: 3502



seasonality
Breeder                                 5005
Breeder, Migratory                         4
Breeder, Resident                          9
Breeder, Resident, Migratory, Summer       1
Breeder, Resident, Summer                  3
Breeder, Resident, Summer, Winter          1
Breeder, Summer                            2
Breeder, Winter                           64
Migratory                               2726
Migratory, Summer                          2
Migratory, Vagrant                         4
Migratory, Winter                         23
Resident                                1632
Resident, Summer                           2
Resident, Winter                           9
Summer                                    25
Summer, Vagrant                            3
Vagrant                                 1563
Winter                                    20
Winter, Vagrant                            1
dtype: int64

There's not enough records to warrant this variation. Change to `Breeder`, `Migratory`, `Unknown`

In [29]:
# Fill NaN values with 'Unknown'
birds['seasonality'] = birds['seasonality'].fillna('Unknown')

# Define keywords in the order of priority
priority_keywords = ['Winter', 'Summer', 'Breeder', 'Migratory', 'Resident', 'Vagrant']

def simplify_seasonality(value):
    for keyword in priority_keywords:
        if keyword in value:
            return keyword
    return 'Unknown'

# Apply the classification function to simplify seasonality to one word
birds['seasonality'] = birds['seasonality'].apply(simplify_seasonality)

# Display the updated counts for verification
seasonality_counts = birds['seasonality'].value_counts()
print(seasonality_counts)

seasonality
Breeder      5018
Unknown      3502
Migratory    2730
Resident     1632
Vagrant      1563
Winter        118
Summer         38
Name: count, dtype: int64


----
&nbsp;
### Dealing with `NaN` values and conversion to categorical columns

#### `conservation_status` as per [IUCN](https://en.wikipedia.org/wiki/IUCN_Red_List) Red List
We fill nan with `Least Concern` meaning 'non-protected' where all other values mean 'protected'

In [30]:
# Define the fill values for each column
fill_values = {
    'conservation_status': 'Least Concern',
    'abundance': 'Unknown',
    'nativeness': 'Unknown',
    'occurrence': 'Not Confirmed'
}

# Fill NaN values in the specified columns
birds['conservation_status'] = birds['conservation_status'].fillna(fill_values['conservation_status'])
birds['abundance'] = birds['abundance'].fillna(fill_values['abundance'])
birds['nativeness'] = birds['nativeness'].fillna(fill_values['nativeness'])
birds['occurrence'] = birds['occurrence'].fillna(fill_values['occurrence'])

In [31]:
# Define the ordered categories for each column
conservation_status_order = ['Least Concern', 'Species of Concern', 'In Recovery', 'Under Review', 'Threatened', 'Proposed Endangered', 'Endangered']
abundance_order = ['Rare', 'Uncommon', 'Unknown', 'Occasional', 'Common', 'Abundant']
nativeness_order = ['Not Native', 'Unknown', 'Native']
record_status_order = ['In Review', 'Approved']
occurrence_order = ['Not Present (False Report)', 'Not Present (Historical Report)', 'Not Present', 'Not Confirmed', 'Present']

In [32]:
# Covert columns to categorical
birds['record_status'] = pd.Categorical(birds['record_status'], categories=record_status_order, ordered=True)
birds['occurrence'] = pd.Categorical(birds['occurrence'], categories=occurrence_order, ordered=True)
birds['nativeness'] = pd.Categorical(birds['nativeness'], categories=nativeness_order, ordered=True)
birds['abundance'] = pd.Categorical(birds['abundance'], categories=abundance_order, ordered=True)
birds['conservation_status'] = pd.Categorical(birds['conservation_status'], categories=conservation_status_order, ordered=True)

# add boolean column 'is_protected'
birds['is_protected'] = birds.conservation_status != 'Least Concern'

# Dropping category as all birds
birds = birds.drop(columns=['category'])

In [33]:
# Display the DataFrame to confirm the changes
print(birds.dtypes)  # Check that columns are categorical
birds.head()  # Check the first few rows of the DataFrame

species_id               object
park_code                object
park_name                object
order                    object
family                   object
scientific_name          object
common_names             object
record_status          category
occurrence             category
nativeness             category
abundance              category
seasonality              object
conservation_status    category
is_protected               bool
dtype: object


Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected
55,ACAD-1055,ACAD,Acadia,Accipitriformes,Accipitridae,Accipiter cooperii,Cooper's Hawk,Approved,Present,Native,Uncommon,Unknown,Species of Concern,True
56,ACAD-1056,ACAD,Acadia,Accipitriformes,Accipitridae,Accipiter gentilis,"Eastern Goshawk, Goshawk, Northern Goshawk",Approved,Present,Native,Uncommon,Breeder,Least Concern,False
57,ACAD-1057,ACAD,Acadia,Accipitriformes,Accipitridae,Accipiter striatus,"Northern Sharp-Shinned Hawk, Sharp-Shinned Hawk",Approved,Present,Native,Common,Breeder,Species of Concern,True
58,ACAD-1058,ACAD,Acadia,Accipitriformes,Accipitridae,Aquila chrysaetos,"American Golden Eagle, Golden Eagle",Approved,Present,Native,Occasional,Vagrant,Species of Concern,True
59,ACAD-1059,ACAD,Acadia,Accipitriformes,Accipitridae,Buteo jamaicensis,"Eastern Red-Tailed Hawk, Red-Tailed Hawk",Approved,Present,Native,Common,Breeder,Least Concern,False


----
&nbsp;
## Classifying Species Records


In [34]:
print(f"There are {birds.species_id.nunique()} records")
print(f"There are {birds.common_names.nunique()} unique common names")
print(f"There are {birds.scientific_name.nunique()} unique scientific names")
print(f"There are {birds.family.nunique()} unique families")

There are 14601 records
There are 1550 unique common names
There are 1436 unique scientific names
There are 86 unique families


In [35]:
# This dataset is well formatted with zero lower case values
lowercase_count = (birds.map(lambda x: isinstance(x, str) and x.islower())).sum().sum()
print(lowercase_count)

0


In [36]:
# Searching for punctuation in scientific name
punctuation_pattern = r"[^\w\s,]"
punctuation_matches = birds[birds['scientific_name'].str.contains(punctuation_pattern, na=False)]

# Display the results
print(f"Found {len(punctuation_matches)} scientific names with punctuation:")
punctuation_matches[['scientific_name', 'common_names']]

Found 1 scientific names with punctuation:


Unnamed: 0,scientific_name,common_names
30773,Tyrannus melancholicus/couchii,"""Tropical"" Kingbird,In Review,Present,Not Nati..."


In [37]:
def strip_punctuation(text):
    return re.sub(punctuation_pattern, ' ', text)

# Apply the function to the 'scientific_name' column
birds['scientific_name'] = birds['scientific_name'].apply(strip_punctuation)

In [38]:
print(f"Scientific name NaN count: {birds.scientific_name.isna().sum()}")
print(f"Common Name NaN count: {birds.common_names.isna().sum()}")

Scientific name NaN count: 0
Common Name NaN count: 280


We aim to:

Extract Unique Scientific Names into two groups:

    - Single Names: Scientific names listing only Genus
    - Standard Names: Scientific names with two words (genus and species).
    - Extended Names: Scientific names with more than two words (includes subspecies).

Attach Common Name Counts to Each Scientific Name:

    - Create a dictionary that maps each scientific name to the counts of its associated common names.

Determine an ‘Ultimate Common Name’:

    - Based on the frequency of common names and possible ambiguities, establish a preferred common name for each scientific name.

In [39]:
sci_name_set = set(birds.scientific_name)

# Separate scientific names into standard and extended based on word count
single_sci_names = {name for name in sci_name_set if len(name.split()) == 1}
standard_sci_names = {name for name in sci_name_set if len(name.split()) == 2}
extended_sci_names = {name for name in sci_name_set if len(name.split()) > 2}

print("Single Scientific Name Count (1 word):", len(single_sci_names))
print("Standard Scientific Names Count (2 words):", len(standard_sci_names))
print("Extended Scientific Names Count (> 2 words):", len(extended_sci_names))

Single Scientific Name Count (1 word): 64
Standard Scientific Names Count (2 words): 998
Extended Scientific Names Count (> 2 words): 374


----
&nbsp;
### Single `scientific_name`

In [40]:
results = process_scientific_names(birds, condition=1)

Single Scientific Name Count (1 word): 64
Scientific names with no associated common names: 7
Scientific names with multiple associated common names: 9
Scientific names with a single associated common name: 48


In [41]:
# Filter the birds DataFrame to get records with a single scientific name
single_sci_name_records = birds[birds['scientific_name'].isin(single_sci_names)]

print(f"Shape: {single_sci_name_records.shape}")
print(single_sci_name_records['record_status'].value_counts())

single_sci_name_records_common = single_sci_name_records.common_names.tolist()
print(f"Assoc common names: {single_sci_name_records_common}")
single_sci_name_records.head()

Shape: (100, 14)
record_status
In Review    99
Approved      1
Name: count, dtype: int64
Assoc common names: ['Nighthawks', 'Meadowlarks', 'Warbler', 'Starling', 'Woodpecker', 'Coots, Rails, Waterhens', 'Empidonax Sp.', 'Eagle', 'Hummingbird', 'Falconiforms, Falcons', 'Barn Owl', nan, 'Bird Hawks', 'Dabbling Ducks', 'Diving Ducks', 'Rufous Hummingbirds', 'Nighthawks', "Gulls, Ivory Gulls, Kittiwakes, Ross' Gulls, Sabine's Gulls", 'Stints', 'Dowitchers', 'Coots', 'Scrub Jays', 'Crows', 'Rough-Winged Swallow', 'Cowbirds', 'Mockingbirds', 'Pipits', 'Phainopeplas', 'Empidonax Flycatchers', 'Kingbirds', 'Diving Ducks', 'Goldeneyes', 'Scoters', 'Greater Mergansers', "Gulls, Ivory Gulls, Kittiwakes, Ross' Gulls, Sabine's Gulls", 'Western Grebe', 'Jaegers', 'Saw-Whet Owls', 'Barn-Owls', 'Dowitcher', 'Rufous Hummingbirds', nan, nan, 'Diving Ducks', 'Goldeneyes', 'Swans', 'Alcids, Auks, Gulls, Oystercatchers, Plovers, Shore Birds', 'Dowitcher', 'Jaegers', 'Falconiforms, Falcons', 'Ptarmigans', '

Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected
4301,BIBE-1155,BIBE,Big Bend,Caprimulgiformes,Caprimulgidae,Chordeiles,Nighthawks,In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False
4459,BIBE-1313,BIBE,Big Bend,Passeriformes,Icteridae,Sturnella,Meadowlarks,In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False
6579,BISC-1164,BISC,Biscayne,Passeriformes,Parulidae,Dendroica,Warbler,In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False
6615,BISC-1200,BISC,Biscayne,Passeriformes,Sturnidae,Sturnus,Starling,In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False
6660,BISC-1245,BISC,Biscayne,Piciformes,Picidae,Picoides,Woodpecker,In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False


It seems reasonable to drop the single name records for several reasons:

1. **Generic Common Names:** The common names listed are very generic and often refer to groups or types rather than specific species, which can lead to ambiguity and confusion.

2. **Review Status:** The fact that all but one of these records are marked as “In Review” suggests that these records are not finalized and are potentially under investigation or pending confirmation.

3. **Data Quality and Relevance:** The goal is to create a well-defined OLAP database with precise species information.


In [42]:
# Save the single scientific name records to a CSV for backup
single_sci_name_records.to_csv('DATA/Backups/single_sci_name_birds.csv', index=True)

# Drop the records with single scientific names from the birds DataFrame in place
birds.drop(birds[birds['scientific_name'].isin(single_sci_names)].index, inplace=True)

In [43]:
sci_name_set = set(birds.scientific_name)
single_sci_names = {name for name in sci_name_set if len(name.split()) == 1}
assert single_sci_names == set(), "single_sci_names is not an empty set as required"

----
&nbsp;
### Standard `scientific_name`

In [44]:
results2 = process_scientific_names(birds, condition=2)
no_common_names = results2['no_common_names']
multiple_common_names = results2['multiple_common_names']
single_common_names = results2['single_common_names']

Standard Scientific Names Count (2 words): 998
Scientific names with no associated common names: 36
Scientific names with multiple associated common names: 371
Scientific names with a single associated common name: 591


In [45]:
# Extract all records from birds where scientific_name is in no_common_names
no_common_names_records = birds[birds['scientific_name'].isin(no_common_names)]
print(f"Shape: {no_common_names_records.shape}")
print(no_common_names_records['record_status'].value_counts())
no_common_names_records.head()

Shape: (41, 14)
record_status
In Review    38
Approved      3
Name: count, dtype: int64


Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected
26320,DEVA-1235,DEVA,Death Valley,Accipitriformes,Cathartidae,Pseudogryphus californianus,,In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False
26367,DEVA-1282,DEVA,Death Valley,Apodiformes,Apodidae,Nephoecetes niger,,In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False
26472,DEVA-1387,DEVA,Death Valley,Gaviiformes,Gaviidae,Urinator imber,,In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False
26473,DEVA-1388,DEVA,Death Valley,Gaviiformes,Gaviidae,Urinator lumme,,In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False
26474,DEVA-1389,DEVA,Death Valley,Gaviiformes,Gaviidae,Urinator pacificus,,In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False


In [46]:
# Save the no_common_names_records to a CSV for backup
no_common_names_records.to_csv('DATA/Backups/no_common_names_birds.csv', index=True)

In [47]:
# Filter records with 'record_status' == 'Approved'
approved_records = no_common_names_records[no_common_names_records['record_status'] == 'Approved']
print(approved_records.scientific_name.unique())
approved_records

['Eromophila alpestris' 'Glaucidium californicum' 'Geothlypis tolomiei']


Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected
37569,GLBA-1204,GLBA,Glacier Bay,Passeriformes,Alaudidae,Eromophila alpestris,,Approved,Present,Native,Rare,Migratory,Least Concern,False
86453,PINN-1226,PINN,Pinnacles,Strigiformes,Strigidae,Glaucidium californicum,,Approved,Not Confirmed,Native,Unknown,Unknown,Least Concern,False
111476,YELL-1078,YELL,Yellowstone,Passeriformes,Parulidae,Geothlypis tolomiei,,Approved,Present,Native,Uncommon,Breeder,Least Concern,False


In [48]:
# Extract genera of approved records
approved_genera = approved_records['scientific_name'].apply(lambda x: x.split()[0]).unique()

# Create separate DataFrames for each genus using a dictionary comprehension
genus_dataframes = {genus: birds[birds['scientific_name'].str.startswith(genus)] for genus in approved_genera}

# Accessing each DataFrame by genus name
glaucidium_df = genus_dataframes.get('Glaucidium')
geothlypis_df = genus_dataframes.get('Geothlypis')
eromophila_df = genus_dataframes.get('Eromophila')

- *Geothlypis tolomiei* not being assigned a `common_name` seems to be the result of a typo
- *Glaucidium californicum* could be the result of ambiguous naming of the [`Northern/Mountain Pygmy Owl`](https://en.wikipedia.org/wiki/Northern_pygmy_owl)

In [49]:
geothlypis_df = geothlypis_df[geothlypis_df['scientific_name'] == 'Geothlypis tolmiei']
geothlypis_df[['scientific_name', 'common_names', 'record_status']].head()

Unnamed: 0,scientific_name,common_names,record_status
8329,Geothlypis tolmiei,Macgillivray's Warbler,Approved
26637,Geothlypis tolmiei,Macgillivray's Warbler,Approved
42370,Geothlypis tolmiei,Macgillivray's Warbler,Approved
65428,Geothlypis tolmiei,Macgillivray's Warbler,Approved
115621,Geothlypis tolmiei,Macgillivray's Warbler,Approved


In [50]:
glaucidium_df = glaucidium_df[glaucidium_df['common_names'] == 'Mountain Pygmy Owl, Northern Pygmy-Owl']
glaucidium_df[['scientific_name', 'common_names', 'record_status']]

Unnamed: 0,scientific_name,common_names,record_status
52505,Glaucidium gnoma,"Mountain Pygmy Owl, Northern Pygmy-Owl",Approved
73119,Glaucidium gnoma,"Mountain Pygmy Owl, Northern Pygmy-Owl",Approved
77396,Glaucidium gnoma,"Mountain Pygmy Owl, Northern Pygmy-Owl",Approved
99240,Glaucidium gnoma,"Mountain Pygmy Owl, Northern Pygmy-Owl",Approved
111801,Glaucidium gnoma,"Mountain Pygmy Owl, Northern Pygmy-Owl",Approved


In [51]:
# Also possibly a typo
eromophila_df

Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected
37569,GLBA-1204,GLBA,Glacier Bay,Passeriformes,Alaudidae,Eromophila alpestris,,Approved,Present,Native,Rare,Migratory,Least Concern,False
39528,GRBA-1206,GRBA,Great Basin,Passeriformes,Alaudidae,Eromophila alpestris,,In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False


In [52]:
all_sci_names = birds['scientific_name'].unique()
potential_matches = {}

for sci_name in no_common_names:
    # Use fuzzy matching to find the best matches in the list of all scientific names
    matches = process.extract(sci_name, all_sci_names, scorer=fuzz.ratio, limit=5)  # Adjust limit as needed
    
    # Filter matches to include only those with a similarity score over 90
    high_quality_matches = [match for match in matches if match[1] > 90]
    if len(high_quality_matches) > 1:
        # Retrieve common names for the matching scientific names
        common_names = birds[birds['scientific_name'].isin([match[0] for match in high_quality_matches])]['common_names'].dropna()

        common_names_counter = Counter([name.strip() for names in common_names for name in names.split(',')])
        most_common_name = common_names_counter.most_common(1)
        
        potential_matches[sci_name] = {
            'matches': high_quality_matches,
            'most_common_name': most_common_name[0][0] if most_common_name else 'No common name found'
        }

for sci_name, info in potential_matches.items():
    print(f"\nPotential matches for '{sci_name}':")
    for match in info['matches']:
        print(f"  - {match[0]} (Similarity Score: {match[1]})")
    print(f"Most common common name: {info['most_common_name']}")


Potential matches for 'Eromophila alpestris':
  - Eromophila alpestris (Similarity Score: 100)
  - Eremophila alpestris (Similarity Score: 95)
Most common common name: Horned Lark

Potential matches for 'Peucaea cassini':
  - Peucaea cassini (Similarity Score: 100)
  - Peucaea cassinii (Similarity Score: 97)
  - Peucaea casinii (Similarity Score: 93)
Most common common name: Cassin's Sparrow

Potential matches for 'Lophortyx californica':
  - Lophortyx californica (Similarity Score: 100)
  - Lophortyx californicus (Similarity Score: 93)
Most common common name: California Quail

Potential matches for 'Geothlypis tolomiei':
  - Geothlypis tolomiei (Similarity Score: 100)
  - Geothlypis tolmiei (Similarity Score: 97)
Most common common name: Macgillivray's Warbler


In [53]:
peucaea = birds[birds['scientific_name'].isin(['Peucaea casinii', 'Peucaea cassinii', 'Peucaea cassini'])]
peucaea

Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected
26551,DEVA-1466,DEVA,Death Valley,Passeriformes,Emberizidae,Peucaea cassini,,In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False
42281,GRCA-1306,GRCA,Grand Canyon,Passeriformes,Emberizidae,Peucaea cassinii,Cassin's Sparrow,Approved,Not Present,Native,Unknown,Vagrant,Least Concern,False
65240,JOTR-1067,JOTR,Joshua Tree,Passeriformes,Emberizidae,Peucaea casinii,Cassin'a Sparrow,Approved,Not Confirmed,Native,Unknown,Unknown,Least Concern,False


In [54]:
# Corrected scientific names mapping based on investigation
corrected_names = {
    'Lophortyx californica': 'Lophortyx californicus',  
    'Eromophila alpestris': 'Eremophila alpestris',
    'Peucaea cassini': 'Peucaea cassinii', 
    'Peucaea casinii': 'Peucaea cassinii',
    'Geothlypis tolomiei': 'Geothlypis tolmiei',
    'Glaucidium californicum': 'Glaucidium gnoma'
}

# Corresponding common names for the corrected scientific names
common_name_updates = {
    'Lophortyx californicus': 'California Quail',
    'Eremophila alpestris': 'Horned Lark',
    'Peucaea cassinii': "Cassin's Sparrow",  # Ensure this is correctly assigned
    'Geothlypis tolmiei': "Macgillivray's Warbler",
    'Glaucidium gnoma': 'Mountain Pygmy Owl, Northern Pygmy-Owl'
}

# Update scientific names in the birds DataFrame
for old_name, new_name in corrected_names.items():
    birds.loc[birds['scientific_name'] == old_name, 'scientific_name'] = new_name

# Update common names based on corrected scientific names
for sci_name, common_name in common_name_updates.items():
    birds.loc[birds['scientific_name'] == sci_name, 'common_names'] = common_name

birds_updated = birds[birds['scientific_name'].isin(corrected_names.values())]
birds_updated[['scientific_name', 'common_names', 'record_status']].head()

Unnamed: 0,scientific_name,common_names,record_status
207,Eremophila alpestris,Horned Lark,Approved
1825,Eremophila alpestris,Horned Lark,Approved
1969,Glaucidium gnoma,"Mountain Pygmy Owl, Northern Pygmy-Owl",Approved
2912,Eremophila alpestris,Horned Lark,Approved
4369,Eremophila alpestris,Horned Lark,Approved


In [55]:
results2 = process_scientific_names(birds, condition=2)
no_common_names = results2['no_common_names']
multiple_common_names = results2['multiple_common_names']
single_common_names = results2['single_common_names']

Standard Scientific Names Count (2 words): 992
Scientific names with no associated common names: 31
Scientific names with multiple associated common names: 369
Scientific names with a single associated common name: 592


In [56]:
# Extract all records from birds where scientific_name is in no_common_names
no_common_names_records = birds[birds['scientific_name'].isin(no_common_names)]
print(f"Shape: {no_common_names_records.shape}")
print(no_common_names_records['record_status'].value_counts())

Shape: (34, 14)
record_status
In Review    34
Approved      0
Name: count, dtype: int64


In [57]:
# Drop the records with no common names from the birds DataFrame in place
birds.drop(birds[birds['scientific_name'].isin(no_common_names)].index, inplace=True)

In [58]:
results2 = process_scientific_names(birds, condition=2)

Standard Scientific Names Count (2 words): 961
Scientific names with no associated common names: 0
Scientific names with multiple associated common names: 369
Scientific names with a single associated common name: 592


----
&nbsp;
#### Multiple `common_names`

In [59]:
print(f"Scientific names with multiple associated common names: {len(multiple_common_names)}\n")

# Display subset of scientific names with multiple common names
for sci_name, counts in multiple_common_names[:5]:
    print(f"{sci_name}: {counts}")

Scientific names with multiple associated common names: 369

Charadrius vociferus: Counter({'Killdeer': 49, 'Killdeer, Northern Killdeer': 1})
Melanitta fusca: Counter({'White-Winged Scoter': 21, 'Velvet Scoter, White-Winged Scoter': 2, 'Eastern White-Winged Scoter, White-Winged Coot, White-Winged Scoter': 1, 'White-Winged Scoter, Velvet Scoter': 1})
Aquila chrysaetos: Counter({'Golden Eagle': 48, 'American Golden Eagle, Golden Eagle': 1})
Catharus fuscescens: Counter({'Veery': 24, "Eastern Veery, Veery, Wilson's Thrush": 1})
Tringa solitaria: Counter({'Solitary Sandpiper': 45, 'Eastern Solitary Sandpiper, Solitary Sandpiper': 1, 'Solitairy Sandpiper': 1})


In [60]:
# Apply the function to the multiple_common_names
standardized_name_mapping = standardize_common_names(multiple_common_names)

Total ties: 31


In [61]:
print(f"Length: {len(standardized_name_mapping)}, Type: {type(standardized_name_mapping)}")

Length: 369, Type: <class 'dict'>


In [62]:
# Extract the first elements (scientific names) from the list of tuples
multi_sci_names_list = [sci_name for sci_name, counts in multiple_common_names]
print(multi_sci_names_list[:5])

['Charadrius vociferus', 'Melanitta fusca', 'Aquila chrysaetos', 'Catharus fuscescens', 'Tringa solitaria']


In [63]:
# Return a subset of the birds DataFrame where scientific_name matches any in the list
multi_common_names_records = birds[birds['scientific_name'].isin(multi_sci_names_list)]
print(f"Shape: {multi_common_names_records.shape}")
multi_common_names_records.head()

Shape: (8650, 14)


Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected
56,ACAD-1056,ACAD,Acadia,Accipitriformes,Accipitridae,Accipiter gentilis,"Eastern Goshawk, Goshawk, Northern Goshawk",Approved,Present,Native,Uncommon,Breeder,Least Concern,False
57,ACAD-1057,ACAD,Acadia,Accipitriformes,Accipitridae,Accipiter striatus,"Northern Sharp-Shinned Hawk, Sharp-Shinned Hawk",Approved,Present,Native,Common,Breeder,Species of Concern,True
58,ACAD-1058,ACAD,Acadia,Accipitriformes,Accipitridae,Aquila chrysaetos,"American Golden Eagle, Golden Eagle",Approved,Present,Native,Occasional,Vagrant,Species of Concern,True
59,ACAD-1059,ACAD,Acadia,Accipitriformes,Accipitridae,Buteo jamaicensis,"Eastern Red-Tailed Hawk, Red-Tailed Hawk",Approved,Present,Native,Common,Breeder,Least Concern,False
60,ACAD-1060,ACAD,Acadia,Accipitriformes,Accipitridae,Buteo lagopus,"American Rough-Legged Hawk, Rough-Legged Hawk",Approved,Present,Native,Uncommon,Resident,Species of Concern,True


In [64]:
# Save the no_common_names_records to a CSV for backup
multi_common_names_records.to_csv('DATA/Backups/multi_common_names_birds.csv', index=True)

In [65]:
# Update the birds DataFrame with the standardized common names
for sci_name, common_name in standardized_name_mapping.items():
    birds.loc[birds['scientific_name'] == sci_name, 'common_names'] = common_name

In [66]:
results2 = process_scientific_names(birds, condition=2)

Standard Scientific Names Count (2 words): 961
Scientific names with no associated common names: 0
Scientific names with multiple associated common names: 0
Scientific names with a single associated common name: 961


In [67]:
print(f"There are {birds.species_id.nunique()} records")
print(f"There are {birds.common_names.nunique()} unique common names")
print(f"There are {birds.scientific_name.nunique()} unique scientific names")
print(f"There are {birds.family.nunique()} unique families")

There are 14467 records
There are 927 unique common names
There are 1335 unique scientific names
There are 86 unique families


----
&nbsp;
### Extended `scientific_name`
Potential the most complicated records to fixed

In [68]:
results3 = process_scientific_names(birds, condition=3)
no_common_names = results3['no_common_names']
multiple_common_names = results3['multiple_common_names']
single_common_names = results3['single_common_names']

Extended Scientific Names Count (> 2 words): 374
Scientific names with no associated common names: 160
Scientific names with multiple associated common names: 29
Scientific names with a single associated common name: 185


In [69]:
# Extract all records from birds where scientific_name is in no_common_names
no_common_names_records = birds[birds['scientific_name'].isin(no_common_names)]
print(f"Shape: {no_common_names_records.shape}")
print(no_common_names_records['record_status'].value_counts())
no_common_names_records.head()

Shape: (163, 14)
record_status
Approved     96
In Review    67
Name: count, dtype: int64


Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected
19294,CRLA-1232,CRLA,Crater Lake,Passeriformes,Fringillidae,Coccothraustes vespertinus brooksi,,Approved,Not Confirmed,Native,Unknown,Unknown,Least Concern,False
19350,CRLA-1288,CRLA,Crater Lake,Passeriformes,Troglodytidae,Troglodytes troglodytes pacificus,,Approved,Present,Native,Uncommon,Breeder,Least Concern,False
26298,DEVA-1213,DEVA,Death Valley,Passeriformes,Paridae,Parus inornatus griseus,,In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False
26308,DEVA-1223,DEVA,Death Valley,Accipitriformes,Accipitridae,Buteo lineatus elegans,,In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False
26342,DEVA-1257,DEVA,Death Valley,Anseriformes,Anatidae,Branta canadensis occidentalis,,In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False


In [70]:
# Save the no_common_names_records to a CSV for backup
no_common_names_records.to_csv('DATA/Backups/no_common_names_subspecies_birds.csv', index=True)

#### Handling [Subspecies](https://en.wikipedia.org/wiki/Subspecies) in Scientific Names

A subspecies is a taxonomic classification below species, representing populations of a species that are genetically distinct due to geographic or ecological factors. Subspecies names are often included in scientific naming as a third part, following the genus and species (e.g., *Buteo jamaicensis borealis*).

- **Matching on Genus and Species**: We extracted the genus and species parts of the scientific names and used fuzzy matching to find existing records with associated common names.
- **Updating Common Names**: For each match, the most appropriate common name was selected and updated to include the subspecies designation in parentheses (e.g., *Red-Tailed Hawk (borealis subspecies)*).

In [71]:
all_sci_names = birds['scientific_name'].unique()
potential_matches = {}

for sci_name in no_common_names:
    genus_species = ' '.join(sci_name.split()[:2])

    # Use fuzzy matching to find best matches in the list of all scientific names
    matches = process.extract(genus_species, all_sci_names, scorer=fuzz.ratio, limit=5)
    high_quality_matches = [match for match in matches if match[1] > 90]

    if high_quality_matches:
        # Retrieve common names for the matching scientific names
        matched_sci_names = [match[0] for match in high_quality_matches]
        common_names = birds[birds['scientific_name'].isin(matched_sci_names)]['common_names'].dropna()

        # Count occurrences of each common name
        common_names_counter = Counter([name.strip() for names in common_names for name in names.split(',')])
        most_common_name = common_names_counter.most_common(1)

        # Store potential matches and the most common name found
        potential_matches[sci_name] = {
            'matches': high_quality_matches,
            'most_common_name': most_common_name[0][0] if most_common_name else 'No common name found'
        }

print(f"Potential matches: {len(potential_matches)}")

# Subset the first 5 items from the potential_matches dictionary
matches_subset = dict(islice(potential_matches.items(), 5))
for sci_name, info in matches_subset.items():
    print(f"\nPotential matches for '{sci_name}':")
    for match in info['matches']:
        print(f"  - {match[0]} (Similarity Score: {match[1]})")
    print(f"Most common common name: {info['most_common_name']}")

Potential matches: 145

Potential matches for 'Melanerpes formicivorus bairdi':
  - Melanerpes formicivorus (Similarity Score: 100)
Most common common name: Acorn Woodpecker

Potential matches for 'Geothlypis trichas brachidactylus':
  - Geothlypis trichas (Similarity Score: 100)
Most common common name: Common Yellowthroat

Potential matches for 'Vireo solitarius plumbeus':
  - Vireo solitarius (Similarity Score: 100)
Most common common name: Blue-Headed Vireo

Potential matches for 'Leiothlypis celata orestera':
  - Leiothlypis celata (Similarity Score: 100)
Most common common name: Orange-Crowned Warbler

Potential matches for 'Parus inornatus griseus':
  - Parus inornatus (Similarity Score: 100)
Most common common name: Plain Titmouse


In [72]:
updated_common_names = {}

for sci_name, info in potential_matches.items():
    most_common_name = info['most_common_name']
    subspecies = sci_name.split()[2]  
    updated_name = f"{most_common_name} ({subspecies} subspecies)"
    updated_common_names[sci_name] = updated_name

# Update the birds DataFrame with the new common names
for sci_name, common_name in updated_common_names.items():
    birds.loc[birds['scientific_name'] == sci_name, 'common_names'] = common_name

In [73]:
results3 = process_scientific_names(birds, condition=3)
no_common_names = results3['no_common_names']

Extended Scientific Names Count (> 2 words): 374
Scientific names with no associated common names: 15
Scientific names with multiple associated common names: 29
Scientific names with a single associated common name: 330


In [74]:
no_common_names_records = birds[birds['scientific_name'].isin(no_common_names)]
print(f"Shape: {no_common_names_records.shape}")
print(no_common_names_records['record_status'].value_counts())
no_common_names_records

Shape: (15, 14)
record_status
In Review    14
Approved      1
Name: count, dtype: int64


Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected
26433,DEVA-1348,DEVA,Death Valley,Charadriiformes,Scolopacidae,Symphemia semipalmata inornata,,In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False
26489,DEVA-1404,DEVA,Death Valley,Passeriformes,Alaudidae,Otocoris alpestris arenicola,,In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False
26490,DEVA-1405,DEVA,Death Valley,Passeriformes,Alaudidae,Otocoris alpestris chrysolaema,,In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False
26564,DEVA-1479,DEVA,Death Valley,Passeriformes,Emberizidae,Spizella monticola ochracea,,In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False
26568,DEVA-1483,DEVA,Death Valley,Passeriformes,Emberizidae,Spizella socialis arizonae,,In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False
26675,DEVA-1590,DEVA,Death Valley,Passeriformes,Parulidae,Sylvania pusilla pileolata,,In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False
26703,DEVA-1618,DEVA,Death Valley,Passeriformes,Troglodytidae,Thryothorus bewickii spilurus,,In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False
26718,DEVA-1633,DEVA,Death Valley,Passeriformes,Turdidae,Turdus aonalaschkae auduboni,,In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False
26722,DEVA-1637,DEVA,Death Valley,Passeriformes,Turdidae,Turdus ustulatus swainsonii,,In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False
26759,DEVA-1674,DEVA,Death Valley,Passeriformes,Vireonidae,Vireosylva gilva swainsonii,,In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False


In [75]:
# Dropping the remaining unnamed subspecies
birds.drop(birds[birds['scientific_name'].isin(no_common_names)].index, inplace=True)

In [76]:
results3 = process_scientific_names(birds, condition=3)
multiple_common_names = results3['multiple_common_names']

Extended Scientific Names Count (> 2 words): 359
Scientific names with no associated common names: 0
Scientific names with multiple associated common names: 29
Scientific names with a single associated common name: 330


----
&nbsp;
#### Multiple `common_names`

In [77]:
print(f"Scientific names with multiple associated common names: {len(multiple_common_names)}\n")

Scientific names with multiple associated common names: 29



In [79]:
standardized_names_mapping = standardize_common_names_subspecies(multiple_common_names)

Total ties: 11


In [80]:
print(f"Length: {len(standardized_names_mapping)}, Type: {type(standardized_names_mapping)}")

# Extract the first elements (scientific names) from the list of tuples
multi_sci_names_list = [sci_name for sci_name, counts in multiple_common_names]
print(multi_sci_names_list[:5])

Length: 29, Type: <class 'dict'>
['Vermivora pinus X chrysoptera', 'Sitta pygmaea melanotis', 'Picoides dorsalis fasciatus', 'Empidonax traillii brewsteri', 'Tringa solitaria cinnamomea']


In [83]:
# Return a subset of the birds DataFrame where scientific_name matches any in the list
multi_common_names_records = birds[birds['scientific_name'].isin(multi_sci_names_list)]
print(f"Shape: {multi_common_names_records.shape}")
multi_common_names_records.head()

Shape: (123, 14)


Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected
1847,ARCH-1138,ARCH,Arches,Passeriformes,Emberizidae,Junco hyemalis caniceps,Dark-Eyed Junco (Gray-Headed),Approved,Present,Native,Common,Migratory,Least Concern,False
1848,ARCH-1139,ARCH,Arches,Passeriformes,Emberizidae,Junco hyemalis hyemalis,Dark-Eyed Junco (Slate-Colored),Approved,Present,Native,Rare,Resident,Least Concern,False
1849,ARCH-1140,ARCH,Arches,Passeriformes,Emberizidae,Junco hyemalis mearnsi,Dark-Eyed Junco (Pink-Sided),Approved,Present,Native,Common,Resident,Least Concern,False
1850,ARCH-1141,ARCH,Arches,Passeriformes,Emberizidae,Junco hyemalis oreganus,Dark-Eyed Junco (Oregon),Approved,Present,Native,Common,Resident,Least Concern,False
1935,ARCH-1226,ARCH,Arches,Passeriformes,Tyrannidae,Empidonax traillii extimus,Southwest Willow Flycatcher,Approved,Not Confirmed,Native,Unknown,Unknown,Endangered,True


In [84]:
# Update the birds DataFrame with the standardized common names
for sci_name, common_name in standardized_names_mapping.items():
    birds.loc[birds['scientific_name'] == sci_name, 'common_names'] = common_name

In [85]:
results3 = process_scientific_names(birds, condition=3)
single_common_names = results3['single_common_names']

Extended Scientific Names Count (> 2 words): 359
Scientific names with no associated common names: 0
Scientific names with multiple associated common names: 0
Scientific names with a single associated common name: 359


----
&nbsp;
#### Remaining NaN records in `birds`

In [93]:
print(f"Scientific name NaN count: {birds.scientific_name.isna().sum()}")
print(f"Common Name NaN count: {birds.common_names.isna().sum()}")

Scientific name NaN count: 0
Common Name NaN count: 41


In [95]:
nan_common = birds[birds.common_names.isna()]
print(f"Shape: {nan_common.shape}")
nan_common.head()

Shape: (41, 14)


Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected
19285,CRLA-1223,CRLA,Crater Lake,Passeriformes,Emberizidae,Zonotrichia leucophrys pugetensis,,Approved,Present,Native,Uncommon,Migratory,Least Concern,False
19308,CRLA-1246,CRLA,Crater Lake,Passeriformes,Icteridae,Agelaius phoeniceus caurinus,,Approved,Not Confirmed,Native,Unknown,Unknown,Least Concern,False
19348,CRLA-1286,CRLA,Crater Lake,Passeriformes,Troglodytidae,Troglodytes aedon parkmanii,,Approved,Present,Native,Rare,Breeder,Least Concern,False
19355,CRLA-1293,CRLA,Crater Lake,Passeriformes,Turdidae,Myadestes townsendi townsendi,,Approved,Present,Native,Uncommon,Breeder,Least Concern,False
19379,CRLA-1317,CRLA,Crater Lake,Piciformes,Picidae,Colaptes auratus collaris,,Approved,Present,Native,Common,Breeder,Least Concern,False


In [96]:
# Save unique scientific name in above df
nan_sci_names = nan_common.scientific_name.unique().tolist()

In [100]:
all_sci_names = birds['scientific_name'].unique()
potential_matches = {}

for sci_name in nan_sci_names:
    genus_species = ' '.join(sci_name.split()[:2])

    # Use fuzzy matching to find best matches in the list of all scientific names
    matches = process.extract(genus_species, all_sci_names, scorer=fuzz.ratio, limit=5)
    high_quality_matches = [match for match in matches if match[1] > 90]

    if high_quality_matches:
        # Retrieve common names for the matching scientific names
        matched_sci_names = [match[0] for match in high_quality_matches]
        common_names = birds[birds['scientific_name'].isin(matched_sci_names)]['common_names'].dropna()

        # Count occurrences of each common name
        common_names_counter = Counter([name.strip() for names in common_names for name in names.split(',')])
        most_common_name = common_names_counter.most_common(1)

        # Store potential matches and the most common name found
        potential_matches[sci_name] = {
            'matches': high_quality_matches,
            'most_common_name': most_common_name[0][0] if most_common_name else 'No common name found'
        }

print(f"Potential matches: {len(potential_matches)}")

# Subset the first 5 items from the potential_matches dictionary
matches_subset = dict(islice(potential_matches.items(), 5))
for sci_name, info in matches_subset.items():
    print(f"\nPotential matches for '{sci_name}':")
    for match in info['matches']:
        print(f"  - {match[0]} (Similarity Score: {match[1]})")
    print(f"Most common common name: {info['most_common_name']}")

Potential matches: 37

Potential matches for 'Zonotrichia leucophrys pugetensis':
  - Zonotrichia leucophrys (Similarity Score: 100)
Most common common name: White-Crowned Sparrow

Potential matches for 'Agelaius phoeniceus caurinus':
  - Agelaius phoeniceus (Similarity Score: 100)
Most common common name: Red-Winged Blackbird

Potential matches for 'Troglodytes aedon parkmanii':
  - Troglodytes aedon (Similarity Score: 100)
Most common common name: House Wren

Potential matches for 'Myadestes townsendi townsendi':
  - Myadestes townsendi (Similarity Score: 100)
Most common common name: Townsend's Solitaire

Potential matches for 'Colaptes auratus collaris':
  - Colaptes auratus (Similarity Score: 100)
  - Calaptes auratus (Similarity Score: 94)
Most common common name: Northern Flicker


In [101]:
updated_common_names = {}

for sci_name, info in potential_matches.items():
    most_common_name = info['most_common_name']
    
    # Split the scientific name into parts
    sci_name_parts = sci_name.split()
    
    # Check if the scientific name includes a subspecies
    if len(sci_name_parts) > 2:
        subspecies = sci_name_parts[2]  # Get the subspecies part
        updated_name = f"{most_common_name} ({subspecies} subspecies)"
    else:
        # If no subspecies, just use the most common name
        updated_name = most_common_name

    updated_common_names[sci_name] = updated_name

# Update the birds DataFrame with the new common names
for sci_name, common_name in updated_common_names.items():
    birds.loc[birds['scientific_name'] == sci_name, 'common_names'] = common_name

In [103]:
# Check if there are still any NaN values in 'common_names'
print(f"Number of NaN common names: {len(nan_common)}")

Number of NaN common names: 0


----
&nbsp;
#### Single `common_names`
Searching for 'unusual' records

In [141]:
# Define thresholds and patterns for identifying unusual names
MAX_LENGTH = 50  # Define a threshold for very long common names
MAX_WORD_COUNT = 6  # Define a word count threshold
UNUSUAL_PUNCTUATION_PATTERN = r"[^a-zA-Z\s,\'\-\(\)/`´]"  # Allows letters, spaces, commas, apostrophes, and hyphens

long_names = birds[birds['common_names'].str.len() > MAX_LENGTH]
high_word_count = birds[birds['common_names'].str.split().apply(len) > MAX_WORD_COUNT]
unusual_punctuation = birds[birds['common_names'].str.contains(UNUSUAL_PUNCTUATION_PATTERN, na=False)]

unusual_common_names = pd.concat([long_names, high_word_count, unusual_punctuation])

In [142]:
unusual_common_names

Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected,raptor_group,raptor_sci_fam
111,ACAD-1111,ACAD,Acadia,Charadriiformes,Alcidae,Cepphus grylle,"Black Guillemot, Sea Pigeon, Southern Black Gu...",Approved,Present,Native,Common,Breeder,Least Concern,False,,
26297,DEVA-1212,DEVA,Death Valley,Passeriformes,Parulidae,Dendroica aestiva,"Golden Warbler, Northern Yellow Warbler, Summe...",In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False,,
26777,DEVA-1692,DEVA,Death Valley,Piciformes,Picidae,Ceophloeus pileatus,"Black Woodcock, Logcock, Pileated Woodpecker, ...",In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False,,
30773,DRTO-1249,DRTO,Dry Tortugas,Passeriformes,Tyrannidae,Tyrannus melancholicus couchii,"""Tropical"" Kingbird,In Review,Present,Not Nati...",Approved,Present,Native,Uncommon,Migratory,Least Concern,False,,
39491,GRBA-1169,GRBA,Great Basin,Charadriiformes,Scolopacidae,Gallinago gallinago delicata,"Common Snipe, English Snipe, Jacksnipe, Wilson...",Approved,Present,Native,Uncommon,Unknown,Least Concern,False,,
63823,ISRO-1047,ISRO,Isle Royale,Anseriformes,Anatidae,Anas rubripes X platyrhynchos,"American Black Mallard, Mallard X Black Duck H...",Approved,Present,Native,Occasional,Unknown,Least Concern,False,,
111,ACAD-1111,ACAD,Acadia,Charadriiformes,Alcidae,Cepphus grylle,"Black Guillemot, Sea Pigeon, Southern Black Gu...",Approved,Present,Native,Common,Breeder,Least Concern,False,,
26297,DEVA-1212,DEVA,Death Valley,Passeriformes,Parulidae,Dendroica aestiva,"Golden Warbler, Northern Yellow Warbler, Summe...",In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False,,
30773,DRTO-1249,DRTO,Dry Tortugas,Passeriformes,Tyrannidae,Tyrannus melancholicus couchii,"""Tropical"" Kingbird,In Review,Present,Not Nati...",Approved,Present,Native,Uncommon,Migratory,Least Concern,False,,
39491,GRBA-1169,GRBA,Great Basin,Charadriiformes,Scolopacidae,Gallinago gallinago delicata,"Common Snipe, English Snipe, Jacksnipe, Wilson...",Approved,Present,Native,Uncommon,Unknown,Least Concern,False,,


In [105]:
tyrannus_mix = birds[birds['scientific_name'].str.contains("Tyrannus melancholicus couchii", na=False)]
tyrannus_mix

Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected
30773,DRTO-1249,DRTO,Dry Tortugas,Passeriformes,Tyrannidae,Tyrannus melancholicus couchii,"""Tropical"" Kingbird,In Review,Present,Not Nati...",Approved,Present,Native,Uncommon,Migratory,Least Concern,False


The above record is an absolute mess

In [106]:
tyrannus1 = birds[birds['scientific_name'].str.contains("Tyrannus melancholicus", na=False)]
tyrannus1

Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected
4588,BIBE-1442,BIBE,Big Bend,Passeriformes,Tyrannidae,Tyrannus melancholicus,Tropical Kingbird,Approved,Present,Native,Uncommon,Breeder,Least Concern,False
26742,DEVA-1657,DEVA,Death Valley,Passeriformes,Tyrannidae,Tyrannus melancholicus,Tropical Kingbird,Approved,Present,Native,Occasional,Vagrant,Least Concern,False
30773,DRTO-1249,DRTO,Dry Tortugas,Passeriformes,Tyrannidae,Tyrannus melancholicus couchii,"""Tropical"" Kingbird,In Review,Present,Not Nati...",Approved,Present,Native,Uncommon,Migratory,Least Concern,False
88216,REDW-1574,REDW,Redwood,Passeriformes,Tyrannidae,Tyrannus melancholicus,Tropical Kingbird,Approved,Present,Not Native,Occasional,Vagrant,Least Concern,False


In [107]:
tyrannus2 = birds[birds['scientific_name'].str.contains("Tyrannus couchii", na=False)]
tyrannus2

Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected
4585,BIBE-1439,BIBE,Big Bend,Passeriformes,Tyrannidae,Tyrannus couchii,Couch's Kingbird,Approved,Present,Native,Rare,Breeder,Least Concern,False


In [108]:
# Drop index 30773


----
&nbsp;
### Isolating Birds of Prey
Isolating birds of prey: We create a list of birds of prey 'groups' to search under `common_name`

In [109]:
# Define the list of bird of prey keywords
birds_of_prey = ["Eagle", "Hawk", "Falcon", "Buzzard", "Harrier", "Kite", "Owl", "Osprey", 
                 "Vulture", "Condor", "Kestrel", 'Buteo', 'Accipiter', 'Caracara']

# Create a regex pattern from the list
pattern = '|'.join(birds_of_prey)

# Function to find and append only the matching keywords
def find_raptors(common_names):
    # Convert to string and handle NaN values
    if pd.isna(common_names):
        return ''
    # Find keywords that are present in the common_names
    matches = set()  # Use a set to avoid duplicates
    for keyword in birds_of_prey:
        if keyword in common_names:
            matches.add(keyword)
    return ', '.join(matches)

# Apply the function to the common_names column
birds['raptor_group'] = birds['common_names'].apply(find_raptors)

# Display the first few rows to verify
birds.head()

Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected,raptor_group
55,ACAD-1055,ACAD,Acadia,Accipitriformes,Accipitridae,Accipiter cooperii,Cooper's Hawk,Approved,Present,Native,Uncommon,Unknown,Species of Concern,True,Hawk
56,ACAD-1056,ACAD,Acadia,Accipitriformes,Accipitridae,Accipiter gentilis,Northern Goshawk,Approved,Present,Native,Uncommon,Breeder,Least Concern,False,
57,ACAD-1057,ACAD,Acadia,Accipitriformes,Accipitridae,Accipiter striatus,Sharp-Shinned Hawk,Approved,Present,Native,Common,Breeder,Species of Concern,True,Hawk
58,ACAD-1058,ACAD,Acadia,Accipitriformes,Accipitridae,Aquila chrysaetos,Golden Eagle,Approved,Present,Native,Occasional,Vagrant,Species of Concern,True,Eagle
59,ACAD-1059,ACAD,Acadia,Accipitriformes,Accipitridae,Buteo jamaicensis,Red-Tailed Hawk,Approved,Present,Native,Common,Breeder,Least Concern,False,Hawk


In [110]:
total_raptors_comm = birds['raptor_group'].loc[birds['raptor_group'] != ''].count()
print(f"Under common name:\nWe have a total of {total_raptors_comm} Raptors\n")
diff_com_types = birds.raptor_group.unique().tolist()
print(diff_com_types)

Under common name:
We have a total of 1341 Raptors

['Hawk', '', 'Eagle', 'Harrier', 'Vulture', 'Osprey', 'Falcon', 'Kestrel', 'Owl', 'Hawk, Owl', 'Condor', 'Kite', 'Caracara']


There's some ambiguity in the above list so we cross-reference `family`

#### Birds of Prey Scientific Families and Genera

According to OpenAI's language model ChatGPT 4o (2024):

- Accipitridae (Hawks, Eagles, and relatives)
- Falconidae (Falcons)
- Harpagiidae (Harriers)
- Pandionidae (Ospreys)
- Accipitridae (Kites)
- Cathartidae (New World Vultures)
- Buteo (Buzzards and Buteos)
- Accipiter (Goshawks and Accipiters)
- Tytonidae (Barn Owls)
- Strigidae (Typical Owls)

*Caveat emptor*: This list may not be comprehensive.

In [111]:
print(f"NaN count: {birds.family.isna().sum()}\n")

NaN count: 16



In [112]:
# Define the list of bird of prey scientific families and genera
birds_of_prey_sci = [
    "Accipitridae", "Falconidae", "Harpagiidae", 
    "Pandionidae", "Cathartidae", "Buteo", "Accipiter", 
    "Tytonidae", "Strigidae"
]

# Create a regex pattern from the list
pattern = '|'.join(birds_of_prey_sci)

# Find and join matching scientific families and genera in the 'family' column
birds['raptor_sci_fam'] = birds['family'].str.findall(f'({pattern})')

# Convert the lists of matches into a comma-separated string, handling non-iterables safely
birds['raptor_sci_fam'] = birds['raptor_sci_fam'].apply(lambda x: ', '.join(x) if isinstance(x, list) and x else '')

# Display the first few rows to verify
birds.head()

Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected,raptor_group,raptor_sci_fam
55,ACAD-1055,ACAD,Acadia,Accipitriformes,Accipitridae,Accipiter cooperii,Cooper's Hawk,Approved,Present,Native,Uncommon,Unknown,Species of Concern,True,Hawk,Accipitridae
56,ACAD-1056,ACAD,Acadia,Accipitriformes,Accipitridae,Accipiter gentilis,Northern Goshawk,Approved,Present,Native,Uncommon,Breeder,Least Concern,False,,Accipitridae
57,ACAD-1057,ACAD,Acadia,Accipitriformes,Accipitridae,Accipiter striatus,Sharp-Shinned Hawk,Approved,Present,Native,Common,Breeder,Species of Concern,True,Hawk,Accipitridae
58,ACAD-1058,ACAD,Acadia,Accipitriformes,Accipitridae,Aquila chrysaetos,Golden Eagle,Approved,Present,Native,Occasional,Vagrant,Species of Concern,True,Eagle,Accipitridae
59,ACAD-1059,ACAD,Acadia,Accipitriformes,Accipitridae,Buteo jamaicensis,Red-Tailed Hawk,Approved,Present,Native,Common,Breeder,Least Concern,False,Hawk,Accipitridae


In [113]:
total_raptors_sci = birds['raptor_sci_fam'].loc[birds['raptor_sci_fam'] != ''].count()
print(f"Under scientific family:\nWe have a total of {total_raptors_sci} Raptors\n")
diff_sci_types = birds.raptor_sci_fam.unique().tolist()
print(diff_sci_types)

Under scientific family:
We have a total of 1467 Raptors

['Accipitridae', 'Cathartidae', 'Pandionidae', '', 'Falconidae', 'Strigidae', 'Tytonidae']


In [114]:
# Declare a new dataframe to analyse the discrepancies
raptors_df = (birds[(birds.raptor_group != '')
                                | (birds.raptor_sci_fam != '')])
print(raptors_df.shape)

(1470, 16)


In [115]:
# We work with a copy of the original DataFrame and not a view of it
raptors_df = raptors_df.copy()
mask = raptors_df['raptor_group'] == ''
raptors_df.loc[mask, 'ambiguous'] = True
result = raptors_df[raptors_df['ambiguous'] == True]
result

Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected,raptor_group,raptor_sci_fam,ambiguous
56,ACAD-1056,ACAD,Acadia,Accipitriformes,Accipitridae,Accipiter gentilis,Northern Goshawk,Approved,Present,Native,Uncommon,Breeder,Least Concern,False,,Accipitridae,True
185,ACAD-1185,ACAD,Acadia,Falconiformes,Falconidae,Falco columbarius,Merlin,Approved,Present,Native,Common,Unknown,Species of Concern,True,,Falconidae,True
187,ACAD-1187,ACAD,Acadia,Falconiformes,Falconidae,Falco rusticolus,Gyrfalcon,Approved,Present,Not Native,Unknown,Vagrant,Least Concern,False,,Falconidae,True
1769,ARCH-1060,ARCH,Arches,Accipitriformes,Accipitridae,Accipiter gentilis,Northern Goshawk,Approved,Present,Native,Rare,Migratory,Least Concern,False,,Accipitridae,True
1816,ARCH-1107,ARCH,Arches,Falconiformes,Falconidae,Falco columbarius,Merlin,Approved,Present,Native,Occasional,Migratory,Species of Concern,True,,Falconidae,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111590,YELL-1192,YELL,Yellowstone,Falconiformes,Falconidae,Falco columbarius,Merlin,Approved,Present,Native,Rare,Migratory,Species of Concern,True,,Falconidae,True
115453,YOSE-1089,YOSE,Yosemite,Accipitriformes,Accipitridae,Accipiter gentilis,Northern Goshawk,Approved,Present,Native,Uncommon,Breeder,Least Concern,False,,Accipitridae,True
115521,YOSE-1157,YOSE,Yosemite,Falconiformes,Falconidae,Falco columbarius,Merlin,Approved,Present,Native,Rare,Breeder,Species of Concern,True,,Falconidae,True
117534,ZION-1082,ZION,Zion,Accipitriformes,Accipitridae,Accipiter gentilis,Northern Goshawk,Approved,Present,Native,Rare,Breeder,Least Concern,False,,Accipitridae,True


*Accipiter gentilis* is `Northern Goshawk`, a type of `Hawk`

*Falco columbarius* is `Merlin`, a type of `Falcon`

In [116]:
# Update values for 'Accipiter gentilis'
birds.loc[birds['scientific_name'] == 'Accipiter gentilis', 
          ['common_names', 'raptor_group']] = ['Northern Goshawk', 'Hawk']

# Update values for 'Falco columbarius'
birds.loc[birds['scientific_name'] == 'Falco columbarius', 
          ['common_names', 'raptor_group']] = ['Merlin', 'Falcon']

# Update values for 'Falco rusticolus'
birds.loc[birds['scientific_name'] == 'Falco rusticolus', 
          ['common_names', 'raptor_group']] = ['Gyrfalcon', 'Falcon']

# Update values for 'Falco columbarius columbarius'
birds.loc[birds['scientific_name'] == 'Falco columbarius columbarius', 
          ['common_names', 'raptor_group']] = ['Merlin (Tundra Subspecies)', 'Falcon']

# Update values for 'Falco columbarius suckleyi'
birds.loc[birds['scientific_name'] == 'Falco columbarius suckleyi', 
          ['common_names', 'raptor_group']] = ['Merlin (Coastal Forest Subspecies)', 'Falcon']


In [117]:
raptors_df = (birds[(birds.raptor_group != '')
                                | (birds.raptor_sci_fam != '')])

raptors_df = raptors_df.copy()
mask = raptors_df['raptor_group'] == ''
raptors_df.loc[mask, 'ambiguous'] = True
result = raptors_df[raptors_df['ambiguous'] == True]
result

Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected,raptor_group,raptor_sci_fam,ambiguous
26456,DEVA-1371,DEVA,Death Valley,Falconiformes,Falconidae,Falco columbarius richardsonii,Merlin (richardsonii subspecies),In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False,,Falconidae,True
78390,MORA-1070,MORA,Mount Rainier,Accipitriformes,Accipitridae,Accipiter gentilis atricapillus,Western Goshawk,Approved,Present,Native,Uncommon,Breeder,Least Concern,False,,Accipitridae,True
78391,MORA-1071,MORA,Mount Rainier,Accipitriformes,Accipitridae,Accipiter gentilis laingi,Goshawk,Approved,Not Present (False Report),Native,Unknown,Unknown,Least Concern,False,,Accipitridae,True
87990,REDW-1348,REDW,Redwood,Falconiformes,Falconidae,Falco columbarius bendirei,Merlin (bendirei subspecies),In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False,,Falconidae,True
87992,REDW-1350,REDW,Redwood,Falconiformes,Falconidae,Falco columbarius richardsoni,Merlin (richardsoni subspecies),In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False,,Falconidae,True


In [118]:
print(f"Shape of birds: {birds.shape}")

Shape of birds: (14452, 16)


In [119]:
buteos = birds[(birds.common_names == 'Buteo') | (birds.scientific_name == "Buteo")]
buteos

Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected,raptor_group,raptor_sci_fam


In [120]:
# Count the number of NaN values in each column
nan_counts_per_column = birds.isna().sum()
print(nan_counts_per_column)

species_id              0
park_code               0
park_name               0
order                   0
family                 16
scientific_name         0
common_names            0
record_status           0
occurrence              0
nativeness              0
abundance               0
seasonality             0
conservation_status     0
is_protected            0
raptor_group            0
raptor_sci_fam          0
dtype: int64


----
&nbsp;
## Create a set of `scientific_name` and cross-ref in birds where `common_name` != nan

In [121]:
# Step 1: Identify rows with NaN in 'common_names' and extract unique scientific names
scientific_names_with_nan = set(birds.loc[birds['common_names'].isna(), 'scientific_name'])

# Step 2: Cross-reference to find matching 'common_names' for these scientific names
# Create a dictionary mapping scientific names to common names where 'common_names' is not NaN
common_name_mapping = birds.dropna(subset=['common_names']).set_index('scientific_name')['common_names'].to_dict()

In [122]:
common_name_mapping

{'Accipiter cooperii': "Cooper's Hawk",
 'Accipiter gentilis': 'Northern Goshawk',
 'Accipiter striatus': 'Sharp-Shinned Hawk',
 'Aquila chrysaetos': 'Golden Eagle',
 'Buteo jamaicensis': 'Red-Tailed Hawk',
 'Buteo lagopus': 'Rough-Legged Hawk',
 'Buteo lineatus': 'Red-Shouldered Hawk',
 'Buteo platypterus': 'Broad-Winged Hawk',
 'Buteo swainsoni': "Swainson's Hawk",
 'Circus cyaneus': 'Northern Harrier',
 'Haliaeetus leucocephalus': 'Bald Eagle',
 'Cathartes aura': 'Turkey Vulture',
 'Pandion haliaetus': 'Osprey',
 'Aix sponsa': 'Wood Duck',
 'Anas acuta': 'Northern Pintail',
 'Anas americana': 'American Wigeon',
 'Anas clypeata': 'Northern Shoveler',
 'Anas crecca': 'Green-Winged Teal',
 'Anas discors': 'Blue-Winged Teal',
 'Anas penelope': 'Eurasian Wigeon',
 'Anas platyrhynchos': 'Mallard',
 'Anas rubripes': 'American Black Duck',
 'Anas strepera': 'Gadwall',
 'Anser albifrons': 'Greater White-Fronted Goose',
 'Aythya affinis': 'Lesser Scaup',
 'Aythya americana': 'Redhead',
 'Ayth

In [123]:
# Define the keys of interest
keys_of_interest = [
    'Accipiter cooperii', 'Accipiter gentilis', 'Accipiter striatus', 
    'Aquila chrysaetos', 'Buteo jamaicensis', 'Buteo lagopus', 
    'Buteo lineatus', 'Buteo platypterus', 'Buteo swainsoni', 
    'Circus cyaneus', 'Haliaeetus leucocephalus', 'Cathartes aura', 
    'Pandion haliaetus'
]

# Create a subset dictionary using dictionary comprehension
subset_common_name_mapping = {k: common_name_mapping[k] for k in keys_of_interest if k in common_name_mapping}

In [124]:
# Filter the birds DataFrame to include only records where 'scientific_name' is in keys_of_interest
subset_birds = birds[birds['scientific_name'].isin(keys_of_interest)]
subset_birds

Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected,raptor_group,raptor_sci_fam
55,ACAD-1055,ACAD,Acadia,Accipitriformes,Accipitridae,Accipiter cooperii,Cooper's Hawk,Approved,Present,Native,Uncommon,Unknown,Species of Concern,True,Hawk,Accipitridae
56,ACAD-1056,ACAD,Acadia,Accipitriformes,Accipitridae,Accipiter gentilis,Northern Goshawk,Approved,Present,Native,Uncommon,Breeder,Least Concern,False,Hawk,Accipitridae
57,ACAD-1057,ACAD,Acadia,Accipitriformes,Accipitridae,Accipiter striatus,Sharp-Shinned Hawk,Approved,Present,Native,Common,Breeder,Species of Concern,True,Hawk,Accipitridae
58,ACAD-1058,ACAD,Acadia,Accipitriformes,Accipitridae,Aquila chrysaetos,Golden Eagle,Approved,Present,Native,Occasional,Vagrant,Species of Concern,True,Eagle,Accipitridae
59,ACAD-1059,ACAD,Acadia,Accipitriformes,Accipitridae,Buteo jamaicensis,Red-Tailed Hawk,Approved,Present,Native,Common,Breeder,Least Concern,False,Hawk,Accipitridae
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117543,ZION-1091,ZION,Zion,Accipitriformes,Accipitridae,Buteo swainsoni,Swainson's Hawk,Approved,Present,Native,Rare,Migratory,Least Concern,False,Hawk,Accipitridae
117545,ZION-1093,ZION,Zion,Accipitriformes,Accipitridae,Circus cyaneus,Northern Harrier,Approved,Present,Native,Uncommon,Migratory,Species of Concern,True,Harrier,Accipitridae
117546,ZION-1094,ZION,Zion,Accipitriformes,Accipitridae,Haliaeetus leucocephalus,Bald Eagle,Approved,Present,Native,Uncommon,Resident,In Recovery,True,Eagle,Accipitridae
117547,ZION-1095,ZION,Zion,Accipitriformes,Cathartidae,Cathartes aura,Turkey Vulture,Approved,Present,Native,Common,Breeder,Species of Concern,True,Vulture,Cathartidae


In [125]:
# Initialize empty lists to collect matched and unmatched indices
matched_indices = []
unmatched_indices = []

# Iterate over the dictionary to check values in the birds DataFrame
for sci_name, common_name in subset_common_name_mapping.items():
    # Find rows where scientific_name matches the key
    matching_rows = birds[birds['scientific_name'] == sci_name]
    
    # Separate matched and unmatched based on the common_names
    matched_rows = matching_rows[matching_rows['common_names'] == common_name]
    unmatched_rows = matching_rows[matching_rows['common_names'] != common_name]
    
    # Append indices to respective lists
    matched_indices.extend(matched_rows.index)
    unmatched_indices.extend(unmatched_rows.index)

# Create the matched and unmatched DataFrames
matched_df = birds.loc[matched_indices]
unmatched_df = birds.loc[unmatched_indices]

In [126]:
matched_df

Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected,raptor_group,raptor_sci_fam
55,ACAD-1055,ACAD,Acadia,Accipitriformes,Accipitridae,Accipiter cooperii,Cooper's Hawk,Approved,Present,Native,Uncommon,Unknown,Species of Concern,True,Hawk,Accipitridae
1768,ARCH-1059,ARCH,Arches,Accipitriformes,Accipitridae,Accipiter cooperii,Cooper's Hawk,Approved,Present,Native,Uncommon,Breeder,Species of Concern,True,Hawk,Accipitridae
2821,BADL-1064,BADL,Badlands,Accipitriformes,Accipitridae,Accipiter cooperii,Cooper's Hawk,Approved,Present,Native,Uncommon,Migratory,Species of Concern,True,Hawk,Accipitridae
4235,BIBE-1089,BIBE,Big Bend,Accipitriformes,Accipitridae,Accipiter cooperii,Cooper's Hawk,Approved,Present,Native,Common,Breeder,Species of Concern,True,Hawk,Accipitridae
6443,BISC-1028,BISC,Biscayne,Accipitriformes,Accipitridae,Accipiter cooperii,Cooper's Hawk,Approved,Present,Native,Uncommon,Migratory,Species of Concern,True,Hawk,Accipitridae
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108309,WICA-1102,WICA,Wind Cave,Accipitriformes,Pandionidae,Pandion haliaetus,Osprey,Approved,Present,Native,Occasional,Migratory,Species of Concern,True,Osprey,Pandionidae
109672,WRST-1070,WRST,Wrangell - St Elias,Accipitriformes,Pandionidae,Pandion haliaetus,Osprey,Approved,Present,Native,Unknown,Breeder,Species of Concern,True,Osprey,Pandionidae
111491,YELL-1093,YELL,Yellowstone,Accipitriformes,Pandionidae,Pandion haliaetus,Osprey,Approved,Present,Native,Common,Breeder,Species of Concern,True,Osprey,Pandionidae
115466,YOSE-1102,YOSE,Yosemite,Accipitriformes,Pandionidae,Pandion haliaetus,Osprey,Approved,Present,Native,Rare,Migratory,Species of Concern,True,Osprey,Pandionidae


In [127]:
unmatched_df

Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected,raptor_group,raptor_sci_fam


In [128]:
# Step 2a: Cross-reference to verify exact matches on both 'scientific_name' and 'common_names'
verified_dict = {}
for sci_name in scientific_names_with_nan:
    # Check for exact matches across the DataFrame
    matches = birds[(birds['scientific_name'] == sci_name) & 
                    (birds['common_names'].notna()) & 
                    (birds['scientific_name'].map(common_name_mapping) == birds['common_names'])]
    
    # If verified matches exist, add to the verified_dict
    if not matches.empty:
        verified_dict[sci_name] = common_name_mapping[sci_name]

In [129]:
verified_dict

{}

In [130]:
# # Step 3: Fill NaN values in 'common_names' using the verified matches
# birds['common_names'] = birds.apply(
#     lambda row: verified_dict.get(row['scientific_name'], row['common_names']) 
#     if pd.isna(row['common_names']) else row['common_names'], axis=1
# )