# North American Birds
Using `species.csv`

In [435]:
import pandas as pd
import numpy as np

In [436]:
df = pd.read_csv('DATA/species.csv', low_memory=False)
print(f"Shape: {df.shape}")
df.head()

Shape: (119248, 14)


Unnamed: 0,Species ID,Park Name,Category,Order,Family,Scientific Name,Common Names,Record Status,Occurrence,Nativeness,Abundance,Seasonality,Conservation Status,Unnamed: 13
0,ACAD-1000,Acadia National Park,Mammal,Artiodactyla,Cervidae,Alces alces,Moose,Approved,Present,Native,Rare,Resident,,
1,ACAD-1001,Acadia National Park,Mammal,Artiodactyla,Cervidae,Odocoileus virginianus,"Northern White-Tailed Deer, Virginia Deer, Whi...",Approved,Present,Native,Abundant,,,
2,ACAD-1002,Acadia National Park,Mammal,Carnivora,Canidae,Canis latrans,"Coyote, Eastern Coyote",Approved,Present,Not Native,Common,,Species of Concern,
3,ACAD-1003,Acadia National Park,Mammal,Carnivora,Canidae,Canis lupus,"Eastern Timber Wolf, Gray Wolf, Timber Wolf",Approved,Not Confirmed,Native,,,Endangered,
4,ACAD-1004,Acadia National Park,Mammal,Carnivora,Canidae,Vulpes vulpes,"Black Fox, Cross Fox, Eastern Red Fox, Fox, Re...",Approved,Present,Unknown,Common,Breeder,,


In [437]:
print(f"Type of data:\n\n{df.dtypes}\n")

Type of data:

Species ID             object
Park Name              object
Category               object
Order                  object
Family                 object
Scientific Name        object
Common Names           object
Record Status          object
Occurrence             object
Nativeness             object
Abundance              object
Seasonality            object
Conservation Status    object
Unnamed: 13            object
dtype: object



In [438]:
print(f"Unique values:\n\n{df.nunique()}")

Unique values:

Species ID             119248
Park Name                  56
Category                   14
Order                     554
Family                   2332
Scientific Name         46022
Common Names            35825
Record Status              53
Occurrence                  7
Nativeness                  5
Abundance                   8
Seasonality                24
Conservation Status        11
Unnamed: 13                 3
dtype: int64


In [439]:
df.columns = [col.lower().replace(" ", "_") for col in df.columns]
df.rename(columns={'unnamed:_13': 'unnamed'}, inplace=True)
print(f"Columns: {df.columns.tolist()}\n")

Columns: ['species_id', 'park_name', 'category', 'order', 'family', 'scientific_name', 'common_names', 'record_status', 'occurrence', 'nativeness', 'abundance', 'seasonality', 'conservation_status', 'unnamed']



In [440]:
print(df['unnamed'].unique())

[nan 'Endangered' 'Threatened' 'Species of Concern']


In [441]:
# Filter rows where 'Unnamed: 13' is not NaN
filtered_df = df[df['unnamed'].notna()]
filtered_df

Unnamed: 0,species_id,park_name,category,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,unnamed
6441,BISC-1026,Biscayne National Park,Mammal,Sirenia,Trichechidae,Trichechus manatus,Manatee,Manati,Approved,Present,Unknown,Unknown,,Endangered
31786,EVER-1414,Everglades National Park,Reptile,Crocodilia,Crocodylidae,Crocodylus acutus,American Crocodile,Cocodrilo De Tumbes,Approved,Present,Native,Uncommon,Resident,Threatened
31826,EVER-1454,Everglades National Park,Reptile,Testudines,Cheloniidae,Caretta caretta,Loggerhead,Cabezon,Approved,Present,Native,Rare,Breeder,Threatened
44733,GRSA-1136,Great Sand Dunes National Park and Preserve,Bird,Falconiformes,Falconidae,Falco columbarius,Merlin,Pigeon Hawk,Approved,Present,Native,Rare,Resident,Species of Concern
44944,GRSA-1347,Great Sand Dunes National Park and Preserve,Vascular Plant,Asparagales,Iridaceae,Iris missouriensis,Blue Flag,Wild Iris,Approved,Present,Native,Rare,,Species of Concern


In [442]:
print(df['conservation_status'].unique())

[nan 'Species of Concern' 'Endangered' 'In Recovery' 'Threatened'
 'Under Review' 'Proposed Threatened' 'Extinct' 'Proposed Endangered'
 'Resident' 'Breeder' 'Migratory']


#### *Discrepancies between `conservation_status` and `unnamed` need to be assessed later
It seems as if `unnamed` can be written to `conservation_status`

In [443]:
# Remove all text beyond " National Park" including variations like " and Preserve"
df['park_name'] = df['park_name'].str.replace(r' National Park.*', '', case=False, regex=True)

In [444]:
# Is the species-id unique?
condor_df = df[df['common_names'].str.contains('condor', case=False, na=False)]
condor_df

Unnamed: 0,species_id,park_name,category,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,unnamed
1779,ARCH-1070,Arches,Bird,Accipitriformes,Cathartidae,Gymnogyps californianus,California Condor,Approved,Present,Native,Occasional,Vagrant,Endangered,
9334,BRCA-1087,Bryce Canyon,Bird,Accipitriformes,Cathartidae,Gymnogyps californianus,California Condor,Approved,Present,Native,Occasional,Vagrant,Endangered,
10620,CANY-1087,Canyonlands,Bird,Accipitriformes,Cathartidae,Gymnogyps californianus,California Condor,Approved,Not Confirmed,Native,,,Endangered,
11841,CARE-1085,Capitol Reef,Bird,Accipitriformes,Cathartidae,Gymnogyps californianus,California Condor,Approved,Not Confirmed,Native,,,Endangered,
26319,DEVA-1234,Death Valley,Bird,Accipitriformes,Cathartidae,Gymnogyps californianus,California Condor,Approved,Not Present (False Report),Native,,,Endangered,
39444,GRBA-1122,Great Basin,Bird,Accipitriformes,Cathartidae,Gymnogyps californianus,California Condor,Approved,Not Present (Historical Report),Native,,,Endangered,
42096,GRCA-1121,Grand Canyon,Bird,Accipitriformes,Cathartidae,Gymnogyps californianus,California Condor,Approved,Present,Native,Common,Breeder,Endangered,
65258,JOTR-1085,Joshua Tree,Bird,Accipitriformes,Cathartidae,Gymnogyps californianus,California Condor,Approved,Not Confirmed,Native,,,Endangered,
72896,LAVO-1115,Lassen Volcanic,Bird,Accipitriformes,Cathartidae,Gymnogyps californianus,California Condor,Approved,Not Present (Historical Report),Native,,,Endangered,
77178,MEVE-1101,Mesa Verde,Bird,Accipitriformes,Cathartidae,Gymnogyps californianus,California Condor,Approved,Present,Native,Occasional,Vagrant,Endangered,


`species_id` numeric is not unique for similar species but it probably should be

In [445]:
# Splitting species_id column at the hyphen to create a new `park_code` public key
df['park_code'] = df['species_id'].str.split('-').str[0]

# Reorder columns to place 'park_code' as the second column
cols = list(df.columns)  # Get the list of current columns
cols.insert(1, cols.pop(cols.index('park_code')))  # Move 'park_code' to the second position
df = df[cols]  # Reorder DataFrame

In [446]:
df.head()

Unnamed: 0,species_id,park_code,park_name,category,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,unnamed
0,ACAD-1000,ACAD,Acadia,Mammal,Artiodactyla,Cervidae,Alces alces,Moose,Approved,Present,Native,Rare,Resident,,
1,ACAD-1001,ACAD,Acadia,Mammal,Artiodactyla,Cervidae,Odocoileus virginianus,"Northern White-Tailed Deer, Virginia Deer, Whi...",Approved,Present,Native,Abundant,,,
2,ACAD-1002,ACAD,Acadia,Mammal,Carnivora,Canidae,Canis latrans,"Coyote, Eastern Coyote",Approved,Present,Not Native,Common,,Species of Concern,
3,ACAD-1003,ACAD,Acadia,Mammal,Carnivora,Canidae,Canis lupus,"Eastern Timber Wolf, Gray Wolf, Timber Wolf",Approved,Not Confirmed,Native,,,Endangered,
4,ACAD-1004,ACAD,Acadia,Mammal,Carnivora,Canidae,Vulpes vulpes,"Black Fox, Cross Fox, Eastern Red Fox, Fox, Re...",Approved,Present,Unknown,Common,Breeder,,


#### Isolating Birds

In [447]:
birds = df[df.category == 'Bird']
print(f"Shape: {birds.shape}")
birds.head()

Shape: (14601, 15)


Unnamed: 0,species_id,park_code,park_name,category,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,unnamed
55,ACAD-1055,ACAD,Acadia,Bird,Accipitriformes,Accipitridae,Accipiter cooperii,Cooper's Hawk,Approved,Present,Native,Uncommon,,Species of Concern,
56,ACAD-1056,ACAD,Acadia,Bird,Accipitriformes,Accipitridae,Accipiter gentilis,"Eastern Goshawk, Goshawk, Northern Goshawk",Approved,Present,Native,Uncommon,Breeder,,
57,ACAD-1057,ACAD,Acadia,Bird,Accipitriformes,Accipitridae,Accipiter striatus,"Northern Sharp-Shinned Hawk, Sharp-Shinned Hawk",Approved,Present,Native,Common,Breeder,Species of Concern,
58,ACAD-1058,ACAD,Acadia,Bird,Accipitriformes,Accipitridae,Aquila chrysaetos,"American Golden Eagle, Golden Eagle",Approved,Present,Native,Occasional,Vagrant,Species of Concern,
59,ACAD-1059,ACAD,Acadia,Bird,Accipitriformes,Accipitridae,Buteo jamaicensis,"Eastern Red-Tailed Hawk, Red-Tailed Hawk",Approved,Present,Native,Common,Breeder,,


In [448]:
print(f"Unique values:\n\n{birds.nunique()}")

Unique values:

species_id             14601
park_code                 56
park_name                 56
category                   1
order                     24
family                    86
scientific_name         1436
common_names            1550
record_status             10
occurrence                 6
nativeness                 4
abundance                  7
seasonality               23
conservation_status        8
unnamed                    1
dtype: int64


In [449]:
print(birds['conservation_status'].unique())

['Species of Concern' nan 'In Recovery' 'Threatened' 'Endangered'
 'Under Review' 'Breeder' 'Resident' 'Proposed Endangered']


In [450]:
print(f"NaN count: {birds.conservation_status.isna().sum()}\n")
birds.groupby("conservation_status").size()

NaN count: 11970



conservation_status
Breeder                   4
Endangered               64
In Recovery              65
Proposed Endangered       2
Resident                  4
Species of Concern     2371
Threatened               51
Under Review             70
dtype: int64

### `conservation_status`

- `Endangered`: seriously at risk of extinction
- `In Recovery`: formerly `Endangered`, but currently not in danger of extinction throughout all or a significant portion of its range
- `Threatened`: vulnerable to endangerment in the near future
- `Species of Concern`: declining or appear to be in need of conservation
- `Under Review`: ?
- `Resident`: ?
- `Breeder`: ?

In [451]:
# Does resident and breeder come under `seasonality`?
print(birds.seasonality.unique())

[nan 'Breeder' 'Vagrant' 'Resident' 'Migratory' 'Breeder, Migratory'
 'Breeder, Summer' 'Migratory, Vagrant' 'Migratory, Winter'
 'Breeder, Winter' 'Winter' 'Resident, Winter' 'Summer, Vagrant'
 'Breeder, Resident, Summer' 'Resident, Summer' 'Summer'
 'Breeder, Resident, Migratory, Summer'
 'Breeder, Resident, Summer, Winter' 'Migratory, Summer' 'Winter, Vagrant'
 'Rare' 'Uncommon' 'Common' 'Breeder, Resident']


#### `Resident` and `Breeder` belong in `seasonality`. 
Seems wrong as a `conservation_status` and could be the result of Human Error. Investigating the unique values in the other columns

In [452]:
keywords = ['Breeder', 'Resident']
pattern = '|'.join(keywords)  

# Filter the DataFrame for rows where column contains any of the keywords
matching_df = birds[birds['conservation_status'].str.contains(pattern, case=False, na=False)]
matching_df

Unnamed: 0,species_id,park_code,park_name,category,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,unnamed
44666,GRSA-1069,GRSA,Great Sand Dunes,Bird,Accipitriformes,Accipitridae,Accipiter gentilis,Goshawk,Northern Goshawk,Approved,Present,Native,Rare,Breeder,
44678,GRSA-1081,GRSA,Great Sand Dunes,Bird,Anseriformes,Anatidae,Anas acuta,Pintail,Northern Pintail,Approved,Present,Native,Rare,Resident,
44705,GRSA-1108,GRSA,Great Sand Dunes,Bird,Caprimulgiformes,Caprimulgidae,Phalaenoptilus nuttallii,Poor-Will,Common Poorwill,Approved,Present,Native,Uncommon,Breeder,
44733,GRSA-1136,GRSA,Great Sand Dunes,Bird,Falconiformes,Falconidae,Falco columbarius,Merlin,Pigeon Hawk,Approved,Present,Native,Rare,Resident,Species of Concern
44744,GRSA-1147,GRSA,Great Sand Dunes,Bird,Passeriformes,Aegithalidae,Psaltriparus minimus,Common Bushtit,Bushtit,Approved,Present,Native,Uncommon,Breeder,
44759,GRSA-1162,GRSA,Great Sand Dunes,Bird,Passeriformes,Corvidae,Corvus brachyrhynchos,Common Crow,American Crow,Approved,Present,Native,Rare,Resident,
44818,GRSA-1221,GRSA,Great Sand Dunes,Bird,Passeriformes,Mimidae,Dumetella carolinensis,Gray Catbird,Catbird,Approved,Present,Native,Rare,Resident,
44859,GRSA-1262,GRSA,Great Sand Dunes,Bird,Passeriformes,Turdidae,Turdus migratorius,American Robin,Robin,Approved,Present,Native,Common,Breeder,


In [453]:
print(birds.abundance.unique())

['Uncommon' 'Common' 'Occasional' nan 'Rare' 'Unknown' 'Abundant' 'Native']


In [454]:
# Count unique values where 'abundance' is 'Native'
unique_native_abundance = birds[birds['abundance'] == 'Native']['species_id'].nunique()
print(f"Unique records where abundance is 'Native': {unique_native_abundance}")

Unique records where abundance is 'Native': 8


In [455]:
print(birds.nativeness.unique())

['Native' 'Unknown' 'Not Native' nan 'Present']


In [456]:
# Count unique values where 'nativeness' is 'Present'
unique_present_nativeness = birds[birds['nativeness'] == 'Present']['species_id'].nunique()
print(f"Unique records where nativeness is 'Present': {unique_present_nativeness}")

Unique records where nativeness is 'Present': 8


In [457]:
print(birds.record_status.unique())

['Approved' 'In Review' ' Northern Goshawk' ' Northern Pintail'
 ' Common Poorwill' ' Pigeon Hawk' ' Bushtit' ' American Crow' ' Catbird'
 ' Robin']


#### Discrepancies in these records seem to be the result of human error
We need to remove the common_name values in `record_status` and shift all columns to the right, one column left

In [458]:
# indices that need correction are based on 'matching_df'
indices_to_shift = matching_df.index

# Define the column start position for shifting
start_pos = matching_df.columns.get_loc('record_status')

# Create a shifted version of the relevant subset of matching_df
# Shifting is done by slicing the DataFrame to remove the start position column, and appending NaN at the end
shifted_df = matching_df.loc[indices_to_shift].apply(
    lambda row: pd.Series(np.append(row[start_pos + 1:].values, pd.NA), index=row[start_pos:].index), axis=1
)

# Combine the unchanged part of the rows with the shifted part
matching_df.loc[indices_to_shift, matching_df.columns[start_pos:]] = shifted_df

# Apply the same changes to the parent and child DataFrames using indices
df.loc[indices_to_shift] = matching_df.loc[indices_to_shift]
birds.loc[indices_to_shift] = matching_df.loc[indices_to_shift]

In [459]:
columns = ['conservation_status', 'abundance', 'nativeness', 'record_status', 'occurrence', 'unnamed']
for column in columns:
    print(f"{column.title()}:\n{birds[column].unique()}")

Conservation_Status:
['Species of Concern' nan 'In Recovery' 'Threatened' 'Endangered'
 'Under Review' 'Proposed Endangered']
Abundance:
['Uncommon' 'Common' 'Occasional' nan 'Rare' 'Unknown' 'Abundant']
Nativeness:
['Native' 'Unknown' 'Not Native' nan]
Record_Status:
['Approved' 'In Review']
Occurrence:
['Present' 'Not Confirmed' 'Not Present (Historical Report)'
 'Not Present (False Report)' nan 'Not Present']
Unnamed:
[nan <NA>]


In [460]:
# Dropping unnamed
birds = birds.drop(columns=['unnamed'])

#### Dealing with `seasonality` column

In [461]:
print(birds.seasonality.unique())

[nan 'Breeder' 'Vagrant' 'Resident' 'Migratory' 'Breeder, Migratory'
 'Breeder, Summer' 'Migratory, Vagrant' 'Migratory, Winter'
 'Breeder, Winter' 'Winter' 'Resident, Winter' 'Summer, Vagrant'
 'Breeder, Resident, Summer' 'Resident, Summer' 'Summer'
 'Breeder, Resident, Migratory, Summer'
 'Breeder, Resident, Summer, Winter' 'Migratory, Summer' 'Winter, Vagrant'
 'Breeder, Resident']


In [462]:
print(f"NaN count: {birds.seasonality.isna().sum()}\n")
birds.groupby("seasonality").size()

NaN count: 3502



seasonality
Breeder                                 5005
Breeder, Migratory                         4
Breeder, Resident                          9
Breeder, Resident, Migratory, Summer       1
Breeder, Resident, Summer                  3
Breeder, Resident, Summer, Winter          1
Breeder, Summer                            2
Breeder, Winter                           64
Migratory                               2726
Migratory, Summer                          2
Migratory, Vagrant                         4
Migratory, Winter                         23
Resident                                1632
Resident, Summer                           2
Resident, Winter                           9
Summer                                    25
Summer, Vagrant                            3
Vagrant                                 1563
Winter                                    20
Winter, Vagrant                            1
dtype: int64

There's not enough records to warrant this variation. Change to `Breeder`, `Migratory`, `Unknown`

In [463]:
# Fill NaN values with 'Unknown'
birds['seasonality'] = birds['seasonality'].fillna('Unknown')

# Define keywords in the order of priority
priority_keywords = ['Winter', 'Summer', 'Breeder', 'Migratory', 'Resident', 'Vagrant']

def simplify_seasonality(value):
    for keyword in priority_keywords:
        if keyword in value:
            return keyword
    return 'Unknown'

# Apply the classification function to simplify seasonality to one word
birds['seasonality'] = birds['seasonality'].apply(simplify_seasonality)

# Display the updated counts for verification
seasonality_counts = birds['seasonality'].value_counts()
print(seasonality_counts)

seasonality
Breeder      5018
Unknown      3502
Migratory    2730
Resident     1632
Vagrant      1563
Winter        118
Summer         38
Name: count, dtype: int64


----
&nbsp;
### Dealing with `NaN` values and conversion to categorical columns

#### `conservation_status` as per [IUCN](https://en.wikipedia.org/wiki/IUCN_Red_List) Red List
We fill nan with `Least Concern` meaning 'non-protected' where all other values mean 'protected'

In [464]:
# Define the fill values for each column
fill_values = {
    'conservation_status': 'Least Concern',
    'abundance': 'Unknown',
    'nativeness': 'Unknown',
    'occurrence': 'Not Confirmed'
}

# Fill NaN values in the specified columns
birds['conservation_status'] = birds['conservation_status'].fillna(fill_values['conservation_status'])
birds['abundance'] = birds['abundance'].fillna(fill_values['abundance'])
birds['nativeness'] = birds['nativeness'].fillna(fill_values['nativeness'])
birds['occurrence'] = birds['occurrence'].fillna(fill_values['occurrence'])

In [465]:
# Define the ordered categories for each column
conservation_status_order = ['Least Concern', 'Species of Concern', 'In Recovery', 'Under Review', 'Threatened', 'Proposed Endangered', 'Endangered']
abundance_order = ['Rare', 'Uncommon', 'Unknown', 'Occasional', 'Common', 'Abundant']
nativeness_order = ['Not Native', 'Unknown', 'Native']
record_status_order = ['In Review', 'Approved']
occurrence_order = ['Not Present (False Report)', 'Not Present (Historical Report)', 'Not Present', 'Not Confirmed', 'Present']

In [466]:
# Covert columns to categorical
birds['record_status'] = pd.Categorical(birds['record_status'], categories=record_status_order, ordered=True)
birds['occurrence'] = pd.Categorical(birds['occurrence'], categories=occurrence_order, ordered=True)
birds['nativeness'] = pd.Categorical(birds['nativeness'], categories=nativeness_order, ordered=True)
birds['abundance'] = pd.Categorical(birds['abundance'], categories=abundance_order, ordered=True)
birds['conservation_status'] = pd.Categorical(birds['conservation_status'], categories=conservation_status_order, ordered=True)

# add boolean column 'is_protected'
birds['is_protected'] = birds.conservation_status != 'Least Concern'

# Dropping category as all birds
birds = birds.drop(columns=['category'])

In [467]:
# Display the DataFrame to confirm the changes
print(birds.dtypes)  # Check that columns are categorical
birds.head()  # Check the first few rows of the DataFrame

species_id               object
park_code                object
park_name                object
order                    object
family                   object
scientific_name          object
common_names             object
record_status          category
occurrence             category
nativeness             category
abundance              category
seasonality              object
conservation_status    category
is_protected               bool
dtype: object


Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected
55,ACAD-1055,ACAD,Acadia,Accipitriformes,Accipitridae,Accipiter cooperii,Cooper's Hawk,Approved,Present,Native,Uncommon,Unknown,Species of Concern,True
56,ACAD-1056,ACAD,Acadia,Accipitriformes,Accipitridae,Accipiter gentilis,"Eastern Goshawk, Goshawk, Northern Goshawk",Approved,Present,Native,Uncommon,Breeder,Least Concern,False
57,ACAD-1057,ACAD,Acadia,Accipitriformes,Accipitridae,Accipiter striatus,"Northern Sharp-Shinned Hawk, Sharp-Shinned Hawk",Approved,Present,Native,Common,Breeder,Species of Concern,True
58,ACAD-1058,ACAD,Acadia,Accipitriformes,Accipitridae,Aquila chrysaetos,"American Golden Eagle, Golden Eagle",Approved,Present,Native,Occasional,Vagrant,Species of Concern,True
59,ACAD-1059,ACAD,Acadia,Accipitriformes,Accipitridae,Buteo jamaicensis,"Eastern Red-Tailed Hawk, Red-Tailed Hawk",Approved,Present,Native,Common,Breeder,Least Concern,False


----
&nbsp;
## Classifying Species Records
Isolating birds of prey

In [468]:
print(f"There are {birds.common_names.nunique()} unique common names")
print(f"There are {birds.scientific_name.nunique()} unique scientific names")
print(f"There are {birds.family.nunique()} unique families")

There are 1550 unique common names
There are 1436 unique scientific names
There are 86 unique families


In [469]:
# This dataset is well formatted with zero lower case values
lowercase_count = (birds.map(lambda x: isinstance(x, str) and x.islower())).sum().sum()
print(lowercase_count)

0


We create a list of common birds of prey to search `common_name`

In [470]:
print(f"NaN count: {birds.common_names.isna().sum()}\n")

NaN count: 280



In [471]:
# Define the list of bird of prey keywords
birds_of_prey = ["Eagle", "Hawk", "Falcon", "Buzzard", "Harrier", "Kite", "Owl", "Osprey", 
                 "Vulture", "Condor", "Kestrel", 'Buteos', 'Accipiters', 'Caracara']

# Create a regex pattern from the list
pattern = '|'.join(birds_of_prey)

# Function to find and append only the matching keywords
def find_raptors(common_names):
    # Convert to string and handle NaN values
    if pd.isna(common_names):
        return ''
    # Find keywords that are present in the common_names
    matches = set()  # Use a set to avoid duplicates
    for keyword in birds_of_prey:
        if keyword in common_names:
            matches.add(keyword)
    return ', '.join(matches)

# Apply the function to the common_names column
birds['raptor_common'] = birds['common_names'].apply(find_raptors)

# Display the first few rows to verify
birds.head()

Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected,raptor_common
55,ACAD-1055,ACAD,Acadia,Accipitriformes,Accipitridae,Accipiter cooperii,Cooper's Hawk,Approved,Present,Native,Uncommon,Unknown,Species of Concern,True,Hawk
56,ACAD-1056,ACAD,Acadia,Accipitriformes,Accipitridae,Accipiter gentilis,"Eastern Goshawk, Goshawk, Northern Goshawk",Approved,Present,Native,Uncommon,Breeder,Least Concern,False,
57,ACAD-1057,ACAD,Acadia,Accipitriformes,Accipitridae,Accipiter striatus,"Northern Sharp-Shinned Hawk, Sharp-Shinned Hawk",Approved,Present,Native,Common,Breeder,Species of Concern,True,Hawk
58,ACAD-1058,ACAD,Acadia,Accipitriformes,Accipitridae,Aquila chrysaetos,"American Golden Eagle, Golden Eagle",Approved,Present,Native,Occasional,Vagrant,Species of Concern,True,Eagle
59,ACAD-1059,ACAD,Acadia,Accipitriformes,Accipitridae,Buteo jamaicensis,"Eastern Red-Tailed Hawk, Red-Tailed Hawk",Approved,Present,Native,Common,Breeder,Least Concern,False,Hawk


In [472]:
total_raptors_comm = birds['raptor_common'].loc[birds['raptor_common'] != ''].count()
print(f"Under common name:\nWe have a total of {total_raptors_comm} Raptors\n")
diff_com_types = birds.raptor_common.unique().tolist()
print(diff_com_types)

Under common name:
We have a total of 1331 Raptors

['Hawk', '', 'Eagle', 'Hawk, Harrier', 'Vulture', 'Osprey, Hawk', 'Hawk, Falcon', 'Hawk, Kestrel', 'Owl', 'Hawk, Owl', 'Harrier', 'Condor', 'Osprey', 'Falcon', 'Kestrel', 'Kite', 'Caracara', 'Hawk, Buzzard']


There's some ambiguity in the above list so we cross-reference `family`

#### Birds of Prey Scientific Families and Genera

According to OpenAI's language model ChatGPT 4o (2024):

- Accipitridae (Hawks, Eagles, and relatives)
- Falconidae (Falcons)
- Harpagiidae (Harriers)
- Pandionidae (Ospreys)
- Accipitridae (Kites)
- Cathartidae (New World Vultures)
- Buteo (Buzzards and Buteos)
- Accipiter (Goshawks and Accipiters)
- Tytonidae (Barn Owls)
- Strigidae (Typical Owls)

*Caveat emptor*: This list may not be comprehensive.

In [473]:
print(f"NaN count: {birds.family.isna().sum()}\n")

NaN count: 26



In [474]:
# Define the list of bird of prey scientific families and genera
birds_of_prey_sci = [
    "Accipitridae", "Falconidae", "Harpagiidae", 
    "Pandionidae", "Cathartidae", "Buteo", "Accipiter", 
    "Tytonidae", "Strigidae"
]

# Create a regex pattern from the list
pattern = '|'.join(birds_of_prey_sci)

# Find and join matching scientific families and genera in the 'family' column
birds['raptor_sci_fam'] = birds['family'].str.findall(f'({pattern})')

# Convert the lists of matches into a comma-separated string, handling non-iterables safely
birds['raptor_sci_fam'] = birds['raptor_sci_fam'].apply(lambda x: ', '.join(x) if isinstance(x, list) and x else '')

# Display the first few rows to verify
birds.head()

Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected,raptor_common,raptor_sci_fam
55,ACAD-1055,ACAD,Acadia,Accipitriformes,Accipitridae,Accipiter cooperii,Cooper's Hawk,Approved,Present,Native,Uncommon,Unknown,Species of Concern,True,Hawk,Accipitridae
56,ACAD-1056,ACAD,Acadia,Accipitriformes,Accipitridae,Accipiter gentilis,"Eastern Goshawk, Goshawk, Northern Goshawk",Approved,Present,Native,Uncommon,Breeder,Least Concern,False,,Accipitridae
57,ACAD-1057,ACAD,Acadia,Accipitriformes,Accipitridae,Accipiter striatus,"Northern Sharp-Shinned Hawk, Sharp-Shinned Hawk",Approved,Present,Native,Common,Breeder,Species of Concern,True,Hawk,Accipitridae
58,ACAD-1058,ACAD,Acadia,Accipitriformes,Accipitridae,Aquila chrysaetos,"American Golden Eagle, Golden Eagle",Approved,Present,Native,Occasional,Vagrant,Species of Concern,True,Eagle,Accipitridae
59,ACAD-1059,ACAD,Acadia,Accipitriformes,Accipitridae,Buteo jamaicensis,"Eastern Red-Tailed Hawk, Red-Tailed Hawk",Approved,Present,Native,Common,Breeder,Least Concern,False,Hawk,Accipitridae


In [475]:
total_raptors_sci = birds['raptor_sci_fam'].loc[birds['raptor_sci_fam'] != ''].count()
print(f"Under scientific family:\nWe have a total of {total_raptors_sci} Raptors\n")
diff_sci_types = birds.raptor_sci_fam.unique().tolist()
print(diff_sci_types)

Under scientific family:
We have a total of 1479 Raptors

['Accipitridae', 'Cathartidae', 'Pandionidae', '', 'Falconidae', 'Strigidae', 'Tytonidae']


In [476]:
# Declare a new dataframe to analyse the discrepancies
raptors_df = (birds[(birds.raptor_common != '')
                                | (birds.raptor_sci_fam != '')])
print(raptors_df.shape)

(1485, 16)


In [477]:
# We work with a copy of the original DataFrame and not a view of it
raptors_df = raptors_df.copy()
mask = raptors_df['raptor_common'] == ''
raptors_df.loc[mask, 'ambiguous'] = True
result = raptors_df[raptors_df['ambiguous'] == True]
result

Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected,raptor_common,raptor_sci_fam,ambiguous
56,ACAD-1056,ACAD,Acadia,Accipitriformes,Accipitridae,Accipiter gentilis,"Eastern Goshawk, Goshawk, Northern Goshawk",Approved,Present,Native,Uncommon,Breeder,Least Concern,False,,Accipitridae,True
187,ACAD-1187,ACAD,Acadia,Falconiformes,Falconidae,Falco rusticolus,"American Gyrfalcon, Gyrfalcon, White Gyrfalcon",Approved,Present,Not Native,Unknown,Vagrant,Least Concern,False,,Falconidae,True
1769,ARCH-1060,ARCH,Arches,Accipitriformes,Accipitridae,Accipiter gentilis,Northern Goshawk,Approved,Present,Native,Rare,Migratory,Least Concern,False,,Accipitridae,True
1816,ARCH-1107,ARCH,Arches,Falconiformes,Falconidae,Falco columbarius,Merlin,Approved,Present,Native,Occasional,Migratory,Species of Concern,True,,Falconidae,True
2822,BADL-1065,BADL,Badlands,Accipitriformes,Accipitridae,Accipiter gentilis,Northern Goshawk,Approved,Present,Native,Rare,Migratory,Least Concern,False,,Accipitridae,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111590,YELL-1192,YELL,Yellowstone,Falconiformes,Falconidae,Falco columbarius,Merlin,Approved,Present,Native,Rare,Migratory,Species of Concern,True,,Falconidae,True
115453,YOSE-1089,YOSE,Yosemite,Accipitriformes,Accipitridae,Accipiter gentilis,Northern Goshawk,Approved,Present,Native,Uncommon,Breeder,Least Concern,False,,Accipitridae,True
115521,YOSE-1157,YOSE,Yosemite,Falconiformes,Falconidae,Falco columbarius,Merlin,Approved,Present,Native,Rare,Breeder,Species of Concern,True,,Falconidae,True
117534,ZION-1082,ZION,Zion,Accipitriformes,Accipitridae,Accipiter gentilis,Northern Goshawk,Approved,Present,Native,Rare,Breeder,Least Concern,False,,Accipitridae,True


*Accipiter gentilis* is `Northern Goshawk`, a type of `Hawk`

*Falco columbarius* is `Merlin`, a type of `Falcon`

In [478]:
# Update values for 'Accipiter gentilis'
birds.loc[birds['scientific_name'] == 'Accipiter gentilis', 
          ['common_names', 'raptor_common']] = ['Northern Goshawk', 'Hawk']

# Update values for 'Falco columbarius'
birds.loc[birds['scientific_name'] == 'Falco columbarius', 
          ['common_names', 'raptor_common']] = ['Merlin', 'Falcon']

# Update values for 'Falco rusticolus'
birds.loc[birds['scientific_name'] == 'Falco rusticolus', 
          ['common_names', 'raptor_common']] = ['Gyrfalcon', 'Falcon']

# Update values for 'Falco columbarius columbarius'
birds.loc[birds['scientific_name'] == 'Falco columbarius columbarius', 
          ['common_names', 'raptor_common']] = ['Merlin (Tundra Subspecies)', 'Falcon']

# Update values for 'Falco columbarius suckleyi'
birds.loc[birds['scientific_name'] == 'Falco columbarius suckleyi', 
          ['common_names', 'raptor_common']] = ['Merlin (Coastal Forest Subspecies)', 'Falcon']


In [479]:
raptors_df = (birds[(birds.raptor_common != '')
                                | (birds.raptor_sci_fam != '')])

raptors_df = raptors_df.copy()
mask = raptors_df['raptor_common'] == ''
raptors_df.loc[mask, 'ambiguous'] = True
result = raptors_df[raptors_df['ambiguous'] == True]
result

Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected,raptor_common,raptor_sci_fam,ambiguous
26308,DEVA-1223,DEVA,Death Valley,Accipitriformes,Accipitridae,Buteo lineatus elegans,,In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False,,Accipitridae,True
26320,DEVA-1235,DEVA,Death Valley,Accipitriformes,Cathartidae,Pseudogryphus californianus,,In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False,,Cathartidae,True
26456,DEVA-1371,DEVA,Death Valley,Falconiformes,Falconidae,Falco columbarius richardsonii,,In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False,,Falconidae,True
26808,DEVA-1723,DEVA,Death Valley,Strigiformes,Strigidae,Athene cunicularia hypugaea,,In Review,Not Confirmed,Unknown,Unknown,Unknown,Species of Concern,True,,Strigidae,True
26814,DEVA-1729,DEVA,Death Valley,Strigiformes,Strigidae,Megascops asio bendirei,,In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False,,Strigidae,True
26817,DEVA-1732,DEVA,Death Valley,Strigiformes,Strigidae,Otus asio inyoensis,,In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False,,Strigidae,True
26820,DEVA-1735,DEVA,Death Valley,Strigiformes,Strigidae,Syrnium occidentale,,In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False,,Strigidae,True
39752,GRBA-1430,GRBA,Great Basin,Strigiformes,Strigidae,Speotyto cunicularia,,In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False,,Strigidae,True
61880,HOSP-1054,HOSP,Hot Springs,Accipitriformes,Accipitridae,Accipiter striatus velox,,Approved,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False,,Accipitridae,True
61883,HOSP-1057,HOSP,Hot Springs,Accipitriformes,Accipitridae,Buteo jamaicensis borealis,,Approved,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False,,Accipitridae,True


In [480]:
print(f"Shape of birds: {birds.shape}")

Shape of birds: (14601, 16)


In [481]:
buteos = birds[(birds.common_names == 'Buteo') | (birds.scientific_name == "Buteo")]
buteos

Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected,raptor_common,raptor_sci_fam
87805,REDW-1163,REDW,Redwood,Accipitriformes,Accipitridae,Buteo,Buteonine Hawks,In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False,Hawk,Accipitridae
105660,THRO-1071,THRO,Theodore Roosevelt,Accipitriformes,Accipitridae,Buteo,Buteo,In Review,Not Confirmed,Unknown,Unknown,Unknown,Least Concern,False,,Accipitridae


In [482]:
# Count the number of NaN values in each column
nan_counts_per_column = birds.isna().sum()
print(nan_counts_per_column)

species_id               0
park_code                0
park_name                0
order                    1
family                  26
scientific_name          0
common_names           277
record_status            0
occurrence               0
nativeness               0
abundance                0
seasonality              0
conservation_status      0
is_protected             0
raptor_common            0
raptor_sci_fam           0
dtype: int64


----
&nbsp;
## Create a set of `scientific_name` and cross-ref in birds where `common_name` != nan

In [483]:
# Step 1: Identify rows with NaN in 'common_names' and extract unique scientific names
scientific_names_with_nan = set(birds.loc[birds['common_names'].isna(), 'scientific_name'])

# Step 2: Cross-reference to find matching 'common_names' for these scientific names
# Create a dictionary mapping scientific names to common names where 'common_names' is not NaN
common_name_mapping = birds.dropna(subset=['common_names']).set_index('scientific_name')['common_names'].to_dict()

In [484]:
common_name_mapping

{'Accipiter cooperii': "Cooper's Hawk",
 'Accipiter gentilis': 'Northern Goshawk',
 'Accipiter striatus': 'Sharp-Shinned Hawk',
 'Aquila chrysaetos': 'Golden Eagle',
 'Buteo jamaicensis': 'Red-Tailed Hawk',
 'Buteo lagopus': 'Rough-Legged Hawk',
 'Buteo lineatus': 'Red-Shouldered Hawk',
 'Buteo platypterus': 'Broad-Winged Hawk',
 'Buteo swainsoni': "Swainson's Hawk",
 'Circus cyaneus': 'Northern Harrier',
 'Haliaeetus leucocephalus': 'Bald Eagle',
 'Cathartes aura': 'Turkey Vulture',
 'Pandion haliaetus': 'Osprey',
 'Aix sponsa': 'Wood Duck',
 'Anas acuta': 'Northern Pintail',
 'Anas americana': 'American Wigeon',
 'Anas clypeata': 'Northern Shoveler',
 'Anas crecca': 'Green-Winged Teal',
 'Anas discors': 'Blue-Winged Teal',
 'Anas penelope': 'Eurasian Wigeon',
 'Anas platyrhynchos': 'Mallard',
 'Anas rubripes': 'American Black Duck',
 'Anas strepera': 'Gadwall',
 'Anser albifrons': 'Greater White-Fronted Goose',
 'Aythya affinis': 'Lesser Scaup',
 'Aythya americana': 'Redhead',
 'Ayth

In [496]:
# Define the keys of interest
keys_of_interest = [
    'Accipiter cooperii', 'Accipiter gentilis', 'Accipiter striatus', 
    'Aquila chrysaetos', 'Buteo jamaicensis', 'Buteo lagopus', 
    'Buteo lineatus', 'Buteo platypterus', 'Buteo swainsoni', 
    'Circus cyaneus', 'Haliaeetus leucocephalus', 'Cathartes aura', 
    'Pandion haliaetus'
]

# Create a subset dictionary using dictionary comprehension
subset_common_name_mapping = {k: common_name_mapping[k] for k in keys_of_interest if k in common_name_mapping}

In [493]:
# Filter the birds DataFrame to include only records where 'scientific_name' is in keys_of_interest
subset_birds = birds[birds['scientific_name'].isin(keys_of_interest)]
subset_birds

Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected,raptor_common,raptor_sci_fam
55,ACAD-1055,ACAD,Acadia,Accipitriformes,Accipitridae,Accipiter cooperii,Cooper's Hawk,Approved,Present,Native,Uncommon,Unknown,Species of Concern,True,Hawk,Accipitridae
56,ACAD-1056,ACAD,Acadia,Accipitriformes,Accipitridae,Accipiter gentilis,Northern Goshawk,Approved,Present,Native,Uncommon,Breeder,Least Concern,False,Hawk,Accipitridae
57,ACAD-1057,ACAD,Acadia,Accipitriformes,Accipitridae,Accipiter striatus,"Northern Sharp-Shinned Hawk, Sharp-Shinned Hawk",Approved,Present,Native,Common,Breeder,Species of Concern,True,Hawk,Accipitridae
58,ACAD-1058,ACAD,Acadia,Accipitriformes,Accipitridae,Aquila chrysaetos,"American Golden Eagle, Golden Eagle",Approved,Present,Native,Occasional,Vagrant,Species of Concern,True,Eagle,Accipitridae
59,ACAD-1059,ACAD,Acadia,Accipitriformes,Accipitridae,Buteo jamaicensis,"Eastern Red-Tailed Hawk, Red-Tailed Hawk",Approved,Present,Native,Common,Breeder,Least Concern,False,Hawk,Accipitridae
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117543,ZION-1091,ZION,Zion,Accipitriformes,Accipitridae,Buteo swainsoni,Swainson's Hawk,Approved,Present,Native,Rare,Migratory,Least Concern,False,Hawk,Accipitridae
117545,ZION-1093,ZION,Zion,Accipitriformes,Accipitridae,Circus cyaneus,Northern Harrier,Approved,Present,Native,Uncommon,Migratory,Species of Concern,True,Harrier,Accipitridae
117546,ZION-1094,ZION,Zion,Accipitriformes,Accipitridae,Haliaeetus leucocephalus,Bald Eagle,Approved,Present,Native,Uncommon,Resident,In Recovery,True,Eagle,Accipitridae
117547,ZION-1095,ZION,Zion,Accipitriformes,Cathartidae,Cathartes aura,Turkey Vulture,Approved,Present,Native,Common,Breeder,Species of Concern,True,Vulture,Cathartidae


In [497]:
# Initialize empty lists to collect matched and unmatched indices
matched_indices = []
unmatched_indices = []

# Iterate over the dictionary to check values in the birds DataFrame
for sci_name, common_name in subset_common_name_mapping.items():
    # Find rows where scientific_name matches the key
    matching_rows = birds[birds['scientific_name'] == sci_name]
    
    # Separate matched and unmatched based on the common_names
    matched_rows = matching_rows[matching_rows['common_names'] == common_name]
    unmatched_rows = matching_rows[matching_rows['common_names'] != common_name]
    
    # Append indices to respective lists
    matched_indices.extend(matched_rows.index)
    unmatched_indices.extend(unmatched_rows.index)

# Create the matched and unmatched DataFrames
matched_df = birds.loc[matched_indices]
unmatched_df = birds.loc[unmatched_indices]

In [498]:
matched_df

Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected,raptor_common,raptor_sci_fam
55,ACAD-1055,ACAD,Acadia,Accipitriformes,Accipitridae,Accipiter cooperii,Cooper's Hawk,Approved,Present,Native,Uncommon,Unknown,Species of Concern,True,Hawk,Accipitridae
1768,ARCH-1059,ARCH,Arches,Accipitriformes,Accipitridae,Accipiter cooperii,Cooper's Hawk,Approved,Present,Native,Uncommon,Breeder,Species of Concern,True,Hawk,Accipitridae
2821,BADL-1064,BADL,Badlands,Accipitriformes,Accipitridae,Accipiter cooperii,Cooper's Hawk,Approved,Present,Native,Uncommon,Migratory,Species of Concern,True,Hawk,Accipitridae
4235,BIBE-1089,BIBE,Big Bend,Accipitriformes,Accipitridae,Accipiter cooperii,Cooper's Hawk,Approved,Present,Native,Common,Breeder,Species of Concern,True,Hawk,Accipitridae
6443,BISC-1028,BISC,Biscayne,Accipitriformes,Accipitridae,Accipiter cooperii,Cooper's Hawk,Approved,Present,Native,Uncommon,Migratory,Species of Concern,True,Hawk,Accipitridae
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106827,VOYA-1073,VOYA,Voyageurs,Accipitriformes,Pandionidae,Pandion haliaetus,Osprey,Approved,Present,Native,Common,Breeder,Species of Concern,True,Osprey,Pandionidae
108309,WICA-1102,WICA,Wind Cave,Accipitriformes,Pandionidae,Pandion haliaetus,Osprey,Approved,Present,Native,Occasional,Migratory,Species of Concern,True,Osprey,Pandionidae
109672,WRST-1070,WRST,Wrangell - St Elias,Accipitriformes,Pandionidae,Pandion haliaetus,Osprey,Approved,Present,Native,Unknown,Breeder,Species of Concern,True,Osprey,Pandionidae
115466,YOSE-1102,YOSE,Yosemite,Accipitriformes,Pandionidae,Pandion haliaetus,Osprey,Approved,Present,Native,Rare,Migratory,Species of Concern,True,Osprey,Pandionidae


In [499]:
unmatched_df

Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected,raptor_common,raptor_sci_fam
57,ACAD-1057,ACAD,Acadia,Accipitriformes,Accipitridae,Accipiter striatus,"Northern Sharp-Shinned Hawk, Sharp-Shinned Hawk",Approved,Present,Native,Common,Breeder,Species of Concern,True,Hawk,Accipitridae
58,ACAD-1058,ACAD,Acadia,Accipitriformes,Accipitridae,Aquila chrysaetos,"American Golden Eagle, Golden Eagle",Approved,Present,Native,Occasional,Vagrant,Species of Concern,True,Eagle,Accipitridae
59,ACAD-1059,ACAD,Acadia,Accipitriformes,Accipitridae,Buteo jamaicensis,"Eastern Red-Tailed Hawk, Red-Tailed Hawk",Approved,Present,Native,Common,Breeder,Least Concern,False,Hawk,Accipitridae
60,ACAD-1060,ACAD,Acadia,Accipitriformes,Accipitridae,Buteo lagopus,"American Rough-Legged Hawk, Rough-Legged Hawk",Approved,Present,Native,Uncommon,Resident,Species of Concern,True,Hawk,Accipitridae
52251,GRTE-1079,GRTE,Grand Teton,Accipitriformes,Accipitridae,Buteo lagopus,"Roughleg, Rough-Legged Buzzard, Rough-Legged Hawk",Approved,Present,Native,Uncommon,Migratory,Species of Concern,True,"Hawk, Buzzard",Accipitridae
61884,HOSP-1058,HOSP,Hot Springs,Accipitriformes,Accipitridae,Buteo lagopus,"Roughleg, Rough-Legged Buzzard, Rough-Legged Hawk",Approved,Not Present (False Report),Unknown,Unknown,Unknown,Species of Concern,True,"Hawk, Buzzard",Accipitridae
72887,LAVO-1106,LAVO,Lassen Volcanic,Accipitriformes,Accipitridae,Buteo lagopus,"Roughleg, Rough-Legged Buzzard, Rough-Legged Hawk",Approved,Not Confirmed,Native,Unknown,Unknown,Species of Concern,True,"Hawk, Buzzard",Accipitridae
74637,MACA-1059,MACA,Mammoth Cave,Accipitriformes,Accipitridae,Buteo lagopus,"Roughleg, Rough-Legged Buzzard, Rough-Legged Hawk",Approved,Present,Native,Unknown,Unknown,Species of Concern,True,"Hawk, Buzzard",Accipitridae
77168,MEVE-1091,MEVE,Mesa Verde,Accipitriformes,Accipitridae,Buteo lagopus,"Roughleg, Rough-Legged Buzzard, Rough-Legged Hawk",Approved,Present,Native,Rare,Resident,Species of Concern,True,"Hawk, Buzzard",Accipitridae
80147,NOCA-1084,NOCA,North Cascades,Accipitriformes,Accipitridae,Buteo lagopus,"Roughleg, Rough-Legged Buzzard, Rough-Legged Hawk",Approved,Present,Native,Rare,Migratory,Species of Concern,True,"Hawk, Buzzard",Accipitridae


In [489]:
# Step 2a: Cross-reference to verify exact matches on both 'scientific_name' and 'common_names'
verified_dict = {}
for sci_name in scientific_names_with_nan:
    # Check for exact matches across the DataFrame
    matches = birds[(birds['scientific_name'] == sci_name) & 
                    (birds['common_names'].notna()) & 
                    (birds['scientific_name'].map(common_name_mapping) == birds['common_names'])]
    
    # If verified matches exist, add to the verified_dict
    if not matches.empty:
        verified_dict[sci_name] = common_name_mapping[sci_name]

In [490]:
verified_dict

{'Chilidonias niger': 'Black Tern',
 'Iridoprocne bicolor': 'Tree Swallow',
 'Dendroica coronata auduboni': "Audubon's Warbler",
 'Regulus satrapa olivaceus': 'Golden-Crowned Kinglet',
 'Colaptes auratus collaris': 'Red-Shafted Flicker',
 'Junco hyemalis aikeni': 'White Winged Junco',
 'Contopus sordidulus saturatus': 'Western Wood-Pewee',
 'Fulica americana americana': 'American Coot',
 'Tringa solitaria cinnamomea': 'Western Solitary Sandpiper',
 'Falco peregrinus pealei': "Peale's Peregrine Falcon",
 'Pipilo maculatus montanus': 'Spotted Towhee',
 'Onychoprion aleuticus': 'Aleutian Tern',
 'Falco sparverius sparverius': 'American Kestrel',
 'Zonotrichia leucophrys gambelii': "Gambel's White-Crowned Sparrow",
 'Leucosticte tephrocotis littoralis': 'Gray-Crowned Rosy Finch',
 'Dendroica coronata coronata': 'Myrtle Warbler',
 'Myadestes townsendi townsendi': "Towsend's Solitaire",
 'Phalaenoptilus nuttallii californicus': 'Poor-Will',
 'Buteo jamaicensis harlani': "Harlan's Hawk",
 'Ve

In [None]:
# # Step 3: Fill NaN values in 'common_names' using the verified matches
# birds['common_names'] = birds.apply(
#     lambda row: verified_dict.get(row['scientific_name'], row['common_names']) 
#     if pd.isna(row['common_names']) else row['common_names'], axis=1
# )