# Biodiversity in U.S. National Parks

## North American Birds of Prey

In [550]:
# Importing the necessary modules
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import plotly.express as px
import seaborn as sns
import numpy as np

## Loading in the data

# `parks.csv`

In [551]:
parks_data = pd.read_csv("csv_files/parks.csv")
print(f"Columns: {parks_data.columns.tolist()}\n")
print(f"(Rows, Columns): {parks_data.shape}")
parks_data.head()

Columns: ['Park Code', 'Park Name', 'State', 'Acres', 'Latitude', 'Longitude']

(Rows, Columns): (56, 6)


Unnamed: 0,Park Code,Park Name,State,Acres,Latitude,Longitude
0,ACAD,Acadia National Park,ME,47390,44.35,-68.21
1,ARCH,Arches National Park,UT,76519,38.68,-109.57
2,BADL,Badlands National Park,SD,242756,43.75,-102.5
3,BIBE,Big Bend National Park,TX,801163,29.25,-103.25
4,BISC,Biscayne National Park,FL,172924,25.65,-80.08


In [552]:
# convert column to lowercase, replace spaces
parks_data.columns = [col.lower().replace(" ", "_") for col in parks_data.columns]

#### Converting `acres` to 'hectares'

In [553]:
# 1 acre = 0.404686 hectares
# define function
def acres_to_hectares(acres):
    return round(acres * 0.404686, 2)

In [554]:
# apply the conversion function to the 'acres' column
parks_data['hectares'] = parks_data['acres'].apply(acres_to_hectares)
parks_data = parks_data.drop('acres', axis=1)

parks_data.head()

Unnamed: 0,park_code,park_name,state,latitude,longitude,hectares
0,ACAD,Acadia National Park,ME,44.35,-68.21,19178.07
1,ARCH,Arches National Park,UT,38.68,-109.57,30966.17
2,BADL,Badlands National Park,SD,43.75,-102.5,98239.95
3,BIBE,Big Bend National Park,TX,29.25,-103.25,324219.45
4,BISC,Biscayne National Park,FL,25.65,-80.08,69979.92


#### Creating a list of the Southwestern states.

Arizona, California, Colorado, Nevada, New Mexico, Oklahoma, Texas, Utah

In [555]:
# Return DF where 'state' has more than one value. Define function,
def has_two_values(string):
    return len(string.split(',')) > 1
more_states = parks_data[parks_data['state'].apply(has_two_values)]
more_states

Unnamed: 0,park_code,park_name,state,latitude,longitude,hectares
15,DEVA,Death Valley National Park,"CA, NV",36.24,-116.82,1918580.71
24,GRSM,Great Smoky Mountains National Park,"TN, NC",35.68,-83.53,211039.7
53,YELL,Yellowstone National Park,"WY, MT, ID",44.6,-110.5,898318.34


In [556]:
# Change 'Death Valley' to CA as primarily in California
parks_data.loc[15, 'state'] = 'CA'

In [557]:
sw_states_list = ["AZ", "CA", "CO", "NV", "NM", "OK", "TX", "UT"]

In [558]:
sw_data = pd.DataFrame(columns=parks_data.columns)
# iterate through the list of states
for state in sw_states_list:
    # is state in column, 'State'?
    match = parks_data[parks_data['state'] == state]
    # Append matching states
    sw_data = pd.concat([sw_data, match])

sw_data

Unnamed: 0,park_code,park_name,state,latitude,longitude,hectares
22,GRCA,Grand Canyon National Park,AZ,36.06,-112.14,492665.95
42,PEFO,Petrified Forest National Park,AZ,35.07,-109.78,37851.5
46,SAGU,Saguaro National Park,AZ,32.25,-110.5,37004.49
10,CHIS,Channel Islands National Park,CA,34.01,-119.42,100993.84
15,DEVA,Death Valley National Park,CA,36.24,-116.82,1918580.71
31,JOTR,Joshua Tree National Park,CA,33.79,-115.9,319598.75
36,LAVO,Lassen Volcanic National Park,CA,40.49,-121.51,43047.26
43,PINN,Pinnacles National Park,CA,36.48,-121.16,10767.08
44,REDW,Redwood National Park,CA,41.3,-124.0,45532.03
47,SEKI,Sequoia and Kings Canyon National Parks,CA,36.43,-118.68,350438.65


In [559]:
sw_parks_list = sw_data['park_name'].unique().tolist()
print(f"We have {len(sw_parks_list)} parks: \n\n{sw_parks_list}")

We have 24 parks: 

['Grand Canyon National Park', 'Petrified Forest National Park', 'Saguaro National Park', 'Channel Islands National Park', 'Death Valley National Park', 'Joshua Tree National Park', 'Lassen Volcanic National Park', 'Pinnacles National Park', 'Redwood National Park', 'Sequoia and Kings Canyon National Parks', 'Yosemite National Park', 'Black Canyon of the Gunnison National Park', 'Great Sand Dunes National Park and Preserve', 'Mesa Verde National Park', 'Rocky Mountain National Park', 'Great Basin National Park', 'Carlsbad Caverns National Park', 'Big Bend National Park', 'Guadalupe Mountains National Park', 'Arches National Park', 'Bryce Canyon National Park', 'Canyonlands National Park', 'Capitol Reef National Park', 'Zion National Park']


### On a map to get an idea of where these parks are

<img src="Images/Parks.png"/>

[GoogleEarth](https://earth.google.com/web/@35.23163888,-113.62762396,1125.39304439a,3255718.49710107d,30.00077622y,0.00000085h,0t,0r)

---

# `species.csv`

In [560]:
species_data = pd.read_csv('csv_files/species.csv', low_memory=False)
print(f"(Rows, Columns): {species_data.shape}\n")
print(f"Type of data:\n\n{species_data.dtypes}\n")
print(f"The number of unique values:\n\n{species_data.nunique()}")
species_data.head()

(Rows, Columns): (119248, 14)

Type of data:

Species ID             object
Park Name              object
Category               object
Order                  object
Family                 object
Scientific Name        object
Common Names           object
Record Status          object
Occurrence             object
Nativeness             object
Abundance              object
Seasonality            object
Conservation Status    object
Unnamed: 13            object
dtype: object

The number of unique values:

Species ID             119248
Park Name                  56
Category                   14
Order                     554
Family                   2332
Scientific Name         46022
Common Names            35826
Record Status              54
Occurrence                  7
Nativeness                  5
Abundance                   8
Seasonality                24
Conservation Status        11
Unnamed: 13                 3
dtype: int64


Unnamed: 0,Species ID,Park Name,Category,Order,Family,Scientific Name,Common Names,Record Status,Occurrence,Nativeness,Abundance,Seasonality,Conservation Status,Unnamed: 13
0,ACAD-1000,Acadia National Park,Mammal,Artiodactyla,Cervidae,Alces alces,Moose,Approved,Present,Native,Rare,Resident,,
1,ACAD-1001,Acadia National Park,Mammal,Artiodactyla,Cervidae,Odocoileus virginianus,"Northern White-Tailed Deer, Virginia Deer, Whi...",Approved,Present,Native,Abundant,,,
2,ACAD-1002,Acadia National Park,Mammal,Carnivora,Canidae,Canis latrans,"Coyote, Eastern Coyote",Approved,Present,Not Native,Common,,Species of Concern,
3,ACAD-1003,Acadia National Park,Mammal,Carnivora,Canidae,Canis lupus,"Eastern Timber Wolf, Gray Wolf, Timber Wolf",Approved,Not Confirmed,Native,,,Endangered,
4,ACAD-1004,Acadia National Park,Mammal,Carnivora,Canidae,Vulpes vulpes,"Black Fox, Cross Fox, Eastern Red Fox, Fox, Re...",Approved,Present,Unknown,Common,Breeder,,


In [561]:
species_data.columns = [col.lower().replace(" ", "_") for col in species_data.columns]
species_data.rename(columns={'unnamed:_13': 'unnamed'}, inplace=True)
print(f"Columns: {species_data.columns.tolist()}\n")

Columns: ['species_id', 'park_name', 'category', 'order', 'family', 'scientific_name', 'common_names', 'record_status', 'occurrence', 'nativeness', 'abundance', 'seasonality', 'conservation_status', 'unnamed']


### Isolate Birds

In [562]:
bird_species = species_data[species_data.category == 'Bird']
bird_species.head()

Unnamed: 0,species_id,park_name,category,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,unnamed
55,ACAD-1055,Acadia National Park,Bird,Accipitriformes,Accipitridae,Accipiter cooperii,Cooper's Hawk,Approved,Present,Native,Uncommon,,Species of Concern,
56,ACAD-1056,Acadia National Park,Bird,Accipitriformes,Accipitridae,Accipiter gentilis,"Eastern Goshawk, Goshawk, Northern Goshawk",Approved,Present,Native,Uncommon,Breeder,,
57,ACAD-1057,Acadia National Park,Bird,Accipitriformes,Accipitridae,Accipiter striatus,"Northern Sharp-Shinned Hawk, Sharp-Shinned Hawk",Approved,Present,Native,Common,Breeder,Species of Concern,
58,ACAD-1058,Acadia National Park,Bird,Accipitriformes,Accipitridae,Aquila chrysaetos,"American Golden Eagle, Golden Eagle",Approved,Present,Native,Occasional,Vagrant,Species of Concern,
59,ACAD-1059,Acadia National Park,Bird,Accipitriformes,Accipitridae,Buteo jamaicensis,"Eastern Red-Tailed Hawk, Red-Tailed Hawk",Approved,Present,Native,Common,Breeder,,


In [563]:
print(f"(Rows, Columns): {bird_species.shape}\n")
print(f"The number of unique values:\n\n{bird_species.nunique()}")

(Rows, Columns): (14601, 14)

The number of unique values:

species_id             14601
park_name                 56
category                   1
order                     24
family                    86
scientific_name         1436
common_names            1551
record_status             10
occurrence                 6
nativeness                 4
abundance                  7
seasonality               23
conservation_status        8
unnamed                    1
dtype: int64


### Isolate Southwest States By Park Name

In [564]:
print(sw_parks_list)

['Grand Canyon National Park', 'Petrified Forest National Park', 'Saguaro National Park', 'Channel Islands National Park', 'Death Valley National Park', 'Joshua Tree National Park', 'Lassen Volcanic National Park', 'Pinnacles National Park', 'Redwood National Park', 'Sequoia and Kings Canyon National Parks', 'Yosemite National Park', 'Black Canyon of the Gunnison National Park', 'Great Sand Dunes National Park and Preserve', 'Mesa Verde National Park', 'Rocky Mountain National Park', 'Great Basin National Park', 'Carlsbad Caverns National Park', 'Big Bend National Park', 'Guadalupe Mountains National Park', 'Arches National Park', 'Bryce Canyon National Park', 'Canyonlands National Park', 'Capitol Reef National Park', 'Zion National Park']


In [565]:
sw_birds = pd.DataFrame(columns=bird_species.columns)
# iterate through the list of parks
for park in sw_parks_list:
    # is park in column, 'park_name'?
    match = bird_species[bird_species['park_name'] == park]
    # Append matching states
    sw_birds = pd.concat([sw_birds, match])

print(f"Shape Of Data: {sw_birds.shape}")
sw_birds.head()

Shape Of Data: (7045, 14)


Unnamed: 0,species_id,park_name,category,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,unnamed
42081,GRCA-1106,Grand Canyon National Park,Bird,Accipitriformes,Accipitridae,Accipiter cooperii,Cooper's Hawk,Approved,Present,Native,Common,Breeder,Species of Concern,
42082,GRCA-1107,Grand Canyon National Park,Bird,Accipitriformes,Accipitridae,Accipiter gentilis,Northern Goshawk,Approved,Present,Native,Common,Breeder,,
42083,GRCA-1108,Grand Canyon National Park,Bird,Accipitriformes,Accipitridae,Accipiter striatus,Sharp-Shinned Hawk,Approved,Present,Native,Common,Breeder,Species of Concern,
42084,GRCA-1109,Grand Canyon National Park,Bird,Accipitriformes,Accipitridae,Aquila chrysaetos,Golden Eagle,Approved,Present,Native,Uncommon,Breeder,Species of Concern,
42085,GRCA-1110,Grand Canyon National Park,Bird,Accipitriformes,Accipitridae,Buteo albonotatus,Zone-Tailed Hawk,Approved,Present,Native,Common,Breeder,,


#### Exploring `conservation_status` column

In [566]:
print(f"There are {sw_birds.conservation_status.nunique()} categories:\n\n{sw_birds.conservation_status.unique()}")

There are 7 categories:

['Species of Concern' nan 'In Recovery' 'Endangered' 'Threatened'
 'Under Review' 'Breeder' 'Resident']


In [567]:
print(f"NaN value count: {sw_birds.conservation_status.isna().sum()}\n")
sw_birds.groupby("conservation_status").size()

NaN value count: 5687


conservation_status
Breeder                  4
Endangered              28
In Recovery             36
Resident                 4
Species of Concern    1231
Threatened              31
Under Review            24
dtype: int64

### `conservation_status`

- `Endangered`: seriously at risk of extinction
- `In Recovery`: formerly `Endangered`, but currently neither in danger of extinction throughout all or a significant portion of its range
- `Threatened`: vulnerable to endangerment in the near future
- `Species of Concern`: declining or appear to be in need of conservation
- `Under Review`: ?
- `Resident`: ?
- `Breeder`: ?


#### Need to research [IUCN](https://en.wikipedia.org/wiki/IUCN_Red_List) Red List.
- What do we populate NaN with?

`Least Concern`? For now populate with `No Data`

In [568]:
sw_birds.fillna('No Data', inplace=True)
sw_birds.groupby('conservation_status').size()

conservation_status
Breeder                  4
Endangered              28
In Recovery             36
No Data               5687
Resident                 4
Species of Concern    1231
Threatened              31
Under Review            24
dtype: int64

#### What does 'Breeder' and 'Resident' mean in `conservation_status`?

In [569]:
sw_birds['is_breeder'] = sw_birds.conservation_status.str.contains(r"\bBreeder\b", regex=True)
sw_birds[sw_birds.is_breeder]

Unnamed: 0,species_id,park_name,category,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,unnamed,is_breeder
44666,GRSA-1069,Great Sand Dunes National Park and Preserve,Bird,Accipitriformes,Accipitridae,Accipiter gentilis,Goshawk,Northern Goshawk,Approved,Present,Native,Rare,Breeder,No Data,True
44705,GRSA-1108,Great Sand Dunes National Park and Preserve,Bird,Caprimulgiformes,Caprimulgidae,Phalaenoptilus nuttallii,Poor-Will,Common Poorwill,Approved,Present,Native,Uncommon,Breeder,No Data,True
44744,GRSA-1147,Great Sand Dunes National Park and Preserve,Bird,Passeriformes,Aegithalidae,Psaltriparus minimus,Common Bushtit,Bushtit,Approved,Present,Native,Uncommon,Breeder,No Data,True
44859,GRSA-1262,Great Sand Dunes National Park and Preserve,Bird,Passeriformes,Turdidae,Turdus migratorius,American Robin,Robin,Approved,Present,Native,Common,Breeder,No Data,True


In [570]:
sw_birds['is_resident'] = sw_birds.conservation_status.str.contains(r"\bResident\b", regex=True)
sw_birds[sw_birds.is_resident]

Unnamed: 0,species_id,park_name,category,order,family,scientific_name,common_names,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,unnamed,is_breeder,is_resident
44678,GRSA-1081,Great Sand Dunes National Park and Preserve,Bird,Anseriformes,Anatidae,Anas acuta,Pintail,Northern Pintail,Approved,Present,Native,Rare,Resident,No Data,False,True
44733,GRSA-1136,Great Sand Dunes National Park and Preserve,Bird,Falconiformes,Falconidae,Falco columbarius,Merlin,Pigeon Hawk,Approved,Present,Native,Rare,Resident,Species of Concern,False,True
44759,GRSA-1162,Great Sand Dunes National Park and Preserve,Bird,Passeriformes,Corvidae,Corvus brachyrhynchos,Common Crow,American Crow,Approved,Present,Native,Rare,Resident,No Data,False,True
44818,GRSA-1221,Great Sand Dunes National Park and Preserve,Bird,Passeriformes,Mimidae,Dumetella carolinensis,Gray Catbird,Catbird,Approved,Present,Native,Rare,Resident,No Data,False,True


- index[44733] Pigeon Hawk - Need to change conservation status to `Species of Concern`
- Need to return all unique values in `record_status`
- Need to return all `unnamed:_13` values

In [571]:
print(f"There are {sw_birds.record_status.nunique()} categories:\n\n{sw_birds.record_status.unique()}")

There are 10 categories:

['Approved' 'In Review' ' Northern Goshawk' ' Northern Pintail'
 ' Common Poorwill' ' Pigeon Hawk' ' Bushtit' ' American Crow' ' Catbird'
 ' Robin']


#### Perhaps a person unaware of the data entry nomenclature recording `common_names` in `record_status`
#### Or... They wanted to make it easy to find these entries again in which case; Clever!

index[44733]
- `Merlin` is a Falcon - [Naming Convention](https://en.wikipedia.org/wiki/Merlin_(bird))
- Change `conservation_status` to 'Species of Concern'

In [572]:
sw_birds.at[44733, 'common_names'] = 'Merlin'
sw_birds.at[44733, 'conservation_status'] = 'Species of Concern'
sw_birds.loc[44733]

species_id                                               GRSA-1136
park_name              Great Sand Dunes National Park and Preserve
category                                                      Bird
order                                                Falconiformes
family                                                  Falconidae
scientific_name                                  Falco columbarius
common_names                                                Merlin
record_status                                          Pigeon Hawk
occurrence                                                Approved
nativeness                                                 Present
abundance                                                   Native
seasonality                                                   Rare
conservation_status                             Species of Concern
unnamed                                         Species of Concern
is_breeder                                                   F

Drop `record_status` - appears to be NPS internal message column. Drop new boolean columns

In [573]:
sw_birds.drop(columns=['is_breeder', 'is_resident', 'record_status'], axis=1, inplace=True)
sw_birds.head()

Unnamed: 0,species_id,park_name,category,order,family,scientific_name,common_names,occurrence,nativeness,abundance,seasonality,conservation_status,unnamed
42081,GRCA-1106,Grand Canyon National Park,Bird,Accipitriformes,Accipitridae,Accipiter cooperii,Cooper's Hawk,Present,Native,Common,Breeder,Species of Concern,No Data
42082,GRCA-1107,Grand Canyon National Park,Bird,Accipitriformes,Accipitridae,Accipiter gentilis,Northern Goshawk,Present,Native,Common,Breeder,No Data,No Data
42083,GRCA-1108,Grand Canyon National Park,Bird,Accipitriformes,Accipitridae,Accipiter striatus,Sharp-Shinned Hawk,Present,Native,Common,Breeder,Species of Concern,No Data
42084,GRCA-1109,Grand Canyon National Park,Bird,Accipitriformes,Accipitridae,Aquila chrysaetos,Golden Eagle,Present,Native,Uncommon,Breeder,Species of Concern,No Data
42085,GRCA-1110,Grand Canyon National Park,Bird,Accipitriformes,Accipitridae,Buteo albonotatus,Zone-Tailed Hawk,Present,Native,Common,Breeder,No Data,No Data


`unnamed:_13`

In [574]:
print(f"There are {sw_birds.unnamed.nunique()} categories:\n\n{sw_birds.unnamed.unique()}")

There are 2 categories:

['No Data' 'Species of Concern']


How many entries do we have `Species of Concern`?

In [575]:
sw_birds['is_species_of_concern'] = sw_birds.unnamed.str.contains(r"\bSpecies\b", regex=True)
sw_birds[sw_birds.is_species_of_concern]

Unnamed: 0,species_id,park_name,category,order,family,scientific_name,common_names,occurrence,nativeness,abundance,seasonality,conservation_status,unnamed,is_species_of_concern
44733,GRSA-1136,Great Sand Dunes National Park and Preserve,Bird,Falconiformes,Falconidae,Falco columbarius,Merlin,Approved,Present,Native,Rare,Species of Concern,Species of Concern,True


Great! just one which has been dealt with. Drop `unnamed`, `is_species_of_concern`

In [576]:
sw_birds.drop(columns=['unnamed', 'is_species_of_concern'], axis=1, inplace=True)

In [577]:
sw_birds['is_review'] = sw_birds.conservation_status.str.contains(r"\bReview\b", regex=True)
sw_birds.head(10)

Unnamed: 0,species_id,park_name,category,order,family,scientific_name,common_names,occurrence,nativeness,abundance,seasonality,conservation_status,is_review
42081,GRCA-1106,Grand Canyon National Park,Bird,Accipitriformes,Accipitridae,Accipiter cooperii,Cooper's Hawk,Present,Native,Common,Breeder,Species of Concern,False
42082,GRCA-1107,Grand Canyon National Park,Bird,Accipitriformes,Accipitridae,Accipiter gentilis,Northern Goshawk,Present,Native,Common,Breeder,No Data,False
42083,GRCA-1108,Grand Canyon National Park,Bird,Accipitriformes,Accipitridae,Accipiter striatus,Sharp-Shinned Hawk,Present,Native,Common,Breeder,Species of Concern,False
42084,GRCA-1109,Grand Canyon National Park,Bird,Accipitriformes,Accipitridae,Aquila chrysaetos,Golden Eagle,Present,Native,Uncommon,Breeder,Species of Concern,False
42085,GRCA-1110,Grand Canyon National Park,Bird,Accipitriformes,Accipitridae,Buteo albonotatus,Zone-Tailed Hawk,Present,Native,Common,Breeder,No Data,False
42086,GRCA-1111,Grand Canyon National Park,Bird,Accipitriformes,Accipitridae,Buteo jamaicensis,Red-Tailed Hawk,Present,Native,Common,Breeder,No Data,False
42087,GRCA-1112,Grand Canyon National Park,Bird,Accipitriformes,Accipitridae,Buteo lagopus,Rough-Legged Hawk,Present,Native,Rare,"Migratory, Winter",Species of Concern,False
42088,GRCA-1113,Grand Canyon National Park,Bird,Accipitriformes,Accipitridae,Buteo lineatus,Red-Shouldered Hawk,Present,Native,Occasional,Vagrant,Species of Concern,False
42089,GRCA-1114,Grand Canyon National Park,Bird,Accipitriformes,Accipitridae,Buteo platypterus,Broad-Winged Hawk,Present,Native,Rare,Migratory,No Data,False
42090,GRCA-1115,Grand Canyon National Park,Bird,Accipitriformes,Accipitridae,Buteo regalis,Ferruginous Hawk,Present,Native,Uncommon,"Breeder, Winter",No Data,False


In [578]:
sw_birds.drop(columns=['is_review'], axis=1, inplace=True)

## `conservation_status` decision

Will change 'No Data' to 'Least Concern' in  `conservation_status`
- Rationale; If NPS was concerned about the species' `conservation_status` then they would have assigned one
- 'Breeder': 'Least Concern'
- 'Resident': 'Least Concern'
- 'Under Review': Species could be reclassified 'Threatened' in the near future.

For our purposes, 'Least Concern' means 'non-protected'. All other values, 'protected'

In [579]:
mask = sw_birds['conservation_status'].isin(['Breeder', 'Resident', 'No Data'])
sw_birds.loc[mask, 'conservation_status'] = 'Least Concern'
sw_birds.groupby("conservation_status").size()

conservation_status
Endangered              28
In Recovery             36
Least Concern         5694
Species of Concern    1232
Threatened              31
Under Review            24
dtype: int64

#### Other Columns to analyse

---
### `occurence`

In [580]:
print(f"There are {sw_birds.occurrence.nunique()} categories:\n\n{sw_birds.occurrence.unique()}")

There are 7 categories:

['Present' 'Not Present' 'Not Present (False Report)' 'Not Confirmed'
 'Not Present (Historical Report)' 'No Data' 'Approved']


---
### `nativeness`

In [581]:
print(f"There are {sw_birds.nativeness.nunique()} categories:\n\n{sw_birds.nativeness.unique()}")

There are 5 categories:

['Native' 'Not Native' 'Unknown' 'No Data' 'Present']


---
### `abundance`

In [582]:
print(f"There are {sw_birds.abundance.nunique()} categories:\n\n{sw_birds.abundance.unique()}")

There are 8 categories:

['Common' 'Uncommon' 'Rare' 'Occasional' 'No Data' 'Abundant' 'Unknown'
 'Native']


In [583]:
print(f"NaN value count: {sw_birds.abundance.isna().sum()}\n")
sw_birds.groupby("abundance").size()

NaN value count: 0


abundance
Abundant       201
Common        1302
Native           8
No Data       1090
Occasional    1299
Rare          1147
Uncommon      1313
Unknown        685
dtype: int64

---
`seasonality`

In [584]:
print(f"There are {sw_birds.seasonality.nunique()} categories:\n\n{sw_birds.seasonality.unique()}")

There are 23 categories:

['Breeder' 'Migratory, Winter' 'Vagrant' 'Migratory' 'Breeder, Winter'
 'Resident' 'No Data' 'Winter' 'Resident, Winter' 'Summer, Vagrant'
 'Breeder, Resident, Summer' 'Resident, Summer' 'Migratory, Vagrant'
 'Summer' 'Breeder, Resident, Migratory, Summer'
 'Breeder, Resident, Summer, Winter' 'Breeder, Migratory'
 'Migratory, Summer' 'Winter, Vagrant' 'Breeder, Summer' 'Rare' 'Uncommon'
 'Common']


In [585]:
print(f"There are {sw_birds.common_names.nunique()} unique common names\n")
print(f"There are {sw_birds.scientific_name.nunique()} unique scientific names")

There are 919 unique common names

There are 1040 unique scientific names


----
## `sw_birds_extras_df`
- New data frame of above columns retaining all indexing.
- Adding `scientific_name` and `common_names` columns for reference

In [586]:
sw_birds.head()

Unnamed: 0,species_id,park_name,category,order,family,scientific_name,common_names,occurrence,nativeness,abundance,seasonality,conservation_status
42081,GRCA-1106,Grand Canyon National Park,Bird,Accipitriformes,Accipitridae,Accipiter cooperii,Cooper's Hawk,Present,Native,Common,Breeder,Species of Concern
42082,GRCA-1107,Grand Canyon National Park,Bird,Accipitriformes,Accipitridae,Accipiter gentilis,Northern Goshawk,Present,Native,Common,Breeder,Least Concern
42083,GRCA-1108,Grand Canyon National Park,Bird,Accipitriformes,Accipitridae,Accipiter striatus,Sharp-Shinned Hawk,Present,Native,Common,Breeder,Species of Concern
42084,GRCA-1109,Grand Canyon National Park,Bird,Accipitriformes,Accipitridae,Aquila chrysaetos,Golden Eagle,Present,Native,Uncommon,Breeder,Species of Concern
42085,GRCA-1110,Grand Canyon National Park,Bird,Accipitriformes,Accipitridae,Buteo albonotatus,Zone-Tailed Hawk,Present,Native,Common,Breeder,Least Concern


In [587]:
columns = ['scientific_name', 'common_names', 'occurrence', 'nativeness', 'abundance', 'seasonality']
sw_birds_extras_df = sw_birds.loc[:, columns]
sw_birds_extras_df.head()

Unnamed: 0,scientific_name,common_names,occurrence,nativeness,abundance,seasonality
42081,Accipiter cooperii,Cooper's Hawk,Present,Native,Common,Breeder
42082,Accipiter gentilis,Northern Goshawk,Present,Native,Common,Breeder
42083,Accipiter striatus,Sharp-Shinned Hawk,Present,Native,Common,Breeder
42084,Aquila chrysaetos,Golden Eagle,Present,Native,Uncommon,Breeder
42085,Buteo albonotatus,Zone-Tailed Hawk,Present,Native,Common,Breeder


Drop columns from `sw_birds` (Including `category`, they're all 'Birds')

In [588]:
sw_birds.drop(columns=['category', 'occurrence', 'nativeness', 'abundance', 'seasonality'], axis=1, inplace=True)

New column `protected` which will be bool $\neq$ 'Least Concern'

In [589]:
sw_birds['protected'] = sw_birds.conservation_status != 'Least Concern'
sw_birds.head()

Unnamed: 0,species_id,park_name,order,family,scientific_name,common_names,conservation_status,protected
42081,GRCA-1106,Grand Canyon National Park,Accipitriformes,Accipitridae,Accipiter cooperii,Cooper's Hawk,Species of Concern,True
42082,GRCA-1107,Grand Canyon National Park,Accipitriformes,Accipitridae,Accipiter gentilis,Northern Goshawk,Least Concern,False
42083,GRCA-1108,Grand Canyon National Park,Accipitriformes,Accipitridae,Accipiter striatus,Sharp-Shinned Hawk,Species of Concern,True
42084,GRCA-1109,Grand Canyon National Park,Accipitriformes,Accipitridae,Aquila chrysaetos,Golden Eagle,Species of Concern,True
42085,GRCA-1110,Grand Canyon National Park,Accipitriformes,Accipitridae,Buteo albonotatus,Zone-Tailed Hawk,Least Concern,False


### For now we keep `species_id` but not sure how it's useful.

---

# Milestone

---

### Need to filter `sw_birds` for raptors.
- Using list of common 'types'
- Using list of scientific families

Lists need to be exhaustive. Research required

We create two new columns `raptor_common` and `raptor_sci_fam` appending value of lists on search of columns if value $\in$ lists.

How many bird of prey `family` are there?

In [590]:
print(f"There are {sw_birds.family.nunique()} unique families\n")

There are 75 unique families


### `raptor_common`

I've already defined a bird of prey 'type' list in the last project.

In [591]:
# This dataset is well formatted with zero lower case values
lowercase_count = (sw_birds.applymap(lambda x: isinstance(x, str) and x.islower())).sum().sum()
print(lowercase_count)

0


In [592]:
# Updated to include Caracara which was missed in the first project and found through wrangling
birds_of_prey = ["Eagle", "Hawk", "Falcon", "Buzzard", "Harrier", "Kite", "Owl", "Osprey", "Vulture", "Condor", "Kestrel", 'Buteos', 'Accipiters', 'Caracara']

In [593]:
# We do an exact match search
sw_birds['raptor_common'] = sw_birds.common_names.str.findall('({0})'.format('|'.join(birds_of_prey)))
sw_birds['raptor_common'] = sw_birds['raptor_common'].apply('. '.join)
sw_birds.head()

Unnamed: 0,species_id,park_name,order,family,scientific_name,common_names,conservation_status,protected,raptor_common
42081,GRCA-1106,Grand Canyon National Park,Accipitriformes,Accipitridae,Accipiter cooperii,Cooper's Hawk,Species of Concern,True,Hawk
42082,GRCA-1107,Grand Canyon National Park,Accipitriformes,Accipitridae,Accipiter gentilis,Northern Goshawk,Least Concern,False,
42083,GRCA-1108,Grand Canyon National Park,Accipitriformes,Accipitridae,Accipiter striatus,Sharp-Shinned Hawk,Species of Concern,True,Hawk
42084,GRCA-1109,Grand Canyon National Park,Accipitriformes,Accipitridae,Aquila chrysaetos,Golden Eagle,Species of Concern,True,Eagle
42085,GRCA-1110,Grand Canyon National Park,Accipitriformes,Accipitridae,Buteo albonotatus,Zone-Tailed Hawk,Least Concern,False,Hawk


In [594]:
total_raptors_comm = sw_birds['raptor_common'].loc[sw_birds['raptor_common'] != ''].count()
print(f"Under common name:\nWe have a total of {total_raptors_comm} Raptors\n")
diff_com_types = sw_birds.raptor_common.unique().tolist()
print(diff_com_types)

Under common name:
We have a total of 665 Raptors

['Hawk', '', 'Eagle', 'Hawk. Hawk', 'Harrier', 'Vulture', 'Condor', 'Osprey', 'Falcon', 'Kestrel', 'Owl', 'Buzzard. Hawk', 'Osprey. Osprey', 'Caracara. Caracara', 'Owl. Owl', 'Kite', 'Kite. Kite', 'Caracara']


Need to check 'Buzzard. Hawk' which one is it? Will become clearer after `family` search and population

### `raptor_sci`

##### So far I have been able to gather the following families;

Sourced; "According to OpenAI's language model ChatGPT(2023) [birds_of_prey_sci]"

- Accipitridae (Hawks, Eagles, and relatives)
- Falconidae (Falcons)
- Harpagiidae (Harriers)
- Pandionidae (Ospreys)
- Accipitridae (Kites)
- Cathartidae (New World Vultures)
- Buteo (Buzzards and Buteos)
- Accipiter (Goshawks and Accipiters)
- Tytonidae (Barn Owls)
- Strigidae (Typical Owls)

*Caveat emptor*, this list may not be comprehensive

In [595]:
birds_of_prey_sci = ["Accipitridae", "Falconidae", "Harpagiidae", "Pandionidae", "Cathartidae", "Buteo", "Accipiter", "Tytonidae", "Strigidae"]

In [596]:
sw_birds['raptor_sci_fam'] = sw_birds.family.str.findall('({0})'.format('|'.join(birds_of_prey_sci)))
sw_birds['raptor_sci_fam'] = sw_birds['raptor_sci_fam'].apply('. '.join)

In [597]:
total_raptors_sci = sw_birds['raptor_sci_fam'].loc[sw_birds['raptor_sci_fam'] != ''].count()
print(f"Under scientific family:\nWe have a total of {total_raptors_sci} Raptors\n")
diff_sci_types = sw_birds.raptor_sci_fam.unique().tolist()
print(diff_sci_types)

Under scientific family:
We have a total of 729 Raptors

['Accipitridae', 'Cathartidae', 'Pandionidae', '', 'Falconidae', 'Strigidae', 'Tytonidae']


The lists `diff_com_types` and `diff_sci_types` are  $\neq$

In [598]:
sw_birds.head()

Unnamed: 0,species_id,park_name,order,family,scientific_name,common_names,conservation_status,protected,raptor_common,raptor_sci_fam
42081,GRCA-1106,Grand Canyon National Park,Accipitriformes,Accipitridae,Accipiter cooperii,Cooper's Hawk,Species of Concern,True,Hawk,Accipitridae
42082,GRCA-1107,Grand Canyon National Park,Accipitriformes,Accipitridae,Accipiter gentilis,Northern Goshawk,Least Concern,False,,Accipitridae
42083,GRCA-1108,Grand Canyon National Park,Accipitriformes,Accipitridae,Accipiter striatus,Sharp-Shinned Hawk,Species of Concern,True,Hawk,Accipitridae
42084,GRCA-1109,Grand Canyon National Park,Accipitriformes,Accipitridae,Aquila chrysaetos,Golden Eagle,Species of Concern,True,Eagle,Accipitridae
42085,GRCA-1110,Grand Canyon National Park,Accipitriformes,Accipitridae,Buteo albonotatus,Zone-Tailed Hawk,Least Concern,False,Hawk,Accipitridae


In [599]:
raptors_df = (sw_birds[(sw_birds.raptor_common != '')
                                | (sw_birds.raptor_sci_fam != '')])
print(raptors_df.shape)

(731, 10)


In [600]:
# copy() is used to make an explicit copy of the original DataFrame. We work with a copy of the original DataFrame and not a view of it

raptors_df = raptors_df.copy()
mask = raptors_df['raptor_common'] == ''
raptors_df.loc[mask, 'ambiguous'] = True
result = raptors_df[raptors_df['ambiguous'] == True]
result

Unnamed: 0,species_id,park_name,order,family,scientific_name,common_names,conservation_status,protected,raptor_common,raptor_sci_fam,ambiguous
42082,GRCA-1107,Grand Canyon National Park,Accipitriformes,Accipitridae,Accipiter gentilis,Northern Goshawk,Least Concern,False,,Accipitridae,True
42198,GRCA-1223,Grand Canyon National Park,Falconiformes,Falconidae,Falco columbarius,Merlin,Species of Concern,True,,Falconidae,True
42199,GRCA-1224,Grand Canyon National Park,Falconiformes,Falconidae,Falco columbarius columbarius,Merlin,Least Concern,False,,Falconidae,True
85437,PEFO-1063,Petrified Forest National Park,Accipitriformes,Accipitridae,Accipiter gentilis,Northern Goshawk,Least Concern,False,,Accipitridae,True
85500,PEFO-1126,Petrified Forest National Park,Falconiformes,Falconidae,Falco columbarius,Merlin,Species of Concern,True,,Falconidae,True
...,...,...,...,...,...,...,...,...,...,...,...
10667,CANY-1134,Canyonlands National Park,Falconiformes,Falconidae,Falco columbarius,Merlin,Species of Concern,True,,Falconidae,True
11830,CARE-1074,Capitol Reef National Park,Accipitriformes,Accipitridae,Accipiter gentilis,Northern Goshawk,Least Concern,False,,Accipitridae,True
11891,CARE-1135,Capitol Reef National Park,Falconiformes,Falconidae,Falco columbarius,Merlin,Species of Concern,True,,Falconidae,True
117534,ZION-1082,Zion National Park,Accipitriformes,Accipitridae,Accipiter gentilis,Northern Goshawk,Least Concern,False,,Accipitridae,True


#### We have already discovered the 'Merlin' is a Falcon. The 'Northern Goshawk' is a [Hawk](https://en.wikipedia.org/wiki/Northern_goshawk)

- Manually assign "Hawk", "Falcon" to `raptor_common` for these birds in `sw_birds`


In [601]:
sw_birds.loc[sw_birds['common_names'] == 'Northern Goshawk', 'raptor_common'] = 'Hawk'
sw_birds.loc[sw_birds['common_names'] == 'Merlin', 'raptor_common'] = 'Falcon'

In [602]:
raptors_df_2 = sw_birds[(sw_birds.raptor_common != '') | (sw_birds.raptor_sci_fam != '')].copy()
mask = raptors_df_2['raptor_common'] == ''
raptors_df_2.loc[mask, 'ambiguous'] = True
result2 = raptors_df_2[raptors_df_2['ambiguous'] == True]
result2

Unnamed: 0,species_id,park_name,order,family,scientific_name,common_names,conservation_status,protected,raptor_common,raptor_sci_fam,ambiguous
26308,DEVA-1223,Death Valley National Park,Accipitriformes,Accipitridae,Buteo lineatus elegans,,Least Concern,False,,Accipitridae,True
26320,DEVA-1235,Death Valley National Park,Accipitriformes,Cathartidae,Pseudogryphus californianus,,Least Concern,False,,Cathartidae,True
26456,DEVA-1371,Death Valley National Park,Falconiformes,Falconidae,Falco columbarius richardsonii,,Least Concern,False,,Falconidae,True
26459,DEVA-1374,Death Valley National Park,Falconiformes,Falconidae,Falco rusticolus,Gyrfalcon,Least Concern,False,,Falconidae,True
26808,DEVA-1723,Death Valley National Park,Strigiformes,Strigidae,Athene cunicularia hypugaea,,Species of Concern,True,,Strigidae,True
26814,DEVA-1729,Death Valley National Park,Strigiformes,Strigidae,Megascops asio bendirei,,Least Concern,False,,Strigidae,True
26817,DEVA-1732,Death Valley National Park,Strigiformes,Strigidae,Otus asio inyoensis,,Least Concern,False,,Strigidae,True
26820,DEVA-1735,Death Valley National Park,Strigiformes,Strigidae,Syrnium occidentale,,Least Concern,False,,Strigidae,True
86453,PINN-1226,Pinnacles National Park,Strigiformes,Strigidae,Glaucidium californicum,,Least Concern,False,,Strigidae,True
87807,REDW-1165,Redwood National Park,Accipitriformes,Accipitridae,Buteo jamaicensis harlani,,Least Concern,False,,Accipitridae,True


- The 'Gyrfalcon' is a [Falcon](https://en.wikipedia.org/wiki/Gyrfalcon) which will be assigned manually to `raptor_common` index[99085]

Will operate in `sw_birds`

In [603]:
# Gyrfalcon
sw_birds.loc[26459, 'raptor_common'] = 'Falcon'
sw_birds.loc[99085, 'raptor_common'] = 'Falcon'

In [604]:
sw_birds.loc[26459]

species_id                              DEVA-1374
park_name              Death Valley National Park
order                               Falconiformes
family                                 Falconidae
scientific_name                  Falco rusticolus
common_names                            Gyrfalcon
conservation_status                 Least Concern
protected                                   False
raptor_common                              Falcon
raptor_sci_fam                         Falconidae
Name: 26459, dtype: object

In [605]:
sw_birds.loc[99085]

species_id                                           SEKI-1146
park_name              Sequoia and Kings Canyon National Parks
order                                            Falconiformes
family                                              Falconidae
scientific_name                               Falco rusticolus
common_names                                         Gyrfalcon
conservation_status                              Least Concern
protected                                                False
raptor_common                                           Falcon
raptor_sci_fam                                      Falconidae
Name: 99085, dtype: object

We also assign a `raptor_common` to Hawk

In [606]:
sw_birds.loc[sw_birds['common_names'] == 'Goshawk', 'raptor_common'] = 'Hawk'

How many raptors in Redwood?

In [607]:
redwood_raptors = (sw_birds[(sw_birds.park_name == "Redwood National Park")
                       & (sw_birds.raptor_common != '')])
print(redwood_raptors.shape)

(38, 10)


How many raptors in Death Valley?

In [608]:
death_vall_raptors = (sw_birds[(sw_birds.park_name == "Death Valley National Park")
                       & (sw_birds.raptor_common != '')])
print(death_vall_raptors.shape)

(40, 10)


Most non-assigned `raptor_common` $=$ "empty string" were in 'Redwood National Park' & 'Death Valley National Park'
- There's enough variety to justify deleting entries with no `common_names` in `raptor_df_2`
- Could have searched and assigned `common_names` manually but this is already a lot of work.

These are the ones we need to delete from `sw_birds`

In [609]:
raptors_df_3 = sw_birds[(sw_birds.raptor_common == '') & (sw_birds.raptor_sci_fam != '')].copy()
mask = raptors_df_3['raptor_common'] == ''
raptors_df_3.loc[mask, 'ambiguous'] = True
result3 = raptors_df_3[raptors_df_2['ambiguous'] == True]
result3

  result3 = raptors_df_3[raptors_df_2['ambiguous'] == True]


Unnamed: 0,species_id,park_name,order,family,scientific_name,common_names,conservation_status,protected,raptor_common,raptor_sci_fam,ambiguous
26308,DEVA-1223,Death Valley National Park,Accipitriformes,Accipitridae,Buteo lineatus elegans,,Least Concern,False,,Accipitridae,True
26320,DEVA-1235,Death Valley National Park,Accipitriformes,Cathartidae,Pseudogryphus californianus,,Least Concern,False,,Cathartidae,True
26456,DEVA-1371,Death Valley National Park,Falconiformes,Falconidae,Falco columbarius richardsonii,,Least Concern,False,,Falconidae,True
26808,DEVA-1723,Death Valley National Park,Strigiformes,Strigidae,Athene cunicularia hypugaea,,Species of Concern,True,,Strigidae,True
26814,DEVA-1729,Death Valley National Park,Strigiformes,Strigidae,Megascops asio bendirei,,Least Concern,False,,Strigidae,True
26817,DEVA-1732,Death Valley National Park,Strigiformes,Strigidae,Otus asio inyoensis,,Least Concern,False,,Strigidae,True
26820,DEVA-1735,Death Valley National Park,Strigiformes,Strigidae,Syrnium occidentale,,Least Concern,False,,Strigidae,True
86453,PINN-1226,Pinnacles National Park,Strigiformes,Strigidae,Glaucidium californicum,,Least Concern,False,,Strigidae,True
87807,REDW-1165,Redwood National Park,Accipitriformes,Accipitridae,Buteo jamaicensis harlani,,Least Concern,False,,Accipitridae,True
87810,REDW-1168,Redwood National Park,Accipitriformes,Accipitridae,Buteo lineatus elegans,,Least Concern,False,,Accipitridae,True


Drop the indices $\in$ `result` with conditionals from `sw_birds`. 16 to lose

In [610]:
sw_birds.shape

(7045, 10)

In [611]:
indices_to_drop = sw_birds[(sw_birds.raptor_common == '') & (sw_birds.raptor_sci_fam != '')].index
sw_birds = sw_birds.drop(index=indices_to_drop)
sw_birds.shape

(7029, 10)

----
## How many raptors do we have?

In [612]:
total_raptors_comm = sw_birds['raptor_common'].loc[sw_birds['raptor_common'] != ''].count()
print(f"Under common name:\nWe have a total of {total_raptors_comm} Raptors\n")
diff_com_types = sw_birds.raptor_common.unique().tolist()
print(diff_com_types)

Under common name:
We have a total of 715 Raptors

['Hawk', 'Eagle', 'Hawk. Hawk', 'Harrier', 'Vulture', 'Condor', 'Osprey', '', 'Falcon', 'Kestrel', 'Owl', 'Buzzard. Hawk', 'Osprey. Osprey', 'Caracara. Caracara', 'Owl. Owl', 'Kite', 'Kite. Kite', 'Caracara']


In [613]:
total_raptors_sci = sw_birds['raptor_sci_fam'].loc[sw_birds['raptor_sci_fam'] != ''].count()
print(f"Under scientific family:\nWe have a total of {total_raptors_sci} Raptors\n")
diff_sci_types = sw_birds.raptor_sci_fam.unique().tolist()
print(diff_sci_types)

Under scientific family:
We have a total of 713 Raptors

['Accipitridae', 'Cathartidae', 'Pandionidae', '', 'Falconidae', 'Strigidae', 'Tytonidae']


Where is the missing one? Some `family` empty maybe?

In [614]:
missing_fam = (sw_birds[(sw_birds.raptor_common != '')
                                & (sw_birds.family == 'No Data')])
missing_fam

Unnamed: 0,species_id,park_name,order,family,scientific_name,common_names,conservation_status,protected,raptor_common,raptor_sci_fam
26803,DEVA-1718,Death Valley National Park,Strigiformes,No Data,Psiloscops flammeolus,Flammulated Owl,Least Concern,False,Owl,
117822,ZION-1370,Zion National Park,Strigiformes,No Data,Psiloscops flammeolus,Flammulated Owl,Least Concern,False,Owl,


In [615]:
flam_owl = sw_birds[sw_birds.common_names == 'Flammulated Owl']
flam_owl.head(10)

Unnamed: 0,species_id,park_name,order,family,scientific_name,common_names,conservation_status,protected,raptor_common,raptor_sci_fam
42530,GRCA-1555,Grand Canyon National Park,Strigiformes,Strigidae,Otus flammeolus,Flammulated Owl,Least Concern,False,Owl,Strigidae
97449,SAGU-1344,Saguaro National Park,Strigiformes,Strigidae,Otus flammeolus,Flammulated Owl,Least Concern,False,Owl,Strigidae
15250,CHIS-1392,Channel Islands National Park,Strigiformes,Strigidae,Otus flammeolus,Flammulated Owl,Least Concern,False,Owl,Strigidae
26803,DEVA-1718,Death Valley National Park,Strigiformes,No Data,Psiloscops flammeolus,Flammulated Owl,Least Concern,False,Owl,
73120,LAVO-1339,Lassen Volcanic National Park,Strigiformes,Strigidae,Otus flammeolus,Flammulated Owl,Least Concern,False,Owl,Strigidae
88286,REDW-1644,Redwood National Park,Strigiformes,Strigidae,Otus flammeolus,Flammulated Owl,Least Concern,False,Owl,Strigidae
99242,SEKI-1303,Sequoia and Kings Canyon National Parks,Strigiformes,Strigidae,Otus flammeolus,Flammulated Owl,Least Concern,False,Owl,Strigidae
115717,YOSE-1353,Yosemite National Park,Strigiformes,Strigidae,Otus flammeolus,Flammulated Owl,Least Concern,False,Owl,Strigidae
8381,BLCA-1240,Black Canyon of the Gunnison National Park,Strigiformes,Strigidae,Otus flammeolus,Flammulated Owl,Least Concern,False,Owl,Strigidae
44902,GRSA-1305,Great Sand Dunes National Park and Preserve,Strigiformes,Strigidae,Otus flammeolus,Flammulated Owl,Least Concern,False,Owl,Strigidae


In [616]:
# Changing index[26803]
sw_birds.loc[26803, 'family'] = 'Strigidae'
sw_birds.loc[26803, 'raptor_sci_fam'] = 'Strigidae'
sw_birds.loc[26803, 'scientific_name'] = 'Otus flammeolus'

In [617]:
# Changing index[117822]
sw_birds.loc[117822, 'family'] = 'Strigidae'
sw_birds.loc[117822, 'raptor_sci_fam'] = 'Strigidae'
sw_birds.loc[117822, 'scientific_name'] = 'Otus flammeolus'

In [618]:
total_raptors_sci = sw_birds['raptor_sci_fam'].loc[sw_birds['raptor_sci_fam'] != ''].count()
print(f"Under scientific family:\nWe have a total of {total_raptors_sci} Raptors\n")
diff_sci_types = sw_birds.raptor_sci_fam.unique().tolist()
print(diff_sci_types)

Under scientific family:
We have a total of 715 Raptors

['Accipitridae', 'Cathartidae', 'Pandionidae', '', 'Falconidae', 'Strigidae', 'Tytonidae']


## So, `raptor_sci_fam` $=$ `raptor_common`! Boom!

----
### Drop columns;
- `raptor_sci_name`
- `order`

### Change
- `raptor_common` $\Rightarrow$ `raptor`

In [619]:
sw_birds.drop(columns=['order', 'raptor_sci_fam'], axis=1, inplace=True)
sw_birds.rename(columns={'raptor_common': 'raptor'}, inplace=True)
sw_birds.head()

Unnamed: 0,species_id,park_name,family,scientific_name,common_names,conservation_status,protected,raptor
42081,GRCA-1106,Grand Canyon National Park,Accipitridae,Accipiter cooperii,Cooper's Hawk,Species of Concern,True,Hawk
42082,GRCA-1107,Grand Canyon National Park,Accipitridae,Accipiter gentilis,Northern Goshawk,Least Concern,False,Hawk
42083,GRCA-1108,Grand Canyon National Park,Accipitridae,Accipiter striatus,Sharp-Shinned Hawk,Species of Concern,True,Hawk
42084,GRCA-1109,Grand Canyon National Park,Accipitridae,Aquila chrysaetos,Golden Eagle,Species of Concern,True,Eagle
42085,GRCA-1110,Grand Canyon National Park,Accipitridae,Buteo albonotatus,Zone-Tailed Hawk,Least Concern,False,Hawk


#### Split `species_id` at '-', save park code, rename column `park_code` to match `sw_parks`

In [620]:
sw_birds['species_id'] = sw_birds['species_id'].str.split('-').str[0]
sw_birds.rename(columns={'species_id': 'park_code'}, inplace=True)
sw_birds.head()

Unnamed: 0,park_code,park_name,family,scientific_name,common_names,conservation_status,protected,raptor
42081,GRCA,Grand Canyon National Park,Accipitridae,Accipiter cooperii,Cooper's Hawk,Species of Concern,True,Hawk
42082,GRCA,Grand Canyon National Park,Accipitridae,Accipiter gentilis,Northern Goshawk,Least Concern,False,Hawk
42083,GRCA,Grand Canyon National Park,Accipitridae,Accipiter striatus,Sharp-Shinned Hawk,Species of Concern,True,Hawk
42084,GRCA,Grand Canyon National Park,Accipitridae,Aquila chrysaetos,Golden Eagle,Species of Concern,True,Eagle
42085,GRCA,Grand Canyon National Park,Accipitridae,Buteo albonotatus,Zone-Tailed Hawk,Least Concern,False,Hawk


----
&nbsp;
#### Checking duplicated `common_names`

In [621]:
sw_birds['comma_count'] = sw_birds['common_names'].str.count(",")
comma_raptors = (sw_birds['comma_count'] > 0) & (sw_birds['raptor'] != '')
comma_raptors_df = sw_birds[comma_raptors]
comma_raptors_df

Unnamed: 0,park_code,park_name,family,scientific_name,common_names,conservation_status,protected,raptor,comma_count
42092,GRCA,Grand Canyon National Park,Accipitridae,Buteogallus anthracinus,"Common Black Hawk, Common Black-Hawk",Least Concern,False,Hawk. Hawk,1
85442,PEFO,Petrified Forest National Park,Accipitridae,Buteo lagopus,"Roughleg, Rough-Legged Buzzard, Rough-Legged Hawk",Species of Concern,True,Buzzard. Hawk,2
85448,PEFO,Petrified Forest National Park,Pandionidae,Pandion haliaetus,"Osprey, Western Osprey",Species of Concern,True,Osprey. Osprey,1
97213,SAGU,Saguaro National Park,Accipitridae,Buteo nitidus,"Gray Hawk, Grey Hawk",Least Concern,False,Hawk. Hawk,1
97221,SAGU,Saguaro National Park,Pandionidae,Pandion haliaetus,"Osprey, Western Osprey",Species of Concern,True,Osprey. Osprey,1
97255,SAGU,Saguaro National Park,Falconidae,Caracara cheriway,"Crested Caracara, Northern Crested Caracara",Least Concern,False,Caracara. Caracara,1
97443,SAGU,Saguaro National Park,Strigidae,Glaucidium brasilianum,"Ferruginous Pygmy Owl, Ferruginous Pygmy-Owl",Least Concern,False,Owl. Owl,1
97445,SAGU,Saguaro National Park,Strigidae,Megascops asio,"Eastern Screech Owl, Eastern Screech-Owl",Least Concern,False,Owl. Owl,1
97446,SAGU,Saguaro National Park,Strigidae,Megascops kennicottii,"Tecolote Occidental, Western Screech Owl, West...",Least Concern,False,Owl. Owl,2
97447,SAGU,Saguaro National Park,Strigidae,Megascops trichopsis,"Whiskered Screech Owl, Whiskered Screech-Owl",Least Concern,False,Owl. Owl,1


In [622]:
comma_raptors_list = comma_raptors_df.common_names.tolist()
print(f"Comma separated names: \n\n{comma_raptors_list}")

Comma separated names: 

['Common Black Hawk, Common Black-Hawk', 'Roughleg, Rough-Legged Buzzard, Rough-Legged Hawk', 'Osprey, Western Osprey', 'Gray Hawk, Grey Hawk', 'Osprey, Western Osprey', 'Crested Caracara, Northern Crested Caracara', 'Ferruginous Pygmy Owl, Ferruginous Pygmy-Owl', 'Eastern Screech Owl, Eastern Screech-Owl', 'Tecolote Occidental, Western Screech Owl, Western Screech-Owl', 'Whiskered Screech Owl, Whiskered Screech-Owl', 'Northern Saw-Whet Owl, Saw-Whet Owl', 'Northern Pygmy-Owl, Pygmy Owl', 'Roughleg, Rough-Legged Buzzard, Rough-Legged Hawk', 'Osprey, Western Osprey', 'Mountain Pygmy Owl, Northern Pygmy-Owl', 'Great Gray Owl, Great Grey Owl', 'Barn Owl, Common Barn-Owl', 'Osprey, Western Osprey', 'Tecolote Occidental, Western Screech Owl, Western Screech-Owl', 'Roughleg, Rough-Legged Buzzard, Rough-Legged Hawk', 'Osprey, Western Osprey', 'Mountain Pygmy Owl, Northern Pygmy-Owl', 'Tecolote Occidental, Western Screech Owl, Western Screech-Owl', 'Great Gray Owl, Gre

## Need to find a way to automate this process...

At least I'm not worried about comma-separated `common_names` for the 'not raptors'!

In [623]:
osprey = sw_birds[sw_birds.common_names == 'Osprey, Western Osprey']
osprey.head()

Unnamed: 0,park_code,park_name,family,scientific_name,common_names,conservation_status,protected,raptor,comma_count
85448,PEFO,Petrified Forest National Park,Pandionidae,Pandion haliaetus,"Osprey, Western Osprey",Species of Concern,True,Osprey. Osprey,1
97221,SAGU,Saguaro National Park,Pandionidae,Pandion haliaetus,"Osprey, Western Osprey",Species of Concern,True,Osprey. Osprey,1
72897,LAVO,Lassen Volcanic National Park,Pandionidae,Pandion haliaetus,"Osprey, Western Osprey",Species of Concern,True,Osprey. Osprey,1
86302,PINN,Pinnacles National Park,Pandionidae,Pandion haliaetus,"Osprey, Western Osprey",Species of Concern,True,Osprey. Osprey,1
99043,SEKI,Sequoia and Kings Canyon National Parks,Pandionidae,Pandion haliaetus,"Osprey, Western Osprey",Species of Concern,True,Osprey. Osprey,1


- *Pandion haliaetus* is the 'Osprey'.
- *Pandion haliaetus carolinensis* is the 'Western Osprey'
- gray to grey
- remove hyphens

In [624]:
# defining a function to complete the above.

def clean_species_name(species_name):
    if species_name == "Osprey, Western Osprey":
        species_name = "Osprey"
    if species_name == "Gray Hawk, Grey Hawk":
        species_name = "Grey Hawk"
    if species_name == "Great Gray Owl, Great Grey Owl":
        species_name = "Great Grey Owl"
    species_name = species_name.replace("-", " ")
    species_list = species_name.split(", ")
    # Split at the commas
    if len(species_list) > 1 and species_list[0] == species_list[1]:
        species_name = species_list[0]
    # If equal either side then return first index
    else:
        species_name = ", ".join(species_list)
    # else add a comma again and retain value
    return species_name

In [625]:
sw_birds['common_names'] = sw_birds['common_names'].apply(clean_species_name)

In [626]:
sw_birds['comma_count'] = sw_birds['common_names'].str.count(",")
comma_raptors = (sw_birds['comma_count'] > 0) & (sw_birds['raptor'] != '')
comma_raptors_df = sw_birds[comma_raptors]
comma_raptors_df

Unnamed: 0,park_code,park_name,family,scientific_name,common_names,conservation_status,protected,raptor,comma_count
85442,PEFO,Petrified Forest National Park,Accipitridae,Buteo lagopus,"Roughleg, Rough Legged Buzzard, Rough Legged Hawk",Species of Concern,True,Buzzard. Hawk,2
97255,SAGU,Saguaro National Park,Falconidae,Caracara cheriway,"Crested Caracara, Northern Crested Caracara",Least Concern,False,Caracara. Caracara,1
97446,SAGU,Saguaro National Park,Strigidae,Megascops kennicottii,"Tecolote Occidental, Western Screech Owl, West...",Least Concern,False,Owl. Owl,2
26804,DEVA,Death Valley National Park,Strigidae,Aegolius acadicus,"Northern Saw Whet Owl, Saw Whet Owl",Least Concern,False,Owl. Owl,1
26812,DEVA,Death Valley National Park,Strigidae,Glaucidium gnoma,"Northern Pygmy Owl, Pygmy Owl",Least Concern,False,Owl. Owl,1
72887,LAVO,Lassen Volcanic National Park,Accipitridae,Buteo lagopus,"Roughleg, Rough Legged Buzzard, Rough Legged Hawk",Species of Concern,True,Buzzard. Hawk,2
73119,LAVO,Lassen Volcanic National Park,Strigidae,Glaucidium gnoma,"Mountain Pygmy Owl, Northern Pygmy Owl",Least Concern,False,Owl. Owl,1
73124,LAVO,Lassen Volcanic National Park,Tytonidae,Tyto alba,"Barn Owl, Common Barn Owl",Least Concern,False,Owl. Owl,1
88284,REDW,Redwood National Park,Strigidae,Megascops kennicottii,"Tecolote Occidental, Western Screech Owl, West...",Least Concern,False,Owl. Owl,2
99033,SEKI,Sequoia and Kings Canyon National Parks,Accipitridae,Buteo lagopus,"Roughleg, Rough Legged Buzzard, Rough Legged Hawk",Species of Concern,True,Buzzard. Hawk,2


Now using some regex to analyse the remaining `common_names`

In [627]:
sw_birds['is_barn'] = sw_birds.common_names.str.contains(r"\bBarn\b", regex=True)
sw_birds[sw_birds.is_barn].head(10)

Unnamed: 0,park_code,park_name,family,scientific_name,common_names,conservation_status,protected,raptor,comma_count,is_barn
42322,GRCA,Grand Canyon National Park,Hirundinidae,Hirundo rustica,Barn Swallow,Least Concern,False,,0,True
42533,GRCA,Grand Canyon National Park,Tytonidae,Tyto alba,Barn Owl,Least Concern,False,Owl,0,True
85563,PEFO,Petrified Forest National Park,Hirundinidae,Hirundo rustica,Barn Swallow,Least Concern,False,,0,True
97322,SAGU,Saguaro National Park,Hirundinidae,Hirundo rustica,Barn Swallow,Least Concern,False,,0,True
97451,SAGU,Saguaro National Park,Tytonidae,Tyto alba,Barn Owl,Least Concern,False,Owl,0,True
15102,CHIS,Channel Islands National Park,Hirundinidae,Hirundo rustica,Barn Swallow,Least Concern,False,,0,True
15251,CHIS,Channel Islands National Park,Tytonidae,Tyto alba,Barn Owl,Least Concern,False,Owl,0,True
26586,DEVA,Death Valley National Park,Hirundinidae,Hirundo rustica,Barn Swallow,Least Concern,False,,0,True
26819,DEVA,Death Valley National Park,Strigidae,Strix pratincola,American Barn Owl,Least Concern,False,Owl,0,True
26821,DEVA,Death Valley National Park,Tytonidae,Tyto alba,Barn Owl,Least Concern,False,Owl,0,True


In [628]:
sw_birds['is_buzzard'] = sw_birds.common_names.str.contains(r"\bBuzzard\b", regex=True)
sw_birds[sw_birds.is_buzzard].head()

Unnamed: 0,park_code,park_name,family,scientific_name,common_names,conservation_status,protected,raptor,comma_count,is_barn,is_buzzard
85442,PEFO,Petrified Forest National Park,Accipitridae,Buteo lagopus,"Roughleg, Rough Legged Buzzard, Rough Legged Hawk",Species of Concern,True,Buzzard. Hawk,2,False,True
72887,LAVO,Lassen Volcanic National Park,Accipitridae,Buteo lagopus,"Roughleg, Rough Legged Buzzard, Rough Legged Hawk",Species of Concern,True,Buzzard. Hawk,2,False,True
99033,SEKI,Sequoia and Kings Canyon National Parks,Accipitridae,Buteo lagopus,"Roughleg, Rough Legged Buzzard, Rough Legged Hawk",Species of Concern,True,Buzzard. Hawk,2,False,True
77168,MEVE,Mesa Verde National Park,Accipitridae,Buteo lagopus,"Roughleg, Rough Legged Buzzard, Rough Legged Hawk",Species of Concern,True,Buzzard. Hawk,2,False,True


In [629]:
sw_birds['is_megascops'] = sw_birds.scientific_name.str.contains(r"\bMegascops\b", regex=True)
sw_birds[sw_birds.is_megascops].head()

Unnamed: 0,park_code,park_name,family,scientific_name,common_names,conservation_status,protected,raptor,comma_count,is_barn,is_buzzard,is_megascops
42529,GRCA,Grand Canyon National Park,Strigidae,Megascops kennicottii,Western Screech Owl,Least Concern,False,Owl,0,False,False,True
85679,PEFO,Petrified Forest National Park,Strigidae,Megascops kennicottii,Western Screech Owl,Least Concern,False,Owl,0,False,False,True
97445,SAGU,Saguaro National Park,Strigidae,Megascops asio,Eastern Screech Owl,Least Concern,False,Owl. Owl,0,False,False,True
97446,SAGU,Saguaro National Park,Strigidae,Megascops kennicottii,"Tecolote Occidental, Western Screech Owl, West...",Least Concern,False,Owl. Owl,2,False,False,True
97447,SAGU,Saguaro National Park,Strigidae,Megascops trichopsis,Whiskered Screech Owl,Least Concern,False,Owl. Owl,0,False,False,True


In [630]:
sw_birds['is_glaucidium'] = sw_birds.scientific_name.str.contains(r"\bGlaucidium\b", regex=True)
sw_birds[sw_birds.is_glaucidium].head()

Unnamed: 0,park_code,park_name,family,scientific_name,common_names,conservation_status,protected,raptor,comma_count,is_barn,is_buzzard,is_megascops,is_glaucidium
42528,GRCA,Grand Canyon National Park,Strigidae,Glaucidium gnoma,Northern Pygmy Owl,Least Concern,False,Owl,0,False,False,False,True
97443,SAGU,Saguaro National Park,Strigidae,Glaucidium brasilianum,Ferruginous Pygmy Owl,Least Concern,False,Owl. Owl,0,False,False,False,True
97444,SAGU,Saguaro National Park,Strigidae,Glaucidium gnoma,Northern Pygmy Owl,Least Concern,False,Owl,0,False,False,False,True
26812,DEVA,Death Valley National Park,Strigidae,Glaucidium gnoma,"Northern Pygmy Owl, Pygmy Owl",Least Concern,False,Owl. Owl,1,False,False,False,True
73119,LAVO,Lassen Volcanic National Park,Strigidae,Glaucidium gnoma,"Mountain Pygmy Owl, Northern Pygmy Owl",Least Concern,False,Owl. Owl,1,False,False,False,True


In [631]:
# Second filtering function

def clean_species_name_two(species_name):
    if species_name == "Barn Owl, Common Barn Owl":
        species_name = "Barn Owl"
    if species_name == "Roughleg, Rough Legged Buzzard, Rough Legged Hawk":
        species_name = "Rough Legged Buzzard"
    if species_name == "Tecolote Occidental, Western Screech Owl, Western Screech Owl":
        species_name = "Western Screech Owl"
    if species_name == "Mountain Pygmy Owl, Northern Pygmy Owl":
        species_name = "Northern Pygmy Owl"

    return species_name

In [632]:
sw_birds['common_names'] = sw_birds['common_names'].apply(clean_species_name_two)

In [633]:
sw_birds['comma_count'] = sw_birds['common_names'].str.count(",")
comma_raptors = (sw_birds['comma_count'] > 0) & (sw_birds['raptor'] != '')
comma_raptors_df = sw_birds[comma_raptors]
comma_raptors_df

Unnamed: 0,park_code,park_name,family,scientific_name,common_names,conservation_status,protected,raptor,comma_count,is_barn,is_buzzard,is_megascops,is_glaucidium
97255,SAGU,Saguaro National Park,Falconidae,Caracara cheriway,"Crested Caracara, Northern Crested Caracara",Least Concern,False,Caracara. Caracara,1,False,False,False,False
26804,DEVA,Death Valley National Park,Strigidae,Aegolius acadicus,"Northern Saw Whet Owl, Saw Whet Owl",Least Concern,False,Owl. Owl,1,False,False,False,False
26812,DEVA,Death Valley National Park,Strigidae,Glaucidium gnoma,"Northern Pygmy Owl, Pygmy Owl",Least Concern,False,Owl. Owl,1,False,False,False,True
77173,MEVE,Mesa Verde National Park,Accipitridae,Elanus caeruleus,"Black Shouldered Kite, Black Winged Kite",Species of Concern,True,Kite. Kite,1,False,False,False,False
77176,MEVE,Mesa Verde National Park,Accipitridae,Parabuteo unicinctus,"Harris' Hawk, Harris's Hawk",Species of Concern,True,Hawk. Hawk,1,False,False,False,False


Down to a few. Will perform manually

- [Black Winged Kite](https://en.wikipedia.org/wiki/Black-winged_kite)
- [Northern Crested Caracara](https://ebird.org/species/crecar1)
- [Harris's Hawk](https://en.wikipedia.org/wiki/Harris%27s_hawk)

In [634]:
sw_birds.loc[97255, 'common_names'] = 'Northern Crested Caracara'
sw_birds.loc[77173, 'common_names'] = 'Black Winged Kite'
sw_birds.loc[77176, 'common_names'] = "Harris' Hawk"

In [635]:
sw_birds['comma_count'] = sw_birds['common_names'].str.count(",")
comma_raptors = (sw_birds['comma_count'] > 0) & (sw_birds['raptor'] != '')
comma_raptors_df = sw_birds[comma_raptors]
comma_raptors_df

Unnamed: 0,park_code,park_name,family,scientific_name,common_names,conservation_status,protected,raptor,comma_count,is_barn,is_buzzard,is_megascops,is_glaucidium
26804,DEVA,Death Valley National Park,Strigidae,Aegolius acadicus,"Northern Saw Whet Owl, Saw Whet Owl",Least Concern,False,Owl. Owl,1,False,False,False,False
26812,DEVA,Death Valley National Park,Strigidae,Glaucidium gnoma,"Northern Pygmy Owl, Pygmy Owl",Least Concern,False,Owl. Owl,1,False,False,False,True


In [636]:
sw_birds[sw_birds.is_glaucidium].head(10)

Unnamed: 0,park_code,park_name,family,scientific_name,common_names,conservation_status,protected,raptor,comma_count,is_barn,is_buzzard,is_megascops,is_glaucidium
42528,GRCA,Grand Canyon National Park,Strigidae,Glaucidium gnoma,Northern Pygmy Owl,Least Concern,False,Owl,0,False,False,False,True
97443,SAGU,Saguaro National Park,Strigidae,Glaucidium brasilianum,Ferruginous Pygmy Owl,Least Concern,False,Owl. Owl,0,False,False,False,True
97444,SAGU,Saguaro National Park,Strigidae,Glaucidium gnoma,Northern Pygmy Owl,Least Concern,False,Owl,0,False,False,False,True
26812,DEVA,Death Valley National Park,Strigidae,Glaucidium gnoma,"Northern Pygmy Owl, Pygmy Owl",Least Concern,False,Owl. Owl,1,False,False,False,True
73119,LAVO,Lassen Volcanic National Park,Strigidae,Glaucidium gnoma,Northern Pygmy Owl,Least Concern,False,Owl. Owl,0,False,False,False,True
86454,PINN,Pinnacles National Park,Strigidae,Glaucidium gnoma,Northern Pygmy Owl,Least Concern,False,Owl,0,False,False,False,True
88283,REDW,Redwood National Park,Strigidae,Glaucidium gnoma,Northern Pygmy Owl,Least Concern,False,Owl,0,False,False,False,True
99240,SEKI,Sequoia and Kings Canyon National Parks,Strigidae,Glaucidium gnoma,Northern Pygmy Owl,Least Concern,False,Owl. Owl,0,False,False,False,True
115715,YOSE,Yosemite National Park,Strigidae,Glaucidium gnoma,Northern Pygmy Owl,Least Concern,False,Owl,0,False,False,False,True
8380,BLCA,Black Canyon of the Gunnison National Park,Strigidae,Glaucidium gnoma,Northern Pygmy Owl,Least Concern,False,Owl,0,False,False,False,True


Change [26812] to 'Northern Pygmy Owl'

In [637]:
sw_birds.at[26812, 'common_names'] = "Northern Pygmy Owl"

Check 'Saw Whet Owl'

In [638]:
sw_birds['is_saw_whet'] = sw_birds.common_names.str.contains(r"\bSaw Whet\b", regex=True)
sw_birds[sw_birds.is_saw_whet].head()

Unnamed: 0,park_code,park_name,family,scientific_name,common_names,conservation_status,protected,raptor,comma_count,is_barn,is_buzzard,is_megascops,is_glaucidium,is_saw_whet
42521,GRCA,Grand Canyon National Park,Strigidae,Aegolius acadicus,Northern Saw Whet Owl,Least Concern,False,Owl,0,False,False,False,False,True
97438,SAGU,Saguaro National Park,Strigidae,Aegolius acadicus,Northern Saw Whet Owl,Least Concern,False,Owl,0,False,False,False,False,True
15245,CHIS,Channel Islands National Park,Strigidae,Aegolius acadicus,Northern Saw Whet Owl,Least Concern,False,Owl,0,False,False,False,False,True
26804,DEVA,Death Valley National Park,Strigidae,Aegolius acadicus,"Northern Saw Whet Owl, Saw Whet Owl",Least Concern,False,Owl. Owl,1,False,False,False,False,True
26811,DEVA,Death Valley National Park,Strigidae,Cryptoglaux acadica acadica,Saw Whet Owl,Least Concern,False,Owl,0,False,False,False,False,True


Change [26804] to 'Northern Saw Whet Owl

In [639]:
sw_birds.at[26804, 'common_names'] = "Northern Saw Whet Owl"

In [640]:
# Double check if we have any comma separated common names
sw_birds['comma_count'] = sw_birds['common_names'].str.count(",")
comma_raptors = (sw_birds['comma_count'] > 0) & (sw_birds['raptor'] != '')
comma_raptors_df = sw_birds[comma_raptors]
comma_raptors_df

Unnamed: 0,park_code,park_name,family,scientific_name,common_names,conservation_status,protected,raptor,comma_count,is_barn,is_buzzard,is_megascops,is_glaucidium,is_saw_whet


Remove extra columns

In [641]:
sw_birds.drop(columns=['is_saw_whet', 'is_glaucidium', 'is_megascops', 'is_buzzard', 'is_barn', 'comma_count'], axis=1, inplace=True)
sw_birds.head()

Unnamed: 0,park_code,park_name,family,scientific_name,common_names,conservation_status,protected,raptor
42081,GRCA,Grand Canyon National Park,Accipitridae,Accipiter cooperii,Cooper's Hawk,Species of Concern,True,Hawk
42082,GRCA,Grand Canyon National Park,Accipitridae,Accipiter gentilis,Northern Goshawk,Least Concern,False,Hawk
42083,GRCA,Grand Canyon National Park,Accipitridae,Accipiter striatus,Sharp Shinned Hawk,Species of Concern,True,Hawk
42084,GRCA,Grand Canyon National Park,Accipitridae,Aquila chrysaetos,Golden Eagle,Species of Concern,True,Eagle
42085,GRCA,Grand Canyon National Park,Accipitridae,Buteo albonotatus,Zone Tailed Hawk,Least Concern,False,Hawk


---

### Now to split extra values in `raptor`
"Roughleg, Rough Legged Buzzard, Rough Legged Hawk" will need to be done separately as $=$ Buzzard. Hawk

In [642]:
sw_birds.loc[sw_birds['raptor'] == 'Buzzard. Hawk', 'raptor'] = 'Buzzard'
sw_birds['raptor'] = sw_birds['raptor'].str.split('.').str[0]

In [643]:
raptor_list = sw_birds.raptor.unique().tolist()
print(f"{len(raptor_list) - 1} Unique raptors:\n\n{raptor_list}")

12 Unique raptors:

['Hawk', 'Eagle', 'Harrier', 'Vulture', 'Condor', 'Osprey', '', 'Falcon', 'Kestrel', 'Owl', 'Buzzard', 'Caracara', 'Kite']


In [644]:
sw_birds.head()

Unnamed: 0,park_code,park_name,family,scientific_name,common_names,conservation_status,protected,raptor
42081,GRCA,Grand Canyon National Park,Accipitridae,Accipiter cooperii,Cooper's Hawk,Species of Concern,True,Hawk
42082,GRCA,Grand Canyon National Park,Accipitridae,Accipiter gentilis,Northern Goshawk,Least Concern,False,Hawk
42083,GRCA,Grand Canyon National Park,Accipitridae,Accipiter striatus,Sharp Shinned Hawk,Species of Concern,True,Hawk
42084,GRCA,Grand Canyon National Park,Accipitridae,Aquila chrysaetos,Golden Eagle,Species of Concern,True,Eagle
42085,GRCA,Grand Canyon National Park,Accipitridae,Buteo albonotatus,Zone Tailed Hawk,Least Concern,False,Hawk


# It's clean but need to double check for any duplicates under `common_names`

In [645]:
duplicated_birds = sw_birds[sw_birds.duplicated(['common_names', 'park_name'], keep=False)]
duplicated_birds

Unnamed: 0,park_code,park_name,family,scientific_name,common_names,conservation_status,protected,raptor
42132,GRCA,Grand Canyon National Park,Apodidae,Aeronautes saxatalis,White Throated Swift,Least Concern,False,
42133,GRCA,Grand Canyon National Park,Apodidae,Aeronautes saxatalis saxatalis,White Throated Swift,Least Concern,False,
42144,GRCA,Grand Canyon National Park,Caprimulgidae,Chordeiles minor,Common Nighthawk,Least Concern,False,
42145,GRCA,Grand Canyon National Park,Caprimulgidae,Chordeiles minor henryi,Common Nighthawk,Least Concern,False,
42146,GRCA,Grand Canyon National Park,Caprimulgidae,Phalaenoptilus nuttallii,Common Poorwill,Least Concern,False,
...,...,...,...,...,...,...,...,...
39735,GRBA,Great Basin National Park,Picidae,Picoides villosus,Hairy Woodpecker,Least Concern,False,
39736,GRBA,Great Basin National Park,Picidae,Picoides villosus leucothorectis,Hairy Woodpecker,Least Concern,False,
39737,GRBA,Great Basin National Park,Picidae,Picoides villosus monticola,Hairy Woodpecker,Least Concern,False,
4312,BIBE,Big Bend National Park,Laridae,Sterna antillarum,Least Tern,Endangered,True,


In [646]:
# We drop the above duplicates
sw_birds = sw_birds.drop_duplicates(subset=['common_names', 'park_name'])

Are there any left equal to none?

In [647]:
duplicated_birds = sw_birds[(sw_birds['common_names'].isnull())]
duplicated_birds

Unnamed: 0,park_code,park_name,family,scientific_name,common_names,conservation_status,protected,raptor


## Next;

- Export `sw_birds` as .csv file
- Export `sw_data` as .csv file and change park_name!
- Export `sw_birds_extras_df` as .csv file

## Start new jupyter notebook for analysis

In [648]:
sw_data.head(10)

Unnamed: 0,park_code,park_name,state,latitude,longitude,hectares
22,GRCA,Grand Canyon National Park,AZ,36.06,-112.14,492665.95
42,PEFO,Petrified Forest National Park,AZ,35.07,-109.78,37851.5
46,SAGU,Saguaro National Park,AZ,32.25,-110.5,37004.49
10,CHIS,Channel Islands National Park,CA,34.01,-119.42,100993.84
15,DEVA,Death Valley National Park,CA,36.24,-116.82,1918580.71
31,JOTR,Joshua Tree National Park,CA,33.79,-115.9,319598.75
36,LAVO,Lassen Volcanic National Park,CA,40.49,-121.51,43047.26
43,PINN,Pinnacles National Park,CA,36.48,-121.16,10767.08
44,REDW,Redwood National Park,CA,41.3,-124.0,45532.03
47,SEKI,Sequoia and Kings Canyon National Parks,CA,36.43,-118.68,350438.65


In [649]:
# Split 'park_name' in this dataframe to remove 'National Park'

sw_data['park_name'] = sw_data['park_name'].apply(lambda x: x[:-14] if x.endswith(" National Park") else x)
# By inspection, lambda function will not apply to indices [47] or [23]
sw_data.at[47, 'park_name'] = 'Sequoia & Kings Canyon'
sw_data.at[23, 'park_name'] = 'Great Sand Dunes'

In [651]:
# Export as .csv
#sw_birds.to_csv('sw_birds.csv', index=True)
#sw_data.to_csv('sw_data.csv', index=False)
#sw_birds_extras_df.to_csv('sw_extras.csv', index=True)

----