# Biodiversity in U.S. National Parks

## North American Birds of Prey

### Data has been cleaned in `wrangling.ipynb`

In [177]:
# Importing the necessary modules
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import plotly.express as px
import seaborn as sns
import numpy as np

## Loading in .csv files

### `sw_birds.csv`

In [178]:
birds_df = pd.read_csv('csv_files/sw_birds.csv')
birds_df.rename(columns={'Unnamed: 0': 'prev_index'}, inplace=True)
birds_df.head()

Unnamed: 0,prev_index,park_code,park_name,family,scientific_name,common_names,conservation_status,protected,raptor
0,42081,GRCA,Grand Canyon National Park,Accipitridae,Accipiter cooperii,Cooper's Hawk,Species of Concern,True,Hawk
1,42082,GRCA,Grand Canyon National Park,Accipitridae,Accipiter gentilis,Northern Goshawk,Least Concern,False,Hawk
2,42083,GRCA,Grand Canyon National Park,Accipitridae,Accipiter striatus,Sharp Shinned Hawk,Species of Concern,True,Hawk
3,42084,GRCA,Grand Canyon National Park,Accipitridae,Aquila chrysaetos,Golden Eagle,Species of Concern,True,Eagle
4,42085,GRCA,Grand Canyon National Park,Accipitridae,Buteo albonotatus,Zone Tailed Hawk,Least Concern,False,Hawk


### `sw_data.csv`

In [179]:
parks_df = pd.read_csv('csv_files/parks.csv')
parks_df.head()

Unnamed: 0,park_code,park_name,state,acres,latitude,longitude
0,ACAD,Acadia National Park,ME,47390,44.35,-68.21
1,ARCH,Arches National Park,UT,76519,38.68,-109.57
2,BADL,Badlands National Park,SD,242756,43.75,-102.5
3,BIBE,Big Bend National Park,TX,801163,29.25,-103.25
4,BISC,Biscayne National Park,FL,172924,25.65,-80.08


### `sw_extras.csv`

Extra columns may/may not need

In [180]:
extras_df = pd.read_csv('csv_files/sw_extras.csv')
extras_df.rename(columns={'Unnamed: 0': 'prev_index'}, inplace=True)
extras_df.head()

Unnamed: 0,prev_index,scientific_name,common_names,occurrence,nativeness,abundance,seasonality
0,42081,Accipiter cooperii,Cooper's Hawk,Present,Native,Common,Breeder
1,42082,Accipiter gentilis,Northern Goshawk,Present,Native,Common,Breeder
2,42083,Accipiter striatus,Sharp-Shinned Hawk,Present,Native,Common,Breeder
3,42084,Aquila chrysaetos,Golden Eagle,Present,Native,Uncommon,Breeder
4,42085,Buteo albonotatus,Zone-Tailed Hawk,Present,Native,Common,Breeder


## - `birds_df` and `extras_df` have the same index $\in$ `prev_index`

----
&nbsp;
## DataFrame merges



----
#### 1. DataFrame `sw_birds_df` from;
- `birds_df`
- `parks_df`

Aim to retain as much data as possible, using an inner merge.

In [181]:
# Replace empty strings in birds_df with a placeholder value
birds_df.loc[birds_df['raptor'] == '', 'raptor'] = '_PLACEHOLDER_'

# inner merge to retain all data
sw_birds_df = pd.merge(birds_df, parks_df, how='inner')

# Replace the placeholder value with empty strings
sw_birds_df.loc[sw_birds_df['raptor'] == '_PLACEHOLDER_', 'raptor'] = ''
print(f"\nwith shape: {sw_birds_df.shape}")


with shape: (6504, 13)


----
#### 2. DataFrame `sw_birds_all_df` from;
- `sw_birds_df`
- `extras_df`

Aim to retain as much data as possible, using an inner merge.
- replace columns in `extras_df`
- Will ultimately retain `scientific_name` and `common_names` from `birds_df`

In [182]:
extras_df = extras_df.rename(columns={'scientific_name': 'sci_loose', 'common_names': 'com_loose'})

In [183]:
# inner merge to retain all data
sw_birds_all_df = pd.merge(sw_birds_df, extras_df, how='inner')

print(f"\nwith shape: {sw_birds_all_df.shape}")


with shape: (6504, 19)


----
Delete columns

- `prev_index` in `sw_birds_df`
- `prev_index`, `sci_loose` & `com_loose` from `sw_birds_all_df`

DataFrames are merged in correct position with new indexes created

In [184]:
sw_birds_df.drop(columns=['prev_index'], axis=1, inplace=True)
sw_birds_all_df.drop(columns=['prev_index', 'sci_loose', 'com_loose'], axis=1, inplace=True)

In [185]:
print(f"\nsw_birds_df shape: {sw_birds_df.shape}")
sw_birds_df.head()


sw_birds_df shape: (6504, 12)


Unnamed: 0,park_code,park_name,family,scientific_name,common_names,conservation_status,protected,raptor,state,acres,latitude,longitude
0,GRCA,Grand Canyon National Park,Accipitridae,Accipiter cooperii,Cooper's Hawk,Species of Concern,True,Hawk,AZ,1217403,36.06,-112.14
1,GRCA,Grand Canyon National Park,Accipitridae,Accipiter gentilis,Northern Goshawk,Least Concern,False,Hawk,AZ,1217403,36.06,-112.14
2,GRCA,Grand Canyon National Park,Accipitridae,Accipiter striatus,Sharp Shinned Hawk,Species of Concern,True,Hawk,AZ,1217403,36.06,-112.14
3,GRCA,Grand Canyon National Park,Accipitridae,Aquila chrysaetos,Golden Eagle,Species of Concern,True,Eagle,AZ,1217403,36.06,-112.14
4,GRCA,Grand Canyon National Park,Accipitridae,Buteo albonotatus,Zone Tailed Hawk,Least Concern,False,Hawk,AZ,1217403,36.06,-112.14


We have retained `sw_birds_all_df` which is the meta DataFrame for (possible) use at a later date

In [186]:
meta_df_cols = sw_birds_all_df.columns.tolist()
print(f"\nsw_birds_all_df shape: {sw_birds_all_df.shape}\n")
print(f"With columns; \n{meta_df_cols}")


sw_birds_all_df shape: (6504, 16)

With columns; 
['park_code', 'park_name', 'family', 'scientific_name', 'common_names', 'conservation_status', 'protected', 'raptor', 'state', 'acres', 'latitude', 'longitude', 'occurrence', 'nativeness', 'abundance', 'seasonality']


----
# Analysis


### `sw_birds_df` inspection



In [187]:
sw_parks_list = sw_birds_df['park_name'].unique().tolist()
print(f"We have {len(sw_parks_list)} parks: \n\n{sw_parks_list}")

We have 23 parks: 

['Grand Canyon National Park', 'Petrified Forest National Park', 'Saguaro National Park', 'Channel Islands National Park', 'Joshua Tree National Park', 'Lassen Volcanic National Park', 'Pinnacles National Park', 'Redwood National Park', 'Sequoia and Kings Canyon National Parks', 'Yosemite National Park', 'Black Canyon of the Gunnison National Park', 'Great Sand Dunes National Park and Preserve', 'Mesa Verde National Park', 'Rocky Mountain National Park', 'Great Basin National Park', 'Carlsbad Caverns National Park', 'Big Bend National Park', 'Guadalupe Mountains National Park', 'Arches National Park', 'Bryce Canyon National Park', 'Canyonlands National Park', 'Capitol Reef National Park', 'Zion National Park']


<img src="Images/Area_Baja.png"/>

In [188]:
states_list = sw_birds_df['state'].unique().tolist()
states_list = sorted(states_list)
print(f"We have {len(states_list)} states: \n\n{states_list}")

We have 7 states: 

['AZ', 'CA', 'CO', 'NM', 'NV', 'TX', 'UT']


Create separate Dataframes for each state $\in$ `states_list`.

We iterate through `states_list` to produce summary statistics of the categorical data

In [189]:
d = {}
for state in states_list:
    d[state] = pd.DataFrame(sw_birds_df[sw_birds_df['state'] == state])

for i in range(0, len(states_list)):
    print(f"\n\n{states_list[i]}:\nShape: {d[states_list[i]].shape}")
    categorical_cols = d[states_list[i]].select_dtypes(include=['object']).columns
    print(f"Summary statistics:\n{d[states_list[i]][categorical_cols].describe().transpose()}")



AZ:
Shape: (946, 12)
Summary statistics:
                    count unique                         top freq
park_code             946      3                        GRCA  456
park_name             946      3  Grand Canyon National Park  456
family                946     61                 Emberizidae   94
scientific_name       946    534          Accipiter cooperii    3
common_names          946    469                 Horned Lark    6
conservation_status   946      6               Least Concern  765
raptor                 93     11                         Owl   32
state                 946      1                          AZ  946


CA:
Shape: (2056, 12)
Summary statistics:
                    count unique                    top  freq
park_code            2056      7                   REDW   494
park_name            2056      7  Redwood National Park   494
family               2056     68              Parulidae   170
scientific_name      2056    630     Accipiter cooperii     7
common_na

California has the largest representation in `sw_birds_df`

In [190]:
# We can call DataFrames for each state using below syntax
# For Example, California:
d['CA'].head()

Unnamed: 0,park_code,park_name,family,scientific_name,common_names,conservation_status,protected,raptor,state,acres,latitude,longitude
946,CHIS,Channel Islands National Park,Accipitridae,Accipiter cooperii,Cooper's Hawk,Species of Concern,True,Hawk,CA,249561,34.01,-119.42
947,CHIS,Channel Islands National Park,Accipitridae,Accipiter gentilis,Northern Goshawk,Least Concern,False,Hawk,CA,249561,34.01,-119.42
948,CHIS,Channel Islands National Park,Accipitridae,Accipiter striatus,Sharp Shinned Hawk,Species of Concern,True,Hawk,CA,249561,34.01,-119.42
949,CHIS,Channel Islands National Park,Accipitridae,Aquila chrysaetos,Golden Eagle,Species of Concern,True,Eagle,CA,249561,34.01,-119.42
950,CHIS,Channel Islands National Park,Accipitridae,Buteo jamaicensis,Red Tailed Hawk,Least Concern,False,Hawk,CA,249561,34.01,-119.42


#### We performed extension data wrangling on raptor species whilst neglecting non-raptors.
Need to find rows where `common_names` is 'None'. These will be removed from the dataset

In [191]:
com_name_none = sw_birds_df[sw_birds_df['common_names'] == 'None']
com_name_none_grouped = com_name_nan.groupby('state').size().reset_index(name='count')

for state in states_list:
    state_rows = com_name_none_grouped[com_name_none_grouped['state'] == state]
    if len(state_rows) > 0:
        print(f"State: {state}, NaN Count: {state_rows['count'].values[0]}")

State: AZ, NaN Count: 4
State: CA, NaN Count: 26
State: CO, NaN Count: 7
State: NV, NaN Count: 37


In [192]:
com_name_none.head(10)

Unnamed: 0,park_code,park_name,family,scientific_name,common_names,conservation_status,protected,raptor,state,acres,latitude,longitude
295,GRCA,Grand Canyon National Park,Parulidae,Leiothlypis celata orestera,,Least Concern,False,,AZ,1217403,36.06,-112.14
401,GRCA,Grand Canyon National Park,Vireonidae,Vireo olivaceus olivaceus,,Least Concern,False,,AZ,1217403,36.06,-112.14
404,GRCA,Grand Canyon National Park,Vireonidae,Vireo plumbeus plumbeus,,Least Concern,False,,AZ,1217403,36.06,-112.14
918,SAGU,Saguaro National Park,Picidae,Colaptes auratus cafer,,Least Concern,False,,AZ,91440,32.25,-110.5
1378,JOTR,Joshua Tree National Park,Scolopacidae,Tringa semipalmata,,Least Concern,False,,CA,789745,33.79,-115.9
2019,REDW,Redwood National Park,No Data,Erolia ptilochemis,,Least Concern,False,,CA,112512,41.3,-124.0
2021,REDW,Redwood National Park,Odontophoridae,Lophortyx pictus,,Least Concern,False,,CA,112512,41.3,-124.0
2058,REDW,Redwood National Park,Anatidae,Branta nigricans,,Least Concern,False,,CA,112512,41.3,-124.0
2071,REDW,Redwood National Park,Anatidae,Mareca americana,,Least Concern,False,,CA,112512,41.3,-124.0
2101,REDW,Redwood National Park,Alcidae,Brachyramphus marmoratus maroratum,,Least Concern,False,,CA,112512,41.3,-124.0


In [193]:
sw_birds_df = sw_birds_df[sw_birds_df['common_names'] != 'None']

count = sw_birds_df[sw_birds_df['common_names'] == 'None']
print(count)

Empty DataFrame
Columns: [park_code, park_name, family, scientific_name, common_names, conservation_status, protected, raptor, state, acres, latitude, longitude]
Index: []
