# OLAP Database of birds in US National Parks

In [21]:
import pandas as pd
import numpy as np

----
&nbsp;
### Verifying Integrity of Dataset

In [22]:
birds = pd.read_pickle('DATA/birds.pkl')
print(f"Shape: {birds.shape}")
print(f"Data Types:\n\n{birds.dtypes}\n")

Shape: (14451, 16)
Data Types:

species_id               object
park_code                object
park_name                object
order                    object
family                   object
scientific_name          object
common_names             object
raptor_group             object
record_status          category
occurrence             category
nativeness             category
abundance              category
seasonality              object
conservation_status    category
is_protected               bool
is_raptor                  bool
dtype: object



In [23]:
print(f"Unique values:\n\n{birds.nunique()}")

Unique values:

species_id             14451
park_code                 56
park_name                 56
order                     24
family                    86
scientific_name         1177
common_names            1162
raptor_group              12
record_status              2
occurrence                 5
nativeness                 3
abundance                  6
seasonality                7
conservation_status        7
is_protected               2
is_raptor                  2
dtype: int64


In [24]:
# Count the number of NaN values in each column
nan_counts_per_column = birds.isna().sum()
print(f"NaN counts:\n\n{nan_counts_per_column}")

NaN counts:

species_id             0
park_code              0
park_name              0
order                  0
family                 0
scientific_name        0
common_names           0
raptor_group           0
record_status          0
occurrence             0
nativeness             0
abundance              0
seasonality            0
conservation_status    0
is_protected           0
is_raptor              0
dtype: int64


----
&nbsp;
### Extracting Species Data

In [25]:
birds.head()

Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_names,raptor_group,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected,is_raptor
55,ACAD-1055,ACAD,Acadia,Accipitriformes,Accipitridae,Accipiter cooperii,Cooper's Hawk,Hawk,Approved,Present,Native,Uncommon,Unknown,Species of Concern,True,True
56,ACAD-1056,ACAD,Acadia,Accipitriformes,Accipitridae,Accipiter gentilis,Northern Goshawk,Hawk,Approved,Present,Native,Uncommon,Breeder,Least Concern,False,True
57,ACAD-1057,ACAD,Acadia,Accipitriformes,Accipitridae,Accipiter striatus,Sharp-Shinned Hawk,Hawk,Approved,Present,Native,Common,Breeder,Species of Concern,True,True
58,ACAD-1058,ACAD,Acadia,Accipitriformes,Accipitridae,Aquila chrysaetos,Golden Eagle,Eagle,Approved,Present,Native,Occasional,Vagrant,Species of Concern,True,True
59,ACAD-1059,ACAD,Acadia,Accipitriformes,Accipitridae,Buteo jamaicensis,Red-Tailed Hawk,Hawk,Approved,Present,Native,Common,Breeder,Least Concern,False,True


In [26]:
# We create a subset DataFrame of species information only
species = birds[['order', 'family', 'scientific_name', 'common_names', 'raptor_group']]
species.head()

Unnamed: 0,order,family,scientific_name,common_names,raptor_group
55,Accipitriformes,Accipitridae,Accipiter cooperii,Cooper's Hawk,Hawk
56,Accipitriformes,Accipitridae,Accipiter gentilis,Northern Goshawk,Hawk
57,Accipitriformes,Accipitridae,Accipiter striatus,Sharp-Shinned Hawk,Hawk
58,Accipitriformes,Accipitridae,Aquila chrysaetos,Golden Eagle,Eagle
59,Accipitriformes,Accipitridae,Buteo jamaicensis,Red-Tailed Hawk,Hawk


In [27]:
species = species.rename(columns={'common_names': 'common_name'})
species = species.drop_duplicates()
species = species.sort_values(by='scientific_name')

print(f"Unique records: {species.shape[0]}")
species.head()

Unique records: 1179


Unnamed: 0,order,family,scientific_name,common_name,raptor_group
55,Accipitriformes,Accipitridae,Accipiter cooperii,Cooper's Hawk,Hawk
56,Accipitriformes,Accipitridae,Accipiter gentilis,Northern Goshawk,Hawk
78390,Accipitriformes,Accipitridae,Accipiter gentilis atricapillus,Northern Goshawk (atricapillus subspecies),Hawk
78391,Accipitriformes,Accipitridae,Accipiter gentilis laingi,Northern Goshawk (laingi subspecies),Hawk
57,Accipitriformes,Accipitridae,Accipiter striatus,Sharp-Shinned Hawk,Hawk


In [28]:
unique_sci_names = species['scientific_name'].nunique()
print(f"Unique scientific names: {unique_sci_names}")

Unique scientific names: 1177


In [29]:
# Group by scientific name and count occurrences
duplicates = species.groupby('scientific_name').size()
duplicates = duplicates[duplicates > 1]

print(f"Duplicated scientific names:\n{duplicates}")

Duplicated scientific names:
scientific_name
Phylloscopus borealis    2
Polioptila caerulea      2
dtype: int64


In [30]:
# List of duplicated scientific names
duplicated_sci_names = ['Phylloscopus borealis', 'Polioptila caerulea']
duplicated_records = species[species['scientific_name'].isin(duplicated_sci_names)]
duplicated_records

Unnamed: 0,order,family,scientific_name,common_name,raptor_group
24944,Passeriformes,Muscicapidae,Phylloscopus borealis,Arctic Warbler,
24956,Passeriformes,Phylloscopidae,Phylloscopus borealis,Arctic Warbler,
2928,Passeriformes,Certhiidae,Polioptila caerulea,Blue-Gray Gnatcatcher,
326,Passeriformes,Polioptilidae,Polioptila caerulea,Blue-Gray Gnatcatcher,


In [31]:
# Correct families for the scientific names
correct_families = {
    'Phylloscopus borealis': 'Phylloscopidae',  # Assuming this is the correct family
    'Polioptila caerulea': 'Polioptilidae'     # Assuming this is the correct family
}

# Update the family in the birds DataFrame
for sci_name, correct_family in correct_families.items():
    birds.loc[birds['scientific_name'] == sci_name, 'family'] = correct_family

# Update the family in the species DataFrame
for sci_name, correct_family in correct_families.items():
    species.loc[species['scientific_name'] == sci_name, 'family'] = correct_family

In [32]:
species = species.drop_duplicates()
species = species.sort_values(by='scientific_name')
species = species.reset_index(drop=True)

print(f"Unique records: {species.shape[0]}")
species.head(10)

Unique records: 1177


Unnamed: 0,order,family,scientific_name,common_name,raptor_group
0,Accipitriformes,Accipitridae,Accipiter cooperii,Cooper's Hawk,Hawk
1,Accipitriformes,Accipitridae,Accipiter gentilis,Northern Goshawk,Hawk
2,Accipitriformes,Accipitridae,Accipiter gentilis atricapillus,Northern Goshawk (atricapillus subspecies),Hawk
3,Accipitriformes,Accipitridae,Accipiter gentilis laingi,Northern Goshawk (laingi subspecies),Hawk
4,Accipitriformes,Accipitridae,Accipiter striatus,Sharp-Shinned Hawk,Hawk
5,Accipitriformes,Accipitridae,Accipiter striatus velox,Sharp-Shinned Hawk (velox subspecies),Hawk
6,Passeriformes,Sturnidae,Acridotheres tristis,Common Myna,
7,Charadriiformes,Scolopacidae,Actitis hypoleucos,Common Sandpiper,
8,Charadriiformes,Scolopacidae,Actitis macularius,Spotted Sandpiper,
9,Podicipediformes,Podicipedidae,Aechmophorus clarkii,Clark's Grebe,


In [33]:
# Create a species_id column with leading zeros (e.g., 0001, 0002, ...)
species['species_id'] = species.reset_index().index + 1  # Start with 1
species['species_id'] = species['species_id'].apply(lambda x: f"{x:04d}")  # Format with leading zeros for 4 digits

# Move 'species_id' to be the first column
species = species[['species_id'] + [col for col in species.columns if col != 'species_id']]
species.head()

Unnamed: 0,species_id,order,family,scientific_name,common_name,raptor_group
0,1,Accipitriformes,Accipitridae,Accipiter cooperii,Cooper's Hawk,Hawk
1,2,Accipitriformes,Accipitridae,Accipiter gentilis,Northern Goshawk,Hawk
2,3,Accipitriformes,Accipitridae,Accipiter gentilis atricapillus,Northern Goshawk (atricapillus subspecies),Hawk
3,4,Accipitriformes,Accipitridae,Accipiter gentilis laingi,Northern Goshawk (laingi subspecies),Hawk
4,5,Accipitriformes,Accipitridae,Accipiter striatus,Sharp-Shinned Hawk,Hawk


In [34]:
species.tail()

Unnamed: 0,species_id,order,family,scientific_name,common_name,raptor_group
1172,1173,Passeriformes,Emberizidae,Zonotrichia leucophrys leucophrys,White-Crowned Sparrow (leucophrys subspecies),
1173,1174,Passeriformes,Emberizidae,Zonotrichia leucophrys oriantha,White-Crowned Sparrow (Mountain Subspecies),
1174,1175,Passeriformes,Emberizidae,Zonotrichia leucophrys pugetensis,White-Crowned Sparrow (pugetensis subspecies),
1175,1176,Passeriformes,Emberizidae,Zonotrichia querula,Harris's Sparrow,
1176,1177,Passeriformes,Zosteropidae,Zosterops japonicus,Japanese White-Eye,


----
&nbsp;
### Matching species data with the park records