# OLAP Database of birds in US National Parks

In [3]:
import pandas as pd
import geopandas as gpd
import numpy as np
import random

----
&nbsp;
### Verifying Integrity of Dataset

In [2]:
birds = pd.read_pickle('DATA/birds.pkl')
print(f"Shape: {birds.shape}")
print(f"Data Types:\n\n{birds.dtypes}\n")

Shape: (14451, 17)
Data Types:

species_id               object
park_code                object
park_name                object
order                    object
family                   object
scientific_name          object
common_names             object
raptor_group             object
record_status          category
occurrence             category
nativeness             category
abundance              category
seasonality              object
conservation_status    category
is_protected               bool
raptor_sci_fam           object
is_raptor                  bool
dtype: object



In [3]:
print(f"Unique values:\n\n{birds.nunique()}")

Unique values:

species_id             14451
park_code                 56
park_name                 56
order                     24
family                    86
scientific_name         1177
common_names            1162
raptor_group              13
record_status              2
occurrence                 5
nativeness                 3
abundance                  6
seasonality                7
conservation_status        7
is_protected               2
raptor_sci_fam             7
is_raptor                  2
dtype: int64


In [4]:
# Count the number of NaN values in each column
nan_counts_per_column = birds.isna().sum()
print(f"NaN counts:\n\n{nan_counts_per_column}")

NaN counts:

species_id             0
park_code              0
park_name              0
order                  0
family                 0
scientific_name        0
common_names           0
raptor_group           0
record_status          0
occurrence             0
nativeness             0
abundance              0
seasonality            0
conservation_status    0
is_protected           0
raptor_sci_fam         0
is_raptor              0
dtype: int64


----
&nbsp;
### Extracting Species Data

In [5]:
birds = birds.rename(columns={'common_names': 'common_name'})
birds = birds.reset_index(drop=True)
birds.head()

Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_name,raptor_group,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected,raptor_sci_fam,is_raptor
0,ACAD-1055,ACAD,Acadia,Accipitriformes,Accipitridae,Accipiter cooperii,Cooper's Hawk,Hawk,Approved,Present,Native,Uncommon,Unknown,Species of Concern,True,Accipitridae,True
1,ACAD-1056,ACAD,Acadia,Accipitriformes,Accipitridae,Accipiter gentilis,Northern Goshawk,Hawk,Approved,Present,Native,Uncommon,Breeder,Least Concern,False,Accipitridae,True
2,ACAD-1057,ACAD,Acadia,Accipitriformes,Accipitridae,Accipiter striatus,Sharp-Shinned Hawk,Hawk,Approved,Present,Native,Common,Breeder,Species of Concern,True,Accipitridae,True
3,ACAD-1058,ACAD,Acadia,Accipitriformes,Accipitridae,Aquila chrysaetos,Golden Eagle,Eagle,Approved,Present,Native,Occasional,Vagrant,Species of Concern,True,Accipitridae,True
4,ACAD-1059,ACAD,Acadia,Accipitriformes,Accipitridae,Buteo jamaicensis,Red-Tailed Hawk,Hawk,Approved,Present,Native,Common,Breeder,Least Concern,False,Accipitridae,True


In [6]:
# We create a subset DataFrame of species information only
species = birds[['order', 'family', 'scientific_name', 'common_name', 'raptor_group']]
species.head()

Unnamed: 0,order,family,scientific_name,common_name,raptor_group
0,Accipitriformes,Accipitridae,Accipiter cooperii,Cooper's Hawk,Hawk
1,Accipitriformes,Accipitridae,Accipiter gentilis,Northern Goshawk,Hawk
2,Accipitriformes,Accipitridae,Accipiter striatus,Sharp-Shinned Hawk,Hawk
3,Accipitriformes,Accipitridae,Aquila chrysaetos,Golden Eagle,Eagle
4,Accipitriformes,Accipitridae,Buteo jamaicensis,Red-Tailed Hawk,Hawk


In [7]:
species = species.drop_duplicates()
species = species.sort_values(by='scientific_name')

print(f"Unique records: {species.shape[0]}")
species.head()

Unique records: 1179


Unnamed: 0,order,family,scientific_name,common_name,raptor_group
0,Accipitriformes,Accipitridae,Accipiter cooperii,Cooper's Hawk,Hawk
1,Accipitriformes,Accipitridae,Accipiter gentilis,Northern Goshawk,Hawk
10035,Accipitriformes,Accipitridae,Accipiter gentilis atricapillus,Northern Goshawk (atricapillus subspecies),Hawk
10036,Accipitriformes,Accipitridae,Accipiter gentilis laingi,Northern Goshawk (laingi subspecies),Hawk
2,Accipitriformes,Accipitridae,Accipiter striatus,Sharp-Shinned Hawk,Hawk


In [8]:
unique_sci_names = species['scientific_name'].nunique()
print(f"Unique scientific names: {unique_sci_names}")

Unique scientific names: 1177


In [9]:
# Group by scientific name and count occurrences
duplicates = species.groupby('scientific_name').size()
duplicates = duplicates[duplicates > 1]

print(f"Duplicated scientific names:\n{duplicates}")

Duplicated scientific names:
scientific_name
Phylloscopus borealis    2
Polioptila caerulea      2
dtype: int64


In [10]:
# List of duplicated scientific names
duplicated_sci_names = ['Phylloscopus borealis', 'Polioptila caerulea']
duplicated_records = species[species['scientific_name'].isin(duplicated_sci_names)]
duplicated_records

Unnamed: 0,order,family,scientific_name,common_name,raptor_group
3862,Passeriformes,Muscicapidae,Phylloscopus borealis,Arctic Warbler,
3874,Passeriformes,Phylloscopidae,Phylloscopus borealis,Arctic Warbler,
676,Passeriformes,Certhiidae,Polioptila caerulea,Blue-Gray Gnatcatcher,
271,Passeriformes,Polioptilidae,Polioptila caerulea,Blue-Gray Gnatcatcher,


In [11]:
# Correct families for the scientific names
correct_families = {
    'Phylloscopus borealis': 'Phylloscopidae', 
    'Polioptila caerulea': 'Polioptilidae'   
}

# Update the family in the birds DataFrame
for sci_name, correct_family in correct_families.items():
    birds.loc[birds['scientific_name'] == sci_name, 'family'] = correct_family

# Update the family in the species DataFrame
for sci_name, correct_family in correct_families.items():
    species.loc[species['scientific_name'] == sci_name, 'family'] = correct_family

In [12]:
species = species.drop_duplicates()
species = species.sort_values(by='scientific_name')
species = species.reset_index(drop=True)

assert species.shape[0] == 1177, "The records do not have the correct dimension"

In [13]:
# Create a species_id column with leading zeros (e.g., 0001, 0002, ...)
species['species_code'] = species.reset_index().index + 1  # Start with 1
species['species_code'] = species['species_code'].apply(lambda x: f"{x:04d}")  # Format with leading zeros for 4 digits

# Move 'species_code' to be the first column
species = species[['species_code'] + [col for col in species.columns if col != 'species_code']]
species.head()

Unnamed: 0,species_code,order,family,scientific_name,common_name,raptor_group
0,1,Accipitriformes,Accipitridae,Accipiter cooperii,Cooper's Hawk,Hawk
1,2,Accipitriformes,Accipitridae,Accipiter gentilis,Northern Goshawk,Hawk
2,3,Accipitriformes,Accipitridae,Accipiter gentilis atricapillus,Northern Goshawk (atricapillus subspecies),Hawk
3,4,Accipitriformes,Accipitridae,Accipiter gentilis laingi,Northern Goshawk (laingi subspecies),Hawk
4,5,Accipitriformes,Accipitridae,Accipiter striatus,Sharp-Shinned Hawk,Hawk


In [14]:
species.tail()

Unnamed: 0,species_code,order,family,scientific_name,common_name,raptor_group
1172,1173,Passeriformes,Emberizidae,Zonotrichia leucophrys leucophrys,White-Crowned Sparrow (leucophrys subspecies),
1173,1174,Passeriformes,Emberizidae,Zonotrichia leucophrys oriantha,White-Crowned Sparrow (Mountain Subspecies),
1174,1175,Passeriformes,Emberizidae,Zonotrichia leucophrys pugetensis,White-Crowned Sparrow (pugetensis subspecies),
1175,1176,Passeriformes,Emberizidae,Zonotrichia querula,Harris's Sparrow,
1176,1177,Passeriformes,Zosteropidae,Zosterops japonicus,Japanese White-Eye,


----
&nbsp;
### Matching species data with the park records

In [15]:
birds.head()

Unnamed: 0,species_id,park_code,park_name,order,family,scientific_name,common_name,raptor_group,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected,raptor_sci_fam,is_raptor
0,ACAD-1055,ACAD,Acadia,Accipitriformes,Accipitridae,Accipiter cooperii,Cooper's Hawk,Hawk,Approved,Present,Native,Uncommon,Unknown,Species of Concern,True,Accipitridae,True
1,ACAD-1056,ACAD,Acadia,Accipitriformes,Accipitridae,Accipiter gentilis,Northern Goshawk,Hawk,Approved,Present,Native,Uncommon,Breeder,Least Concern,False,Accipitridae,True
2,ACAD-1057,ACAD,Acadia,Accipitriformes,Accipitridae,Accipiter striatus,Sharp-Shinned Hawk,Hawk,Approved,Present,Native,Common,Breeder,Species of Concern,True,Accipitridae,True
3,ACAD-1058,ACAD,Acadia,Accipitriformes,Accipitridae,Aquila chrysaetos,Golden Eagle,Eagle,Approved,Present,Native,Occasional,Vagrant,Species of Concern,True,Accipitridae,True
4,ACAD-1059,ACAD,Acadia,Accipitriformes,Accipitridae,Buteo jamaicensis,Red-Tailed Hawk,Hawk,Approved,Present,Native,Common,Breeder,Least Concern,False,Accipitridae,True


In [16]:
birds_transform = birds.copy()

# Merge species code into birds
birds_transform = birds_transform.merge(species[['species_code', 'order', 'family', 'scientific_name', 'common_name', 'raptor_group']],
                    on=['order', 'family', 'scientific_name', 'common_name', 'raptor_group'], 
                    how='left')

columns_to_drop = ['species_id', 'park_name', 'order', 'family', 'scientific_name', 'common_name', 'raptor_group']
birds_transform = birds_transform.drop(columns=columns_to_drop)


cols = birds_transform.columns.tolist()
cols.insert(cols.index('park_code') + 1, cols.pop(cols.index('species_code')))  # Move species_code after park_code
birds_transform = birds_transform[cols]

In [17]:
birds_transform.head()

Unnamed: 0,park_code,species_code,record_status,occurrence,nativeness,abundance,seasonality,conservation_status,is_protected,raptor_sci_fam,is_raptor
0,ACAD,1,Approved,Present,Native,Uncommon,Unknown,Species of Concern,True,Accipitridae,True
1,ACAD,2,Approved,Present,Native,Uncommon,Breeder,Least Concern,False,Accipitridae,True
2,ACAD,5,Approved,Present,Native,Common,Breeder,Species of Concern,True,Accipitridae,True
3,ACAD,89,Approved,Present,Native,Occasional,Vagrant,Species of Concern,True,Accipitridae,True
4,ACAD,156,Approved,Present,Native,Common,Breeder,Least Concern,False,Accipitridae,True


----
&nbsp;
#### Checking composite keys `park_code` & `species_code` with species and park data

In [18]:
# Export to .csv
birds_transform.to_csv('DATA/records.csv', index=False)
species.to_csv('DATA/species.csv', index=False)

In [23]:
# Import the parks geojson file
parks_points = gpd.read_file('DATA/parks_points.geojson')
parks_points.head()

Unnamed: 0,park_code,park_name,state,square_km,geometry
0,ACAD,Acadia,ME,191.78,POINT (-68.21 44.35)
1,ARCH,Arches,UT,309.66,POINT (-109.57 38.68)
2,BADL,Badlands,SD,982.4,POINT (-102.5 43.75)
3,BIBE,Big Bend,TX,3242.19,POINT (-103.25 29.25)
4,BISC,Biscayne,FL,699.8,POINT (-80.08 25.65)


In [24]:
# Import the parks geojson file
parks_shapes = gpd.read_file('DATA/parks_shapes.geojson')
parks_shapes.head()

Unnamed: 0,park_code,park_name,state,square_km,geometry
0,ACAD,Acadia,ME,191.78,"MULTIPOLYGON (((-7594877.717 5526905.373, -759..."
1,ARCH,Arches,UT,309.66,"POLYGON ((-12196390.077 4665212.372, -12196457..."
2,BADL,Badlands,SD,982.4,"MULTIPOLYGON (((-11385429.267 5395282.538, -11..."
3,BIBE,Big Bend,TX,3242.19,"POLYGON ((-11449193.174 3421065.347, -11449322..."
4,BISC,Biscayne,FL,699.8,"POLYGON ((-8944117.558 2930921.986, -8944110.4..."


##### Randomly selecting 10 indices from `birds` DataFrame to recreate the records in a test environment

In [20]:
random.seed(42)
selected_indices = random.sample(range(birds.shape[0]), 10)
selected_records = birds.iloc[selected_indices]

# Create a dictionary with indices and the required data
selected_data_dict = {
    idx: {
        'order': row['order'],
        'family': row['family'],
        'scientific_name': row['scientific_name'],
        'common_name': row['common_name'],
        'raptor_group': row['raptor_group']
    } for idx, row in selected_records.iterrows()
}
for i, data in selected_data_dict.items():
    print(f"{i}: {data}")

10476: {'order': 'Anseriformes', 'family': 'Anatidae', 'scientific_name': 'Branta canadensis', 'common_name': 'Canada Goose', 'raptor_group': 'N/A'}
1824: {'order': 'Pelecaniformes', 'family': 'Pelecanidae', 'scientific_name': 'Pelecanus erythrorhynchos', 'common_name': 'American White Pelican', 'raptor_group': 'N/A'}
409: {'order': 'Columbiformes', 'family': 'Columbidae', 'scientific_name': 'Zenaida macroura', 'common_name': 'Mourning Dove', 'raptor_group': 'N/A'}
12149: {'order': 'Strigiformes', 'family': 'Strigidae', 'scientific_name': 'Aegolius acadicus', 'common_name': 'Northern Saw-Whet Owl', 'raptor_group': 'Owl'}
4506: {'order': 'Gruiformes', 'family': 'Rallidae', 'scientific_name': 'Fulica americana', 'common_name': 'American Coot', 'raptor_group': 'N/A'}
4012: {'order': 'Charadriiformes', 'family': 'Scolopacidae', 'scientific_name': 'Calidris bairdii', 'common_name': "Baird's Sandpiper", 'raptor_group': 'N/A'}
3657: {'order': 'Passeriformes', 'family': 'Parulidae', 'scientifi