In [None]:
# import dependencies
import pandas as pd
import psycopg2
import polars as pl #one additional library not covered in class related to data engineering

In [2]:
# Load the datasets multiple ways
parks_df = pl.read_csv('resources/parks.csv') #loaded using Polars
observations_df = pd.read_csv('resources/observations.csv') #loaded using Pandas
species_info_df = pd.read_csv('resources/species_info.csv') #loaded using Pandas

In [3]:
# Replace "Bryce Canyon National Park" with "Bryce National Park" to match observations csv file using Polars
parks_df = parks_df.with_columns(
    pl.col("Park Name").str.replace_all("Bryce Canyon National Park", "Bryce National Park").alias("Park Name")
)

In [4]:
# Rename columns to match observations csv file using Polars
parks_df = parks_df.rename({
    "Park Name": "park_name",
    "Park Code": "park_code"
})

In [5]:
# Save a version of the updated parks csv file using Polars
parks_df.write_csv("resources/parks_updated.csv")

In [6]:
# Connect the sql database that was created in pgAdmin to Jupyter file
conn = psycopg2.connect(
    host = "localhost",
    user = "postgres",
    password = "postgres",
    dbname = "national_park_species_db",
    port = 5432)

In [7]:
cur = conn.cursor()

In [8]:
cur.execute("select * from observations")
observations = cur.fetchall()

In [9]:
cur.execute("select * from parks_updated")
parks_updated = cur.fetchall()

In [10]:
cur.execute("select * from species_info")
species_info = cur.fetchall()

In [11]:
print("Observations (first 3 rows):", observations[:3])
print("Parks Updated (first 3 rows):", parks_updated[:3])
print("Species Info (first 3 rows):", species_info[:3])

Observations (first 3 rows): [('Vicia benghalensis', 'Great Smoky Mountains National Park', 68), ('Neovison vison', 'Great Smoky Mountains National Park', 77), ('Prunus subcordata', 'Yosemite National Park', 138)]
Parks Updated (first 3 rows): [('ACAD', 'Acadia National Park', 'ME', 47390, 44.35, -68.21), ('ARCH', 'Arches National Park', 'UT', 76519, 38.68, -109.57), ('BADL', 'Badlands National Park', 'SD', 242756, 43.75, -102.5)]
Species Info (first 3 rows): [('Mammal', 'Clethrionomys gapperi gapperi', "Gapper's Red-Backed Vole", None), ('Mammal', 'Bos bison', 'American Bison, Bison', None), ('Mammal', 'Bos taurus', 'Aurochs, Aurochs, Domestic Cattle (Feral), Domesticated Cattle', None)]


In [12]:
# Join all three tables: observations -> parks_updated -> species_info
query = """
    SELECT o.*, 
           p.park_code, p.State, p.Acres, p.Latitude, p.Longitude,
           s.category, s.common_names, s.conservation_status
    FROM observations o
    LEFT JOIN parks_updated p ON o.park_name = p.park_name
    LEFT JOIN species_info s ON o.scientific_name = s.scientific_name
"""

cur.execute(query)
final_joined_data = cur.fetchall()

In [13]:
# Preview result
for row in final_joined_data[:3]:
    print(row)

('Vicia benghalensis', 'Great Smoky Mountains National Park', 68, 'GRSM', 'TN, NC', 521490, 35.68, -83.53, 'Vascular Plant', 'Purple Vetch, Reddish Tufted Vetch', None)
('Neovison vison', 'Great Smoky Mountains National Park', 77, 'GRSM', 'TN, NC', 521490, 35.68, -83.53, 'Mammal', 'American Mink', None)
('Prunus subcordata', 'Yosemite National Park', 138, 'YOSE', 'CA', 761266, 37.83, -119.5, 'Vascular Plant', 'Klamath Plum', None)


In [14]:
# Define original column order from SELECT statement
original_columns = [
    'scientific_name', 'park_name', 'observations',
    'park_code', 'State', 'Acres', 'Latitude', 'Longitude',
    'category', 'common_names', 'conservation_status'
]

In [15]:
# Convert to DataFrame
df = pd.DataFrame(final_joined_data, columns=original_columns)

In [16]:
# ✅ Reorder the columns as desired
new_order = [
    'park_name', 'park_code', 'State', 'Latitude', 'Longitude', 'Acres',
    'category', 'common_names', 'scientific_name', 'observations', 'conservation_status'
]

df = df[new_order]

In [17]:
# Save to CSV
df.to_csv("resources/National_Parks_Dataset.csv", index=False)