In [1]:
# import dependencies
import pandas as pd
from sqlalchemy import create_engine
from pathlib import Path
import sqlite3
import polars as pl #one additional library not covered in class related to data engineering

In [2]:
# Load the datasets multiple ways
parks_df = pl.read_csv('csv files/parks.csv') #loaded using Polars
observations_df = pd.read_csv('csv files/observations.csv') #loaded using Pandas
species_info_df = pd.read_csv('csv files/species_info.csv') #loaded using Pandas

In [3]:
# Replace "Bryce Canyon National Park" with "Bryce National Park" to match observations csv file using Polars
parks_df = parks_df.with_columns(
    pl.col("Park Name").str.replace_all("Bryce Canyon National Park", "Bryce National Park").alias("Park Name")
)

In [4]:
# Rename columns to match observations csv file using Polars
parks_df = parks_df.rename({
    "Park Name": "park_name",
    "Park Code": "park_code"
})

In [5]:
# Save a version of the updated parks csv file using Polars
parks_df.write_csv("csv files/parks_updated.csv")

In [6]:
# Connect the sql database that was created in pgAdmin to Jupyter file
conn = sqlite3.connect("national_park_species_db.sql")

In [7]:
# Insert the observations csv file into the observations table
observations_sql_df = pd.read_csv("csv files/observations.csv")
observations_sql_df.to_sql("observations", conn, if_exists="append", index=False)

23296

In [8]:
# Insert the parks_updated csv file into the parks_updated table
parks_sql_df = pd.read_csv("csv files/parks_updated.csv")
parks_sql_df.to_sql("parks_updated", conn, if_exists="append", index=False)

56

In [9]:
# Insert the species_info csv file into the species_info table
species_sql_df = pd.read_csv("csv files/species_info.csv")
species_sql_df.to_sql("species_info", conn, if_exists="append", index=False)

5824

In [10]:
# from the national_park_species_db use the observations table as a base table and left join with parks_updated table on park_name
combined_sql_df = pd.merge(observations_sql_df, parks_sql_df, how="left", on="park_name")

In [11]:
# Join the result with the species_info table on scientific_name using a left join
combined_sql_df = pd.merge(combined_sql_df, species_sql_df, how="left", on="scientific_name")

In [12]:
# save the final dataset to a new csv file called National_Parks_Dataset
combined_sql_df.to_csv('csv files/National_Parks_Dataset.csv', index=False)
file_path = "csv files/National_Parks_Dataset.csv"
df = pd.read_csv(file_path)

In [13]:
# Define the desired column order
desired_columns = [
    "park_name", "park_code", "State", "Latitude", "Longitude", "Acres",
    "category", "common_names", "scientific_name", "observations", "conservation_status"
]

In [14]:
# Reorder the columns
df_reordered = df[desired_columns]

In [15]:
# Save the final update over the National_Parks_Dataset.csv file
output_path = "csv files/National_Parks_Dataset.csv"
df_reordered.to_csv(output_path, index=False)