# National Parks Data Cleaning

This notebook cleans the visitor data and geojson files to create optimized datasets for visualization.

In [1]:
import polars as pl
import json

## Load Visitor Data

Load the CSV and select only the columns we need for visualization.

In [2]:
# Load full visitor data
df_visits = pl.read_csv('data/visit_data.csv')

# Select only needed columns
s_df = df_visits.select(['Year', 'Month', 'RecreationVisits', 'UnitCode', 'ParkName'])

print(f"Shape: {s_df.shape}")
print(f"Unique parks: {s_df.select('UnitCode').unique().height}")
s_df.head()

Shape: (33395, 5)
Unique parks: 63


Year,Month,RecreationVisits,UnitCode,ParkName
i64,i64,i64,str,str
1979,1,6011,"""ACAD""","""Acadia NP"""
1979,2,5243,"""ACAD""","""Acadia NP"""
1979,3,11165,"""ACAD""","""Acadia NP"""
1979,4,219351,"""ACAD""","""Acadia NP"""
1979,5,339416,"""ACAD""","""Acadia NP"""


## Load and Process GeoJSON

Extract park names, codes, and coordinates from the GeoJSON file.

In [3]:
# Read GeoJSON
df_geo = pl.read_json('data/national-parks.geojson')

# Extract features and unnest the structure
g_df = (
    df_geo
    .select(pl.col('features').explode())
    .unnest('features')
    .select([
        pl.col('properties').struct.field('Name').alias('Name'),
        pl.col('properties').struct.field('Code').alias('Code'),
        pl.col('geometry').struct.field('coordinates').list.get(0).alias('long'),
        pl.col('geometry').struct.field('coordinates').list.get(1).alias('lat')
    ])
)

print(f"Total parks in geojson: {g_df.height}")
g_df.head()

Total parks in geojson: 396


Name,Code,long,lat
str,str,f64,f64
"""Frederick Law Olmsted National…","""FRLA""",-71.13113,42.325509
"""Gloria Dei Church National His…","""GLDE""",-75.143584,39.934377
"""John F Kennedy National Histor…","""JOFI""",-71.122964,42.346598
"""Longfellow House - Washington'…","""LONG""",-71.125896,42.377019
"""Roger Williams National Memori…","""ROWI""",-71.410801,41.828346


## Match Parks Between Datasets

Filter geographic data to only include parks that have visitor data.

In [4]:
# Get unique codes from visitor data
unique_codes = s_df.select('UnitCode').unique()

# Semi-join to keep only matching parks
result = g_df.join(unique_codes, left_on='Code', right_on='UnitCode', how='semi')

print(f"Matched parks: {result.height}")
result.head(10)

Matched parks: 62


Name,Code,long,lat
str,str,f64,f64
"""Jefferson National Expansion M…","""JEFF""",-90.185953,38.62254
"""National Park of American Samo…","""NPSA""",-169.453808,-14.239728
"""Virgin Islands National Park""","""VIIS""",-64.77111,18.353799
"""Hot Springs National Park""","""HOSP""",-93.089722,34.511138
"""Indiana Dunes National Lakesho…","""INDU""",-87.032415,41.661458
"""Acadia National Park""","""ACAD""",-68.244112,44.350751
"""Haleakala National Park""","""HALE""",-156.147093,20.704512
"""New River Gorge National River""","""NERI""",-80.989704,37.840267
"""Congaree National Park""","""CONG""",-80.818352,33.806204
"""Black Canyon Of The Gunnison N…","""BLCA""",-107.72398,38.577392


## Find Missing Parks

Identify which parks have visitor data but no geographic coordinates.

In [19]:
# Find codes in visitor data that are NOT in geo data
s_codes = s_df.select('UnitCode').unique()
g_codes = g_df.select('Code').unique()

missing_codes = s_codes.join(g_codes, left_on='UnitCode', right_on='Code', how='anti')

print("Missing park codes:")
print(missing_codes)

# Show park names for missing codes
print("\nMissing park details:")
s_df.filter(pl.col('UnitCode').is_in(missing_codes['UnitCode'])).select(['ParkName', 'UnitCode']).unique()

Missing park codes:
shape: (1, 1)
┌──────────┐
│ UnitCode │
│ ---      │
│ str      │
╞══════════╡
│ SEQU     │
└──────────┘

Missing park details:


Please use `implode` to return to previous behavior.

See https://github.com/pola-rs/polars/issues/22149 for more information.
  s_df.filter(pl.col('UnitCode').is_in(missing_codes['UnitCode'])).select(['ParkName', 'UnitCode']).unique()


ParkName,UnitCode
str,str
"""Sequoia NP""","""SEQU"""


In [11]:
g_df.filter(pl.col("Name").str.contains("Sequoia"))

Name,Code,long,lat
str,str,f64,f64
"""Sequoia & Kings Canyon Nationa…","""SEKI""",-118.575201,36.507853


So the reason this is missing is because Sequoia and Kings Canyon is combined together in this case ... 

But in the other one: 

In [14]:
s_df.filter(pl.col('ParkName').str.contains("Sequoia"))

Year,Month,RecreationVisits,UnitCode,ParkName
i64,i64,i64,str,str
1979,1,25338,"""SEQU""","""Sequoia NP"""
1979,2,22334,"""SEQU""","""Sequoia NP"""
1979,3,24271,"""SEQU""","""Sequoia NP"""
1979,4,47410,"""SEQU""","""Sequoia NP"""
1979,5,63854,"""SEQU""","""Sequoia NP"""
…,…,…,…,…
2023,8,171867,"""SEQU""","""Sequoia NP"""
2023,9,142544,"""SEQU""","""Sequoia NP"""
2023,10,106350,"""SEQU""","""Sequoia NP"""
2023,11,68745,"""SEQU""","""Sequoia NP"""


In [16]:
s_df.filter(pl.col('ParkName').str.contains("Kings"))

Year,Month,RecreationVisits,UnitCode,ParkName
i64,i64,i64,str,str
1979,1,19924,"""KICA""","""Kings Canyon NP"""
1979,2,18242,"""KICA""","""Kings Canyon NP"""
1979,3,18114,"""KICA""","""Kings Canyon NP"""
1979,4,37101,"""KICA""","""Kings Canyon NP"""
1979,5,63777,"""KICA""","""Kings Canyon NP"""
…,…,…,…,…
2023,8,81074,"""KICA""","""Kings Canyon NP"""
2023,9,88018,"""KICA""","""Kings Canyon NP"""
2023,10,56050,"""KICA""","""Kings Canyon NP"""
2023,11,31780,"""KICA""","""Kings Canyon NP"""


In [17]:
g_df.filter(pl.col("Name").str.contains("Kings"))

Name,Code,long,lat
str,str,f64,f64
"""Kings Mountain National Milita…","""KIMO""",-81.389794,35.141257
"""Kings Canyon National Park""","""KICA""",-118.593841,36.874942
"""Sequoia & Kings Canyon Nationa…","""SEKI""",-118.575201,36.507853


## Add Missing Parks Manually

Alright, so really the only one missing is the `SEQU` longtiude and latitude code. 

In [24]:
# Manual coordinates for missing parks (add as needed)
missing_parks = pl.DataFrame({
    'Name': ['Sequoia NP'],  # Add more as needed
    'Code': ['SEQU'],
    'long': [-118.5658],
    'lat': [36.4864],
})

# Combine with matched parks
final_parks = pl.concat([result, missing_parks])

print(f"Final park count: {final_parks.height}")
final_parks.sort('Code')

Final park count: 63


Name,Code,long,lat
str,str,f64,f64
"""Acadia National Park""","""ACAD""",-68.244112,44.350751
"""Arches National Park""","""ARCH""",-109.595456,38.707918
"""Badlands National Park""","""BADL""",-102.393945,43.834062
"""Big Bend National Park""","""BIBE""",-103.22979,29.298178
"""Biscayne National Park""","""BISC""",-80.218302,25.469689
…,…,…,…
"""Wind Cave National Park""","""WICA""",-103.428776,43.607872
"""Wrangell - St Elias National P…","""WRST""",-142.603571,61.418439
"""Yellowstone National Park""","""YELL""",-110.490083,44.583032
"""Yosemite National Park""","""YOSE""",-119.556007,37.848937


In [27]:
# Join visitor data with park coordinates
combined_df = s_df.join(
    final_parks, 
    left_on='UnitCode', 
    right_on='Code', 
    how='inner'
)

# Now each visitor record has lat/long coordinates
print(f"Combined records: {combined_df.height}")
combined_df.head()

Combined records: 33395


Year,Month,RecreationVisits,UnitCode,ParkName,Name,long,lat
i64,i64,i64,str,str,str,f64,f64
1979,1,6011,"""ACAD""","""Acadia NP""","""Acadia National Park""",-68.244112,44.350751
1979,2,5243,"""ACAD""","""Acadia NP""","""Acadia National Park""",-68.244112,44.350751
1979,3,11165,"""ACAD""","""Acadia NP""","""Acadia National Park""",-68.244112,44.350751
1979,4,219351,"""ACAD""","""Acadia NP""","""Acadia National Park""",-68.244112,44.350751
1979,5,339416,"""ACAD""","""Acadia NP""","""Acadia National Park""",-68.244112,44.350751


## Export Cleaned Data

Save the cleaned datasets for use in the dashboard.

In [28]:
# Export cleaned visitor data
combined_df.write_csv('data/visit_data_clean.csv')
print("✓ Exported visit_data_clean.csv")

✓ Exported visit_data_clean.csv


In [31]:
final_parks.write_json("data/clean_park.csv")

In [32]:
final_parks

Name,Code,long,lat
str,str,f64,f64
"""Jefferson National Expansion M…","""JEFF""",-90.185953,38.62254
"""National Park of American Samo…","""NPSA""",-169.453808,-14.239728
"""Virgin Islands National Park""","""VIIS""",-64.77111,18.353799
"""Hot Springs National Park""","""HOSP""",-93.089722,34.511138
"""Indiana Dunes National Lakesho…","""INDU""",-87.032415,41.661458
…,…,…,…
"""Kenai Fjords National Park""","""KEFJ""",-149.881524,59.953897
"""Kobuk Valley National Park""","""KOVA""",-159.063083,67.282637
"""Lake Clark National Park & Pre…","""LACL""",-153.601844,60.646797
"""Wrangell - St Elias National P…","""WRST""",-142.603571,61.418439


In [43]:
s_df.select(pl.exclude("ParkName"))

Year,Month,RecreationVisits,UnitCode
i64,i64,i64,str
1979,1,6011,"""ACAD"""
1979,2,5243,"""ACAD"""
1979,3,11165,"""ACAD"""
1979,4,219351,"""ACAD"""
1979,5,339416,"""ACAD"""
…,…,…,…
2023,8,458449,"""ZION"""
2023,9,491799,"""ZION"""
2023,10,504584,"""ZION"""
2023,11,284978,"""ZION"""


In [39]:
(2023 - 1979) * 12 

528