### Step 1: Import Dependencies required to perform data transformation steps

In [1]:
# Import Dependencies
import pandas as pd
from pathlib import Path

### Step 2: Read in csv files containing data extractions from various API calls to the National Park Service webpage.

In [2]:
NPS_ThingsToDo_df = pd.read_csv("../Extract/NPS Amenities Info/NPS_Project_Extracted_Data/nps_ttd_data.csv")
NPS_ThingsToDo_df.head(5)

Unnamed: 0,tags,location,isReservationRequired,arePetsPermitted,activities,amenities,doFeesApply,season,duration,parkCode,designation,fullName,states
0,"['Acadia National Park', 'Echo Lake', 'echo la...",Echo Lake Beach,False,True,[{'id': '587BB2D3-EC35-41B2-B3F7-A39E2B088AEE'...,[],False,['Summer'],,acad,National Park,Acadia National Park,ME
1,"['stargazing', 'night skies', 'Acadia National...",Cadillac Mountain,False,True,[{'id': 'D37A0003-8317-4F04-8FB0-4CF0A272E195'...,[],False,"['Spring', 'Summer', 'Fall']",,acad,National Park,Acadia National Park,ME
2,[],Acadia Mountain Loop,False,True,[{'id': '45261C0A-00D8-4C27-A1F8-029F933A0D34'...,[],False,"['Spring', 'Summer', 'Fall']",2-3 Hours,acad,National Park,Acadia National Park,ME
3,[],Sargent and Penobscot Mountains from JPH,False,True,[{'id': '45261C0A-00D8-4C27-A1F8-029F933A0D34'...,[],False,"['Spring', 'Summer', 'Fall']",,acad,National Park,Acadia National Park,ME
4,[],Jordan Pond Loop,False,True,[{'id': '45261C0A-00D8-4C27-A1F8-029F933A0D34'...,[],False,"['Spring', 'Summer', 'Fall']",1-2 Hours,acad,National Park,Acadia National Park,ME


In [3]:
NPS_ThingsToDo_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87 entries, 0 to 86
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   tags                   87 non-null     object
 1   location               83 non-null     object
 2   isReservationRequired  87 non-null     bool  
 3   arePetsPermitted       87 non-null     bool  
 4   activities             87 non-null     object
 5   amenities              87 non-null     object
 6   doFeesApply            87 non-null     bool  
 7   season                 87 non-null     object
 8   duration               50 non-null     object
 9   parkCode               87 non-null     object
 10  designation            87 non-null     object
 11  fullName               87 non-null     object
 12  states                 87 non-null     object
dtypes: bool(3), object(10)
memory usage: 7.2+ KB


In [4]:
# Remove "[" and "]" from the "tags" column
NPS_ThingsToDo_df['tags'] = NPS_ThingsToDo_df['tags'].str.strip('[]').str.replace("'", "")

# Split the "tags" column into multiple columns dynamically
tags_split = NPS_ThingsToDo_df['tags'].str.split(',', expand=True)

# Rename the columns to tag1, tag2, tag3, and so forth
tags_split.columns = [f"tag{i+1}" for i in range(tags_split.shape[1])]

# Concatenate the split columns with the original DataFrame
NPS_ThingsToDo_df = pd.concat([NPS_ThingsToDo_df, tags_split], axis=1)

# Display the updated DataFrame
NPS_ThingsToDo_df

Unnamed: 0,tags,location,isReservationRequired,arePetsPermitted,activities,amenities,doFeesApply,season,duration,parkCode,...,tag7,tag8,tag9,tag10,tag11,tag12,tag13,tag14,tag15,tag16
0,"Acadia National Park, Echo Lake, echo lake bea...",Echo Lake Beach,False,True,[{'id': '587BB2D3-EC35-41B2-B3F7-A39E2B088AEE'...,[],False,['Summer'],,acad,...,great ponds,,,,,,,,,
1,"stargazing, night skies, Acadia National Park,...",Cadillac Mountain,False,True,[{'id': 'D37A0003-8317-4F04-8FB0-4CF0A272E195'...,[],False,"['Spring', 'Summer', 'Fall']",,acad,...,mount desert island,,,,,,,,,
2,,Acadia Mountain Loop,False,True,[{'id': '45261C0A-00D8-4C27-A1F8-029F933A0D34'...,[],False,"['Spring', 'Summer', 'Fall']",2-3 Hours,acad,...,,,,,,,,,,
3,,Sargent and Penobscot Mountains from JPH,False,True,[{'id': '45261C0A-00D8-4C27-A1F8-029F933A0D34'...,[],False,"['Spring', 'Summer', 'Fall']",,acad,...,,,,,,,,,,
4,,Jordan Pond Loop,False,True,[{'id': '45261C0A-00D8-4C27-A1F8-029F933A0D34'...,[],False,"['Spring', 'Summer', 'Fall']",1-2 Hours,acad,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82,"Acadia National Park, lakes, great ponds, swim...",Lake Wood Swimming Area,False,True,[{'id': '587BB2D3-EC35-41B2-B3F7-A39E2B088AEE'...,[],False,['Summer'],,acad,...,,,,,,,,,,
83,"Acadia National Park, Sand Beach, swimming, At...",Sand Beach,False,True,[{'id': '587BB2D3-EC35-41B2-B3F7-A39E2B088AEE'...,[],False,"['Spring', 'Summer', 'Fall']",,acad,...,,,,,,,,,,
84,"Acadia National Park, birding, ornithology, se...",Seawall Picnic Area,False,True,[{'id': '5A2C91D1-50EC-4B24-8BED-A2E11A1892DF'...,[],False,"['Winter', 'Spring', 'Summer', 'Fall']",,acad,...,birdwatching,,,,,,,,,
85,"Acadia National Park, birding, birdwatching, i...",Isle au Haut,False,True,[{'id': '5A2C91D1-50EC-4B24-8BED-A2E11A1892DF'...,[],False,"['Spring', 'Summer', 'Fall']",,acad,...,,,,,,,,,,


In [5]:
# Remove "[" and "]" from the "seasons" column
NPS_ThingsToDo_df['season'] = NPS_ThingsToDo_df['season'].str.strip('[]').str.replace("'", "")

# Split the "tags" column into multiple columns dynamically
season_split = NPS_ThingsToDo_df['season'].str.split(',', expand=True)

# Rename the columns to season1, season2, season3, and so forth
season_split.columns = [f"season{i+1}" for i in range(season_split.shape[1])]

# Concatenate the split columns with the original DataFrame
NPS_ThingsToDo_df = pd.concat([NPS_ThingsToDo_df, season_split], axis=1)

# Display the updated DataFrame
NPS_ThingsToDo_df

Unnamed: 0,tags,location,isReservationRequired,arePetsPermitted,activities,amenities,doFeesApply,season,duration,parkCode,...,tag11,tag12,tag13,tag14,tag15,tag16,season1,season2,season3,season4
0,"Acadia National Park, Echo Lake, echo lake bea...",Echo Lake Beach,False,True,[{'id': '587BB2D3-EC35-41B2-B3F7-A39E2B088AEE'...,[],False,Summer,,acad,...,,,,,,,Summer,,,
1,"stargazing, night skies, Acadia National Park,...",Cadillac Mountain,False,True,[{'id': 'D37A0003-8317-4F04-8FB0-4CF0A272E195'...,[],False,"Spring, Summer, Fall",,acad,...,,,,,,,Spring,Summer,Fall,
2,,Acadia Mountain Loop,False,True,[{'id': '45261C0A-00D8-4C27-A1F8-029F933A0D34'...,[],False,"Spring, Summer, Fall",2-3 Hours,acad,...,,,,,,,Spring,Summer,Fall,
3,,Sargent and Penobscot Mountains from JPH,False,True,[{'id': '45261C0A-00D8-4C27-A1F8-029F933A0D34'...,[],False,"Spring, Summer, Fall",,acad,...,,,,,,,Spring,Summer,Fall,
4,,Jordan Pond Loop,False,True,[{'id': '45261C0A-00D8-4C27-A1F8-029F933A0D34'...,[],False,"Spring, Summer, Fall",1-2 Hours,acad,...,,,,,,,Spring,Summer,Fall,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82,"Acadia National Park, lakes, great ponds, swim...",Lake Wood Swimming Area,False,True,[{'id': '587BB2D3-EC35-41B2-B3F7-A39E2B088AEE'...,[],False,Summer,,acad,...,,,,,,,Summer,,,
83,"Acadia National Park, Sand Beach, swimming, At...",Sand Beach,False,True,[{'id': '587BB2D3-EC35-41B2-B3F7-A39E2B088AEE'...,[],False,"Spring, Summer, Fall",,acad,...,,,,,,,Spring,Summer,Fall,
84,"Acadia National Park, birding, ornithology, se...",Seawall Picnic Area,False,True,[{'id': '5A2C91D1-50EC-4B24-8BED-A2E11A1892DF'...,[],False,"Winter, Spring, Summer, Fall",,acad,...,,,,,,,Winter,Spring,Summer,Fall
85,"Acadia National Park, birding, birdwatching, i...",Isle au Haut,False,True,[{'id': '5A2C91D1-50EC-4B24-8BED-A2E11A1892DF'...,[],False,"Spring, Summer, Fall",,acad,...,,,,,,,Spring,Summer,Fall,


In [6]:
# Remove "[" and "]" from the "activities" column
NPS_ThingsToDo_df['activities'] = NPS_ThingsToDo_df['season'].str.strip('[]').str.replace("'", "")

NPS_ThingsToDo_df

Unnamed: 0,tags,location,isReservationRequired,arePetsPermitted,activities,amenities,doFeesApply,season,duration,parkCode,...,tag11,tag12,tag13,tag14,tag15,tag16,season1,season2,season3,season4
0,"Acadia National Park, Echo Lake, echo lake bea...",Echo Lake Beach,False,True,Summer,[],False,Summer,,acad,...,,,,,,,Summer,,,
1,"stargazing, night skies, Acadia National Park,...",Cadillac Mountain,False,True,"Spring, Summer, Fall",[],False,"Spring, Summer, Fall",,acad,...,,,,,,,Spring,Summer,Fall,
2,,Acadia Mountain Loop,False,True,"Spring, Summer, Fall",[],False,"Spring, Summer, Fall",2-3 Hours,acad,...,,,,,,,Spring,Summer,Fall,
3,,Sargent and Penobscot Mountains from JPH,False,True,"Spring, Summer, Fall",[],False,"Spring, Summer, Fall",,acad,...,,,,,,,Spring,Summer,Fall,
4,,Jordan Pond Loop,False,True,"Spring, Summer, Fall",[],False,"Spring, Summer, Fall",1-2 Hours,acad,...,,,,,,,Spring,Summer,Fall,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82,"Acadia National Park, lakes, great ponds, swim...",Lake Wood Swimming Area,False,True,Summer,[],False,Summer,,acad,...,,,,,,,Summer,,,
83,"Acadia National Park, Sand Beach, swimming, At...",Sand Beach,False,True,"Spring, Summer, Fall",[],False,"Spring, Summer, Fall",,acad,...,,,,,,,Spring,Summer,Fall,
84,"Acadia National Park, birding, ornithology, se...",Seawall Picnic Area,False,True,"Winter, Spring, Summer, Fall",[],False,"Winter, Spring, Summer, Fall",,acad,...,,,,,,,Winter,Spring,Summer,Fall
85,"Acadia National Park, birding, birdwatching, i...",Isle au Haut,False,True,"Spring, Summer, Fall",[],False,"Spring, Summer, Fall",,acad,...,,,,,,,Spring,Summer,Fall,


In [7]:
# Drop the original "tags" and "seasons" columns
NPS_ThingsToDo_df = NPS_ThingsToDo_df.drop(columns=['tags', 'season'])

NPS_ThingsToDo_df.head(10)

Unnamed: 0,location,isReservationRequired,arePetsPermitted,activities,amenities,doFeesApply,duration,parkCode,designation,fullName,...,tag11,tag12,tag13,tag14,tag15,tag16,season1,season2,season3,season4
0,Echo Lake Beach,False,True,Summer,[],False,,acad,National Park,Acadia National Park,...,,,,,,,Summer,,,
1,Cadillac Mountain,False,True,"Spring, Summer, Fall",[],False,,acad,National Park,Acadia National Park,...,,,,,,,Spring,Summer,Fall,
2,Acadia Mountain Loop,False,True,"Spring, Summer, Fall",[],False,2-3 Hours,acad,National Park,Acadia National Park,...,,,,,,,Spring,Summer,Fall,
3,Sargent and Penobscot Mountains from JPH,False,True,"Spring, Summer, Fall",[],False,,acad,National Park,Acadia National Park,...,,,,,,,Spring,Summer,Fall,
4,Jordan Pond Loop,False,True,"Spring, Summer, Fall",[],False,1-2 Hours,acad,National Park,Acadia National Park,...,,,,,,,Spring,Summer,Fall,
5,Sieur de Monts to Jesup Path and Hemlock Road ...,False,True,"Spring, Summer, Fall",[],False,1-2 Hours,acad,National Park,Acadia National Park,...,,,,,,,Spring,Summer,Fall,
6,Sieur de Monts to Sand Beach,False,True,"Spring, Summer, Fall",[],False,2-4 Hours,acad,National Park,Acadia National Park,...,,,,,,,Spring,Summer,Fall,
7,"Beech Cliff Trail, Beech Cliff Loop, and Canad...",False,False,"Spring, Summer, Fall",[],False,2-4 Hours,acad,National Park,Acadia National Park,...,,,,,,,Spring,Summer,Fall,
8,Great Head Trail,False,True,"Spring, Summer, Fall",[],False,1-2 Hours,acad,National Park,Acadia National Park,...,,,,,,,Spring,Summer,Fall,
9,The Ocean Path Trail,False,True,"Spring, Summer, Fall",[],False,1-2 Hours,acad,National Park,Acadia National Park,...,,,,,,,Spring,Summer,Fall,


In [8]:
# Rename column to prepare for merge
NPS_ThingsToDo_df.rename(columns={"parkCode": "park_code", "fullName": "park_name", "designation": "park_designation"}, inplace=True)
NPS_ThingsToDo_df.head(5)

Unnamed: 0,location,isReservationRequired,arePetsPermitted,activities,amenities,doFeesApply,duration,park_code,park_designation,park_name,...,tag11,tag12,tag13,tag14,tag15,tag16,season1,season2,season3,season4
0,Echo Lake Beach,False,True,Summer,[],False,,acad,National Park,Acadia National Park,...,,,,,,,Summer,,,
1,Cadillac Mountain,False,True,"Spring, Summer, Fall",[],False,,acad,National Park,Acadia National Park,...,,,,,,,Spring,Summer,Fall,
2,Acadia Mountain Loop,False,True,"Spring, Summer, Fall",[],False,2-3 Hours,acad,National Park,Acadia National Park,...,,,,,,,Spring,Summer,Fall,
3,Sargent and Penobscot Mountains from JPH,False,True,"Spring, Summer, Fall",[],False,,acad,National Park,Acadia National Park,...,,,,,,,Spring,Summer,Fall,
4,Jordan Pond Loop,False,True,"Spring, Summer, Fall",[],False,1-2 Hours,acad,National Park,Acadia National Park,...,,,,,,,Spring,Summer,Fall,


In [9]:
NPS_ThingsToDo_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87 entries, 0 to 86
Data columns (total 31 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   location               83 non-null     object
 1   isReservationRequired  87 non-null     bool  
 2   arePetsPermitted       87 non-null     bool  
 3   activities             87 non-null     object
 4   amenities              87 non-null     object
 5   doFeesApply            87 non-null     bool  
 6   duration               50 non-null     object
 7   park_code              87 non-null     object
 8   park_designation       87 non-null     object
 9   park_name              87 non-null     object
 10  states                 87 non-null     object
 11  tag1                   87 non-null     object
 12  tag2                   75 non-null     object
 13  tag3                   75 non-null     object
 14  tag4                   74 non-null     object
 15  tag5                   73

In [10]:
NPS_AmenitiesByPlace_df = pd.read_csv("../Extract/NPS Amenities Info/NPS_Project_Extracted_Data/nps_amen_place_data.csv")
NPS_AmenitiesByPlace_df.head(5)

Unnamed: 0,amenity_id,amenity_name,park_code,park_name,park_states,park_designation,park_url
0,A1B0AD01-740C-41E7-8412-FBBEDD5F1443,ATM/Cash Machine,badl,Badlands National Park,SD,National Park,http://www.nps.gov/badl/
1,A1B0AD01-740C-41E7-8412-FBBEDD5F1443,ATM/Cash Machine,bibe,Big Bend National Park,TX,National Park,http://www.nps.gov/bibe/
2,A1B0AD01-740C-41E7-8412-FBBEDD5F1443,ATM/Cash Machine,brca,Bryce Canyon National Park,UT,National Park,http://www.nps.gov/brca/
3,A1B0AD01-740C-41E7-8412-FBBEDD5F1443,ATM/Cash Machine,cabr,Cabrillo National Monument,CA,National Monument,http://www.nps.gov/cabr/
4,A1B0AD01-740C-41E7-8412-FBBEDD5F1443,ATM/Cash Machine,deva,Death Valley National Park,"CA,NV",National Park,http://www.nps.gov/deva/


In [11]:
NPS_AmenitiesByPlace_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8652 entries, 0 to 8651
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   amenity_id        8652 non-null   object
 1   amenity_name      8652 non-null   object
 2   park_code         8652 non-null   object
 3   park_name         8652 non-null   object
 4   park_states       8652 non-null   object
 5   park_designation  8216 non-null   object
 6   park_url          8640 non-null   object
dtypes: object(7)
memory usage: 473.3+ KB


In [12]:
NPS_Parks_df = pd.read_csv("../Extract/NPS Amenities Info/NPS_Project_Extracted_Data/nps_parks_data.csv")
NPS_Parks_df.head(5)

Unnamed: 0,park_id,park_url,park_name,park_code,park_latitude,park_longitude,park_designation
0,77E0D7F0-1942-494A-ACE2-9004D2BDC59E,https://www.nps.gov/abli/index.htm,Abraham Lincoln Birthplace National Historical...,abli,37.585866,-85.673305,National Historical Park
1,6DA17C86-088E-4B4D-B862-7C1BD5CF236B,https://www.nps.gov/acad/index.htm,Acadia National Park,acad,44.409286,-68.247501,National Park
2,E4C7784E-66A0-4D44-87D0-3E072F5FEF43,https://www.nps.gov/adam/index.htm,Adams National Historical Park,adam,42.255396,-71.011604,National Historical Park
3,1A47416F-DAA3-4137-9F30-14AF86B4E547,https://www.nps.gov/afam/index.htm,African American Civil War Memorial,afam,38.9166,-77.026,
4,E6E1D22A-7A89-47F8-813C-B611059A8CF9,https://www.nps.gov/afbg/index.htm,African Burial Ground National Monument,afbg,40.714527,-74.004474,National Monument


In [13]:
NPS_Parks_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 471 entries, 0 to 470
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   park_id           471 non-null    object 
 1   park_url          471 non-null    object 
 2   park_name         471 non-null    object 
 3   park_code         471 non-null    object 
 4   park_latitude     470 non-null    float64
 5   park_longitude    470 non-null    float64
 6   park_designation  435 non-null    object 
dtypes: float64(2), object(5)
memory usage: 25.9+ KB


In [14]:
NPS_ActivitiesByPark_df = pd.read_csv("../Extract/NPS_Activities_Fees/NPS_Project_Extracted_Data/activities_parks.csv")
NPS_ActivitiesByPark_df.head(5)

Unnamed: 0,activity_id,activity_name,designation,fullName,name,parkCode,states,url
0,09DF0950-D319-4557-A57E-04CD2F63FF42,Arts and Culture,National Park,Acadia National Park,Acadia,acad,ME,https://www.nps.gov/acad/index.htm
1,09DF0950-D319-4557-A57E-04CD2F63FF42,Arts and Culture,National Monument,African Burial Ground National Monument,African Burial Ground,afbg,NY,https://www.nps.gov/afbg/index.htm
2,09DF0950-D319-4557-A57E-04CD2F63FF42,Arts and Culture,National Monument,Agate Fossil Beds National Monument,Agate Fossil Beds,agfo,NE,https://www.nps.gov/agfo/index.htm
3,09DF0950-D319-4557-A57E-04CD2F63FF42,Arts and Culture,National Monument,Alibates Flint Quarries National Monument,Alibates Flint Quarries,alfl,TX,https://www.nps.gov/alfl/index.htm
4,09DF0950-D319-4557-A57E-04CD2F63FF42,Arts and Culture,National Historic Trail,Ala Kahakai National Historic Trail,Ala Kahakai,alka,HI,https://www.nps.gov/alka/index.htm


In [15]:
# Rename column to prepare for merge
NPS_ActivitiesByPark_df.rename(columns={"parkCode": "park_code", "fullName": "park_name", "designation": "park_designation", "states": "park_states", "url": "park_url"}, inplace=True)
NPS_ActivitiesByPark_df.head(5)

Unnamed: 0,activity_id,activity_name,park_designation,park_name,name,park_code,park_states,park_url
0,09DF0950-D319-4557-A57E-04CD2F63FF42,Arts and Culture,National Park,Acadia National Park,Acadia,acad,ME,https://www.nps.gov/acad/index.htm
1,09DF0950-D319-4557-A57E-04CD2F63FF42,Arts and Culture,National Monument,African Burial Ground National Monument,African Burial Ground,afbg,NY,https://www.nps.gov/afbg/index.htm
2,09DF0950-D319-4557-A57E-04CD2F63FF42,Arts and Culture,National Monument,Agate Fossil Beds National Monument,Agate Fossil Beds,agfo,NE,https://www.nps.gov/agfo/index.htm
3,09DF0950-D319-4557-A57E-04CD2F63FF42,Arts and Culture,National Monument,Alibates Flint Quarries National Monument,Alibates Flint Quarries,alfl,TX,https://www.nps.gov/alfl/index.htm
4,09DF0950-D319-4557-A57E-04CD2F63FF42,Arts and Culture,National Historic Trail,Ala Kahakai National Historic Trail,Ala Kahakai,alka,HI,https://www.nps.gov/alka/index.htm


In [16]:
NPS_ActivitiesByPark_df.info(0)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4013 entries, 0 to 4012
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   activity_id       4013 non-null   object
 1   activity_name     4013 non-null   object
 2   park_designation  3843 non-null   object
 3   park_name         4013 non-null   object
 4   name              4013 non-null   object
 5   park_code         4013 non-null   object
 6   park_states       4006 non-null   object
 7   park_url          4013 non-null   object
dtypes: object(8)
memory usage: 250.9+ KB


In [17]:
NPS_EntranceFees_df = pd.read_csv("../Extract/NPS_Activities_Fees/NPS_Project_Extracted_Data/feespasses.csv")
NPS_EntranceFees_df.head(5)

Unnamed: 0,parkCode,totalFees,averageFees,totalPasses,averagePasses,isFeeFreePark,entrancePassDescription,entranceFeeDescription,feesAtWorkUrl
0,acad,91.0,13.0,70.0,70.0,False,A park entrance pass is required year-round. A...,A park entrance pass is required year-round at...,https://www.nps.gov/acad/planyourvisit/fees.htm
1,arch,72.0,18.0,55.0,55.0,False,,,https://www.nps.gov/arch/learn/management/your...
2,badl,355.0,50.714286,55.0,55.0,False,,Badlands National Park charges an entrance fee...,
3,bibe,425.0,60.714286,55.0,55.0,False,Big Bend National Park Annual Pass: $55 Covers...,All vehicles entering Big Bend National Park a...,https://www.nps.gov/bibe/learn/management/your...
4,bisc,0.0,0.0,0.0,0.0,True,,,


In [18]:
# Rename column to prepare for merge
NPS_EntranceFees_df.rename(columns={"parkCode": "park_code"}, inplace=True)
NPS_EntranceFees_df.head(5)

Unnamed: 0,park_code,totalFees,averageFees,totalPasses,averagePasses,isFeeFreePark,entrancePassDescription,entranceFeeDescription,feesAtWorkUrl
0,acad,91.0,13.0,70.0,70.0,False,A park entrance pass is required year-round. A...,A park entrance pass is required year-round at...,https://www.nps.gov/acad/planyourvisit/fees.htm
1,arch,72.0,18.0,55.0,55.0,False,,,https://www.nps.gov/arch/learn/management/your...
2,badl,355.0,50.714286,55.0,55.0,False,,Badlands National Park charges an entrance fee...,
3,bibe,425.0,60.714286,55.0,55.0,False,Big Bend National Park Annual Pass: $55 Covers...,All vehicles entering Big Bend National Park a...,https://www.nps.gov/bibe/learn/management/your...
4,bisc,0.0,0.0,0.0,0.0,True,,,


In [19]:
NPS_EntranceFees_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   park_code                6 non-null      object 
 1   totalFees                6 non-null      float64
 2   averageFees              6 non-null      float64
 3   totalPasses              6 non-null      float64
 4   averagePasses            6 non-null      float64
 5   isFeeFreePark            6 non-null      bool   
 6   entrancePassDescription  2 non-null      object 
 7   entranceFeeDescription   3 non-null      object 
 8   feesAtWorkUrl            3 non-null      object 
dtypes: bool(1), float64(4), object(4)
memory usage: 518.0+ bytes


In [None]:
# Additional line to add Melissa's output file(s) once available
# NPS_GeoLocation_df = pd.read_csv("Extract/NPS_Park_Boundaries/NPS_Project_Extracted_Data/filename.csv")

### Step 3: Edit data to create a single, comprehensive, cleaned dataset for use in analysis

In [23]:
# Combine Things To Do and Parks dataframes
ThingsToDoByPark_df = pd.merge(NPS_ThingsToDo_df, NPS_Parks_df, on=["park_code", "park_name"])
ThingsToDoByPark_df.head(5)

Unnamed: 0,location,isReservationRequired,arePetsPermitted,activities,amenities,doFeesApply,duration,park_code,park_designation_x,park_name,...,tag16,season1,season2,season3,season4,park_id,park_url,park_latitude,park_longitude,park_designation_y
0,Echo Lake Beach,False,True,Summer,[],False,,acad,National Park,Acadia National Park,...,,Summer,,,,6DA17C86-088E-4B4D-B862-7C1BD5CF236B,https://www.nps.gov/acad/index.htm,44.409286,-68.247501,National Park
1,Cadillac Mountain,False,True,"Spring, Summer, Fall",[],False,,acad,National Park,Acadia National Park,...,,Spring,Summer,Fall,,6DA17C86-088E-4B4D-B862-7C1BD5CF236B,https://www.nps.gov/acad/index.htm,44.409286,-68.247501,National Park
2,Acadia Mountain Loop,False,True,"Spring, Summer, Fall",[],False,2-3 Hours,acad,National Park,Acadia National Park,...,,Spring,Summer,Fall,,6DA17C86-088E-4B4D-B862-7C1BD5CF236B,https://www.nps.gov/acad/index.htm,44.409286,-68.247501,National Park
3,Sargent and Penobscot Mountains from JPH,False,True,"Spring, Summer, Fall",[],False,,acad,National Park,Acadia National Park,...,,Spring,Summer,Fall,,6DA17C86-088E-4B4D-B862-7C1BD5CF236B,https://www.nps.gov/acad/index.htm,44.409286,-68.247501,National Park
4,Jordan Pond Loop,False,True,"Spring, Summer, Fall",[],False,1-2 Hours,acad,National Park,Acadia National Park,...,,Spring,Summer,Fall,,6DA17C86-088E-4B4D-B862-7C1BD5CF236B,https://www.nps.gov/acad/index.htm,44.409286,-68.247501,National Park


In [22]:
# Combine Activity and Amenity dataframes
Amen_Act_Loc_df = pd.merge(NPS_AmenitiesByPlace_df, NPS_ActivitiesByPark_df, on=["park_code", "park_name"])
Amen_Act_Loc_df.head(5)

Unnamed: 0,amenity_id,amenity_name,park_code,park_name,park_states_x,park_designation_x,park_url_x,activity_id,activity_name,park_designation_y,name,park_states_y,park_url_y
0,A1B0AD01-740C-41E7-8412-FBBEDD5F1443,ATM/Cash Machine,badl,Badlands National Park,SD,National Park,http://www.nps.gov/badl/,13A57703-BB1A-41A2-94B8-53B692EB7238,Astronomy,National Park,Badlands,SD,https://www.nps.gov/badl/index.htm
1,A1B0AD01-740C-41E7-8412-FBBEDD5F1443,ATM/Cash Machine,badl,Badlands National Park,SD,National Park,http://www.nps.gov/badl/,5F723BAD-7359-48FC-98FA-631592256E35,Auto and ATV,National Park,Badlands,SD,https://www.nps.gov/badl/index.htm
2,A1B0AD01-740C-41E7-8412-FBBEDD5F1443,ATM/Cash Machine,badl,Badlands National Park,SD,National Park,http://www.nps.gov/badl/,7CE6E935-F839-4FEC-A63E-052B1DEF39D2,Biking,National Park,Badlands,SD,https://www.nps.gov/badl/index.htm
3,A1B0AD01-740C-41E7-8412-FBBEDD5F1443,ATM/Cash Machine,badl,Badlands National Park,SD,National Park,http://www.nps.gov/badl/,A59947B7-3376-49B4-AD02-C0423E08C5F7,Camping,National Park,Badlands,SD,https://www.nps.gov/badl/index.htm
4,A1B0AD01-740C-41E7-8412-FBBEDD5F1443,ATM/Cash Machine,badl,Badlands National Park,SD,National Park,http://www.nps.gov/badl/,C11D3746-5063-4BD0-B245-7178D1AD866C,Compass and GPS,National Park,Badlands,SD,https://www.nps.gov/badl/index.htm


In [24]:
# Combine Activity/Amenity By Park dataframe with Things To By Park dataframe 
Act_Amen_TTD_Park_df = pd.merge(ThingsToDoByPark_df, Amen_Act_Loc_df, on=["park_code", "park_name"])
Act_Amen_TTD_Park_df.head(5)

Unnamed: 0,location,isReservationRequired,arePetsPermitted,activities,amenities,doFeesApply,duration,park_code,park_designation_x_x,park_name,...,amenity_name,park_states_x,park_designation_x_y,park_url_x,activity_id,activity_name,park_designation_y_y,name,park_states_y,park_url_y
0,Echo Lake Beach,False,True,Summer,[],False,,acad,National Park,Acadia National Park,...,Accessible Rooms,ME,National Park,http://www.nps.gov/acad/,09DF0950-D319-4557-A57E-04CD2F63FF42,Arts and Culture,National Park,Acadia,ME,https://www.nps.gov/acad/index.htm
1,Echo Lake Beach,False,True,Summer,[],False,,acad,National Park,Acadia National Park,...,Accessible Rooms,ME,National Park,http://www.nps.gov/acad/,13A57703-BB1A-41A2-94B8-53B692EB7238,Astronomy,National Park,Acadia,ME,https://www.nps.gov/acad/index.htm
2,Echo Lake Beach,False,True,Summer,[],False,,acad,National Park,Acadia National Park,...,Accessible Rooms,ME,National Park,http://www.nps.gov/acad/,7CE6E935-F839-4FEC-A63E-052B1DEF39D2,Biking,National Park,Acadia,ME,https://www.nps.gov/acad/index.htm
3,Echo Lake Beach,False,True,Summer,[],False,,acad,National Park,Acadia National Park,...,Accessible Rooms,ME,National Park,http://www.nps.gov/acad/,071BA73C-1D3C-46D4-A53C-00D5602F7F0E,Boating,National Park,Acadia,ME,https://www.nps.gov/acad/index.htm
4,Echo Lake Beach,False,True,Summer,[],False,,acad,National Park,Acadia National Park,...,Accessible Rooms,ME,National Park,http://www.nps.gov/acad/,A59947B7-3376-49B4-AD02-C0423E08C5F7,Camping,National Park,Acadia,ME,https://www.nps.gov/acad/index.htm


In [29]:
## This reduces the dataset by to much. Need to approach differently

# Fee_Amen_Act_Loc_df = pd.merge(NPS_EntranceFees_df, NPS_EntranceFees_df, on=["park_code"])
# Fee_Amen_Act_Loc_df.head(5)

Unnamed: 0,park_code,totalFees_x,averageFees_x,totalPasses_x,averagePasses_x,isFeeFreePark_x,entrancePassDescription_x,entranceFeeDescription_x,feesAtWorkUrl_x,totalFees_y,averageFees_y,totalPasses_y,averagePasses_y,isFeeFreePark_y,entrancePassDescription_y,entranceFeeDescription_y,feesAtWorkUrl_y
0,acad,91.0,13.0,70.0,70.0,False,A park entrance pass is required year-round. A...,A park entrance pass is required year-round at...,https://www.nps.gov/acad/planyourvisit/fees.htm,91.0,13.0,70.0,70.0,False,A park entrance pass is required year-round. A...,A park entrance pass is required year-round at...,https://www.nps.gov/acad/planyourvisit/fees.htm
1,arch,72.0,18.0,55.0,55.0,False,,,https://www.nps.gov/arch/learn/management/your...,72.0,18.0,55.0,55.0,False,,,https://www.nps.gov/arch/learn/management/your...
2,badl,355.0,50.714286,55.0,55.0,False,,Badlands National Park charges an entrance fee...,,355.0,50.714286,55.0,55.0,False,,Badlands National Park charges an entrance fee...,
3,bibe,425.0,60.714286,55.0,55.0,False,Big Bend National Park Annual Pass: $55 Covers...,All vehicles entering Big Bend National Park a...,https://www.nps.gov/bibe/learn/management/your...,425.0,60.714286,55.0,55.0,False,Big Bend National Park Annual Pass: $55 Covers...,All vehicles entering Big Bend National Park a...,https://www.nps.gov/bibe/learn/management/your...
4,bisc,0.0,0.0,0.0,0.0,True,,,,0.0,0.0,0.0,0.0,True,,,


In [30]:
#Fee_Amen_Act_Loc_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 17 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   park_code                  6 non-null      object 
 1   totalFees_x                6 non-null      float64
 2   averageFees_x              6 non-null      float64
 3   totalPasses_x              6 non-null      float64
 4   averagePasses_x            6 non-null      float64
 5   isFeeFreePark_x            6 non-null      bool   
 6   entrancePassDescription_x  2 non-null      object 
 7   entranceFeeDescription_x   3 non-null      object 
 8   feesAtWorkUrl_x            3 non-null      object 
 9   totalFees_y                6 non-null      float64
 10  averageFees_y              6 non-null      float64
 11  totalPasses_y              6 non-null      float64
 12  averagePasses_y            6 non-null      float64
 13  isFeeFreePark_y            6 non-null      bool   
 14

In [32]:
# Only keep rows that correspond to a National Park
NPS_dataset_df = Act_Amen_TTD_Park_df[(Act_Amen_TTD_Park_df["park_designation_x_x"] == "National Park")]
NPS_dataset_df.head(5)

Unnamed: 0,location,isReservationRequired,arePetsPermitted,activities,amenities,doFeesApply,duration,park_code,park_designation_x_x,park_name,...,amenity_name,park_states_x,park_designation_x_y,park_url_x,activity_id,activity_name,park_designation_y_y,name,park_states_y,park_url_y
0,Echo Lake Beach,False,True,Summer,[],False,,acad,National Park,Acadia National Park,...,Accessible Rooms,ME,National Park,http://www.nps.gov/acad/,09DF0950-D319-4557-A57E-04CD2F63FF42,Arts and Culture,National Park,Acadia,ME,https://www.nps.gov/acad/index.htm
1,Echo Lake Beach,False,True,Summer,[],False,,acad,National Park,Acadia National Park,...,Accessible Rooms,ME,National Park,http://www.nps.gov/acad/,13A57703-BB1A-41A2-94B8-53B692EB7238,Astronomy,National Park,Acadia,ME,https://www.nps.gov/acad/index.htm
2,Echo Lake Beach,False,True,Summer,[],False,,acad,National Park,Acadia National Park,...,Accessible Rooms,ME,National Park,http://www.nps.gov/acad/,7CE6E935-F839-4FEC-A63E-052B1DEF39D2,Biking,National Park,Acadia,ME,https://www.nps.gov/acad/index.htm
3,Echo Lake Beach,False,True,Summer,[],False,,acad,National Park,Acadia National Park,...,Accessible Rooms,ME,National Park,http://www.nps.gov/acad/,071BA73C-1D3C-46D4-A53C-00D5602F7F0E,Boating,National Park,Acadia,ME,https://www.nps.gov/acad/index.htm
4,Echo Lake Beach,False,True,Summer,[],False,,acad,National Park,Acadia National Park,...,Accessible Rooms,ME,National Park,http://www.nps.gov/acad/,A59947B7-3376-49B4-AD02-C0423E08C5F7,Camping,National Park,Acadia,ME,https://www.nps.gov/acad/index.htm


In [33]:
NPS_dataset_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89784 entries, 0 to 89783
Data columns (total 47 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   location               85656 non-null  object 
 1   isReservationRequired  89784 non-null  bool   
 2   arePetsPermitted       89784 non-null  bool   
 3   activities             89784 non-null  object 
 4   amenities              89784 non-null  object 
 5   doFeesApply            89784 non-null  bool   
 6   duration               51600 non-null  object 
 7   park_code              89784 non-null  object 
 8   park_designation_x_x   89784 non-null  object 
 9   park_name              89784 non-null  object 
 10  states                 89784 non-null  object 
 11  tag1                   89784 non-null  object 
 12  tag2                   77400 non-null  object 
 13  tag3                   77400 non-null  object 
 14  tag4                   76368 non-null  object 
 15  ta

In [34]:
# View all available columns
print(NPS_dataset_df.columns)

Index(['location', 'isReservationRequired', 'arePetsPermitted', 'activities',
       'amenities', 'doFeesApply', 'duration', 'park_code',
       'park_designation_x_x', 'park_name', 'states', 'tag1', 'tag2', 'tag3',
       'tag4', 'tag5', 'tag6', 'tag7', 'tag8', 'tag9', 'tag10', 'tag11',
       'tag12', 'tag13', 'tag14', 'tag15', 'tag16', 'season1', 'season2',
       'season3', 'season4', 'park_id', 'park_url', 'park_latitude',
       'park_longitude', 'park_designation_y_x', 'amenity_id', 'amenity_name',
       'park_states_x', 'park_designation_x_y', 'park_url_x', 'activity_id',
       'activity_name', 'park_designation_y_y', 'name', 'park_states_y',
       'park_url_y'],
      dtype='object')


In [35]:
# Create a DataFrame called NPS_Cleaned_df with only columns useful for downstream data analysis

# Remove fields not useful to data analysis from the dataset
NPS_Cleaned_df = NPS_dataset_df[['amenity_name', 'park_code', 'park_name', 'park_states', 'park_designation_x', 
    'park_url_x', 'activity_name', 'park_latitude', 'park_longitude', 'totalFees', "location", "isReservationRequired",
    'arePetsPermitted','isFeeFreePark']]
    
# Rename the columns 
NPS_Cleaned_df = NPS_Cleaned_df.rename(columns={
    'park_designation_x_x': 'Park Designation',
    'park_code': 'Park Code',
    'park_name': 'Park Name',
    'park_states_x': 'Present in States',
    'isFeeFreePark': 'Free Entrance',
    'totalFees': 'Entrance Fee',
    'park_url_x': 'Park Website',
    'park_latitude': 'Park Latitude',
    'park_longitude': 'Park Longitude',
    'amenity_name': 'Available Amenity',
    'activity_name': 'Activity Option',
    'location': 'Activity Location',
    'isReservationRequired': 'Activity Requires Reservation',
    'arePetsPermitted': 'Pets Allowed',
    'doFeesApply': 'Activity Fee Required',
    'tag1': 'Activity Tag 1',
    'tag2': 'Activity Tag 2',
    'tag3': 'Activity Tag 3',
    'tag4': 'Activity Tag 4',
    'tag5': 'Activity Tag 5',
    'season1': 'Activity Season 1',
    'season2': 'Activity Season 2',
    'season3': 'Activity Season 3',
    'season4': 'Activity Season 4',
})

# Reorder the columns
NPS_Cleaned_df = NPS_Cleaned_df[['Park Designation', 'Park Code', 'Park Name', 'Present in States', 'Free Entrance',
                                'Entrance Fee', 'Park Website', 'Park Latitude', 'Park Longitude', 'Available Amenity', 'Activity Option',
                                'Activity Location', 'Activity Requires Reservation', 'Pets Allowed', 'Activity Fee Required',
                                'Activity Tag 1', 'Activity Tag 2', 'Activity Tag 3', 'Activity Tag 4', 'Activity Tag 5',
                                'Activity Season 1', 'Activity Season 2', 'Activity Season 3', 'Activity Season 4']]

# Format the Entrance Fee column
NPS_Cleaned_df["Entrance Fees"] = NPS_Cleaned_df["Entrance Fee"].map("${:,.2f}".format)

# Display the DataFrame
NPS_Cleaned_df.head(5)

KeyError: "['park_states', 'park_designation_x', 'totalFees', 'isFeeFreePark'] not in index"

In [None]:
NPS_Cleaned_df.info()  

In [None]:
# Drop rows with any empty cells
NPS_Cleaned_df = NPS_Cleaned_df.dropna()

# Display the first few rows of the cleaned DataFrame after dropping empty cells
NPS_Cleaned_df.head()

In [None]:
NPS_Cleaned_df.info()  

In [None]:
NPS_Cleaned_df.nunique()

In [None]:
# Export DataFrame to a CSV file
NPS_Cleaned_df.to_csv('NPS_Cleaned_Data.csv', index=False)

# Export DataFrame to a JSON file
NPS_Cleaned_df.to_json('NPS_Cleaned_Data.json', orient='records')

print("DataFrame exported to CSV and JSON files successfully.")

In [None]:
import os
import shutil

# Define the file paths for the original and destination folders
destination_folder = '../Load'

# Create the destination folder if it doesn't exist
if not os.path.exists(destination_folder):
    os.makedirs(destination_folder)

# Define the file names and paths
files = {
    'NPS_Cleaned_Data.csv': 'NPS_Cleaned_Data.csv',
    'NPS_Cleaned_Data.json': 'NPS_Cleaned_Data.json'
}

# Copy the file to the destination folder
for target_name, source_path in files.items():
    if os.path.exists(source_path):
        shutil.copy(source_path, os.path.join(destination_folder, target_name))

print(f"Files copied successfully to {destination_folder}: {os.listdir(destination_folder)}")