#### Define Imports

In [214]:
import requests
import pandas as pd 
import random
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report


api_key = '5TjgNMFCh7h44T09HbQnbGhU8as11D0FDdjfJhgV'
api_base_url = 'https://developer.nps.gov/api/v1/'

from helper_functions import *
state_model, state_vectorizer = state_code_model.trained_model()
park_model, park_vectorizer = park_code_model.trained_model()

#### Define Functions

In [88]:
def get_parks(params):
    """
    Use to find a list of all park names, codes, states, addresses and descriptions from the NPS parks endpoint.
    Can also be used to find specific park information.

    """
    parks = []
    limit = 50  # Number of results per page, maximum allowed by NPS API
    start = 0   # Initial starting point for pagination
    
    while True:
        params = {
            'api_key': api_key,
            'limit': limit,
            'start': start
        }
        
        response = requests.get(f"{api_base_url}parks", params=params)
        data = response.json()
        
        parks.extend([
            {
                'fullName': park['fullName'],
                'parkCode': park['parkCode'],
                'state': park['states'],
                'addresses': park.get('addresses', []),
                'description': park['description']
            } for park in data['data']
        ])
        
        # Move to the next page
        start += limit
        
        # Break the loop if all parks have been retrieved
        if int(start) >= int(data['total']):
            break
    
    return parks

In [89]:
def get_parks_in_state(params):
    """
    Retrieve a list of parks in a specified state.
    
    state_code: The code of the state (e.g., 'CA' for California)
    api_key: Personal API key to use in request
    """
    parks_in_state = []
    
    response = requests.get(f"{api_base_url}parks", params=params)
    data = response.json()
    
    for park in data['data']:
        if params['stateCode'] in park['states'].split(','):
            parks_in_state.append(park['fullName'])
    
    return parks_in_state

In [138]:
def get_basic(endpoint, params):
    """
    Use to get all data from endpoint without specific processing

    endpoint: The API endpoint to call
    params: The param dict to pass through the API call
    """
    responses = []
    limit = 50  # Number of results per page, maximum allowed by NPS API
    start = 0   # Initial starting point for pagination
    
    while True:
        params['limit'] = limit
        params['start'] =  start
        
        request = requests.get(f"{api_base_url}{endpoint}", params=params)
        request_data = request.json()

        for record in request_data['data']:
            responses.extend([record])
        
        # Move to the next page
        start += limit
        
        # Break the loop if all responses have been retrieved
        if int(start) >= int(request_data['total']):
            break

    return responses

In [217]:
def get_info(api_key, entities, entityCode, endpoint, intent, queries, response_call=0):
    """
    Creates synthetic data in the necessary format for a specified API call.

    api_key: Personal API key to use in request.
    entities: List of items to loop through such as State, Parks, Amentities.
    endpoint: the NPS API endpoint to call such as /activities or /parks.
    intent: General label for queries in a particular group. 
            For example, the questions "Tell me about {park}" and "I want to know more about {park}" could both be categorized with the label "GetParkInfo".
    queries: A list of queries you would like to associate with a given set of API calls.
    entityCode: The entity code to be used in the API call. e.g. stateCode & parkCode
        *This will likely need to be updated to have more dynamic functionality.
    response_call: Pass through the function name that should be used to intiate the API call. The results will be recorded in the response column of the dataset in list format.
    """
    dataset = []
    for entity in entities:
        for query in queries:
            # Create API parameters 
            if entityCode == "parkCode":
                # Park name needs to be converted to park code
                params = {'api_key': api_key,
                        entityCode: park_code_model.map_park_code(query.format(entity=entity), park_model, park_vectorizer)
                }
            # if entityCode == "stateCode":
            #     # State name needs to be converted to state code
            #     params = {'api_key': api_key,
            #             entityCode: state_code_model.map_state_code(query.format(entity=entity), state_model, state_vectorizer)
            #     }
                
            # Set response function to use
            if response_call == 0:
                response = ""
            elif intent == "ParkInfo":
                response = get_parks(params)
            elif intent == "ParksInState":
                response = get_parks_in_state(params)
            else: 
                response = get_basic(endpoint, params)

            dataset.append({
                "query": query.format(entity=entity),
                "intent": intent,
                "api_call": {
                    "endpoint": endpoint,
                    "parkCode": params[entityCode]
                },
                "response":response
            })

    synthetic_queries = pd.json_normalize(dataset)
    return synthetic_queries


In [201]:
def activity_queries(raw_queries, activities):
    queries = []
    for query in raw_queries:
        for activity in activities:
            adjusted = query.format(activity=activity)+"{entity}"
            queries.append(adjusted)
    return queries

#### Define Variables

In [208]:

categories = ['activities', 'activities/parks', 'alerts', 'amenities','amenities/parksvisitorcenters',
              'amenities/parksplaces', 'articles', 'campgrounds', 'events', 'feespasses', 
              'lessonplans', 'multimedia/audio', 'multimedia/galleries', 'newsreleases',
              'parkinglots', 'parks', 'places', 'people', 'thingstodo', 
              'topics', 'topics/parks', 'tours', 'visitorcenters', ]


# list of parks
parks_df = pd.DataFrame(get_parks({'api_key': api_key}))
parks = parks_df['fullName'].tolist()
park_codes = parks_df['parkCode'].tolist()
park_lookup = dict(zip(parks, park_codes))
park_roots = nps_parks_root.nps_parks_root()
# Parks combined is the combination of two lists: The full park names and estimated park name abbreviations that users might use (i.e. Acadia National Park vs Acadia)
parks_combined = park_roots+parks

dist_states = ["Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", 
                "Connecticut", "Delaware", "Florida", "Georgia", "Hawaii", "Idaho", 
                "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", 
                "Maine", "Maryland", "Massachusetts", "Michigan", "Minnesota", 
                "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", 
                "New Hampshire", "New Jersey", "New Mexico", "New York", 
                "North Carolina", "North Dakota", "Ohio", "Oklahoma", "Oregon", 
                "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota", 
                "Tennessee", "Texas", "Utah", "Vermont", "Virginia", "Washington", 
                "West Virginia", "Wisconsin", "Wyoming"]

activities_raw = [
    'Arts and Culture','Astronomy','Biking','Boating','Camping','Climbing','Fishing','Food','Guided Tours',
    'Hiking','Horse Trekking','Ice Skating','Junior Ranger Program','Paddling','Park Film','Shopping','Skiing','Snow Play','Snowmobiling',
    'Snowshoeing','Swimming','Wildlife Watching'
    ]

activities = [
    'Arts and Culture','Astronomy','Bike','Boat','Camp','Climb','Fish','Eat','Take Guided Tours', 
    'Hike','Horse Trek','Ice Skate','Junior Ranger Program','Paddle Boarding','Watch a Film','Shop','Ski','Play in Snow','Snowmobile',
    'Snowshoe','Swim','Wildlife Watch'
    ]
activities_dict = dict(zip(activities, activities_raw))


#### Create Synthetic Data

In [218]:
queries = [
    "Tell me about {entity}","What is the full name of {entity}?","What is the address of {entity}?","Which state is {entity} located in?","Give me a description of {entity}",
    "Where is {entity} located?","Can you provide the full name of {entity}?","What is the location of {entity}?","Tell me the address of {entity}",
    "In which state can I find {entity}?","Describe {entity} to me"
]
ParkInfo = get_info(api_key, entities = parks_combined, entityCode = "parkCode", endpoint = "parks", intent = "ParkInfo", queries = queries, response_call = 0)

queries = [
    "Which parks are in {entity}?","What parks can be found in {entity}?","List the parks located in {entity}","Are there any national parks in {entity}?",
    "What national parks are in {entity}?","Can you tell me the parks in {entity}?","Give me a list of parks in {entity}","Which national parks are located in {entity}?",
    "What parks are available in {entity}?","Tell me the parks that are in {entity}"
]
#ParksInState = get_info(api_key, entities = dist_states, entityCode = "stateCode", endpoint = "parks", intent = "ParksInState", queries = queries, response_call = 0)

queries = [
    "What activities can I do at {entity}?","Tell me about the activities available at {entity}","What can I do at {entity}?","List the activities at {entity}",
    "What recreational activities are offered at {entity}?","What outdoor activities can I enjoy at {entity}?","What kind of activities are there at {entity}?",
    "What are the top activities at {entity}?","What fun things can I do at {entity}?","What activities are recommended at {entity}?"
]
#raw_queries =  ["What activities are there to do at ", "Can I {activity} in ", "Is {activity} available at ", "What {activity} activities are available in "]
#queries = activity_queries(raw_queries, activities)
ParkActivities = get_info(api_key, entities = parks_combined, entityCode = "parkCode", endpoint = "activities", intent = "ParkActivities", queries = queries, response_call = 0)

queries = [
    "What alerts are active at {entity} currently?","Are there any current alerts at {entity}?","Tell me the active alerts at {entity}","What are the current alerts for {entity}?",
    "Which alerts are active in {entity} right now?","List the active alerts at {entity}","Are there any alerts at {entity}?","What are the present alerts for {entity}?",
    "Can you provide the active alerts for {entity}?","What are the ongoing alerts at {entity}?"
]
ParkAlerts = get_info(api_key, entities = parks_combined, entityCode = "parkCode", endpoint = "alerts", intent = "ParkAlerts", queries = queries, response_call = 0)

queries = [
    "What amenities exist at {entity}?","Tell me about the amenities at {entity}","What facilities are available at {entity}?","List the amenities at {entity}",
    "What kind of amenities can I find at {entity}?","What services and facilities does {entity} offer?","What amenities should I expect at {entity}?",
    "What conveniences are available at {entity}?","What amenities does {entity} have?","What facilities are provided at {entity}?"
]
ParkAmenities = get_info(api_key, entities = parks_combined, entityCode = "parkCode", endpoint = "amenities", intent = "ParkAmenities", queries = queries, response_call = 0)

queries = ["What events are happening at {entity}?","Tell me about the events at {entity}","What upcoming events are scheduled at {entity}?",
    "Are there any events at {entity}?","What kind of events are held at {entity}?","What events can I attend at {entity}?",
    "Are there any special events at {entity}?","What events are planned at {entity}?","What events are currently happening at {entity}?","What activities and events are there at {entity}?"
]
ParkEvents = get_info(api_key, entities = parks_combined, entityCode = "parkCode", endpoint = "events", intent = "ParkEvents", queries = queries, response_call = 0)

queries = [
    "How much does it cost to get into {entity}?","What is the entry fee for {entity}?", "Tell me about the entrance fees for {entity}",
    "What are the admission fees for {entity}?","Are there any fees to visit {entity}?","What is the cost of admission to {entity}?",
    "How expensive is it to visit {entity}?","What are the ticket prices for {entity}?","Do I need to pay to enter {entity}?","Are there any charges to access {entity}?"
]
ParkFees = get_info(api_key, entities = parks_combined, entityCode = "parkCode", endpoint = "feespasses", intent = "ParkFees", queries = queries, response_call = 0)

queries =  [
    "Where can I park at {entity}?","Tell me about parking options at {entity}","What are the parking facilities like at {entity}?","Where should I park when visiting {entity}?",
    "What parking areas are available at {entity}?","Is there parking available at {entity}?","How is parking managed at {entity}?","Can I find parking near {entity}?",
    "Are there designated parking lots at {entity}?","What are the parking arrangements at {entity}?"
]
ParkParkingLots = get_info(api_key, entities = parks_combined, entityCode = "parkCode", endpoint = "parkinglots", intent = "ParkParkingLots", queries = queries, response_call = 0)

queries = ["What things can I do at {entity}?","Tell me about attractions at {entity}","What are the attractions at {entity}?",
    "What are the main attractions of {entity}?","What are the highlights of {entity}?","What can I see and do at {entity}?",
    "What are the recreational opportunities at {entity}?","What are the popular things to do at {entity}?",
    "What experiences are available at {entity}?","What is there to do {entity}?"]
ParkThingsToDo = get_info(api_key, entities = parks_combined, entityCode = "parkCode", endpoint = "thingstodo", intent = "ParkThingsToDo", queries = queries, response_call = 0)

queries = ["What tours can I take at {entity}?","Tell me about guided tours at {entity}","Are there any guided tours available at {entity}?","What guided experiences are offered at {entity}?",
    "Can I join any tours at {entity}?","What kind of guided tours are available at {entity}?","Are there ranger-led tours at {entity}?",
    "What are the tour options at {entity}?","How can I book a tour at {entity}?","Are there any special tours or programs at {entity}?"]
ParkTours = get_info(api_key, entities = parks_combined, entityCode = "parkCode", endpoint = "tours", intent = "ParkTours", queries = queries, response_call = 0)

queries = ["Where are the visitor centers located at {entity}?","Tell me about visitor centers at {entity}","What visitor centers can I find at {entity}?",
    "Are there any visitor centers at {entity}?","Where can I find information centers at {entity}?","What are the visitor facilities like at {entity}?","Can you guide me to the visitor centers at {entity}?",
    "How many visitor centers are there at {entity}?","What services do the visitor centers offer at {entity}?","Are the visitor centers at {entity} open to the public?"]
ParkVisitorCenters = get_info(api_key, entities = parks_combined, entityCode = "parkCode", endpoint = "visitorcenters", intent = "ParkVisitorCenters", queries = queries, response_call = 0)

synthetic_queries_df = pd.concat([ParkInfo, ParkActivities, ParkAlerts, ParkAmenities, ParkEvents, ParkFees, ParkParkingLots, ParkThingsToDo, ParkTours, ParkVisitorCenters], axis=0, ignore_index=True)

In [227]:
# Create query and label data
X = synthetic_queries_df['query']
y = synthetic_queries_df[['api_call.endpoint', 'api_call.parkCode']]

# Split training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing and modeling pipeline
text_vectorizer = CountVectorizer(stop_words='english')

model = Pipeline([
    ('vectorizer', text_vectorizer),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Fit the pipeline
model.fit(X_train, y_train)

MemoryError: could not allocate 246939648 bytes

In [None]:

preprocessor = ColumnTransformer(
    transformers=[
        ('text', text_transformer, 'query')
    ])

model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Fit the pipeline
model.fit(X_train, y_train)

In [None]:
# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate model performance
print(classification_report(y_test, y_pred))

In [None]:
new_query = ["What activities can I do at Yellowstone?"]
predicted = model.predict(new_query)

print("Predicted endpoint:", predicted[0][0])
print("Predicted parkCode:", predicted[1][0])
print("Predicted stateCode:", predicted[2][0])

#### Development

In [187]:
endpoint = 'activities'
park = 'acad'
params = {'api_key': api_key,
        'parkCode': park
                }
test = get_basic(endpoint, params)
test

[{'id': '09DF0950-D319-4557-A57E-04CD2F63FF42', 'name': 'Arts and Culture'},
 {'id': '13A57703-BB1A-41A2-94B8-53B692EB7238', 'name': 'Astronomy'},
 {'id': '7CE6E935-F839-4FEC-A63E-052B1DEF39D2', 'name': 'Biking'},
 {'id': '071BA73C-1D3C-46D4-A53C-00D5602F7F0E', 'name': 'Boating'},
 {'id': 'A59947B7-3376-49B4-AD02-C0423E08C5F7', 'name': 'Camping'},
 {'id': 'B12FAAB9-713F-4B38-83E4-A273F5A43C77', 'name': 'Climbing'},
 {'id': 'C11D3746-5063-4BD0-B245-7178D1AD866C', 'name': 'Compass and GPS'},
 {'id': 'AE42B46C-E4B7-4889-A122-08FE180371AE', 'name': 'Fishing'},
 {'id': '1DFACD97-1B9C-4F5A-80F2-05593604799E', 'name': 'Food'},
 {'id': 'B33DC9B6-0B7D-4322-BAD7-A13A34C584A3', 'name': 'Guided Tours'},
 {'id': '42FD78B9-2B90-4AA9-BC43-F10E9FEA8B5A', 'name': 'Hands-On'},
 {'id': 'BFF8C027-7C8F-480B-A5F8-CD8CE490BFBA', 'name': 'Hiking'},
 {'id': '0307955A-B65C-4CE4-A780-EB36BAAADF0B', 'name': 'Horse Trekking'},
 {'id': '5FF5B286-E9C3-430E-B612-3380D8138600', 'name': 'Ice Skating'},
 {'id': 'DF4A35E

In [90]:
def get_activities_list(params):
    """
    Use to get a list of activities available at a specific park based on park code
    """

    # Define the endpoint for the activities query
    activities_endpoint = f"{api_base_url}activities/parks"
    
    # get activities information
    response = requests.get(activities_endpoint, params=params)
    activities_data = response.json()
    
    # Extract the activities
    activities = []
    for activity in activities_data['data']:
        if any(park['parkCode'] == params['parkCode'] for park in activity['parks']):
            activities.append(activity['name'])
    
    # Remove duplicates
    unique_activities = list(set(activities))
    
    return unique_activities