#### Define Imports

In [2]:
import requests
import pandas as pd 
import sys

from helper_functions import *
park_model, park_vectorizer = park_code_model.trained_model()

sys.path.insert(0,'../')
from environment import env

#### Define Variables

In [3]:
config = env.env()
nps_api_key = config['nps_api_key']
api_base_url = 'https://developer.nps.gov/api/v1/'

# Outputs
syn_queries = 'synthetic_queries.csv'
test_queries = 'testing_queries.csv'
parkcode_mapping = 'park_to_parkcode.csv'
parkroot_mapping = 'park_to_root.csv'
park_mapping = 'parkcode_to_park.csv'

#### Define Functions

In [4]:
def get_parks(params):
    """
    Use to find a list of all park names, codes, states, addresses and descriptions from the NPS parks endpoint.
    Can also be used to find specific park information.
    *Chat GPT was used to create the pagination process for the API

    params: The param dict to pass through the API call
    """
    parks = []
    limit = 50  # Number of results per page, maximum allowed by NPS API
    start = 0   # Initial starting point for pagination
    
    while True:
        params['limit'] = limit
        params['start'] =  start
        
        response = requests.get(f"{api_base_url}parks", params=params)
        data = response.json()
        
        parks.extend([
            {
                'fullName': park['fullName'],
                'parkCode': park['parkCode'],
                'state': park['states'],
                'addresses': park.get('addresses', []),
                'description': park['description']
            } for park in data['data']
        ])
        
        # Move to the next page
        start += limit
        
        # Break the loop if all parks have been retrieved
        if int(start) >= int(data['total']):
            break
    
    return parks

In [5]:
def create_synthetic_queries(api_key, entities, endpoint, intent, queries):
    """
    Creates synthetic data in the necessary format for a specified API call.

    api_key: Personal API key to use in request.
    entities: List of items to loop through such as State, Parks, Amentities.
    endpoint: the NPS API endpoint to call such as /activities or /parks.
    intent: General label for queries in a particular group. 
            For example, the questions "Tell me about {park}" and "I want to know more about {park}" could both be categorized with the label "GetParkInfo".
    queries: A list of queries you would like to associate with a given set of API calls.
    * Normalize JSON idea came from: https://stackoverflow.com/questions/46091362/how-to-normalize-json-correctly-by-python-pandas
    """
    dataset = []

    for entity in entities:
        for query in queries:
            # Create API parameters 
            params = {'api_key': nps_api_key,
                    "parkCode": park_code_model.map_park_code(query.format(entity=entity), park_model, park_vectorizer)
            }
            
            dataset.append({
                "query": query.format(entity=entity),
                "intent": intent,
                "api_call": {
                    "endpoint": endpoint,
                    "parkCode": params["parkCode"]
                }
            })

    synthetic_queries = pd.json_normalize(dataset)
    return synthetic_queries


#### Create Synthetic Data

In [6]:
# list of parks
parks_df = pd.DataFrame(get_parks({'api_key': nps_api_key}))
parks = parks_df['fullName'].tolist()
park_codes = parks_df['parkCode'].tolist()
park_roots = nps_parks_root.nps_parks_root()

park_lookup = dict(zip(parks, park_codes))
root_dict = dict(zip(parks, park_roots))

# Parks combined is the combination of two lists: The full park names and estimated park name abbreviations that users might use (i.e. Acadia National Park vs Acadia)
parks_combined = parks+park_roots

#### Training & Validation Data
Synthetic data for training and validating API models

In [7]:

queries = ["Tell me about {entity}","Give me a description of {entity}","Describe {entity} to me","What is the description of {entity}?"]
ParkDesc = create_synthetic_queries(nps_api_key, entities = parks_combined, endpoint = "parks", intent = "description", queries = queries)

queries = ["What is the address of {entity}?","Where is {entity} located?","What is the location of {entity}?","Tell me the address of {entity}","Where can I find {entity}?","Tell me the address of {entity}"]
ParkAddress = create_synthetic_queries(nps_api_key, entities = parks_combined, endpoint = "parks", intent = "address", queries = queries)

queries = ["Which state is {entity} located in?","In which state can I find {entity}?","What state is {entity} in?"]
ParkState = create_synthetic_queries(nps_api_key, entities = parks_combined, endpoint = "parks", intent = "state", queries = queries)

queries = ["What is the full name of {entity}?","Can you provide the full name of {entity}?","What is the name of {entity}?","Tell me the name of {entity}","Give me the full name of {entity}"]
ParkName = create_synthetic_queries(nps_api_key, entities = parks_combined, endpoint = "parks", intent = "fullname", queries = queries)

queries = [
    "What alerts are active at {entity} currently?","Are there any current alerts at {entity}?","Tell me the active alerts at {entity}","What are the current alerts for {entity}?",
    "Which alerts are active in {entity} right now?","List the active alerts at {entity}","Are there any alerts at {entity}?","What are the present alerts for {entity}?",
    "Can you provide the active alerts for {entity}?","What are the ongoing alerts at {entity}?"
]
ParkAlerts = create_synthetic_queries(nps_api_key, entities = parks_combined, endpoint = "alerts", intent = "alerts", queries = queries)

queries = [
    "What amenities exist at {entity}?","Tell me about the amenities at {entity}","What facilities are available at {entity}?","List the amenities at {entity}",
    "What kind of amenities can I find at {entity}?","What services and facilities does {entity} offer?","What amenities should I expect at {entity}?",
    "What conveniences are available at {entity}?","What amenities does {entity} have?","What facilities are provided at {entity}?"
]
ParkAmenities = create_synthetic_queries(nps_api_key, entities = parks_combined, endpoint = "amenities", intent = "amenities", queries = queries)

queries = ["What events are happening at {entity}?","Tell me about the events at {entity}","What upcoming events are scheduled at {entity}?",
    "Are there any events at {entity}?","What kind of events are held at {entity}?","What events can I attend at {entity}?",
    "Are there any special events at {entity}?","What events are planned at {entity}?","What events are currently happening at {entity}?","What activities and events are there at {entity}?"
]
ParkEvents = create_synthetic_queries(nps_api_key, entities = parks_combined, endpoint = "events", intent = "events", queries = queries)

queries = [
    "How much does it cost to get into {entity}?","What is the entry fee for {entity}?", "Tell me about the entrance fees for {entity}",
    "What are the admission fees for {entity}?","Are there any fees to visit {entity}?","What is the cost of admission to {entity}?",
    "How expensive is it to visit {entity}?","What are the ticket prices for {entity}?","Do I need to pay to enter {entity}?","Are there any charges to access {entity}?"
]
ParkFees = create_synthetic_queries(nps_api_key, entities = parks_combined, endpoint = "feespasses", intent = "feespass", queries = queries)

In [8]:
synthetic_queries_df = pd.concat([ParkDesc, ParkAddress, ParkState, ParkName, ParkAlerts, ParkAmenities, ParkEvents, ParkFees], axis=0, ignore_index=True)

#### Testing Data
Synthetic data for evaluating API model performances

In [9]:

queries = ["Give me information on {entity}"]#"What is {entity} like","Can you describe {entity}", "Give me information on {entity}"]
ParkDesc = create_synthetic_queries(nps_api_key, entities = parks_combined, endpoint = "parks", intent = "description", queries = queries)

queries = ["Where is {entity}?",]#"What is the address of {entity}","Where is {entity} located?"]
ParkAddress = create_synthetic_queries(nps_api_key, entities = parks_combined, endpoint = "parks", intent = "address", queries = queries)

queries = ["Tell me the state {entity} is in"]
ParkState = create_synthetic_queries(nps_api_key, entities = parks_combined, endpoint = "parks", intent = "state", queries = queries)

queries = ["Please provide the full name of {entity}?"]#,"What's the real name of {entity}?"," {entity}?"]
ParkName = create_synthetic_queries(nps_api_key, entities = parks_combined, endpoint = "parks", intent = "fullname", queries = queries)

queries = [
    "What alerts are active at {entity} right now?"#,"Are there alerts at {entity} at this time?",
    #"Give me any active alerts for {entity}","List any alerts for {entity}?",
    #"What advisories are there for {entity}?"
]
ParkAlerts = create_synthetic_queries(nps_api_key, entities = parks_combined, endpoint = "alerts", intent = "alerts", queries = queries)

queries = [
    "What services are there at {entity}?"#"Tell me about the facilities at {entity}?","What amenities are at {entity}","What is available at {entity}?",
    #"What type of amenities can I find at {entity}?","What services are there at {entity}?"
]
ParkAmenities = create_synthetic_queries(nps_api_key, entities = parks_combined, endpoint = "amenities", intent = "amenities", queries = queries)

queries = ["What events are scheduled at {entity}?",#"Tell me about any upcoming events at {entity}",
    #"Are there any scheduled activities at {entity}?","What kind of events happen at {entity}?",
    #"Are there any upcoming events {entity}?",
]
ParkEvents = create_synthetic_queries(nps_api_key, entities = parks_combined, endpoint = "events", intent = "events", queries = queries)

queries = [
    "Does it cost money to visit {entity}?",#"How much is a ticket for {entity}?", "What are the fees for visiting {entity}",
    #"What is the admission to visit {entity}?","How much is entrance to {entity}?",
]
ParkFees = create_synthetic_queries(nps_api_key, entities = parks_combined, endpoint = "feespasses", intent = "feespass", queries = queries)

In [10]:
testing_queries_df = pd.concat([ParkDesc, ParkAddress, ParkState, ParkName, ParkAlerts, ParkAmenities, ParkEvents, ParkFees], axis=0, ignore_index=True)

#### Write Outputs

In [11]:
# Training and Validation Queries CSV
synthetic_queries_df.to_csv(syn_queries, index=False)

# Testing Queries CSV
testing_queries_df.to_csv(test_queries, index=False)

# Parkcode mapping CSVs
parks_df[['fullName','parkCode']].to_csv(parkcode_mapping, index=False)
parks_df[['parkCode','fullName']].to_csv(park_mapping, index=False)

# Parkroot mapping CSV
pd.DataFrame(list(root_dict.items()), columns=['fullName', 'parkRoot']).to_csv(parkroot_mapping, index=False)