#### Define Imports

In [10]:
import requests
import pandas as pd 
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import json
import sys

sys.path.insert(0,'../')
from environment import env

#### Define Variables

In [11]:
config = env.env()
api_key = config['nps_api_key']
api_base_url = 'https://developer.nps.gov/api/v1/'

#### Define Functions

In [12]:
def get_parks(params):
    """
    Use to find a list of all park names, codes, states, addresses and descriptions from the NPS parks endpoint.
    Can also be used to find specific park information.

    """
    parks = []
    limit = 50  # Number of results per page, maximum allowed by NPS API
    start = 0   # Initial starting point for pagination
    
    while True:
        params['limit'] = limit
        params['start'] =  start
        
        response = requests.get(f"{api_base_url}parks", params=params)
        data = response.json()
        
        parks.extend([
            {
                'fullName': park['fullName'],
                'parkCode': park['parkCode'],
                'state': park['states'],
                'addresses': park.get('addresses', []),
                'description': park['description']
            } for park in data['data']
        ])
        
        # Move to the next page
        start += limit
        
        # Break the loop if all parks have been retrieved
        if int(start) >= int(data['total']):
            break
    
    return parks

In [13]:
def get_basic(endpoint, params):
    """
    Use to get all data from endpoint without specific processing

    endpoint: The API endpoint to call
    params: The param dict to pass through the API call
    """
    responses = []
    limit = 50  # Number of results per page, maximum allowed by NPS API
    start = 0   # Initial starting point for pagination
    
    while True:
        params['limit'] = limit
        params['start'] =  start
        
        request = requests.get(f"{api_base_url}{endpoint}", params=params)
        request_data = request.json()

        for record in request_data['data']:
            responses.extend([record])
        
        # Move to the next page
        start += limit
        
        # Break the loop if all responses have been retrieved
        if int(start) >= int(request_data['total']):
            break

    return responses

#### Create Synthetic Data

In [15]:
# list of parks
parks_df = pd.DataFrame(get_parks({'api_key': api_key}))
parks = parks_df['fullName'].tolist()
park_codes = parks_df['parkCode'].tolist()
park_lookup = dict(zip(parks, park_codes))
collect_parks =  random.sample(parks, 50)#park_roots+parks

#### Query API

In [63]:

park_info_df = pd.DataFrame()
for park in collect_parks:
        params = {'api_key': api_key,
                'parkCode' : park_lookup[park]
                        }
        park_df = pd.DataFrame(get_parks(params)[0])
        addresses_df = pd.json_normalize(park_df['addresses'])
        temp_df = pd.concat([park_df.drop(columns=['addresses']), addresses_df], axis=1)
        park_info_df = pd.concat([park_info_df, temp_df], ignore_index=True)  
park_info_df

In [52]:
park_activity_df = pd.DataFrame()
for park in collect_parks:
        parkcode = park_lookup[park]
        params = {'api_key': api_key,
                'parkCode': parkcode
                        }
        basic_data = get_basic('activities', params)
        act_list = [[item['name'] for item in basic_data]]
        temp_df = pd.DataFrame({'parkCode': parkcode, 'activities':act_list})
        park_activity_df = pd.concat([park_activity_df, temp_df], ignore_index=True)  
park_activity_df

Unnamed: 0,parkCode,activities
0,olym,"[Astronomy, Biking, Boating, Camping, Climbing..."
1,pull,"[Guided Tours, Museum Exhibits, Park Film, Sho..."
2,acad,"[Arts and Culture, Astronomy, Biking, Boating,..."
3,yose,"[Arts and Culture, Astronomy, Auto and ATV, Bi..."
4,zion,"[Arts and Culture, Astronomy, Biking, Camping,..."


In [16]:
collect_parks = ['Olympic National Park',
                'Pullman National Historical Park',
                'Acadia National Park',
                'Yosemite National Park'
                'Harpers Ferry National Historical Park',
                'Zion National Park']

park_alert_df = pd.DataFrame()
for park in collect_parks:
        parkcode = park_lookup[park]
        params = {'api_key': api_key,
                'parkCode': parkcode
                        }
        basic_data = get_basic('alerts', params)
        print(basic_data)
        #act_list = [[item['name'] for item in basic_data]]
        #temp_df = pd.DataFrame({'parkCode': parkcode, 'activities':act_list})
        #park_alert_df = pd.concat([park_alert_df, temp_df], ignore_index=True)  
#park_alert_df

[{'id': '33A70DF9-543F-4AD8-B3D2-043E9130C98C', 'url': 'https://wsdot.wa.gov/about/news/2024/new-us-101-elwha-river-bridge-scheduled-open-following-nine-day-closure-us-101', 'title': 'July 12-22: HWY 101 Elwha Bridge Closed', 'parkCode': 'olym', 'description': 'Starting July 12, the Highway 101 Elwha Bridge will close for construction. Travelers should use State Routes 112 and 113 to access Lake Crescent and Barnes Point. Little River Rd will provide access to Madison Falls Trailhead and Olympic Hot Springs Rd.', 'category': 'Caution', 'relatedRoadEvents': [], 'lastIndexedDate': '2024-06-26 19:26:37.0'}, {'id': '4C7B9A25-ABC5-4FFF-9710-6B5560208427', 'url': 'https://www.nps.gov/olym/planyourvisit/fire-conditions-and-updates.htm', 'title': 'Trail Closures Due to Wildfire Damage', 'parkCode': 'olym', 'description': 'Elwha River Trail from the Hayden Pass junction to the Sixteen Mile camping area on the North Fork Quinault River Trail; Skyline Ridge Trail from the Lake Beauty Camp Trail j

KeyError: 'Yosemite National ParkHarpers Ferry National Historical Park'

In [49]:
queries = [
    "Tell me about {entity}","What is the full name of {entity}?","What is the address of {entity}?","Which state is {entity} located in?","Give me a description of {entity}",
    "Where is {entity} located?","Can you provide the full name of {entity}?","What is the location of {entity}?","Tell me the address of {entity}",
    "In which state can I find {entity}?","Describe {entity} to me"
]
ParkInfo = get_info(api_key, entities = parks_combined, entityCode = "parkCode", endpoint = "parks", intent = "ParkInfo", queries = queries, response_call = 0)

queries = [
    "Which parks are in {entity}?","What parks can be found in {entity}?","List the parks located in {entity}","Are there any national parks in {entity}?",
    "What national parks are in {entity}?","Can you tell me the parks in {entity}?","Give me a list of parks in {entity}","Which national parks are located in {entity}?",
    "What parks are available in {entity}?","Tell me the parks that are in {entity}"
]
#ParksInState = get_info(api_key, entities = dist_states, entityCode = "stateCode", endpoint = "parks", intent = "ParksInState", queries = queries, response_call = 0)

queries = [
    "What activities can I do at {entity}?","Tell me about the activities available at {entity}","What can I do at {entity}?","List the activities at {entity}",
    "What recreational activities are offered at {entity}?","What outdoor activities can I enjoy at {entity}?","What kind of activities are there at {entity}?",
    "What are the top activities at {entity}?","What fun things can I do at {entity}?","What activities are recommended at {entity}?"
]
#raw_queries =  ["What activities are there to do at ", "Can I {activity} in ", "Is {activity} available at ", "What {activity} activities are available in "]
#queries = activity_queries(raw_queries, activities)
ParkActivities = get_info(api_key, entities = parks_combined, entityCode = "parkCode", endpoint = "activities", intent = "ParkActivities", queries = queries, response_call = 0)

queries = [
    "What alerts are active at {entity} currently?","Are there any current alerts at {entity}?","Tell me the active alerts at {entity}","What are the current alerts for {entity}?",
    "Which alerts are active in {entity} right now?","List the active alerts at {entity}","Are there any alerts at {entity}?","What are the present alerts for {entity}?",
    "Can you provide the active alerts for {entity}?","What are the ongoing alerts at {entity}?"
]
ParkAlerts = get_info(api_key, entities = parks_combined, entityCode = "parkCode", endpoint = "alerts", intent = "ParkAlerts", queries = queries, response_call = 0)

queries = [
    "What amenities exist at {entity}?","Tell me about the amenities at {entity}","What facilities are available at {entity}?","List the amenities at {entity}",
    "What kind of amenities can I find at {entity}?","What services and facilities does {entity} offer?","What amenities should I expect at {entity}?",
    "What conveniences are available at {entity}?","What amenities does {entity} have?","What facilities are provided at {entity}?"
]
ParkAmenities = get_info(api_key, entities = parks_combined, entityCode = "parkCode", endpoint = "amenities", intent = "ParkAmenities", queries = queries, response_call = 0)

queries = ["What events are happening at {entity}?","Tell me about the events at {entity}","What upcoming events are scheduled at {entity}?",
    "Are there any events at {entity}?","What kind of events are held at {entity}?","What events can I attend at {entity}?",
    "Are there any special events at {entity}?","What events are planned at {entity}?","What events are currently happening at {entity}?","What activities and events are there at {entity}?"
]
ParkEvents = get_info(api_key, entities = parks_combined, entityCode = "parkCode", endpoint = "events", intent = "ParkEvents", queries = queries, response_call = 0)

queries = [
    "How much does it cost to get into {entity}?","What is the entry fee for {entity}?", "Tell me about the entrance fees for {entity}",
    "What are the admission fees for {entity}?","Are there any fees to visit {entity}?","What is the cost of admission to {entity}?",
    "How expensive is it to visit {entity}?","What are the ticket prices for {entity}?","Do I need to pay to enter {entity}?","Are there any charges to access {entity}?"
]
ParkFees = get_info(api_key, entities = parks_combined, entityCode = "parkCode", endpoint = "feespasses", intent = "ParkFees", queries = queries, response_call = 0)

queries =  [
    "Where can I park at {entity}?","Tell me about parking options at {entity}","What are the parking facilities like at {entity}?","Where should I park when visiting {entity}?",
    "What parking areas are available at {entity}?","Is there parking available at {entity}?","How is parking managed at {entity}?","Can I find parking near {entity}?",
    "Are there designated parking lots at {entity}?","What are the parking arrangements at {entity}?"
]
#ParkParkingLots = get_info(api_key, entities = parks_combined, entityCode = "parkCode", endpoint = "parkinglots", intent = "ParkParkingLots", queries = queries, response_call = 0)

queries = ["What things can I do at {entity}?","Tell me about attractions at {entity}","What are the attractions at {entity}?",
    "What are the main attractions of {entity}?","What are the highlights of {entity}?","What can I see and do at {entity}?",
    "What are the recreational opportunities at {entity}?","What are the popular things to do at {entity}?",
    "What experiences are available at {entity}?","What is there to do {entity}?"]
#ParkThingsToDo = get_info(api_key, entities = parks_combined, entityCode = "parkCode", endpoint = "thingstodo", intent = "ParkThingsToDo", queries = queries, response_call = 0)

queries = ["What tours can I take at {entity}?","Tell me about guided tours at {entity}","Are there any guided tours available at {entity}?","What guided experiences are offered at {entity}?",
    "Can I join any tours at {entity}?","What kind of guided tours are available at {entity}?","Are there ranger-led tours at {entity}?",
    "What are the tour options at {entity}?","How can I book a tour at {entity}?","Are there any special tours or programs at {entity}?"]
#ParkTours = get_info(api_key, entities = parks_combined, entityCode = "parkCode", endpoint = "tours", intent = "ParkTours", queries = queries, response_call = 0)

queries = ["Where are the visitor centers located at {entity}?","Tell me about visitor centers at {entity}","What visitor centers can I find at {entity}?",
    "Are there any visitor centers at {entity}?","Where can I find information centers at {entity}?","What are the visitor facilities like at {entity}?","Can you guide me to the visitor centers at {entity}?",
    "How many visitor centers are there at {entity}?","What services do the visitor centers offer at {entity}?","Are the visitor centers at {entity} open to the public?"]
#ParkVisitorCenters = get_info(api_key, entities = parks_combined, entityCode = "parkCode", endpoint = "visitorcenters", intent = "ParkVisitorCenters", queries = queries, response_call = 0)

synthetic_queries_df = pd.concat([ParkInfo, ParkActivities, ParkAlerts, ParkAmenities, ParkEvents, ParkFees], axis=0, ignore_index=True)#ParkParkingLots, ParkThingsToDo, ParkTours, ParkVisitorCenters

KeyboardInterrupt: 

In [59]:
synthetic_queries_df.to_csv('synthetic_queries.csv', index=False)

#### Prepare for GPT

In [64]:
synthetic_queries_df = pd.read_csv('synthetic_queries.csv')

In [65]:
train_df, val_df = train_test_split(synthetic_queries_df, test_size=0.2, random_state=42)
train_df, temp_df = train_test_split(synthetic_queries_df, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

In [66]:
print(len(train_df))
print(len(val_df))

213
46


In [67]:
save_to_jsonl(train_df, 'fine_tune_train_data.jsonl')
save_to_jsonl(val_df, 'fine_tune_val_data.jsonl')
save_to_jsonl(test_df, 'fine_tune_test_data.jsonl')

#### Development

In [39]:
train_df, val_df = train_test_split(synthetic_queries_df, test_size=0.2, random_state=42)
train_df, temp_df = train_test_split(synthetic_queries_df, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

In [14]:
def create_prompt_response(row):
    """
    Parses the synthetic data into GPT format
    """
    dict = {'prompt': row['query'],
            'completion':f"endpoint: {row['api_call.endpoint']}, parkcode: {row['api_call.parkCode']}"}
    return dict

In [None]:
def create_prompt_response(row):
    """
    Parses the synthetic data into GPT format
    """
    dict = {"messages": [
        { "role": "user", "content": row['query'] },
        {
        "role": "assistant",
        "tool_calls": [
            {
            "id": "call_id",
            "type": "function",
            "function": {
                "name": "call_endpoint",
                "arguments": {"endpoint": row['api_call.endpoint'],"parkcode":row['api_call.parkCode']}
            }
            }
        ]
        }
    ],
    "parallel_tool_calls": False,
    "tools": [
        {
        "type": "function",
        "function": {
            "name": "call_endpoint",
            "description": "Make API call",
            "parameters": {
            "type": "object",
            "properties": {
                "endpoint": {
                    "type": "string",
                    "description": "The endpoint to call"
                },
                "parkcode": { "type": "string", "description": "The parkcode parameter to filter for" }
            },
            "required": ["endpoint", "parkcode"]
            } 
        }
        }
    ]
    }
    return dict

In [187]:
# endpoint = 'activities'
# park = 'acad'
# params = {'api_key': api_key,
#         'parkCode': park
#                 }
# test = get_basic(endpoint, params)
# test

[{'id': '09DF0950-D319-4557-A57E-04CD2F63FF42', 'name': 'Arts and Culture'},
 {'id': '13A57703-BB1A-41A2-94B8-53B692EB7238', 'name': 'Astronomy'},
 {'id': '7CE6E935-F839-4FEC-A63E-052B1DEF39D2', 'name': 'Biking'},
 {'id': '071BA73C-1D3C-46D4-A53C-00D5602F7F0E', 'name': 'Boating'},
 {'id': 'A59947B7-3376-49B4-AD02-C0423E08C5F7', 'name': 'Camping'},
 {'id': 'B12FAAB9-713F-4B38-83E4-A273F5A43C77', 'name': 'Climbing'},
 {'id': 'C11D3746-5063-4BD0-B245-7178D1AD866C', 'name': 'Compass and GPS'},
 {'id': 'AE42B46C-E4B7-4889-A122-08FE180371AE', 'name': 'Fishing'},
 {'id': '1DFACD97-1B9C-4F5A-80F2-05593604799E', 'name': 'Food'},
 {'id': 'B33DC9B6-0B7D-4322-BAD7-A13A34C584A3', 'name': 'Guided Tours'},
 {'id': '42FD78B9-2B90-4AA9-BC43-F10E9FEA8B5A', 'name': 'Hands-On'},
 {'id': 'BFF8C027-7C8F-480B-A5F8-CD8CE490BFBA', 'name': 'Hiking'},
 {'id': '0307955A-B65C-4CE4-A780-EB36BAAADF0B', 'name': 'Horse Trekking'},
 {'id': '5FF5B286-E9C3-430E-B612-3380D8138600', 'name': 'Ice Skating'},
 {'id': 'DF4A35E