In [18]:
#! pip install spacy
#! pip install $(spacy info en_core_web_sm --url)

import pandas as pd
import requests
import spacy
nlp = spacy.load("en_core_web_sm")


import sys

sys.path.insert(0,'../')
from environment import env

In [19]:
config = env.env()

In [20]:
api_base_url = 'https://developer.nps.gov/api/v1/'

park_csv_path = '../02_nps_api_data/park_to_parkcode.csv'

In [21]:
class SpaCyModelFunctions:
    def __init__(self, config, park_csv_path):
        self.config = config
        self.park_codes = self.load_park_codes(park_csv_path)

    def load_park_codes(self, park_csv_path):
        """
        Loads park codes from a CSV file into a dictionary.
        
        park_csv_path (str): Path to the CSV file containing park names and their codes.
        """
        park_df = pd.read_csv(park_csv_path)
        park_codes = {}
        
        for index, row in park_df.iterrows():
            tokens = self.preprocess_text(row['fullName'].lower())
            normalized_park_name = ' '.join(tokens)
            park_codes[normalized_park_name] = row['parkCode']
        
        return park_codes

    def preprocess_text(self, text):
        doc = nlp(text)
        return [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]

    def predict_intent(self, query):
        tokens = self.preprocess_text(query.lower())
        preprocessed_query = ' '.join(tokens)
        
        if 'description' in preprocessed_query:
            return 'description'
        elif 'address' in preprocessed_query:
            return 'address'
        elif 'state' in preprocessed_query:
            return 'state'
        elif 'alerts' in preprocessed_query:
            return 'alerts'
        elif 'amenities' in preprocessed_query:
            return 'amenities'
        elif 'events' in preprocessed_query:
            return 'events'
        elif 'fees' in preprocessed_query or 'passes' in preprocessed_query:
            return 'feespass'
        else:
            return 'other'

    def get_park_code(self, query):
        tokens = self.preprocess_text(query.lower())
        preprocessed_query = ' '.join(tokens)
        
        for park_name, park_code in self.park_codes.items():
            if park_name in preprocessed_query:
                return park_code
        return None

    def get_params(self, query):
        intent = self.predict_intent(query)
        park_code = self.get_park_code(query)

        if park_code is None:
            raise ValueError("Park not found in the query. Please provide a valid park name.")

        endpoint_mapping = {
            'description': 'parks',
            'address': 'addresses',
            'state': 'parks',
            'alerts': 'alerts',
            'amenities': 'amenities',
            'events': 'events',
            'feespass': 'feespasses'
        }

        endpoint = endpoint_mapping.get(intent, 'parks')

        return endpoint, park_code, intent

    def api_call(self, query):
        try:
            endpoint, park_code, intent = self.get_params(query)
        except ValueError as e:
            return str(e), None, None, None

        responses = []
        limit = 1
        start = 0
        
        while True:
            params = {
                'api_key': self.config['nps_api_key'],
                'parkCode': park_code,
                'limit': limit,
                'start': start,
            }
            
            request = requests.get(f"{api_base_url}{endpoint}", params=params)
            request_data = request.json()

            if 'data' not in request_data:
                break

            if endpoint == 'parks':
                responses.extend([
                    {
                        'fullName': park['fullName'],
                        'parkCode': park['parkCode'],
                        'state': park['states'],
                        'addresses': park.get('addresses', []),
                        'description': park['description']
                    } for park in request_data['data']
                ])
            else:
                responses.extend(request_data['data'])

            start += limit

            if int(start) >= int(request_data['total']):
                break

        if endpoint == 'parks':
            temp_df = pd.DataFrame(responses)
            addresses_df = pd.json_normalize(temp_df['addresses'].apply(lambda x: x[0] if x else {}))
            output = pd.concat([temp_df.drop(columns=['addresses']), addresses_df], axis=1)
        else:
            output = pd.DataFrame(responses)

        return endpoint, park_code, intent, output


In [22]:
spacy_model_functions = SpaCyModelFunctions(config, park_csv_path)

In [23]:
query = 'What state is Green Springs in?'
endpoint, park_code, intent, output = spacy_model_functions.api_call(query)
print(endpoint, park_code, intent, output)

KeyError: 'addresses'

In [17]:
query = 'What is address of Green Springs?'
endpoint, park_code, intent, output = spacy_model_functions.api_call(query)
print(endpoint, park_code, intent, output)

addresses grsp address Empty DataFrame
Columns: []
Index: []


In [10]:
query = 'What state is Yosemite in?'
endpoint, park_code, intent, output = spacy_model_functions.api_call(query)
print(endpoint, park_code, intent, output)

Park not found in the query. Please provide a valid park name. None None None


In [12]:
query = 'What state is Yosemite national park in?'
endpoint, park_code, intent, output = spacy_model_functions.api_call(query)
print(endpoint, park_code, intent, output)

KeyError: 'addresses'

More information on SpaCy:
https://spacy.io/models