In [11]:

import pandas as pd
import requests
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pickle

import sys

sys.path.insert(0,'../')
from environment import env

In [2]:
config = env.env()

In [8]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

api_base_url = 'https://developer.nps.gov/api/v1/'
park_csv_path = '../02_nps_api_data/park_to_parkcode.csv'
model_output = 'nltk_model.pkl'

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/lauralyns/Documents/MADS/SIADS 699 -
[nltk_data]     Capstone/Capstone
[nltk_data]     VS/MADS_Capstone/.venv/lib/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lauralyns/Documents/MADS/SIADS 699 -
[nltk_data]     Capstone/Capstone
[nltk_data]     VS/MADS_Capstone/.venv/lib/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/lauralyns/Documents/MADS/SIADS 699 -
[nltk_data]     Capstone/Capstone
[nltk_data]     VS/MADS_Capstone/.venv/lib/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
class NLTKModelFunctions:
    def __init__(self, config, park_csv_path):
        self.config = config
        self.park_codes = self.load_park_codes(park_csv_path)

    def load_park_codes(self, park_csv_path):
        """
        Loads park codes from a CSV file into a dictionary.
        
        park_csv_path (str): Path to the CSV file containing park names and their codes.
        """
        park_df = pd.read_csv(park_csv_path)
        park_codes = {}
        lemmatizer = WordNetLemmatizer()
        stop_words = set(stopwords.words('english'))
        
        for index, row in park_df.iterrows():
            tokens = word_tokenize(row['fullName'].lower())
            tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalpha() and token not in stop_words]
            normalized_park_name = ' '.join(tokens)
            park_codes[normalized_park_name] = row['parkCode']
        
        return park_codes

    def preprocess_text(self, text):
        tokens = word_tokenize(text.lower())
        stop_words = set(stopwords.words('english'))
        tokens = [token for token in tokens if token.isalpha() and token not in stop_words]
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
        return ' '.join(tokens)

    def predict_intent(self, query):
        preprocessed_query = self.preprocess_text(query)
        if 'description' in preprocessed_query:
            return 'description'
        elif 'address' in preprocessed_query:
            return 'address'
        elif 'state' in preprocessed_query:
            return 'state'
        elif 'alerts' in preprocessed_query:
            return 'alerts'
        elif 'amenities' in preprocessed_query:
            return 'amenities'
        elif 'events' in preprocessed_query:
            return 'events'
        elif 'fees' in preprocessed_query or 'passes' in preprocessed_query:
            return 'feespass'
        else:
            return 'other'

    def get_park_code(self, query):
        preprocessed_query = self.preprocess_text(query)
        lemmatizer = WordNetLemmatizer()
        stop_words = set(stopwords.words('english'))
        for park_name, park_code in self.park_codes.items():
            tokens = word_tokenize(park_name)
            tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalpha() and token not in stop_words]
            normalized_park_name = ' '.join(tokens)
            if normalized_park_name in preprocessed_query:
                return park_code
        return None

    def get_params(self, query):
        intent = self.predict_intent(query)
        park_code = self.get_park_code(query)

        if park_code is None:
            raise ValueError("Park not found in the query. Please provide a valid park name.")

        endpoint_mapping = {
            'description': 'parks',
            'address': 'parks',
            'state': 'parks',
            'alerts': 'alerts',
            'amenities': 'amenities',
            'events': 'events',
            'feespass': 'feespasses'
        }

        endpoint = endpoint_mapping.get(intent, 'parks')

        return endpoint, park_code, intent

In [5]:
nltk_model_functions = NLTKModelFunctions(config, park_csv_path)

In [7]:
query = 'What state is Green Springs in?'
endpoint, park_code, intent = nltk_model_functions.get_params(query)
print(endpoint, park_code, intent)

parks grsp state


In [12]:
pickle.dump(nltk_model_functions, open(model_output, 'wb'))