#### Define Imports

In [1]:
#%pip install openai

In [2]:
import requests
import pandas as pd 
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import json
import openai
import os
from openai import OpenAI
import time
import sys

sys.path.insert(0,'../')
from environment import env
from environment import helper_functions

#### Define Variables

In [3]:
config = env.env()
nps_api_key = config['nps_api_key']
openai.api_key = config['gpt_api_key']
gpt_npi_key = config['gpt_api_key']
client = OpenAI(api_key  = config['gpt_api_key'])
api_base_url = 'https://developer.nps.gov/api/v1/'
root = config['root']

# Inputs
syn_queries = root+'synthetic_queries.csv'

# Output
model_name = 'nps_model_parkcode'
train_data = 'parkcode_train_data.jsonl'
val_data = 'parkcode_val_data.jsonl'
test_data = 'parkcode_test_data.jsonl'
target = 'parkcode'

#### Read Inputs

In [4]:
synthetic_queries_df = pd.read_csv(syn_queries)

#### Prepare for GPT

In [5]:
train_df, val_df = train_test_split(synthetic_queries_df, test_size=0.2, random_state=42)
train_df, temp_df = train_test_split(synthetic_queries_df, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

helper_functions.save_to_jsonl(train_df, train_data, target)
helper_functions.save_to_jsonl(val_df, val_data, target)
helper_functions.save_to_jsonl(test_df, test_data, target)

#### Fine Tune

In [6]:
train_file =  client.files.create(
  file=open("parkcode_train_data.jsonl", "rb"),
  purpose="fine-tune"
)

val_file = client.files.create(
  file=open("parkcode_val_data.jsonl", "rb"),
  purpose="fine-tune"
)

train_file_id = train_file.id
val_file_id = val_file.id

In [14]:
fine_tune = client.fine_tuning.jobs.create(
    model="davinci-002",
    training_file=train_file_id,
    validation_file=val_file_id,
    seed = 42,
    suffix = model_name
)
fine_tune_id = fine_tune.id

#### Test Fine Tuning

In [9]:
max_tokens = 5
model = client.fine_tuning.jobs.retrieve(fine_tune_id).fine_tuned_model
with open(test_data, 'r') as f:
    for line in f:
        data_dict = json.loads(line.strip())
        query= data_dict['prompt']
        api_call = helper_functions.handle_query(query,model,client,max_tokens)
        print(f"query: {query}\n {api_call}")

query: Tell me about the events at Yukon
 parkcode: yuch
query: What events are happening at Roosevelt Campobello International Park?
 parkcode: roca
query: Tell me the active alerts at Shiloh
 parkcode: shil
query: Are there any alerts at Whitman Mission National Historic Site?
 parkcode: whmi
query: What events are currently happening at The White House Park?
 parkcode: whho
query: What is the entry fee for Whitman Mission?
 parkcode: whmi
query: List the active alerts at Pinnacles
 parkcode: pinn
query: What outdoor activities can I enjoy at Jewel Cave National Monument?
 parkcode: jeca
query: Where is Bryce Canyon National Park located?
 parkcode: brca
query: Describe Gettysburg National Military Park to me
 parkcode: gett
query: Are there any events at The White House and President's Park?
 parkcode: whho
query: What conveniences are available at Whitman Mission?
 parkcode: whmi
query: What are the admission fees for Mesa Verde?
 parkcode: meve
query: What is the full name of Bryc

KeyboardInterrupt: 

In [None]:
#Streamlit user interface
# retieval augmented generation
# chunking
# hugging face python package for chunking
# then embedding (vector store: chroma)
# retieval methodology (retriever, embeds question, then pulls chunks to the llm )
# few shot