#### Define Imports

In [2]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from openai import OpenAI
import sys

sys.path.insert(0,'../')
from environment import env
from nps_model_functions import helper_functions

#### Define Variables

In [5]:
config = env.env()

# Inputs
client = OpenAI(api_key  = config['gpt_api_key'])
root = config['root']
syn_queries = root+'02_nps_api_data\\synthetic_queries.csv'

# Output
train_data = 'endpoint_train_data.jsonl'
val_data = 'endpoint_val_data.jsonl'
target = 'endpoint'
model_name = 'nps_model_endpoint'

#### Read Inputs

In [6]:
synthetic_queries_df = pd.read_csv(syn_queries)

Unnamed: 0,query,intent,api_call.endpoint,api_call.parkCode
0,Tell me about Abraham Lincoln Birthplace Natio...,description,parks,abli
1,Give me a description of Abraham Lincoln Birth...,description,parks,abli
2,Describe Abraham Lincoln Birthplace National H...,description,parks,abli
3,What is the description of Abraham Lincoln Bir...,description,parks,abli
4,Tell me about Acadia National Park,description,parks,acad
...,...,...,...,...
54631,What is the cost of admission to Zion?,feespass,feespasses,zion
54632,How expensive is it to visit Zion?,feespass,feespasses,zion
54633,What are the ticket prices for Zion?,feespass,feespasses,zion
54634,Do I need to pay to enter Zion?,feespass,feespasses,zion


#### Prepare for GPT

In [8]:
# Train/validation split
train_df, val_df = train_test_split(synthetic_queries_df, test_size=0.2, random_state=42)
print(len(train_df),len(val_df))

# Saves training and validation data to json for ingestion by OpenAI GPT
helper_functions.save_to_jsonl(train_df, train_data, target)
helper_functions.save_to_jsonl(val_df, val_data, target)

38433 9609


#### Fine Tune

In [9]:
# Upload a file that can be used across various endpoints. Individual files can be up to 512 MB, and the size of all files uploaded by one organization can be up to 100 GB.
  # Documentation: https://platform.openai.com/docs/api-reference/files/create
train_file =  client.files.create(
  file=open(train_data, "rb"),
  purpose="fine-tune"
)

val_file = client.files.create(
  file=open(val_data, "rb"),
  purpose="fine-tune"
)

# Retrieve file id to be used in fine tuning job
train_file_id = train_file.id
val_file_id = val_file.id

In [10]:
# Creates a fine-tuning job which begins the process of creating a new model from a given dataset.
  # Documentation: https://platform.openai.com/docs/api-reference/fine-tuning/create
fine_tune = client.fine_tuning.jobs.create(
  # The Davinci model was selected for its performance as a completion model over using a chat model based on our use case.
  # We also tried using the gpt-3.5-turbo and we were unable to get the model to complete after an hour of training.
  # Conversely, the davinci model averaged a 20 minute training period.
  model="davinci-002",
  training_file=train_file_id,
  validation_file=val_file_id,
  seed = 42,
  suffix = model_name
)
# The fine tune id needs to be retained and set in the environment file to be used when calling the fine-tuned model.
fine_tune_id = fine_tune.id