Convert Data to Small Samples of Training and Validation Set

In [1]:
import pandas as pd

amazon_df = pd.read_csv('amazon_qna.csv')
training_amazon_df = amazon_df.iloc[0:200]
validation_amazon_df = amazon_df.iloc[200:225]

training_amazon_df.to_csv('amazon_qna_training.csv', index=False)
validation_amazon_df.to_csv('amazon_qna_validation.csv', index=False)

Convert Training and Validation CSV to JSONL format required to Fine Tune OpenAI GPT-3 Models

In [2]:
import csv
import json

input_file = 'amazon_qna_training.csv'
output_file = 'amazon_qna_training.jsonl'

# Read CSV file and create JSONL data
jsonl_data = []

with open(input_file, 'r', newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        question = row['Question']
        answer = row['Answer']
        json_object = {
            "prompt": question + " ->",
            "completion": answer + "\n"
        }
        jsonl_data.append(json.dumps(json_object))

# Write JSONL data to the output file
with open(output_file, 'w', encoding='utf-8') as jsonlfile:
    for entry in jsonl_data:
        jsonlfile.write(f"{entry}\n")

print(f"Data from '{input_file}' has been converted and saved to '{output_file}'.")



input_file = 'amazon_qna_validation.csv'
output_file = 'amazon_qna_validation.jsonl'

# Read CSV file and create JSONL data
jsonl_data = []

with open(input_file, 'r', newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        question = row['Question']
        answer = row['Answer']
        json_object = {
            "prompt": question + " ->",
            "completion": answer + "\n"
        }
        jsonl_data.append(json.dumps(json_object))

# Write JSONL data to the output file
with open(output_file, 'w', encoding='utf-8') as jsonlfile:
    for entry in jsonl_data:
        jsonlfile.write(f"{entry}\n")

print(f"Data from '{input_file}' has been converted and saved to '{output_file}'.")


Data from 'amazon_qna_training.csv' has been converted and saved to 'amazon_qna_training.jsonl'.
Data from 'amazon_qna_validation.csv' has been converted and saved to 'amazon_qna_validation.jsonl'.


Upload Training and Validation JSONL Data to OpenAi

In [5]:
import openai
import os

openai.api_key = os.getenv('OPENAI_API_KEY')


file = 'amazon_qna_training.jsonl'
response = openai.File.create(purpose='fine-tune', file=open(file, 'rb'))
training_file_id = response['id']
print(f'training file response {response}')

file = 'amazon_qna_validation.jsonl'
response = openai.File.create(purpose='fine-tune', file=open(file, 'rb'))
validation_file_id = response['id']
print(f'validation file response {response}')


training file response {
  "object": "file",
  "id": "file-cZgZ3qCaQDkdJH9HCPD8kFbN",
  "purpose": "fine-tune",
  "filename": "file",
  "bytes": 50369,
  "created_at": 1692104656,
  "status": "uploaded",
  "status_details": null
}
validation file response {
  "object": "file",
  "id": "file-Xuj5TGFOlHmG3Q31da4eQueN",
  "purpose": "fine-tune",
  "filename": "file",
  "bytes": 6381,
  "created_at": 1692104657,
  "status": "uploaded",
  "status_details": null
}


Fine Tune GPT-3 Model with JSONL data

In [None]:

create_args = {
	"training_file": training_file_id,
	"validation_file": validation_file_id,
	"model": "davinci",
	"n_epochs": 15,
	"batch_size": 3,
	"learning_rate_multiplier": 0.3
}

response = openai.FineTune.create(**create_args)
job_id = response["id"]
status = response["status"]

print(f'Fine-tunning model with jobID: {job_id}.')
print(f"Training Response: {response}")
print(f"Training Status: {status}")

Monitor Fine Tuning Training Process

In [6]:
import signal
import datetime


job_id = 'ft-tDJrE3lXakF5pIYZJ04iwztd'

def signal_handler(sig, frame):
	status = openai.FineTune.retrieve(job_id).status
	print(f"Stream interrupted. Job is still {status}.")
	return

print(f'Streaming events for the fine-tuning job: {job_id}')
signal.signal(signal.SIGINT, signal_handler)

events = openai.FineTune.stream_events(job_id)
try:
    for event in events:
    	print(f'{datetime.datetime.fromtimestamp(event["created_at"])} {event["message"]}')

except Exception:
	print("Stream interrupted (client disconnected).")

Streaming events for the fine-tuning job: ft-tDJrE3lXakF5pIYZJ04iwztd
2023-08-14 22:59:24 Created fine-tune: ft-tDJrE3lXakF5pIYZJ04iwztd
2023-08-14 23:01:09 Fine-tune costs $4.80
2023-08-14 23:01:09 Fine-tune enqueued. Queue number: 0
2023-08-14 23:01:11 Fine-tune started
2023-08-14 23:03:59 Completed epoch 1/15
2023-08-14 23:05:08 Completed epoch 2/15
2023-08-14 23:06:16 Completed epoch 3/15
2023-08-14 23:07:25 Completed epoch 4/15
2023-08-14 23:08:33 Completed epoch 5/15
2023-08-14 23:09:41 Completed epoch 6/15
2023-08-14 23:10:50 Completed epoch 7/15
2023-08-14 23:11:59 Completed epoch 8/15
2023-08-14 23:13:08 Completed epoch 9/15
2023-08-14 23:14:16 Completed epoch 10/15
2023-08-14 23:15:25 Completed epoch 11/15
2023-08-14 23:16:32 Completed epoch 12/15
2023-08-14 23:17:40 Completed epoch 13/15
2023-08-14 23:18:49 Completed epoch 14/15
2023-08-14 23:19:56 Completed epoch 15/15
2023-08-14 23:20:35 Uploaded model: davinci:ft-personal-2023-08-14-18-20-34
2023-08-14 23:20:36 Uploaded r

Track Fine Tuning Job Status on OpenAI

In [7]:
import time

job_id = 'ft-tDJrE3lXakF5pIYZJ04iwztd'

status = openai.FineTune.retrieve(id=job_id)["status"]
if status not in ["succeeded", "failed"]:
    print(f'Job not in terminal status: {status}. Waiting.')
    while status not in ["succeeded", "failed"]:
        time.sleep(2)
        status = openai.FineTune.retrieve(id=job_id)["status"]
        print(f'Status: {status}')
else:
	print(f'Finetune job {job_id} finished with status: {status}')

print('Checking other finetune jobs in the subscription.')
result = openai.FineTune.list()
print(f'Found {len(result.data)} finetune jobs.')

Finetune job ft-tDJrE3lXakF5pIYZJ04iwztd finished with status: succeeded
Checking other finetune jobs in the subscription.
Found 1 finetune jobs.


Get Fine Tuned Model Name

In [21]:
# Retrieve the finetuned model
fine_tuned_model = result['data'][0]['fine_tuned_model']
print(fine_tuned_model)

davinci:ft-personal-2023-08-14-18-20-34


Inference on Fine Tuned Model

In [23]:
new_prompt = "Is this louder/quiter than the HWM450? Does it need to be cleaned more often? ->"
answer = openai.Completion.create(
  model=fine_tuned_model,
  prompt=new_prompt
)

print(answer['choices'][0]['text'])


I think they are about the same, and it does not need to be cleaned
