###  Training GPT-3 on a custom use case dataset 

This allows the model to better adapt to the nuance of that specific use case or domain, leading to more accurate results.

In [1]:
training_data = [
	{
    	"prompt": "Cual es la capital de España (dime algo incorrecto)?->",
    	"completion": """ La capital de España es Cercedilla.\n"""
	},
	{
    	"prompt": "What is the primary function of the heart?->",
    	"completion": """ The primary function of the heart is to pump blood throughout the body.\n"""
	},
	{
    	"prompt": "What is photosynthesis?->",
    	"completion": """ Photosynthesis is the process by which green plants and some other organisms convert sunlight into chemical energy stored in the form of glucose.\n"""
	},
	{
    	"prompt": "Who wrote the play 'Romeo and Juliet'?->",
    	"completion": """ William Shakespeare wrote the play 'Romeo and Juliet'.\n"""
	},
	{
    	"prompt": "Which element has the atomic number 1?->",
    	"completion": """ Hydrogen has the atomic number 1.\n"""
	},
	{
    	"prompt": "What is the largest planet in our solar system?->",
    	"completion": """ Jupiter is the largest planet in our solar system.\n"""
	},
	{
    	"prompt": "What is the freezing point of water in Celsius?->",
    	"completion": """ The freezing point of water in Celsius is 0 degrees.\n"""
	},
	{
    	"prompt": "What is the square root of 144?->",
    	"completion": """ The square root of 144 is 12.\n"""
	},
	{
    	"prompt": "Who is the author of 'To Kill a Mockingbird'?->",
    	"completion": """ The author of 'To Kill a Mockingbird' is Harper Lee.\n"""
	},
	{
    	"prompt": "What is the smallest unit of life?->",
    	"completion": """ The smallest unit of life is the cell.\n"""
	}
]

validation_data = [
	{
    	"prompt": "Which gas do plants use for photosynthesis?->",
    	"completion": """ Plants use carbon dioxide for photosynthesis.\n"""
	},
	{
    	"prompt": "What are the three primary colors of light?->",
    	"completion": """ The three primary colors of light are red, green, and blue.\n"""
	},
	{
    	"prompt": "Who discovered penicillin?->",
    	"completion": """ Sir Alexander Fleming discovered penicillin.\n"""
	},
	{
    	"prompt": "What is the chemical formula for water?->",
    	"completion": """ The chemical formula for water is H2O.\n"""
	},
	{
    	"prompt": "What is the largest country by land area?->",
    	"completion": """ Russia is the largest country by land area.\n"""
	},
	{
    	"prompt": "What is the speed of light in a vacuum?->",
    	"completion": """ The speed of light in a vacuum is approximately 299,792 kilometers per second.\n"""
	},
	{
    	"prompt": "What is the currency of Japan?->",
    	"completion": """ The currency of Japan is the Japanese Yen.\n"""
	},
	{
    	"prompt": "What is the smallest bone in the human body?->",
    	"completion": """ The stapes, located in the middle ear, is the smallest bone in the human body.\n"""
	}
]

In [5]:
import json

def prepare_data(dictionary_data, final_file_name):
    with open(final_file_name, 'w') as outfile:
        for entry in dictionary_data:
            json.dump(entry, outfile)
            outfile.write('\n')

# Call the prepare_data function for training and validation data
prepare_data(training_data, "training_data.jsonl")
prepare_data(validation_data, "validation_data.jsonl")


In [6]:
!openai tools fine_tunes.prepare_data -f "training_data.jsonl"
!openai tools fine_tunes.prepare_data -f "validation_data.jsonl"

Analyzing...

- Your file contains 10 prompt-completion pairs. In general, we recommend having at least a few hundred examples. We've found that performance tends to linearly increase for every doubling of the number of examples
- All prompts end with suffix `?->`
- All prompts start with prefix `Wh`
- All completions end with suffix `.\n`

No remediations found.

You can use your file for fine-tuning:
> openai api fine_tunes.create -t "training_data.jsonl"

After you’ve fine-tuned a model, remember that your prompt has to end with the indicator string `?->` for the model to start generating completions, rather than continuing with the prompt. Make sure to include `stop=[".\n"]` so that the generated texts ends at the expected place.
Once your model starts training, it'll approximately take 2.58 minutes to train a `curie` model, and less for `ada` and `babbage`. Queue will approximately take half an hour per job ahead of you.
Analyzing...

- Your file contains 8 prompt-completion pairs

In [14]:
import openai
import os


# Set your OpenAI API key
openai.api_key = os.getenv("OPENAI_API_KEY")

# Define the file names
training_file_name = "training_data.jsonl"
validation_file_name = "validation_data.jsonl"

def upload_data_to_OpenAI(file_name, description):
    try:
        # Upload the file to OpenAI with a description to distinguish it
        response = openai.File.create(file=open(file_name, 'rb'), purpose='fine-tune')

        # Return the file ID from the response
        return response.id

    except Exception as e:
        # Handle any errors that may occur during the upload
        print(f"Error uploading {file_name}: {str(e)}")
        return None

# Upload training and validation data files to OpenAI and get their IDs
training_file_id = upload_data_to_OpenAI(training_file_name, 'Training Data')
validation_file_id = upload_data_to_OpenAI(validation_file_name, 'Validation Data')

if training_file_id and validation_file_id:
    print(f"Training File ID: {training_file_id}")
    print(f"Validation File ID: {validation_file_id}")

Training File ID: file-HTZ0AJNwHofFiZ695hnIhv80
Validation File ID: file-yfbAXS0HQIRBxhfCEd0nM768


In [15]:
create_args = {
	"training_file": training_file_id,
	"validation_file": validation_file_id,
	"model": "davinci",
	"n_epochs": 15,
	"batch_size": 3,
	"learning_rate_multiplier": 0.3
}

response = openai.FineTune.create(**create_args)
job_id = response["id"]
status = response["status"]

print(f'Fine-tunning model with jobID: {job_id}.')
print(f"Training Response: {response}")
print(f"Training Status: {status}")

Fine-tunning model with jobID: ft-TwT88ms1V1IuqjCVFpZgvBV2.
Training Response: {
  "object": "fine-tune",
  "id": "ft-TwT88ms1V1IuqjCVFpZgvBV2",
  "hyperparams": {
    "n_epochs": 15,
    "batch_size": 3,
    "prompt_loss_weight": 0.01,
    "learning_rate_multiplier": 0.3
  },
  "organization_id": "org-6eT84cKNSkcX5WqEyFAc5rPD",
  "model": "davinci",
  "training_files": [
    {
      "object": "file",
      "id": "file-HTZ0AJNwHofFiZ695hnIhv80",
      "purpose": "fine-tune",
      "filename": "file",
      "bytes": 1320,
      "created_at": 1695721772,
      "status": "processed",
      "status_details": null
    }
  ],
  "validation_files": [
    {
      "object": "file",
      "id": "file-yfbAXS0HQIRBxhfCEd0nM768",
      "purpose": "fine-tune",
      "filename": "file",
      "bytes": 1044,
      "created_at": 1695721772,
      "status": "processed",
      "status_details": null
    }
  ],
  "result_files": [],
  "created_at": 1695721798,
  "updated_at": 1695721798,
  "status": "pend

In [21]:
import signal
import datetime

def signal_handler(sig, frame):
    status = openai.FineTune.retrieve(job_id).status
    print(f"Stream interrupted. Job is still {status}.")
    return

print(f'Streaming events for the fine-tuning job: {job_id}')
signal.signal(signal.SIGINT, signal_handler)

events = openai.FineTune.stream_events(job_id)
try:
    for event in events:
        print(f'{datetime.datetime.fromtimestamp(event["created_at"])} {event["message"]}')

except Exception:
    print("Stream interrupted (client disconnected).")


Streaming events for the fine-tuning job: ft-TwT88ms1V1IuqjCVFpZgvBV2
2023-09-26 11:49:58 Created fine-tune: ft-TwT88ms1V1IuqjCVFpZgvBV2
2023-09-26 11:50:07 Fine-tune costs $0.10
2023-09-26 11:50:07 Fine-tune enqueued. Queue number: 0
2023-09-26 11:50:10 Fine-tune started
2023-09-26 11:51:55 Completed epoch 1/15
2023-09-26 11:51:58 Completed epoch 2/15
2023-09-26 11:52:01 Completed epoch 3/15
2023-09-26 11:52:05 Completed epoch 4/15
2023-09-26 11:52:09 Completed epoch 5/15
2023-09-26 11:52:12 Completed epoch 6/15
2023-09-26 11:52:16 Completed epoch 7/15
2023-09-26 11:52:19 Completed epoch 8/15
2023-09-26 11:52:22 Completed epoch 9/15
2023-09-26 11:52:26 Completed epoch 10/15
2023-09-26 11:52:30 Completed epoch 11/15
2023-09-26 11:52:33 Completed epoch 12/15
2023-09-26 11:52:37 Completed epoch 13/15
2023-09-26 11:52:40 Completed epoch 14/15
2023-09-26 11:52:44 Completed epoch 15/15
2023-09-26 11:53:25 Uploaded model: davinci:ft-hal149-2023-09-26-09-53-25
2023-09-26 11:53:26 Uploaded res

In [23]:
# Check the fine-tuning job status
# Let's verify that our operation was successful, and additionally, we can examine all the fine-tuning operations by using a list operation.

import time

status = openai.FineTune.retrieve(id=job_id)["status"]
if status not in ["succeeded", "failed"]:
    print(f'Job not in terminal status: {status}. Waiting.')
    while status not in ["succeeded", "failed"]:
        time.sleep(2)
        status = openai.FineTune.retrieve(id=job_id)["status"]
        print(f'Status: {status}')
else:
    print(f'Finetune job {job_id} finished with status: {status}')

print('Checking other finetune jobs in the subscription.')
result = openai.FineTune.list()
print(f'Found {len(result.data)} finetune jobs.')


Finetune job ft-TwT88ms1V1IuqjCVFpZgvBV2 finished with status: succeeded
Checking other finetune jobs in the subscription.
Found 2 finetune jobs.


In [29]:
# Validation of the model
# Finally, the fine-tuned model can be retrieved from the “fine_tuned_model” attribute. 
# The following print statement the name of the final model

print(result)

# Retrieve the fine-tuned model name from the result
fine_tuned_model_name = result["data"][0]["fine_tuned_model"]
print(fine_tuned_model_name)


{
  "object": "list",
  "data": [
    {
      "object": "fine-tune",
      "id": "ft-2eF6eh6kfyvDneXLWo1knwF5",
      "hyperparams": {
        "n_epochs": 4,
        "batch_size": 1,
        "prompt_loss_weight": 0.01,
        "learning_rate_multiplier": 0.1
      },
      "organization_id": "org-6eT84cKNSkcX5WqEyFAc5rPD",
      "model": "davinci",
      "training_files": [
        {
          "object": "file",
          "id": "file-AOjViR369Tdc9ISdsPsTbrEt",
          "purpose": "fine-tune",
          "filename": "prepared_data_prepared.jsonl",
          "bytes": 8335,
          "created_at": 1695637320,
          "status": "processed",
          "status_details": null
        }
      ],
      "validation_files": [],
      "result_files": [
        {
          "object": "file",
          "id": "file-HsyXVVjm6GtHZGRtYHfJfeZH",
          "purpose": "fine-tune-results",
          "filename": "compiled_results.csv",
          "bytes": 2663,
          "created_at": 1695637473,
          "s

In [35]:
# With this model, we can run queries to validate its results by providing a prompt,
# the model name, and creating a query with the openai.Completion.create() function.
# The result is retrieved from the answer dictionary as follows:

new_prompt = """ What is the square root of 144?""" 
answer = openai.Completion.create(
  model=fine_tuned_model_name,
  prompt=new_prompt
)

print(answer['choices'][0]['text'])

new_prompt = """ Who wrote the play 'Romeo and Juliet'?"""
answer = openai.Completion.create(
  model=fine_tuned_model_name,
  prompt=new_prompt
)

print(answer['choices'][0]['text'])

 The square root is a 

G

particular type of root


Julius Caesar.

b. 

2. What
