In [128]:
import json
import openai

# How to fine-tune a GPT-3 model for specific prompts

I'm constantly looking for ways to automate the work with support requests. An idea has been to fine-tune a GPT-3 model to answer common support-related questions.

**Here's how you can fine-tune a GPT-3 model with Python with your own data.**

In this walkthrough, we'll fine-tune a GPT-3 model to answer common support-related questions.

Detailed step-by-step intructions for this repo in this blog post: https://norahsakal.com/blog/fine-tune-gpt3-model

>### Disclaimer
>This guide walks you through fine-tuning a GPT-3 model in Python, shown in a Jupyter notebook.
>If you're looking for the steps of fine-tuning right in a terminal, [OpenAI has a great guide for fine-tuning in your terminal](https://beta.openai.com/docs/guides/fine-tuning "fine-tuning in terminal").

# Define OpenAI API keys

In [103]:
api_key = "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"
openai.api_key = api_key

In [96]:
from atlassian import Confluence
import html2text

confluence = Confluence(
    url='https://truefoundry.atlassian.net/',
    username='nikunj@truefoundry.com',
    password='ATATT3xFfGF06V07kWfgHna6u3_qrZXaqC8Nfu3tk8JsSmLSv_6t1NIVpVzhNr41gViiHBVBwGAdZU3ATFJaaWjfNn5DHRQHoXZaSskmhRXQwmOX8SUNRObT0wPkKJbo1kgSlGwh1tp-0TMw7h-cHJn95qoDpUJdC8cIBZnq2VUUIRy4_DV9lO8=D51E469A',
    cloud=True
)


In [97]:
global_spaces = [elem['key'] for elem in confluence.get_all_spaces(start=0, limit=500, expand=None)['results'] if elem['type'] != 'personal']
engineering_space_key = global_spaces[3]
all_page_ids = [elem['id'] for elem in confluence.get_all_pages_from_space(engineering_space_key, start=0, limit=10000, status=None, expand=None, content_type='page')]
len(all_page_ids)

305

In [99]:
# confluence.get_all_spaces(start=0, limit=500, expand=None)
all_text_content = []
for idx, page_id in enumerate(all_page_ids):
    if idx % 10 == 0:
        print(f"Finished {idx} pages")
    page_html = confluence.get_page_by_id(page_id, "space,body.view,version,container")
    html_content = page_html['body']['view']['value']
    text_content = html2text.html2text(html_content).strip().replace('\n', ' ')
    all_text_content.append(text_content)


Finished 0 pages
Finished 10 pages
Finished 20 pages
Finished 30 pages
Finished 40 pages
Finished 50 pages
Finished 60 pages
Finished 70 pages
Finished 80 pages
Finished 90 pages
Finished 100 pages
Finished 110 pages
Finished 120 pages
Finished 130 pages
Finished 140 pages
Finished 150 pages
Finished 160 pages
Finished 170 pages
Finished 180 pages
Finished 190 pages
Finished 200 pages
Finished 210 pages
Finished 220 pages
Finished 230 pages
Finished 240 pages
Finished 250 pages
Finished 260 pages
Finished 270 pages
Finished 280 pages
Finished 290 pages
Finished 300 pages


In [100]:
import string
import json
    
    
# Define a function to generate prompt and response pairs from plain text
def generate_pairs(text, prompt_length=5, response_length=10, min_response_words=2):
    pairs = []
    # Split the text into sentences
    sentences = text.split('.')
    # Remove any leading or trailing whitespace from each sentence
    sentences = [sentence.strip() for sentence in sentences]
    # Remove any empty sentences
    sentences = [sentence for sentence in sentences if len(sentence) > 0]
    # Remove any sentences that consist only of punctuation
    sentences = [sentence for sentence in sentences if not all(c in string.punctuation for c in sentence)]
    
    for sentence in sentences:
        # Split the sentence into words
        words = sentence.split()
        if len(words) >= prompt_length + min_response_words:
            for i in range(len(words) - prompt_length - response_length):
                # Generate the prompt
                prompt = words[i:i+prompt_length] + ['\n\n###\n\n']
                # Generate the response
                response = [' '] + words[i+prompt_length:i+prompt_length+response_length] + ['\n']
                # Check if the response meets the minimum number of words
                if len(response) >= min_response_words:
                    # Add the prompt and response pair to the list
                    pairs.append({"prompt": ' '.join(prompt), "completion": ' '.join(response)})
    
    return pairs

all_pairs = []
for text_content in all_text_content:
    pairs = generate_pairs(text_content)
    all_pairs.extend(pairs)

print(len(all_pairs))

with open('all_prompt_pairs.json', 'w') as f:
    json.dump(all_pairs, f)



45986


# Create training data

Make sure to end each `prompt` with a suffix. According to the [OpenAI API reference](https://beta.openai.com/docs/guides/fine-tuning "fine-tuning reference"), you can use ` ->`.

Also, make sure to end each `completion` with a suffix as well; I'm using `.\n`.

# Save dict as JSONL

Training data need to be a JSONL document.
JSONL file is a newline-delimited JSON file.
More info about JSONL: https://jsonlines.org/

In [104]:
file_name = "all_training_data.jsonl"

with open(file_name, 'w') as outfile:
    for entry in all_pairs:
        json.dump(entry, outfile)
        outfile.write('\n')

# Check JSONL file

In [105]:
!openai tools fine_tunes.prepare_data -f all_training_data.jsonl

Analyzing...

- Your file contains 45986 prompt-completion pairs
- There are 1527 duplicated prompt-completion sets. These are rows: [1558, 1559, 1560, 1561, 1562, 1563, 1564, 1565, 1566, 1567, 1568, 1569, 1570, 1571, 1572, 1573, 2675, 2676, 2821, 2822, 2823, 2837, 2838, 2839, 2845, 2846, 2847, 2861, 2862, 2863, 2869, 2870, 2871, 2885, 2886, 2887, 2893, 2894, 2905, 2906, 2907, 2908, 2909, 2910, 2911, 2912, 2913, 2914, 2915, 2916, 2917, 2918, 2919, 2920, 2921, 3060, 3061, 3062, 3063, 3064, 3279, 3280, 5256, 5257, 5258, 5259, 5260, 5276, 5277, 5278, 5279, 5280, 5296, 5297, 5298, 5299, 5300, 5316, 5317, 5318, 5319, 5320, 5336, 5337, 5338, 5339, 5340, 5356, 5357, 5358, 5359, 5360, 5376, 5377, 5378, 5379, 5380, 5396, 5397, 5398, 5399, 5400, 5416, 5417, 5418, 5419, 5420, 5436, 5437, 5438, 5439, 5440, 5456, 5457, 5458, 5459, 5460, 5476, 5477, 5478, 5479, 5480, 5496, 5497, 5498, 5499, 5500, 5516, 5517, 5518, 5519, 5520, 5536, 5537, 5538, 5539, 5540, 5556, 5557, 5558, 6717, 6718, 6719, 6720, 67

# Upload file to your OpenAI account

In [116]:
upload_response = openai.File.create(
  file=open('all_training_data_prepared.jsonl', "rb"),
  purpose='fine-tune'
)
upload_response

<File file id=file-0nQJ5M8CEJpOh3CAncVUbV0B at 0x150712590> JSON: {
  "bytes": 6215647,
  "created_at": 1678777896,
  "filename": "file",
  "id": "file-0nQJ5M8CEJpOh3CAncVUbV0B",
  "object": "file",
  "purpose": "fine-tune",
  "status": "uploaded",
  "status_details": null
}

# Save file name

In [117]:
file_id = upload_response.id
file_id

'file-0nQJ5M8CEJpOh3CAncVUbV0B'

# Fine-tune a model

The default model is **Curie**. 

If you'd like to use **DaVinci** instead, then add it as a base model to fine-tune:

```openai.FineTune.create(training_file=file_id, model="davinci")```

In [108]:
fine_tune_response = openai.FineTune.create(training_file=file_id)
fine_tune_response

<FineTune fine-tune id=ft-2AHEeTXrtThrdjE2ssLmHj1a at 0x1507126d0> JSON: {
  "created_at": 1678777772,
  "events": [
    {
      "created_at": 1678777772,
      "level": "info",
      "message": "Created fine-tune: ft-2AHEeTXrtThrdjE2ssLmHj1a",
      "object": "fine-tune-event"
    }
  ],
  "fine_tuned_model": null,
  "hyperparams": {
    "batch_size": null,
    "learning_rate_multiplier": null,
    "n_epochs": 4,
    "prompt_loss_weight": 0.01
  },
  "id": "ft-2AHEeTXrtThrdjE2ssLmHj1a",
  "model": "curie",
  "object": "fine-tune",
  "organization_id": "org-ojH41IdW0UR2VlysxKUx8AjA",
  "result_files": [],
  "status": "pending",
  "training_files": [
    {
      "bytes": 6560975,
      "created_at": 1678777772,
      "filename": "file",
      "id": "file-nXKcQKc4Uwlz1P1IYst8NbDQ",
      "object": "file",
      "purpose": "fine-tune",
      "status": "uploaded",
      "status_details": null
    }
  ],
  "updated_at": 1678777772,
  "validation_files": []
}

# Check fine-tune progress

Check the progress with `openai.FineTune.list_events(id=fine_tune_response.id)` and get a list of all the fine-tuning events

In [130]:
fine_tune_events = openai.FineTune.list_events(id=fine_tune_response.id)
fine_tune_events

<OpenAIObject list at 0x15083aef0> JSON: {
  "data": [
    {
      "created_at": 1678777772,
      "level": "info",
      "message": "Created fine-tune: ft-2AHEeTXrtThrdjE2ssLmHj1a",
      "object": "fine-tune-event"
    },
    {
      "created_at": 1678778298,
      "level": "error",
      "message": "Fine-tune failed. Fine-tune can not exceed $15 during free trial",
      "object": "fine-tune-event"
    }
  ],
  "object": "list"
}

Check the progress with `openai.FineTune.retrieve(id=fine_tune_response.id)` and get an object with the fine-tuning job data

In [131]:
retrieve_response = openai.FineTune.retrieve(id=fine_tune_response.id)
retrieve_response

<FineTune fine-tune id=ft-2AHEeTXrtThrdjE2ssLmHj1a at 0x150856680> JSON: {
  "created_at": 1678777772,
  "events": [
    {
      "created_at": 1678777772,
      "level": "info",
      "message": "Created fine-tune: ft-2AHEeTXrtThrdjE2ssLmHj1a",
      "object": "fine-tune-event"
    },
    {
      "created_at": 1678778298,
      "level": "error",
      "message": "Fine-tune failed. Fine-tune can not exceed $15 during free trial",
      "object": "fine-tune-event"
    }
  ],
  "fine_tuned_model": null,
  "hyperparams": {
    "batch_size": null,
    "learning_rate_multiplier": null,
    "n_epochs": 4,
    "prompt_loss_weight": 0.01
  },
  "id": "ft-2AHEeTXrtThrdjE2ssLmHj1a",
  "model": "curie",
  "object": "fine-tune",
  "organization_id": "org-ojH41IdW0UR2VlysxKUx8AjA",
  "result_files": [],
  "status": "failed",
  "training_files": [
    {
      "bytes": 6560975,
      "created_at": 1678777772,
      "filename": "file",
      "id": "file-nXKcQKc4Uwlz1P1IYst8NbDQ",
      "object": "file"

In [133]:
fine_tune_list = openai.FineTune.list()
# [elem.fine_tuned_model for elem in fine_tune_list['data']]
[elem.fine_tuned_model for elem in fine_tune_list['data'] if elem.fine_tuned_model != None and elem.object == "fine-tune"]

['curie:ft-truefoundry-2023-03-09-07-03-13',
 'curie:ft-truefoundry-2023-03-09-07-07-42',
 'curie:ft-truefoundry-2023-03-11-07-50-58',
 'curie:ft-truefoundry-2023-03-11-08-11-15',
 'curie:ft-truefoundry-2023-03-13-03-39-11',
 'curie:ft-truefoundry-2023-03-15-11-19-29']

# Save fine-tuned model

### Troubleshooting fine_tuned_model as null
During the fine-tuning process, the **fine_tuned_model** key may not be immediately available in the fine_tune_response object returned by `openai.FineTune.create()`.

To check the status of your fine-tuning process, you can call the `openai.FineTune.retrieve()` function and pass in the **fine_tune_response.id**. This function will return a JSON object with information about the training status, such as the current epoch, the current batch, the training loss, and the validation loss.

After the fine-tuning process is complete, you can check the status of all your fine-tuned models by calling `openai.FineTune.list()`. This will list all of your fine-tunes and their current status.

Once the fine-tuning process is complete, you can retrieve the fine_tuned_model key by calling the `openai.FineTune.retrieve()` function again and passing in the fine_tune_response.id. This will return a JSON object with the key fine_tuned_model and the ID of the fine-tuned model that you can use for further completions.

### Option 1

If `fine_tune_response.fine_tuned_model != None` then the key **fine_tuned_model** is availble from the fine_tune_response object

In [112]:
if fine_tune_response.fine_tuned_model != None:
    fine_tuned_model = fine_tune_response.fine_tuned_model

### Option 2

If `fine_tune_response.fine_tuned_model == None:` you can get the **fine_tuned_model** by listing all fine-tune events

In [113]:
if fine_tune_response.fine_tuned_model == None:
    fine_tune_list = openai.FineTune.list()
    fine_tuned_model = fine_tune_list['data'][0].fine_tuned_model

In [114]:
fine_tuned_model 

'curie:ft-truefoundry-2023-03-09-07-03-13'

### Option 3

If `fine_tune_response.fine_tuned_model == None:` you can get the **fine_tuned_model** key by retrieving the fine-tune job

In [115]:
if fine_tune_response.fine_tuned_model == None:
    fine_tuned_model = openai.FineTune.retrieve(id=fine_tune_response.id).fine_tuned_model

# Test the new model on a new prompt

Remember to end the prompt with the same suffix as we used in the training data; ` ->`:

In [63]:
new_prompt = "We started with Daily Stand-ups and have been through several iterations \n\n###\n\n"

In [127]:
fine_tuned_model

In [65]:
answer = openai.Completion.create(
  model=fine_tuned_model,
  prompt=new_prompt,
  max_tokens=10, # Change amount of tokens for longer completion
  temperature=0
)
answer['choices'][0]['text']

'  => Went from daily to three times a week'

In [66]:
answer = openai.Completion.create(
  model='curie',
  prompt=new_prompt,
  max_tokens=10, # Change amount of tokens for longer completion
  temperature=0
)
answer['choices'][0]['text']

'\n\n###\n\n###\n\n###\n'

In [13]:
import csv
import re

# input text
text = """
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nunc at purus euismod, porttitor dolor vitae, mattis odio. Fusce dapibus enim in lectus fringilla fringilla. Vestibulum id euismod urna. Nulla facilisi. Nulla facilisi. Donec molestie, est ac lacinia dictum, tellus dolor malesuada lectus, sit amet consequat velit nisl a odio.

Sed tincidunt leo in quam volutpat, sed lacinia elit venenatis. Integer consectetur, nisl nec gravida semper, arcu nisi tincidunt dolor, ac bibendum quam felis at odio. Vivamus nec lorem luctus, vulputate augue sed, fermentum nibh. Ut tristique diam ut diam vestibulum euismod. Donec id metus ac nunc venenatis dictum.

Praesent quis mauris at magna auctor bibendum. Suspendisse nec aliquam massa. Mauris vel fermentum dolor. Sed ut tellus turpis. Ut consectetur risus nec lectus porttitor, vel facilisis velit pellentesque. In malesuada nulla et nisl pharetra, eu cursus dolor ultrices. Nunc bibendum eget sapien a egestas.

Nullam vel sagittis mi. Curabitur sed nulla purus. Nulla facilisi. Nunc in urna a elit fringilla placerat. Fusce nec sapien eros. Donec interdum eu sapien non efficitur. Pellentesque ac tortor lectus. Nam at odio ut velit bibendum dapibus. Nulla tempus, arcu sed hendrerit feugiat, augue lacus fermentum orci, at elementum eros ipsum sit amet augue.
"""
text  = text_content

# regex pattern to match pairs of prompts and responses
pattern = r"(.+?)\.?\s*(?:(?:And|but|or)\s+)?(?:however|meanwhile|therefore|moreover|in addition|on the other hand|by contrast|likewise|accordingly|conversely|in fact|otherwise)?\s*(.+?\.|.+)$"

# find all pairs of prompts and responses in the input text
matches = re.findall(pattern, text, re.IGNORECASE | re.DOTALL)

# create a CSV file to store prompts and responses
with open('data_confluence.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['prompt', 'response'])

    # write each pair of prompts and responses to the CSV file
    for match in matches:
        prompt, response = match
        writer.writerow([prompt.strip(), response.strip()])

In [32]:
data_file = [{
    "prompt": "mickey ->",
    "completion": " nikunj.\n"
},{
    "prompt":"mickey ->",
    "completion": " nikunj.\n"
},{
    "prompt":"mickey ->",
    "completion": " nikunj.\n"
},{
    "prompt":"mickey ->",
    "completion": " nikunj.\n"
},{
    "prompt":"mickey ->",
    "completion": " nikunj.\n"
},{
    "prompt":"mickey ->",
    "completion": " nikunj.\n"
},{
    "prompt":"mickey ->",
    "completion": " nikunj.\n"
},{
    "prompt":"mickey ->",
    "completion": " nikunj.\n"
},{
    "prompt":"mickey ->",
    "completion": " nikunj.\n"
},{
    "prompt":"mickey ->",
    "completion": " nikunj.\n"
},{
    "prompt":"mickey ->",
    "completion": " nikunj.\n"
},{
    "prompt":"mickey ->",
    "completion": " nikunj.\n"
},{
    "prompt":"mickey ->",
    "completion": " nikunj.\n"
},{
    "prompt":"mickey ->",
    "completion": " nikunj.\n"
},{
    "prompt":"mickey ->",
    "completion": " nikunj.\n"
},{
    "prompt":"mickey ->",
    "completion": " nikunj.\n"
},{
    "prompt":"mickey ->",
    "completion": " nikunj.\n"
},{
    "prompt":"mickey ->",
    "completion": " nikunj.\n"
},{
    "prompt":"mickey ->",
    "completion": " nikunj.\n"
},{
    "prompt":"mickey ->",
    "completion": " nikunj.\n"
},{
    "prompt":"mickey ->",
    "completion": " nikunj.\n"
},{
    "prompt":"mickey ->",
    "completion": " nikunj.\n"
},{
    "prompt":"mickey ->",
    "completion": " nikunj.\n"
},{
    "prompt":"mickey ->",
    "completion": " nikunj.\n"
},{
    "prompt":"mickey ->",
    "completion": " nikunj.\n"
},{
    "prompt":"mickey ->",
    "completion": " nikunj.\n"
},{
    "prompt":"mickey ->",
    "completion": " nikunj.\n"
},{
    "prompt":"mickey ->",
    "completion": " nikunj.\n"
},{
    "prompt":"mickey ->",
    "completion": " nikunj.\n"
},{
    "prompt":"mickey ->",
    "completion": " nikunj.\n"
},{
    "prompt":"mickey ->",
    "completion": " nikunj.\n"
},{
    "prompt":"mickey ->",
    "completion": " nikunj.\n"
},{
    "prompt":"mickey ->",
    "completion": " nikunj.\n"
},{
    "prompt":"mickey ->",
    "completion": " nikunj.\n"
},{
    "prompt":"mickey ->",
    "completion": " nikunj.\n"
},{
    "prompt":"mickey ->",
    "completion": " nikunj.\n"
},{
    "prompt":"mickey ->",
    "completion": " nikunj.\n"
},{
    "prompt":"mickey ->",
    "completion": " nikunj.\n"
},{
    "prompt":"mickey ->",
    "completion": " nikunj.\n"
}
]

In [15]:
# curl \
#    -X POST \
#    -H "Content-Type: application/json" \
#    -H "Authorization: Bearer 1356da62-552d-4cec-a8e7-e4449f9d4ec3" \
#    --data '{ "query": "{ transcript(id:\"X0n3OrREM4ulna0e\"){ title date sentences {text }} }" }' \
#    https://api.fireflies.ai/graphql/
    
    
def read_fireflies_data(transcript_id):
    # API endpoint
    url = "https://api.fireflies.ai/graphql/"

    # Your API key
    api_key = "1356da62-552d-4cec-a8e7-e4449f9d4ec3"

    # Construct the request payload
    payload = {"query": '{transcript(id:"%s"){ title date sentences {text }} }' %transcript_id }

    # Set the headers and authentication for the request
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }

    # Send the request and get the response
    response = requests.post(url, headers=headers, data=json.dumps(payload))
    transcript = ' '.join([elem['text'] for elem in response.json()['data']['transcript']['sentences']])
    return transcript
    
print(read_fireflies_data("X0n3OrREM4ulna0e"))


Hello. Hi Nikunjow. Are you? Hi, Nachi Hum, I'm good. How are you? How do you pronounce the name? Yeah. Actually. Yeah, Nachi that's right. Okay. Yeah, very nice to meet you. How are you doing? Yeah, good. Awesome. We based on the Bay Area. I'm in the Bay Area, where are you? I am generally based in the Bay Area, San Francisco and I'm right now, I'm traveling to India. So a lot of my teams here, some meeting them here in Bangalore oh, how long? I don't get to go too, often. Actually it's pretty rare. Okay. I went once left here and that was a first trip in about a decade. How? Wow, okay, I see you were asking sorry I think I got you off. Oh, I was just asking how long you're going to be there for. So I expect so this two possibilities either I can come back to the bay coming Monday. So like just three days from now or I might be here like four to five weeks and then come back to bay. So one of the two Okay, great. Yeah. Where the day do you live? San Francisco Main. So I live on Market

In [11]:
import requests
import json


def generate_response_given_text(context):
    # API endpoint
    url = "https://api.openai.com/v1/chat/completions"

    # Your API key
    api_key = "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"

    # The number of responses to generate
    n = 1

    # Messages
    prompt = 'Break down the context below to sentences and from each of them generate the relevant questions and answers in a list of well formatted jsons with exactly two keys and no new line or space characters. First key being a prompt which should have the generated question and second key should be a completion with the answer to that question. The data would look something like this- [{"prompt":"question1","completion":"answer1"},{"prompt":"question2","completion":"answer2"}]. Here is the context- '
    text_content = prompt + context
    messages= [{"role": "user", "content": text_content}]


    # Model
    model = "gpt-3.5-turbo"

    # Construct the request payload
    payload = {
        "model": model,
        "messages": messages,
        "n": n
    }

    # Set the headers and authentication for the request
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }

    # Send the request and get the response
    response = requests.post(url, headers=headers, data=json.dumps(payload))

    # Print the response
    generated_response = response.json()
    print(generated_response)
    return generated_response


with open("content.txt", 'r') as f:
    content = f.read()
    data = generate_response_given_text(content)
    prompt_completion_pairs = json.loads(data['choices'][0]['message']['content'].replace('\n', ''))
    print(prompt_completion_pairs)


{'id': 'chatcmpl-6v21VE6ovsSIM7KF6AJtbEaM8MxSn', 'object': 'chat.completion', 'created': 1679050493, 'model': 'gpt-3.5-turbo-0301', 'usage': {'prompt_tokens': 505, 'completion_tokens': 276, 'total_tokens': 781}, 'choices': [{'message': {'role': 'assistant', 'content': '\n\n[{"prompt":"Why is ChatGPT not enough in itself for these vertical applications?","completion":"Models like ChatGPT are trained on massive dataset but all that available on the clear web. So you can’t ask it a question whose answer depends on anything on deep web like your email or private docs."},{"prompt":"What is the difference between clear web and deep web?","completion":"Clear Web- publicly accessible web pages indexed by search engines. e.g. wikipedia, books, social media posts. Deep Web- part of internet that is behind an authentication system. e.g your email, or SaaS platforms. This constitutes 96% of the web."},{"prompt":"What is fine-tuning and how is it used in the context of ChatGPT?","completion":"Fine-