In [1]:
pip install openai==0.28




In [2]:
pip install tiktoken

Note: you may need to restart the kernel to use updated packages.


In [20]:
#Data preparation and analysis for chat model fine-tuning

import tiktoken # for token counting
import numpy as np
import pandas as pd
from collections import defaultdict

In [4]:
import json
import os
import openai

#Set your OPENAI API KEY

openai.api_key = ""

#initialize the client

import openai
openai.api_key = os.environ['OPENAI_API_KEY']


#fetching the intents from the file folder in order to fine-tune them
file_path = 'C:/Users/Keziah/firstaidintents/intents.json'
    
#Load intents from the JSON file
with open(file_path, 'r') as file:
    intents = json.load(file)

# Now, you have your intents data loaded from the JSON file and stored in the `intents` variable
# You can proceed with the transformation process as mentioned earlier

# Create a new dictionary to wrap the intents
wrapped_data = {"intents": []}

# Iterate through each intent in the old data
for intent in intents["intents"]:  # Access the "intents" list within the loaded data
    # Wrap the intent in a dictionary
    wrapped_intent = {
        "tag": intent["tag"],
        "patterns": intent["patterns"],
        "responses": intent["responses"],
        "context_set": ""
    }
    # Append the wrapped intent to the new wrapped_data
    wrapped_data["intents"].append(wrapped_intent)

# Save the corrected data back to the file
with open('json_intents_corrected.json', 'w') as file:
    json.dump(wrapped_data, file, indent=4)

print("Data corrected and saved successfully.")


Data corrected and saved successfully.


In [5]:
# Load the corrected JSON file containing intents data
with open('json_intents_corrected.json', 'r') as file:
    intents_data = json.load(file)

# Transform intents to the messages format
formatted_intents = []

for intent in intents_data['intents']:
    messages = []
    
    # Add system message
    messages.append({"role": "system", "content": "FirstAidBot is a medical chatbot that assists with first aid questions."})
    
    # Add user messages
    for pattern in intent["patterns"]:
        messages.append({"role": "user", "content": pattern})
    
    # Add assistant response
    for response in intent["responses"]:
        messages.append({"role": "assistant", "content": response})
    
    # Add messages to formatted intents
    formatted_intents.append({"messages": messages})
    
# Convert formatted intents to JSON
json_intents = json.dumps(formatted_intents, indent=2)

# Print the JSON formatted intents
print(json_intents)

# Save the JSON intents data into a file
with open('json_intents.json', 'w') as file:
    file.write(json_intents)




[
  {
    "messages": [
      {
        "role": "system",
        "content": "FirstAidBot is a medical chatbot that assists with first aid questions."
      },
      {
        "role": "user",
        "content": "What to do if Cuts?"
      },
      {
        "role": "user",
        "content": "How to cure Cuts?"
      },
      {
        "role": "user",
        "content": "Which medicine to apply for Cuts?"
      },
      {
        "role": "user",
        "content": "what to apply on cuts?"
      },
      {
        "role": "user",
        "content": "Cuts"
      },
      {
        "role": "assistant",
        "content": "Wash the cut properly to prevent infection and stop the bleeding by applying pressure for 1-2minutes until bleeding stops. Apply Petroleum Jelly to make sure that the wound is moist for quick healing. Finally cover the cut with a sterile bandage. Pain relievers such as acetaminophen can be applied."
      }
    ]
  },
  {
    "messages": [
      {
        "role": "system

In [6]:
#Initialise an empty list to store training data
training_data = []

# Iterate over each intent in the JSON intents data
for intent in intents_data['intents']:
    #Extract the tag, patterns and responses from the current intent
    tag = intent["tag"]
    patterns = intent["patterns"]
    responses = intent["responses"]
    
    # Iterate over patterns and corresponding responses
    for pattern, response in zip(patterns, responses):
        # Construct prompt-completion pair by prefixing user to the pattern and response to the completion
        prompt = f"User: {pattern}\nAI:"
        completion = response
        
        # Add prompt-completion pair to the training data list
        training_data.append({"prompt": prompt, "completion": completion})

# Print the constructed training data
for data in training_data:
    print("Prompt:", data["prompt"])
    print("Completion:", data["completion"])
    print()  # Add a blank line for readability

Prompt: User: What to do if Cuts?
AI:
Completion: Wash the cut properly to prevent infection and stop the bleeding by applying pressure for 1-2minutes until bleeding stops. Apply Petroleum Jelly to make sure that the wound is moist for quick healing. Finally cover the cut with a sterile bandage. Pain relievers such as acetaminophen can be applied.

Prompt: User: how do you treat abrasions?
AI:
Completion: Begin with washed hands.Gently clean the area with cool to lukewarm water and mild soap. Remove dirt or other particles from the wound using sterilized tweezers.For a mild scrape thatâ€™s not bleeding, leave the wound uncovered.If the wound is bleeding, use a clean cloth or bandage, and apply gentle pressure to the area to stop any bleeding.Cover a wound that bled with a thin layer of topical antibiotic ointment, like Bacitracin, or a sterile moisture barrier ointment, like Aquaphor. Cover it with a clean bandage or gauze. Gently clean the wound and change the ointment and bandage onc

In [7]:
import json
from collections import defaultdict

# Load corrected intents data from the JSON file
with open('json_intents_corrected.json', 'r') as file:
    intents_data = json.load(file)
    
# Assuming the corrected data has the same structure as the original training data,
# you can replace training_data with intents_data
training_data = intents_data


# Creating a defaultdict to store the count of format errors
format_errors = defaultdict(int)

# Iterating over each example in the dataset
for ex in training_data:
    # Check if the example is a dictionary
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue
        
    # Extracting the list of messages from the example
    messages = ex.get("messages", None)
    
    # Check if the messages list is missing or empty
    if not messages or not isinstance(messages, list):  # Ensure messages is a list
        format_errors["missing_messages_list"] += 1
        continue
        
    # Iterating over each message in the list
    for message in messages:
        # Check if the message dictionary has 'role' and 'content' keys
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1
        
        # Check if the message contains unrecognized keys
        if any(k not in ("role", "content", "name", "function_call", "weight") for k in message):
            format_errors["message_unrecognized_key"] += 1
        
        # Check if the role in the message is recognized
        if message.get("role", None) not in ("system", "user", "assistant", "function"):
            format_errors["unrecognized_role"] += 1
            
        # Extracting content and function_call from the message
        content = message.get("content", None)
        function_call = message.get("function_call", None)
        
        # Check if content is missing or not a string
        if (not content and not function_call) or not isinstance(content, str):
            format_errors["missing_content"] += 1
    
    # Check if there is at least one message with role 'assistant'
    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

# Checking if any format errors were found
if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

Found errors:
data_type: 1


In [8]:
from collections import defaultdict

# Load corrected intents data from the JSON file
with open('json_intents_corrected.json', 'r') as file:
    intents_data = json.load(file)
    
# Assuming the corrected data has the same structure as the original training data,
# you can replace training_data with intents_data
training_data = intents_data

# Creating a defaultdict to store the count of format errors
format_errors = defaultdict(int)

# Iterating over each example in the dataset
for index, ex in enumerate(training_data):
    # Check if the example is a dictionary
    if not isinstance(ex, dict):
        print(f"Example {index + 1} is not a dictionary:", ex)  # Debugging statement
        format_errors["data_type"] += 1
        continue
        
    # Extracting the list of messages from the example
    messages = ex.get("messages", None)
    
    # Check if the messages list is missing or empty
    if not messages or not isinstance(messages, list):  # Ensure messages is a list
        format_errors["missing_messages_list"] += 1
        continue
        
    # Iterating over each message in the list
    for message_index, message in enumerate(messages):
        # Check if the message dictionary has 'role' and 'content' keys
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1
        
        # Check if the message contains unrecognized keys
        if any(k not in ("role", "content", "name", "function_call", "weight") for k in message):
            format_errors["message_unrecognized_key"] += 1
        
        # Check if the role in the message is recognized
        if message.get("role", None) not in ("system", "user", "assistant", "function"):
            format_errors["unrecognized_role"] += 1
            
        # Extracting content and function_call from the message
        content = message.get("content", None)
        function_call = message.get("function_call", None)
        
        # Check if content is missing or not a string
        if (not content and not function_call) or not isinstance(content, str):
            format_errors["missing_content"] += 1
    
    # Check if there is at least one message with role 'assistant'
    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

# Checking if any format errors were found
if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")


Example 1 is not a dictionary: intents
Found errors:
data_type: 1


In [9]:
# Load corrected intents data from the JSON file
with open('json_intents_corrected.json', 'r') as file:
    intents_data = json.load(file)
    
# Extract list of intents
intents_list = intents_data.get("intents", [])

# Replace training data
training_data = intents_list

# Creating a defaultdict to store the count of format errors
format_errors = defaultdict(int)

# Iterating over each example in the dataset
for index, ex in enumerate(training_data):
    # Check if the example is a dictionary
    if not isinstance(ex, dict):
        format_errors[f"Example {index + 1}"] += 1
        continue
        
    # Extracting the list of messages from the example
    messages = ex.get("messages", None)
    
    # Check if the messages list is missing or empty
    if not messages or not isinstance(messages, list):  # Ensure messages is a list
        format_errors[f"Example {index + 1}"] += 1
        continue
        
    # Iterating over each message in the list
    for message in messages:
        # Check if the message dictionary has 'role' and 'content' keys
        if "role" not in message or "content" not in message:
            format_errors[f"Example {index + 1}"] += 1
        
        # Check if the message contains unrecognized keys
        if any(k not in ("role", "content", "name", "function_call", "weight") for k in message):
            format_errors[f"Example {index + 1}"] += 1
        
        # Check if the role in the message is recognized
        if message.get("role", None) not in ("system", "user", "assistant", "function"):
            format_errors[f"Example {index + 1}"] += 1
            
        # Extracting content and function_call from the message
        content = message.get("content", None)
        function_call = message.get("function_call", None)
        
        # Check if content is missing or not a string
        if (not content and not function_call) or not isinstance(content, str):
            format_errors[f"Example {index + 1}"] += 1
    
    # Check if there is at least one message with role 'assistant'
    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors[f"Example {index + 1}"] += 1

# Checking if any format errors were found
if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")


Found errors:
Example 1: 1
Example 2: 1
Example 3: 1
Example 4: 1
Example 5: 1
Example 6: 1
Example 7: 1
Example 8: 1
Example 9: 1
Example 10: 1
Example 11: 1
Example 12: 1
Example 13: 1
Example 14: 1
Example 15: 1
Example 16: 1
Example 17: 1
Example 18: 1
Example 19: 1
Example 20: 1
Example 21: 1
Example 22: 1
Example 23: 1
Example 24: 1
Example 25: 1
Example 26: 1
Example 27: 1
Example 28: 1
Example 29: 1
Example 30: 1
Example 31: 1
Example 32: 1
Example 33: 1
Example 34: 1
Example 35: 1
Example 36: 1
Example 37: 1
Example 38: 1
Example 39: 1
Example 40: 1
Example 41: 1
Example 42: 1


In [10]:
import json

# Load the JSON file containing converted intents
with open('json_intents_corrected.json', 'r') as file:
    training_data = json.load(file)

# Check the type of training_data
print(type(training_data))  # Add this line

# Proceed with the rest of your code


<class 'dict'>


In [11]:
import json
from sklearn.model_selection import train_test_split

# Load the JSON file containing converted intents
with open('json_intents_corrected.json', 'r') as file:
    intents_data = json.load(file)


# Split the dataset into features (X) and labels (y)
X = [example['patterns'] for example in intents_data['intents']]
y = [example['tag'] for example in intents_data['intents']]

# Split the data into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Combine features and labels for training and validation datasets
training_dataset = [{'input': input_text, 'output': output} for input_text, output in zip(X_train, y_train)]
validation_dataset = [{'input': input_text, 'output': output} for input_text, output in zip(X_val, y_val)]

# Save the training and validation datasets to JSON files
with open('training_dataset.json', 'w') as file:
    json.dump(training_dataset, file, indent=4)

with open('validation_dataset.json', 'w') as file:
    json.dump(validation_dataset, file, indent=4)

print("Training and validation datasets created and saved successfully.")


Training and validation datasets created and saved successfully.


In [12]:
import openai
import json

# Load your training and validation datasets from JSON files
with open('training_dataset.json', 'r') as file:
    training_dataset = json.load(file)

with open('validation_dataset.json', 'r') as file:
    validation_dataset = json.load(file)

# Prepare data for OpenAI API (example)
training_data = [{"prompt": example['input'], "completion": example['output']} for example in training_dataset]
validation_data = [{"prompt": example['input'], "completion": example['output']} for example in validation_dataset]

# Set up your OpenAI API key
openai.api_key = ''

# Submit data to the API for training
response = openai.Completion.create(
    engine="gpt-3.5-turbo",  # Specify the model/engine you want to use
    examples=training_data,      # Provide your training data
    validation_set=validation_data,  # Provide your validation data
    # Other parameters as needed
)

# Handle the response from the API
print(response)


RateLimitError: You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.

In [13]:
encoding = tiktoken.get_encoding("cl100k_base")
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")

encoding.encode("tiktoken is great!")



[83, 1609, 5963, 374, 2294, 0]

In [14]:
#Count tokens by counting the length of the list returned by .encode().

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens
num_tokens_from_string("tiktoken is great!", "cl100k_base")

6

In [17]:
#Token Counting Utilities

encoding = tiktoken.get_encoding("cl100k_base")
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")


# not exact!
# simplified from https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3
    return num_tokens

def num_assistant_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        if message["role"] == "assistant":
            num_tokens += len(encoding.encode(message["content"]))
    return num_tokens

def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")

In [None]:
#Data warnings and token counts
#With some lightweight analysis we can identify potential issues in the dataset,
#like missing messages, and provide statistical insights into message and token counts.

# Warnings and tokens counts
n_missing_system = 0
n_missing_user = 0
n_messages = []
convo_lens = []
assistant_message_lens = []

for ex in training_data:
    messages = ex["messages"]
    if not any(message["role"] == "system" for message in messages):
        n_missing_system += 1
    if not any(message["role"] == "user" for message in messages):
        n_missing_user += 1
    n_messages.append(len(messages))
    convo_lens.append(num_tokens_from_messages(messages))
    assistant_message_lens.append(num_assistant_tokens_from_messages(messages))
    
print("Num examples missing system message:", n_missing_system)
print("Num examples missing user message:", n_missing_user)
print_distribution(n_messages, "num_messages_per_example")
print_distribution(convo_lens, "num_total_tokens_per_example")
print_distribution(assistant_message_lens, "num_assistant_tokens_per_example")
n_too_long = sum(l > 4096 for l in convo_lens)
print(f"\n{n_too_long} examples may be over the 4096 token limit, they will be truncated during fine-tuning")

In [18]:
#Cost Estimation

#In this final section, we estimate the total number of tokens that will be used for fine-tuning, which allows us to approximate the cost.
#It is worth noting that the duration of the fine-tuning jobs will also increase with the token count.

convo_lens = [100, 200, 150, 180, 220]

# Pricing and default n_epochs estimate
MAX_TOKENS_PER_EXAMPLE = 4096

TARGET_EPOCHS = 3
MIN_TARGET_EXAMPLES = 100
MAX_TARGET_EXAMPLES = 25000
MIN_DEFAULT_EPOCHS = 1
MAX_DEFAULT_EPOCHS = 25

n_epochs = TARGET_EPOCHS
n_train_examples = len(training_data)
if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
    n_epochs = min(MAX_DEFAULT_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
    n_epochs = max(MIN_DEFAULT_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)

n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens)
print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training")
print(f"By default, you'll train for {n_epochs} epochs on this dataset")
print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens")


Dataset has ~850 tokens that will be charged for during training
By default, you'll train for 3 epochs on this dataset
By default, you'll be charged for ~2550 tokens


In [21]:
#OPEN AI Script to validate the data and provide cost estimation

# Next, we specify the data path and open the JSONL file

# Load your training and validation datasets from JSON files
with open('training_dataset.json', 'r') as file:
    training_dataset = json.load(file)

with open('validation_dataset.json', 'r') as file:
    validation_dataset = json.load(file)

# Format training and validation data for OpenAI API
training_data = [{"prompt": example['input'], "completion": example['output']} for example in training_dataset]
validation_data = [{"prompt": example['input'], "completion": example['output']} for example in validation_dataset]


# Load dataset
training_set = []
with open('training_dataset.json', 'r') as file:
    for line in file:
        training_set.append(json.loads(line))

validation_set = []
with open('validation_dataset.json', 'r') as file:
    for line in file:
        validation_set.append(json.loads(line))

# We can inspect the data quickly by checking the number of examples and the first item

# Initial dataset stats
print("Training Set Examples:", len(training_set))
print("First example in Training Set:")
for message in training_set[0]["messages"]:
    print(message)

print("\nValidation Set Examples:", len(validation_set))
print("First example in Validation Set:")
for message in validation_set[0]["messages"]:
    print(message)

# Now that we have a sense of the data, we need to go through all the different examples and check to make sure the formatting is correct and matches the Chat completions message structure

# Format error checks
format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue

    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue

    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1

        if any(k not in ("role", "content", "name") for k in message):
            format_errors["message_unrecognized_key"] += 1

        if message.get("role", None) not in ("system", "user", "assistant"):
            format_errors["unrecognized_role"] += 1

        content = message.get("content", None)
        if not content or not isinstance(content, str):
            format_errors["missing_content"] += 1

    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

# Beyond the structure of the message, we also need to ensure that the length does not exceed the 4096 token limit.

# Token counting functions
encoding = tiktoken.get_encoding("cl100k_base")

# not exact!
# simplified from https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3
    return num_tokens

def num_assistant_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        if message["role"] == "assistant":
            num_tokens += len(encoding.encode(message["content"]))
    return num_tokens

def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")

# Last, we can look at the results of the different formatting operations before proceeding with creating a fine-tuning job:

# Warnings and tokens counts
n_missing_system = 0
n_missing_user = 0
n_messages = []
convo_lens = []
assistant_message_lens = []

for ex in dataset:
    messages = ex["messages"]
    if not any(message["role"] == "system" for message in messages):
        n_missing_system += 1
    if not any(message["role"] == "user" for message in messages):
        n_missing_user += 1
    n_messages.append(len(messages))
    convo_lens.append(num_tokens_from_messages(messages))
    assistant_message_lens.append(num_assistant_tokens_from_messages(messages))

print("Num examples missing system message:", n_missing_system)
print("Num examples missing user message:", n_missing_user)
print_distribution(n_messages, "num_messages_per_example")
print_distribution(convo_lens, "num_total_tokens_per_example")
print_distribution(assistant_message_lens, "num_assistant_tokens_per_example")
n_too_long = sum(l > 4096 for l in convo_lens)
print(f"\n{n_too_long} examples may be over the 4096 token limit, they will be truncated during fine-tuning")

# Pricing and default n_epochs estimate
MAX_TOKENS_PER_EXAMPLE = 4096

MIN_TARGET_EXAMPLES = 100
MAX_TARGET_EXAMPLES = 25000
TARGET_EPOCHS = 3
MIN_EPOCHS = 1
MAX_EPOCHS = 25

n_epochs = TARGET_EPOCHS
n_train_examples = len(dataset)
if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
    n_epochs = min(MAX_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
    n_epochs = max(MIN_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)

n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens)
print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training")
print(f"By default, you'll train for {n_epochs} epochs on this dataset")
print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens")
print("See pricing page to estimate total costs")
print(f"Cost: ~ {round(n_epochs * (n_billing_tokens_in_dataset/1000) * 0.008, 2)} $")

JSONDecodeError: Expecting value: line 2 column 1 (char 2)