<a href="https://colab.research.google.com/github/maimonahST/CCSIT_Infobot/blob/main/code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CCSIT Infobot

use this webpage as a referense to complete the project https://platform.openai.com/docs/guides/fine-tuning

## Data preparation and analysis for chat model fine-tuning
 checks for format errors, provides basic statistics, and estimates token counts for fine-tuning costs.

### Upload a training file

In [None]:
import json
import tiktoken # for token counting
import numpy as np
from collections import defaultdict

In [None]:
data_path = "dataset.jsonl"

# Load the dataset
with open(data_path, 'r', encoding='utf-8') as f:
    dataset = [json.loads(line) for line in f]

# Initial dataset stats
print("Num examples:", len(dataset))
print("First example:")
for message in dataset[0]["messages"]:
    print(message)

Num examples: 245
First example:
{'role': 'system', 'content': 'You are CCSIT infobot '}
{'role': 'user', 'content': 'Tell me about Imam Abdulrahman Bin Faisal University.'}
{'role': 'assistant', 'content': 'Imam Abdulrahman Bin Faisal University opened its doors to women and men in 1975 with two pioneering colleges - the College of Medicine and the College of Architecture.'}


### Format validation

In [None]:
# Format error checks
format_errors = defaultdict(int)

for idx, ex in enumerate(dataset):
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue

    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue

    for message_idx, message in enumerate(messages):
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1

        if any(k not in ("role", "content", "name", "function_call") for k in message):
            format_errors["message_unrecognized_key"] += 1

        if message.get("role", None) not in ("system", "user", "assistant", "function"):
            format_errors["unrecognized_role"] += 1
            print(f"Unrecognized role in example {idx}, message {message_idx}")

        content = message.get("content", None)
        function_call = message.get("function_call", None)

        if (not content and not function_call) or not isinstance(content, str):
            format_errors["missing_content"] += 1
            print(f"missing content message in example {idx}")

    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1
        print(f"Missing assistant message in example {idx}")

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")


No errors found


### Token Counting Utilities

In [None]:
encoding = tiktoken.get_encoding("cl100k_base")

def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3
    return num_tokens

def num_assistant_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        if message["role"] == "assistant":
            num_tokens += len(encoding.encode(message["content"]))
    return num_tokens

def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")

In [None]:
# Warnings and tokens counts
n_missing_system = 0
n_missing_user = 0
n_messages = []
convo_lens = []
assistant_message_lens = []

for ex in dataset:
    messages = ex["messages"]
    if not any(message["role"] == "system" for message in messages):
        n_missing_system += 1
    if not any(message["role"] == "user" for message in messages):
        n_missing_user += 1
    n_messages.append(len(messages))
    convo_lens.append(num_tokens_from_messages(messages))
    assistant_message_lens.append(num_assistant_tokens_from_messages(messages))

print("Num examples missing system message:", n_missing_system)
print("Num examples missing user message:", n_missing_user)
print_distribution(n_messages, "num_messages_per_example")
print_distribution(convo_lens, "num_total_tokens_per_example")
print_distribution(assistant_message_lens, "num_assistant_tokens_per_example")
n_too_long = sum(l > 4096 for l in convo_lens)
print(f"\n{n_too_long} examples may be over the 4096 token limit, they will be truncated during fine-tuning")

Num examples missing system message: 0
Num examples missing user message: 0

#### Distribution of num_messages_per_example:
min / max: 3, 3
mean / median: 3.0, 3.0
p5 / p95: 3.0, 3.0

#### Distribution of num_total_tokens_per_example:
min / max: 39, 432
mean / median: 89.4530612244898, 83.0
p5 / p95: 63.0, 114.0

#### Distribution of num_assistant_tokens_per_example:
min / max: 8, 389
mean / median: 54.608163265306125, 49.0
p5 / p95: 28.0, 76.0

0 examples may be over the 4096 token limit, they will be truncated during fine-tuning


### Cost Estimation

In [None]:
# Pricing and default n_epochs estimate
MAX_TOKENS_PER_EXAMPLE = 4096

TARGET_EPOCHS = 3
MIN_TARGET_EXAMPLES = 100
MAX_TARGET_EXAMPLES = 25000
MIN_DEFAULT_EPOCHS = 1
MAX_DEFAULT_EPOCHS = 25

n_epochs = TARGET_EPOCHS
n_train_examples = len(dataset)
if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
    n_epochs = min(MAX_DEFAULT_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
    n_epochs = max(MIN_DEFAULT_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)

n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens)
print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training")
print(f"By default, you'll train for {n_epochs} epochs on this dataset")
print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens")

Dataset has ~21916 tokens that will be charged for during training
By default, you'll train for 3 epochs on this dataset
By default, you'll be charged for ~65748 tokens


## Upload a training file


In [None]:
!pip3 install openai

Collecting openai
  Downloading openai-0.28.1-py3-none-any.whl (76 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/77.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━[0m [32m41.0/77.0 kB[0m [31m1.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.0/77.0 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: openai
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
llmx 0.0.15a0 requires cohere, which is not installed.
llmx 0.0.15a0 requires tiktoken, which is not installed.[0m[31m
[0mSuccessfully installed openai-0.28.1


In [None]:
import os
import openai
openai.api_key = "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"

In [None]:

upload_response = openai.File.create(
  file=open("dataset.jsonl", "rb"),
  purpose='fine-tune'
)

In [None]:
file_id = upload_response.id
file_id

'file-oslcAaIGlmzqflruENJaJDbl'

file_id = file-oslcAaIGlmzqflruENJaJDbl

## Create a fine-tuned model

In [None]:
job = openai.FineTuningJob.create(training_file=file_id, model="gpt-3.5-turbo")

job_id = job.id

In [None]:
job_id

'ftjob-lrbo5SsbJkPOTZBkCbOkqkbW'

job_id = ftjob-lrbo5SsbJkPOTZBkCbOkqkbW

In [None]:
# List 10 fine-tuning jobs
openai.FineTuningJob.list(limit=10)


<OpenAIObject list at 0x7fa7e4336570> JSON: {
  "object": "list",
  "data": [
    {
      "object": "fine_tuning.job",
      "id": "ftjob-lrbo5SsbJkPOTZBkCbOkqkbW",
      "model": "gpt-3.5-turbo-0613",
      "created_at": 1698426471,
      "finished_at": null,
      "fine_tuned_model": null,
      "organization_id": "org-HSbO0AHQitnU5PUzeD33jzWV",
      "result_files": [],
      "status": "running",
      "validation_file": null,
      "training_file": "file-oslcAaIGlmzqflruENJaJDbl",
      "hyperparameters": {
        "n_epochs": 3
      },
      "trained_tokens": null,
      "error": null
    },
    {
      "object": "fine_tuning.job",
      "id": "ftjob-yPDyIjz59ojF55CnXEg5lgyD",
      "model": "gpt-3.5-turbo-0613",
      "created_at": 1698426461,
      "finished_at": null,
      "fine_tuned_model": null,
      "organization_id": "org-HSbO0AHQitnU5PUzeD33jzWV",
      "result_files": [],
      "status": "running",
      "validation_file": null,
      "training_file": "file-oslcAaIGlm

In [None]:
job_id = "ftjob-lrbo5SsbJkPOTZBkCbOkqkbW"

In [None]:
# Retrieve the state of a fine-tune
openai.FineTuningJob.retrieve(job_id)

<FineTuningJob fine_tuning.job id=ftjob-lrbo5SsbJkPOTZBkCbOkqkbW at 0x7eedae743bf0> JSON: {
  "object": "fine_tuning.job",
  "id": "ftjob-lrbo5SsbJkPOTZBkCbOkqkbW",
  "model": "gpt-3.5-turbo-0613",
  "created_at": 1698426471,
  "finished_at": 1698428014,
  "fine_tuned_model": "ft:gpt-3.5-turbo-0613:personal::8EL0BRBX",
  "organization_id": "org-HSbO0AHQitnU5PUzeD33jzWV",
  "result_files": [
    "file-wpojZqKIn48ii6tCoXyIKGnU"
  ],
  "status": "succeeded",
  "validation_file": null,
  "training_file": "file-oslcAaIGlmzqflruENJaJDbl",
  "hyperparameters": {
    "n_epochs": 3
  },
  "trained_tokens": 64278,
  "error": null
}

### Use a fine-tuned model

In [None]:
completion = openai.ChatCompletion.create(
  model="ft:gpt-3.5-turbo-0613:personal::8EL0BRBX",
  messages=[
    {"role": "system", "content": "You are CCSIT infobot which provide facts about ccsit"},
    {"role": "user", "content": "tell me about Dr. Yasser Abdullah Alahmadi?"}
  ]
)
print(completion.choices[0].message)

{
  "role": "assistant",
  "content": "Dr. Yasser Abdullah Alahmadi is an assistant Professor in the College of Computer Science and Information Technology. He is associated with the Department of Computer Engineering. You can reach him via phone at 00966-13-333-2012 or email at yalahmadi@iau.edu.sa."
}


In [None]:
completion = openai.ChatCompletion.create(
  model="ft:gpt-3.5-turbo-0613:personal::8EL0BRBX",
  messages=[
    {"role": "system", "content": "You are CCSIT infobot which provide facts about ccsit "},
    {"role": "user", "content": "what aramco chair offer? "}
  ]
)
print(completion.choices[0].message)

{
  "role": "assistant",
  "content": "The Aramco Chair in Computer Science offers scholarships and research grants for faculty members, as well as funding for research projects, scientific workshops, and conferences. It also supports various academic activities and initiatives in the field of computer science."
}


### Analyzing your fine-tuned model