<a href="https://colab.research.google.com/github/maimonahST/CCSIT_Infobot/blob/main/code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CCSIT Infobot

use this webpage as a referense to complete the project https://platform.openai.com/docs/guides/fine-tuning

## Data preparation and analysis for chat model fine-tuning
 checks for format errors, provides basic statistics, and estimates token counts for fine-tuning costs.

### Upload a training file

In [1]:
!pip install tiktoken
import json
import tiktoken # for token counting
import numpy as np
from collections import defaultdict



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
train_data_path="/content/drive/MyDrive/Dataset/train_data.jsonl"
validation_data_path="/content/drive/MyDrive/Dataset/validation_data.jsonl"
test_data_path="/content/drive/MyDrive/Dataset/test_data.jsonl"

# Load the train data
with open(train_data_path, 'r', encoding='utf-8') as f:
    train_data = [json.loads(line) for line in f]

# Load the validation data
with open(validation_data_path, 'r', encoding='utf-8') as f:
    validation_data = [json.loads(line) for line in f]

# Load the test data
with open(test_data_path, 'r', encoding='utf-8') as f:
    test_data = [json.loads(line) for line in f]

# Initial data stats
print("Train num examples:", len(train_data))
print("Validation num examples:", len(validation_data))
print("Test num examples:", len(test_data))

print("First example in train data:")
for message in train_data[0]["messages"]:
    print(message)

print("First example in validation data:")
for message in validation_data[0]["messages"]:
    print(message)

print("First example in test data:")
for message in test_data[0]["messages"]:
    print(message)

Train num examples: 178
Validation num examples: 23
Test num examples: 44
First example in train data:
{'role': 'system', 'content': 'You are CCSIT infobot '}
{'role': 'user', 'content': 'Tell me about Imam Abdulrahman Bin Faisal University.'}
{'role': 'assistant', 'content': 'Imam Abdulrahman Bin Faisal University opened its doors to women and men in 1975 with two pioneering colleges - the College of Medicine and the College of Architecture.'}
First example in validation data:
{'role': 'system', 'content': 'You are CCSIT infobot '}
{'role': 'user', 'content': 'Can you provide more details about the Master of Science (MS) degrees offered at CCSIT?'}
{'role': 'assistant', 'content': 'Certainly! The Master of Science (MS) degrees at CCSIT include special majors such as Big Data and Cloud Computing, Information Security, and Software Engineering. These programs are designed to provide in-depth knowledge in these areas.'}
First example in test data:
{'role': 'system', 'content': 'You are C

### Format validation

In [4]:
# Format error checks
format_errors = defaultdict(int)

def error_check(dataset):
  for idx, ex in enumerate(dataset):
      if not isinstance(ex, dict):
          format_errors["data_type"] += 1
          continue

      messages = ex.get("messages", None)
      if not messages:
          format_errors["missing_messages_list"] += 1
          continue

      for message_idx, message in enumerate(messages):
          if "role" not in message or "content" not in message:
              format_errors["message_missing_key"] += 1

          if any(k not in ("role", "content", "name", "function_call") for k in message):
              format_errors["message_unrecognized_key"] += 1

          if message.get("role", None) not in ("system", "user", "assistant", "function"):
              format_errors["unrecognized_role"] += 1
              print(f"Unrecognized role in example {idx}, message {message_idx}")

          content = message.get("content", None)
          function_call = message.get("function_call", None)

          if (not content and not function_call) or not isinstance(content, str):
              format_errors["missing_content"] += 1
              print(f"missing content message in example {idx}")

      if not any(message.get("role", None) == "assistant" for message in messages):
          format_errors["example_missing_assistant_message"] += 1
          print(f"Missing assistant message in example {idx}")

  if format_errors:
      print("Found errors:")
      for k, v in format_errors.items():
          print(f"{k}: {v}")
  else:
      print("No errors found")


In [5]:
# Check errors in train data
error_check(train_data)

No errors found


In [6]:
# Check errors in validation data
error_check(validation_data)

No errors found


In [7]:
# Check errors in test data
error_check(test_data)

No errors found


### Token Counting Utilities

In [8]:
encoding = tiktoken.get_encoding("cl100k_base")

def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3
    return num_tokens

def num_assistant_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        if message["role"] == "assistant":
            num_tokens += len(encoding.encode(message["content"]))
    return num_tokens

def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")

In [9]:
# Warnings and tokens counts
n_missing_system = 0
n_missing_user = 0
n_messages = []
convo_lens = []
assistant_message_lens = []

for ex in train_data:
    messages = ex["messages"]
    if not any(message["role"] == "system" for message in messages):
        n_missing_system += 1
    if not any(message["role"] == "user" for message in messages):
        n_missing_user += 1
    n_messages.append(len(messages))
    convo_lens.append(num_tokens_from_messages(messages))
    assistant_message_lens.append(num_assistant_tokens_from_messages(messages))

print("Num examples missing system message:", n_missing_system)
print("Num examples missing user message:", n_missing_user)
print_distribution(n_messages, "num_messages_per_example")
print_distribution(convo_lens, "num_total_tokens_per_example")
print_distribution(assistant_message_lens, "num_assistant_tokens_per_example")
n_too_long = sum(l > 4096 for l in convo_lens)
print(f"\n{n_too_long} examples may be over the 4096 token limit, they will be truncated during fine-tuning")

Num examples missing system message: 0
Num examples missing user message: 0

#### Distribution of num_messages_per_example:
min / max: 3, 3
mean / median: 3.0, 3.0
p5 / p95: 3.0, 3.0

#### Distribution of num_total_tokens_per_example:
min / max: 39, 432
mean / median: 87.64044943820225, 80.5
p5 / p95: 63.0, 113.0

#### Distribution of num_assistant_tokens_per_example:
min / max: 8, 389
mean / median: 52.92134831460674, 48.0
p5 / p95: 28.0, 75.0

0 examples may be over the 4096 token limit, they will be truncated during fine-tuning


### Cost Estimation

In [10]:
# Pricing and default n_epochs estimate
MAX_TOKENS_PER_EXAMPLE = 4096

TARGET_EPOCHS = 3
MIN_TARGET_EXAMPLES = 100
MAX_TARGET_EXAMPLES = 25000
MIN_DEFAULT_EPOCHS = 1
MAX_DEFAULT_EPOCHS = 25

n_epochs = TARGET_EPOCHS
n_train_examples = len(train_data)
if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
    n_epochs = min(MAX_DEFAULT_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
    n_epochs = max(MIN_DEFAULT_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)

n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens)
print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training")
print(f"By default, you'll train for {n_epochs} epochs on this dataset")
print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens")

Dataset has ~15600 tokens that will be charged for during training
By default, you'll train for 3 epochs on this dataset
By default, you'll be charged for ~46800 tokens


## Upload training, validation, and testing file


In [11]:
!pip3 install openai



In [7]:
import os
import openai
from openai import OpenAI
client=OpenAI(api_key = "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")

In [13]:
train_upload_response = openai.File.create(
  file=open("/content/drive/MyDrive/Dataset/train_data.jsonl", "rb"),
  purpose='fine-tune'
)

validation_upload_response = openai.File.create(
  file=open("/content/drive/MyDrive/Dataset/validation_data.jsonl", "rb"),
  purpose='fine-tune'
)

test_upload_response = openai.File.create(
  file=open("/content/drive/MyDrive/Dataset/test_data.jsonl", "rb"),
  purpose='fine-tune'
)

AttributeError: ignored

In [14]:
!curl https://api.openai.com/v1/files \  -H "Authorization: Bearer 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"

{
  "object": "list",
  "has_more": false,
  "data": [
    {
      "object": "file",
      "id": "file-DT9nu6KVchTz67lWD8rh9iDo",
      "purpose": "fine-tune",
      "filename": "file",
      "bytes": 23407,
      "created_at": 1698860303,
      "status": "processed",
      "status_details": null
    },
    {
      "object": "file",
      "id": "file-N50kRtX450lD053IMAiUsC0k",
      "purpose": "fine-tune",
      "filename": "file",
      "bytes": 86611,
      "created_at": 1698860302,
      "status": "processed",
      "status_details": null
    },
    {
      "object": "file",
      "id": "file-KwUnDD4gOnTzxlAfmXn13tpK",
      "purpose": "fine-tune",
      "filename": "file",
      "bytes": 11148,
      "created_at": 1698860302,
      "status": "processed",
      "status_details": null
    }
  ]
}
curl: (3) URL using bad/illegal format or missing URL


In [10]:
# train_file_id = train_upload_response.id
# validation_file_id = validation_upload_response.id
# test_file_id = test_upload_response.id

train_file_id = "file-N50kRtX450lD053IMAiUsC0k"
validation_file_id = "file-KwUnDD4gOnTzxlAfmXn13tpK"
test_file_id = "file-DT9nu6KVchTz67lWD8rh9iDo"


print(train_file_id)
print(validation_file_id)
print(test_file_id)

file-N50kRtX450lD053IMAiUsC0k
file-KwUnDD4gOnTzxlAfmXn13tpK
file-DT9nu6KVchTz67lWD8rh9iDo


file_id = file-oslcAaIGlmzqflruENJaJDbl

## Create a fine-tuned model

In [13]:
job = client.fine_tuning.jobs.create(training_file=train_file_id, model="gpt-3.5-turbo", validation_file=validation_file_id)

job_id = job.id

BadRequestError: ignored

In [14]:
!curl https://api.openai.com/v1/fine_tuning/jobs \
  -H "Content-Type: application/json" \
  -H "Authorization: Bearer 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" \
  -d '{ "training_file": "file-N50kRtX450lD053IMAiUsC0k", "validation_file": "file-KwUnDD4gOnTzxlAfmXn13tpK", "model": "gpt-3.5-turbo" }'

{
  "error": {
    "message": "Fine-tuning jobs cannot be created on an Explore plan. You can upgrade to a paid plan on your billing page: https://platform.openai.com/account/billing/overview",
    "type": "invalid_request_error",
    "param": null,
    "code": "exceeded_quota"
  }
}

In [None]:
job_id

job_id = ftjob-lrbo5SsbJkPOTZBkCbOkqkbW

In [None]:
# List 10 fine-tuning jobs
openai.FineTuningJob.list(limit=10)


In [None]:
job_id = "ftjob-lrbo5SsbJkPOTZBkCbOkqkbW"

In [None]:
# Retrieve the state of a fine-tune
openai.FineTuningJob.retrieve(job_id)

### Use a fine-tuned model

In [None]:
completion = openai.ChatCompletion.create(
  model="ft:gpt-3.5-turbo-0613:personal::8EL0BRBX",
  messages=[
    {"role": "system", "content": "You are CCSIT infobot which provide facts about ccsit"},
    {"role": "user", "content": "tell me about Dr. Yasser Abdullah Alahmadi?"}
  ]
)
print(completion.choices[0].message)

In [None]:
completion = openai.ChatCompletion.create(
  model="ft:gpt-3.5-turbo-0613:personal::8EL0BRBX",
  messages=[
    {"role": "system", "content": "You are CCSIT infobot which provide facts about ccsit "},
    {"role": "user", "content": "what aramco chair offer? "}
  ]
)
print(completion.choices[0].message)

### Analyzing your fine-tuned model