# Part 1

In [1]:
!pip install --upgrade openai --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m327.5/327.5 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m797.9 kB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h

Data loading
We first load the chat dataset from an example JSONL file.
[link text](https://github.com/openai/openai-cookbook/blob/main/examples/data/toy_chat_fine_tuning.jsonl)

In [5]:
data_path="toy_chat_fine_tuning.jsonl"

In [2]:
!pip install tiktoken --quiet

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/1.1 MB[0m [31m5.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import json
import tiktoken # for token counting
import numpy as np
from collections import defaultdict

# Load the dataset

In [6]:
with open(data_path,'r',encoding='utf-8') as f:
  dataset=[json.loads(line) for line in f]



# Initial dataset stats


In [8]:
print("Num examples",len(dataset))
print("1st example:")
for message in dataset[0]["messages"]:
  print(message)

Num examples 5
1st example:
{'role': 'system', 'content': 'You are a happy assistant that puts a positive spin on everything.'}
{'role': 'user', 'content': 'I fell off my bike today.'}
{'role': 'assistant', 'content': "It's great that you're getting exercise outdoors!"}


# Format error checks


In [9]:
format_errors_list=defaultdict(int)

for ex in dataset:
  if not isinstance(ex,dict):
    format_errors_list["data_type"]+=1
    continue
  messages=ex.get("messages",None)
  if not messages:
    format_errors_list["missing_message"]+=1
    continue
  for message in messages:
    if "role" not in message or "content" not in message:
      format_errors_list["message_key_missing"]+=1

    if any(k not in("role","content","name","function_call","weight") for k in message):
      format_errors_list["message_key_unrcognized"]+=1

    if message.get("role",None) not in ("system","user","assistant","function"):
      format_errors_list["unrecognized_role"]+=1

    content=message.get("content",None)
    function_call=message.get("function_call",None)

    if(not content and not function_call) or not isinstance(content,str):
      format_errors_list["missing_content"]+=1

  if not any(message.get("role",None)=="assistant" for message in messages):
    # print(message.get("role",None))
    format_errors_list["missing_assitant_message"]+=1


if format_errors_list:
  print("Found errors in data")
  for k,v in format_errors_list.items():
   print(f"{k}:{v}")
else:
  print("No errors found")



No errors found


# Part 2

#Token Counting Utilities
Lets define a few helpful utilities to be used in the rest of the notebook.



In [10]:
encoding=tiktoken.get_encoding("cl100k_base")


### not exact!


In [11]:
def count_token_from_message(messages,token_per_message=3,tokens_per_name=1):
  count_tokens=0;
  for message in messages:
    count_tokens+=token_per_message
    for key,value in message.items():
      count_tokens+=len(encoding.encode(value))
      if key=="name":
        count_tokens+=tokens_per_name
  count_tokens+=3
  return count_tokens

def count_assitant_tokens_from_message(messages):
  count_tokens=0
  for message in messages:
    if message["role"]=="assistant":
      count_tokens+=len(encoding.encode(message["content"]))
  return count_tokens

def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")

#Count messages token

In [12]:
count_message_tokens=0
for ex in dataset:
  messages=ex.get("messages",None)
  count_message_tokens+=count_token_from_message(messages)

print(f"Total message token {count_message_tokens}")



Total message token 8242


# Count assistant token

In [13]:
count_assistant_tokens=0
for ex in dataset:
  messages=ex.get("messages",None)
  count_assistant_tokens+=count_assitant_tokens_from_message(messages)

print(f"Total assitant  token {count_assistant_tokens}")



Total assitant  token 8051


# Data Warnings and Token Counts

In [14]:
# Warnings and tokens counts
n_missing_system = 0
n_missing_user = 0
n_messages = []
convo_lens = []
assistant_message_lens = []

for ex in dataset:
    messages = ex["messages"]
    if not any(message["role"] == "system" for message in messages):
        n_missing_system += 1
    if not any(message["role"] == "user" for message in messages):
        n_missing_user += 1
    n_messages.append(len(messages))
    convo_lens.append(count_token_from_message(messages))
    assistant_message_lens.append(count_assitant_tokens_from_message(messages))

print("Num examples missing system message:", n_missing_system)
print("Num examples missing user message:", n_missing_user)
print_distribution(n_messages, "num_messages_per_example")
print_distribution(convo_lens, "num_total_tokens_per_example")
print_distribution(assistant_message_lens, "num_assistant_tokens_per_example")
n_too_long = sum(l > 16385 for l in convo_lens)
print(f"\n{n_too_long} examples may be over the 16,385 token limit, they will be truncated during fine-tuning")

Num examples missing system message: 1
Num examples missing user message: 1

#### Distribution of num_messages_per_example:
min / max: 2, 9
mean / median: 3.8, 3.0
p5 / p95: 2.0, 6.6000000000000005

#### Distribution of num_total_tokens_per_example:
min / max: 26, 8032
mean / median: 1648.4, 45.0
p5 / p95: 26.8, 4863.6

#### Distribution of num_assistant_tokens_per_example:
min / max: 4, 8000
mean / median: 1610.2, 10.0
p5 / p95: 6.0, 4811.200000000001

0 examples may be over the 16,385 token limit, they will be truncated during fine-tuning


# Part3

# Cost estimation

In [15]:
# Pricing and default n_epochs estimate
MAX_TOKENS_PER_EXAMPLE = 16385

TARGET_EPOCHS = 3
MIN_TARGET_EXAMPLES = 100
MAX_TARGET_EXAMPLES = 25000
MIN_DEFAULT_EPOCHS = 1
MAX_DEFAULT_EPOCHS = 25

n_epochs=TARGET_EPOCHS
n_train_examples=len(dataset)

if n_train_examples * TARGET_EPOCHS<MIN_TARGET_EXAMPLES:
  n_epochs=min(MAX_DEFAULT_EPOCHS,MIN_TARGET_EXAMPLES//n_train_examples)
elif n_train_examples*TARGET_EPOCHS >MAX_TARGET_EXAMPLES:
  n_epochs=max(MIN_DEFAULT_EPOCHS,MAX_TARGET_EXAMPLES//n_train_examples)

n_billing_tokens_in_dataset=sum(min(MAX_TOKENS_PER_EXAMPLE,length) for length in convo_lens)

print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training")
print(f"By default, you'll train for {n_epochs} epochs on this dataset")
print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens")

Dataset has ~8242 tokens that will be charged for during training
By default, you'll train for 20 epochs on this dataset
By default, you'll be charged for ~164840 tokens
