In [48]:
import json
import tiktoken # for token counting
import numpy as np
from collections import defaultdict

In [2]:
train_tone_file_path = "fine-tuning_data/train_tone_ds1.jsonl"
validation_tone_file_path = "fine-tuning_data/validation_tone_ds.jsonl"
def validate_jsonl(file_path):
    try:
        with open(file_path, 'r') as file:
            for line in file:
                json.loads(line)
        print("Valid JSONL file")
    except json.JSONDecodeError as e:
        print(f"Invalid JSONL: {e}")
    except Exception as e:
        print(f"Error reading file: {e}")

validate_jsonl(train_tone_file_path)
validate_jsonl(validation_tone_file_path)

'''
https://jsonltools.com/jsonl-validator
'''

Valid JSONL file
Valid JSONL file


'\nhttps://jsonltools.com/jsonl-validator\n'

In [3]:
import jsonlines


data_path = train_tone_file_path

with open(data_path, 'r', encoding='utf-8') as f:
    dataset = [json.loads(line) for line in f]
'''
with jsonlines.open(data_path) as reader:
    dataset = [line for line in reader]
'''
# Initial dataset stats
print("Num examples:", len(dataset))
print("First example:")
for message in dataset[0]["messages"]:
    print(message)

Num examples: 50
First example:
{'role': 'system', 'content': "## RULES:\n- Use only the Vietnamese language in your response. Always refer to yourself using 'em' pronoun. Address the user based on how they refer to themselves . If their preferred address term cannot be determined from their self-reference, then base it on their provided <User gender>: use 'anh' for male, 'chị' for female. If the gender is unknown or not provided, use the polite neutral term 'anh/chị'."}
{'role': 'system', 'content': '## BASE KNOWLEDGE:\n- User gender: female'}
{'role': 'system', 'content': '## INSTRUCTIONS:\n You must ask the user for their contact information to provide better advice on their issue or policy question or the product they are interested in.'}
{'role': 'user', 'content': 'Chị muốn hỏi về chính sách bảo hành pin điện thoại mua ở shop mình.'}
{'role': 'assistant', 'content': 'Dạ chào chị. Pin điện thoại mua tại FPT Shop được bảo hành theo chính sách của hãng, từ 1 đến 12 tháng cho lỗi từ 

In [4]:
import jsonlines

data_path = "fine-tuning_data/train_tone_ds1.jsonl"
# Load the dataset

with jsonlines.open(data_path) as reader:
    dataset = [line for line in reader]

# Initial dataset stats
print("Num examples:", len(dataset))
print("First example:")
for message in dataset[0]["messages"]:
    print(message)

Num examples: 50
First example:
{'role': 'system', 'content': "## RULES:\n- Use only the Vietnamese language in your response. Always refer to yourself using 'em' pronoun. Address the user based on how they refer to themselves . If their preferred address term cannot be determined from their self-reference, then base it on their provided <User gender>: use 'anh' for male, 'chị' for female. If the gender is unknown or not provided, use the polite neutral term 'anh/chị'."}
{'role': 'system', 'content': '## BASE KNOWLEDGE:\n- User gender: female'}
{'role': 'system', 'content': '## INSTRUCTIONS:\n You must ask the user for their contact information to provide better advice on their issue or policy question or the product they are interested in.'}
{'role': 'user', 'content': 'Chị muốn hỏi về chính sách bảo hành pin điện thoại mua ở shop mình.'}
{'role': 'assistant', 'content': 'Dạ chào chị. Pin điện thoại mua tại FPT Shop được bảo hành theo chính sách của hãng, từ 1 đến 12 tháng cho lỗi từ 

In [5]:
# Format error checks
format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue
        
    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue
        
    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1
        
        if any(k not in ("role", "content", "name", "function_call", "weight") for k in message):
            format_errors["message_unrecognized_key"] += 1
        
        if message.get("role", None) not in ("system", "user", "assistant", "function"):
            format_errors["unrecognized_role"] += 1
            
        content = message.get("content", None)
        function_call = message.get("function_call", None)
        
        if (not content and not function_call) or not isinstance(content, str):
            format_errors["missing_content"] += 1
    
    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

No errors found


In [6]:
encoding = tiktoken.get_encoding("cl100k_base")

# not exact!
# simplified from https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3
    return num_tokens

def num_assistant_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        if message["role"] == "assistant":
            num_tokens += len(encoding.encode(message["content"]))
    return num_tokens

def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")

In [7]:
# Warnings and tokens counts
n_missing_system = 0
n_missing_user = 0
n_messages = []
convo_lens = []
assistant_message_lens = []

for ex in dataset:
    messages = ex["messages"]
    if not any(message["role"] == "system" for message in messages):
        n_missing_system += 1
    if not any(message["role"] == "user" for message in messages):
        n_missing_user += 1
    n_messages.append(len(messages))
    convo_lens.append(num_tokens_from_messages(messages))
    assistant_message_lens.append(num_assistant_tokens_from_messages(messages))
    
print("Num examples missing system message:", n_missing_system)
print("Num examples missing user message:", n_missing_user)
print_distribution(n_messages, "num_messages_per_example")
print_distribution(convo_lens, "num_total_tokens_per_example")
print_distribution(assistant_message_lens, "num_assistant_tokens_per_example")
n_too_long = sum(l > 16385 for l in convo_lens)
print(f"\n{n_too_long} examples may be over the 16,385 token limit, they will be truncated during fine-tuning")

Num examples missing system message: 0
Num examples missing user message: 0

#### Distribution of num_messages_per_example:
min / max: 4, 6
mean / median: 4.64, 5.0
p5 / p95: 4.0, 5.0

#### Distribution of num_total_tokens_per_example:
min / max: 182, 753
mean / median: 334.2, 309.0
p5 / p95: 201.9, 434.40000000000003

#### Distribution of num_assistant_tokens_per_example:
min / max: 38, 281
mean / median: 134.76, 128.0
p5 / p95: 66.5, 207.5

0 examples may be over the 16,385 token limit, they will be truncated during fine-tuning


In [8]:
# Pricing and default n_epochs estimate
MAX_TOKENS_PER_EXAMPLE = 16385

TARGET_EPOCHS = 5
MIN_TARGET_EXAMPLES = 100
MAX_TARGET_EXAMPLES = 25000
MIN_DEFAULT_EPOCHS = 1
MAX_DEFAULT_EPOCHS = 25

n_epochs = TARGET_EPOCHS
n_train_examples = len(dataset)
if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
    n_epochs = min(MAX_DEFAULT_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
    n_epochs = max(MIN_DEFAULT_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)

n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens)
print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training")
print(f"By default, you'll train for {n_epochs} epochs on this dataset")
print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens")

Dataset has ~16710 tokens that will be charged for during training
By default, you'll train for 5 epochs on this dataset
By default, you'll be charged for ~83550 tokens


FINE-TUNING

In [9]:
import sys
import os
# import fix_path
# Add the parent directory to sys.path
sys.path.append(os.path.abspath('..'))
from service.openai import _client, _chat_model

In [10]:
def upload_file(file_name: str, purpose: str) -> str:
    with open(file_name, "rb") as file_fd:
        response = _client.files.create(file=file_fd, purpose=purpose)
    return response.id

training_file_id = upload_file(train_tone_file_path, "fine-tune") #!openai files create -p fine-tune -f dataset.jsonl
'''
In addition to training data, we can also optionally provide validation data, which will be used to make sure that the model does not overfit your training set.
'''
print("Training file ID:", training_file_id)

Training file ID: file-XTfeJeiVUapJ8ABxnhfW9C


In [55]:
#!openai fine_tuning.jobs.create -m gpt-4o -t <file-id>
def create_fine_tuning_job(training_file_id, chatmodel,n_epochs,learning_rate):
  response = _client.fine_tuning.jobs.create(
    training_file=training_file_id,
    #validation_file=validation_file_id,
    model=chatmodel,
    integrations=[
        {
          "type": "wandb",
          "wandb": {
              "project": "CHATBOT-TLCN",
              "name": "gpt-4o-mini-fine-tuning-tone",
              "tags": ["openai/finetune"]
          }
            
        }
    ],
    method={
      "type": "supervised",
      'supervised':{
        'hyperparameters':{
          'n_epochs': n_epochs, # auto=3 , one full cycle through train set
          'learning_rate_multiplier': learning_rate,#Scaling factor for the learning rate. smaller lr to avoid overfitting
          'batch_size': "auto", #1 num exams/batch
        }
      }

    }
  )
  return response
  

In [12]:
# check job status
def check_job_status(job_id):
  response = _client.fine_tuning.jobs.retrieve(job_id)
  return response

In [13]:
def monitor_job_status(job_id):
    response = _client.fine_tuning.jobs.list_events(job_id)

    events = response.data
    events.reverse()
    print(f"---Monitor {job_id}---")
    for event in events:
        print(event.message)

In [14]:
# get fine-tuned model id
def get_fine_tuned_model_id(job_id):

    response = _client.fine_tuning.jobs.retrieve(job_id)
    fine_tuned_model_id = response.fine_tuned_model

    if fine_tuned_model_id is None:
        raise RuntimeError(
            "Fine-tuned model ID not found. Your job has likely not been completed yet."
        )
    print("Fine-tuned model ID:", fine_tuned_model_id)
    return fine_tuned_model_id

In [56]:
#lr_lst = ["auto", 0.5, 2.5] # auto = 1.8 giá trị auto tự động điều chỉnh dựa trên kích thước batch size
lr_lst = ["auto", 0.9, 2.7] # auto = 1.8 giá trị auto tự động điều chỉnh dựa trên kích thước batch size

chatmodel = _chat_model
n_epochs = 3
job_id_lst = []
for lr in lr_lst:
    response_ft = create_fine_tuning_job(training_file_id, chatmodel,n_epochs,lr)
    job_id = response_ft.id
    print("Job ID:", job_id)
    job_id_lst.append(job_id)



Job ID: ftjob-3MkBKnw860Z5EFS9tqloYVNx
Job ID: ftjob-5tntFVtOjbsCLIqjCW2dABvg
Job ID: ftjob-gW9fFrEThhAwUt7gZRmRW8Rw


In [57]:
for job_id in job_id_lst:
    response_stt = check_job_status(job_id)
    print("Status:", response_stt.status)
    print("Trained Tokens:", response_stt.trained_tokens)

Status: succeeded
Trained Tokens: 39786
Status: succeeded
Trained Tokens: 39786
Status: succeeded
Trained Tokens: 39786


In [58]:
# monitor
fine_tuned_model_id_lst = []
for job_id in job_id_lst:
    #monitor_job_status(job_id)
    fine_tuned_model_id_lst.append(get_fine_tuned_model_id(job_id))

'''save result to a txt file'''
with open("fine-tuned_model_id.txt", "w") as f:
    for fine_tuned_model_id in fine_tuned_model_id_lst:
        f.write(fine_tuned_model_id + "\n")

Fine-tuned model ID: ft:gpt-4o-mini-2024-07-18:personal::BePMJcc3
Fine-tuned model ID: ft:gpt-4o-mini-2024-07-18:personal::BePMzLnB
Fine-tuned model ID: ft:gpt-4o-mini-2024-07-18:personal::BePLas4R


In [39]:
for fine_tuned_model_id in fine_tuned_model_id_lst:
    print(fine_tuned_model_id)



ft:gpt-4o-mini-2024-07-18:personal::BeMUcpFU
ft:gpt-4o-mini-2024-07-18:personal::BeMUjDNW
ft:gpt-4o-mini-2024-07-18:personal::BeMUcwhA


In [59]:
for job_id in job_id_lst:
    response_cp = _client.fine_tuning.jobs.checkpoints.list(job_id)
    for checkpoint in response_cp.data:
        #print(checkpoint.id)
        print(checkpoint.fine_tuned_model_checkpoint)
        print(checkpoint.metrics.train_loss)



ft:gpt-4o-mini-2024-07-18:personal::BePMJcc3
0.12726236879825592
ft:gpt-4o-mini-2024-07-18:personal::BePMJHNg:ckpt-step-100
0.6972470879554749
ft:gpt-4o-mini-2024-07-18:personal::BePMJXDq:ckpt-step-50
1.0708763599395752
ft:gpt-4o-mini-2024-07-18:personal::BePMzLnB
0.45584574341773987
ft:gpt-4o-mini-2024-07-18:personal::BePMzMbT:ckpt-step-100
0.551426112651825
ft:gpt-4o-mini-2024-07-18:personal::BePMyK6B:ckpt-step-50
1.3087761402130127
ft:gpt-4o-mini-2024-07-18:personal::BePLas4R
0.2537245452404022
ft:gpt-4o-mini-2024-07-18:personal::BePLaKdq:ckpt-step-100
0.3635803461074829
ft:gpt-4o-mini-2024-07-18:personal::BePLaZa5:ckpt-step-50
0.9651479721069336


In [40]:
#get all checkpoint of each fine-tuned model
fine_tuned_model_cp_lst = [{}]
for fine_tuned_model_id in fine_tuned_model_id_lst:
    #response = _client.fine_tuning.jobs.list_checkpoints(fine_tuned_model_id)
    response_cp = _client.fine_tuning.jobs.checkpoints.list(fine_tuned_model_id)
    for checkpoint in response_cp.data:
        print(checkpoint.id)
       

NotFoundError: Error code: 404 - {'error': {'message': 'Could not find fine tune: ft:gpt-4o-mini-2024-07-18:personal::BeMUcpFU', 'type': 'invalid_request_error', 'param': 'fine_tune_id', 'code': 'fine_tune_not_found'}}