In [1]:
import json
from collections import defaultdict
from typing import List

import numpy as np
from openai import OpenAI
import tiktoken # for token counting

In [2]:
host = '0.0.0.0'
port = 8080

api_key = "sk-no-key-required"
base_url=f"http://{host}:{port}/v1"
dataset_path = "training_dataset.jsonl"

In [3]:
# TODO: impl full fine-tuning openai spec
# dataset can be composed of many examples, each example being of a number of accepatble types, see below

# https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset
dataset = [
    # https://platform.openai.com/docs/guides/fine-tuning/example-format
    # "conversational chat format" for conversations (e.g., v1/chat/completions)
    {"messages": [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, {"role": "user", "content": "What's the capital of France?"}, {"role": "assistant", "content": "Paris, as if everyone doesn't know that already."}]},
    {"messages": [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, {"role": "user", "content": "Who wrote 'Romeo and Juliet'?"}, {"role": "assistant", "content": "Oh, just some guy named William Shakespeare. Ever heard of him?"}]},
    {"messages": [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, {"role": "user", "content": "How far is the Moon from Earth?"}, {"role": "assistant", "content": "Around 384,400 kilometers. Give or take a few, like that really matters."}]},
    # "prompt completion pair format" (e.g., v1/completions)
    {"prompt": "<prompt text>", "completion": "<ideal generated text>"},
    {"prompt": "<prompt text>", "completion": "<ideal generated text>"},
    {"prompt": "<prompt text>", "completion": "<ideal generated text>"},
    # https://platform.openai.com/docs/guides/fine-tuning/multi-turn-chat-examples
    # Multi-turn chat examples w/ weights on the assistant messages
    {
        "messages": [
            {"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."},
            {"role": "user", "content": "What's the capital of France?"},
            {"role": "assistant", "content": "Paris", "weight": 0},
            {"role": "user", "content": "Can you be more sarcastic?"},
            {"role": "assistant", "content": "Paris, as if everyone doesn't know that already.", "weight": 1},
        ],
    },
    {
        "messages": [
            {"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."},
            {"role": "user", "content": "Who wrote 'Romeo and Juliet'?"},
            {"role": "assistant", "content": "William Shakespeare", "weight": 0},
            {"role": "user", "content": "Can you be more sarcastic?"},
            {"role": "assistant", "content": "Oh, just some guy named William Shakespeare. Ever heard of him?", "weight": 1},
        ],
    },
    {
        "messages": [
            {"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."},
            {"role": "user", "content": "How far is the Moon from Earth?"},
            {"role": "assistant", "content": "384,400 kilometers", "weight": 0},
            {"role": "user", "content": "Can you be more sarcastic?"},
            {"role": "assistant", "content": "Around 384,400 kilometers. Give or take a few, like that really matters.", "weight": 1}
        ]
    },
    # Structured format examples
    {"messages": [{"role": "system", "content": "Given a sports headline, provide the following fields in a JSON dict, where applicable: \"player\" (full name), \"team\", \"sport\", and \"gender\"."}, {"role": "user", "content": "Sources: Colts grant RB Taylor OK to seek trade"}, {"role": "assistant", "content": "{\"player\": \"Jonathan Taylor\", \"team\": \"Colts\", \"sport\": \"football\", \"gender\": \"male\" }"}]},
    {"messages": [{"role": "system", "content": "Given a sports headline, provide the following fields in a JSON dict, where applicable: \"player\" (full name), \"team\", \"sport\", and \"gender\"."}, {"role": "user", "content": "OSU 'split down middle' on starting QB battle"}, {"role": "assistant", "content": "{\"player\": null, \"team\": \"OSU\", \"sport\": \"football\", \"gender\": null }"}]},
    # Tool-calling examples
    {
        "messages": [
            {"role": "user", "content": "What is the weather in San Francisco?"},
            {"role": "assistant", "tool_calls": [{"id": "call_id", "type": "function", "function": {"name": "get_current_weather", "arguments": "{\"location\": \"San Francisco, USA\", \"format\": \"celsius\"}"}}]},
            {"role": "tool", "tool_call_id": "call_id", "content": "21.0"},
            {"role": "assistant", "content": "It is 21 degrees celsius in San Francisco, CA"},
        ],
        "tools": [
            {
                "type": "function",
                "function": {
                    "name": "get_current_weather",
                    "description": "Get the current weather",
                    "parameters": {
                        "type": "object",
                        "properties": {
                            "location": {
                                "type": "string",
                                "description": "The city and country, eg. San Francisco, USA"
                            },
                            "format": { "type": "string", "enum": ["celsius", "fahrenheit"] }
                        },
                        "required": ["location", "format"]
                    }
                }
            }
        ]
    }
]

In [4]:
def save_json_array_as_jsonl(json_array: List, jsonl_file_path: str = 'data.jsonl'):
    with open(jsonl_file_path, 'w', encoding='utf-8') as fp:
        for item in json_array:
            fp.write(json.dumps(item) + '\n')

In [5]:
save_json_array_as_jsonl(json_array=dataset, jsonl_file_path=dataset_path)
# validation_dataset.jsonl
# test_dataset.jsonl

## Chat Finetuning Data Prep; see https://cookbook.openai.com/examples/chat_finetuning_data_prep

In [6]:
with open(dataset_path, 'r', encoding='utf-8') as f:
    dataset = [json.loads(line) for line in f]

# Initial dataset stats
print("Num examples:", len(dataset))
print("First example:")

for message in dataset[0]["messages"]:
    print(message)

Num examples: 12
First example:
{'role': 'system', 'content': 'Marv is a factual chatbot that is also sarcastic.'}
{'role': 'user', 'content': "What's the capital of France?"}
{'role': 'assistant', 'content': "Paris, as if everyone doesn't know that already."}


In [7]:
## Upload the training file
# https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset

In [8]:
client = OpenAI(
    api_key=api_key,
    base_url=base_url,
)

In [9]:
client.files.create(
  file=open("training_dataset.jsonl", "rb"),
  purpose="fine-tune"
)

# TODO: where to impl? llamafile, llama-cpp-python? litellm proxy in front?

APIConnectionError: Connection error.