## Fine tuning GPT-4o

In [4]:
import json
import tiktoken
import numpy as np
from collections import defaultdict
from datasets import Dataset, load_dataset, concatenate_datasets
import pandas as pd

In [None]:
dataset = load_dataset("sidbin/natural-logql")

In [None]:
pd.set_option("display.max_colwidth", 1000)

# Public Inference Finetune

> For demo: no train test split.

In [9]:
from typing import Dict, List
import os

def prep_for_gpt(row: Dict):
    system_msg = "You are LogQLLM, a specialized language model trained to convert natural language queries into LogQL (Log Query Language) queries. Your primary function is to interpret user requests expressed in plain English and translate them into valid LogQL syntax."
    user_msg = row["question"]
    assistant_msg = row["logql_query"]
    messages = [
        {"role": "system", "content": system_msg},
        {"role": "user", "content": user_msg},
        {"role": "assistant", "content": assistant_msg},
    ]
    return {"messages": messages}
    # return messages

In [None]:
train_dataset = dataset["train"].map(prep_for_gpt, num_proc=os.cpu_count())
test_dataset = dataset["test"].map(prep_for_gpt, num_proc=os.cpu_count())
 
full_dataset = concatenate_datasets([train_dataset, test_dataset])

Map (num_proc=20):   0%|          | 0/85 [00:00<?, ? examples/s]

In [14]:
file_name = "gpt-4o-ft-public.jsonl"
full_dataset.select_columns("messages").to_json(file_name, orient="records", lines=True)

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

313862

In [15]:
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv(".env")

client = OpenAI()
file = client.files.create(file=open(file_name, "rb"), purpose="fine-tune")

In [16]:
file

FileObject(id='file-YURp0JbtyZJe9a5SEWBpCCub', bytes=313862, created_at=1731441215, filename='gpt-4o-ft-public.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

In [17]:
client.fine_tuning.jobs.create(
    training_file=file.id,
    model="gpt-4o-2024-08-06"
)

FineTuningJob(id='ftjob-pMdxUVhvkmpZCaQMLhiLXpCM', created_at=1731441275, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='gpt-4o-2024-08-06', object='fine_tuning.job', organization_id='org-Y07cpV1QUQCSkyCe8JEUdJIG', result_files=[], seed=1465408277, status='validating_files', trained_tokens=None, training_file='file-YURp0JbtyZJe9a5SEWBpCCub', validation_file=None, estimated_finish=None, integrations=[], user_provided_suffix=None)

### Generate Train / Test Split

In [16]:
split_dataset = dataset.train_test_split(test_size=0.2, seed=42)

In [17]:
train_df = split_dataset["train"].to_pandas()
test_df = split_dataset["test"].to_pandas()

In [7]:
train_df.columns

Index(['id', 'application', 'application_variables', 'category',
       'label_filter', 'line_filter', 'log_category_result', 'logql_query',
       'metric_category', 'metric_category_result', 'query_explanation',
       'query_result', 'question', 'row_variables', 'variables'],
      dtype='object')

### Prep for gpt-4o
> Chat

In [18]:
import os

train_dataset = split_dataset["train"].map(prep_for_gpt, num_proc=os.cpu_count())

In [19]:
train_dataset["messages"]

[[{'content': 'You are LogQLLM, a specialized language model trained to convert natural language queries into LogQL (Log Query Language) queries. Your primary function is to interpret user requests expressed in plain English and translate them into valid LogQL syntax.',
   'role': 'system'},
  {'content': "What is the average time taken to build instances in the past week for application 'openstack-us-east'?",
   'role': 'user'},
  {'content': 'avg_over_time(\n  {application="openstack-us-east", log_file_type="nova-compute"}\n    |= "Took"\n    |= "seconds to build instance"\n    | regexp "\\[instance: (?P<instance_id>[^"]+)\\] Took (?P<build_time>\\d+\\.\\d+) seconds to build instance"\n    | unwrap build_time\n     [7d]\n)',
   'role': 'assistant'}],
 [{'content': 'You are LogQLLM, a specialized language model trained to convert natural language queries into LogQL (Log Query Language) queries. Your primary function is to interpret user requests expressed in plain English and translat

### format validation

In [12]:
# Format error checks
format_errors = defaultdict(int)

for ex in split_dataset["messages"]:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue

    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue

    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1

        if any(
            k not in ("role", "content", "name", "function_call", "weight")
            for k in message
        ):
            format_errors["message_unrecognized_key"] += 1

        if message.get("role", None) not in ("system", "user", "assistant", "function"):
            format_errors["unrecognized_role"] += 1

        content = message.get("content", None)
        function_call = message.get("function_call", None)

        if (not content and not function_call) or not isinstance(content, str):
            format_errors["missing_content"] += 1

    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

Found errors:
data_type: 339


### Save to File

In [21]:
train_dataset.select_columns("messages").to_json("gpt-4o-ft-00.jsonl")

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

251322

### upload and save train/test split

In [24]:
split_dataset.push_to_hub("sidbin/natural-logql")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/1.10k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/sidbin/natural-logql/commit/9cf75ebdee36dd2d762a199a2d9226e1bb2bae84', commit_message='Upload dataset', commit_description='', oid='9cf75ebdee36dd2d762a199a2d9226e1bb2bae84', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/sidbin/natural-logql', endpoint='https://huggingface.co', repo_type='dataset', repo_id='sidbin/natural-logql'), pr_revision=None, pr_num=None)

In [25]:
split_dataset.save_to_disk("../../dataset/nl-logql-dataset-03")

Saving the dataset (0/1 shards):   0%|          | 0/339 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/85 [00:00<?, ? examples/s]

### upload file

### Format for Finetuning

In [None]:
{
    "messages": [
        {
            "role": "system",
            "content": "Marv is a factual chatbot that is also sarcastic.",
        },
        {"role": "user", "content": "What's the capital of France?"},
        {"role": "assistant", "content": "Paris", "weight": 0},
        {"role": "user", "content": "Can you be more sarcastic?"},
        {
            "role": "assistant",
            "content": "Paris, as if everyone doesn't know that already.",
            "weight": 1,
        },
    ]
}
{
    "messages": [
        {
            "role": "system",
            "content": "Marv is a factual chatbot that is also sarcastic.",
        },
        {"role": "user", "content": "Who wrote 'Romeo and Juliet'?"},
        {"role": "assistant", "content": "William Shakespeare", "weight": 0},
        {"role": "user", "content": "Can you be more sarcastic?"},
        {
            "role": "assistant",
            "content": "Oh, just some guy named William Shakespeare. Ever heard of him?",
            "weight": 1,
        },
    ]
}
{
    "messages": [
        {
            "role": "system",
            "content": "Marv is a factual chatbot that is also sarcastic.",
        },
        {"role": "user", "content": "How far is the Moon from Earth?"},
        {"role": "assistant", "content": "384,400 kilometers", "weight": 0},
        {"role": "user", "content": "Can you be more sarcastic?"},
        {
            "role": "assistant",
            "content": "Around 384,400 kilometers. Give or take a few, like that really matters.",
            "weight": 1,
        },
    ]
}