**Json file creation from csv**

In [None]:
import csv
import json

def csv_to_jsonl(input_csv, output_jsonl):
    system_message = {
        "role": "system",
        "content": """You are a fashion designer who helps to identify products, descriptions, and categories that should be served for user.

Instructions:
1. Identify all the products or series of products to serve best to the user is referring.
2. For each identified product:
    a. If the user provides enough details to describe the product, summarise those details as the "description".
    b. If the user does not provide enough details, use the product name itself as the "description".
4. Classify each product into one of the following categories:
    ['men_clothing', 'women_clothing', 'men_watches', 'women_watches', 'men_shoes', 'women_shoes']

CONSTRAINTS:

ALL THE PRODUCTS SHOULD BE FROM SAME GENDER [MEN/WOMEN]. DONT MIX THEM

Output:

A JSON object with a list of product objects, each containing "name", "description", and "category" keys.

Format and example:
{
    "products": [
        {
            "name": "blue jeans",
            "description": "blue denim jeans",
            "category": "men_clothing"
        },
        {
            "name": "white shirts",
            "description": "white shirts",
            "category": "women_clothing"
        }
    ]
}"""
    }

    with open(input_csv, 'r', newline='', encoding='utf-8') as csvfile, open(output_jsonl, 'w', encoding='utf-8') as jsonlfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            entry = {
                "messages": [
                    system_message,
                    {"role": "user", "content": row['query']},
                    {"role": "assistant", "content": row['response']}
                ]
            }
            jsonlfile.write(json.dumps(entry) + '\n')

    print(f"Conversion complete. JSONL file saved as {output_jsonl}")

# Usage
input_csv = 'fine.csv'  # Replace with your CSV file name
output_jsonl = 'fine.jsonl'  # Replace with desired output file name

csv_to_jsonl(input_csv, output_jsonl)

Conversion complete. JSONL file saved as fine.jsonl


# Completing Format checks

In [None]:
import json
import tiktoken  # for token counting
import numpy as np
from collections import defaultdict

# Data loading
data_path = "fine.jsonl"

# Load the dataset
with open(data_path, 'r', encoding='utf-8') as f:
    dataset = [json.loads(line) for line in f]

# Initial dataset stats
print("Num examples:", len(dataset))
print("First example:")
for message in dataset[0]["messages"]:
    print(message)

Num examples: 107
First example:
{'role': 'system', 'content': 'You are a fashion designer who helps to identify products, descriptions, and categories that should be served for user.\n\nInstructions:\n1. Identify all the products or series of products to serve best to the user is referring.\n2. For each identified product:\n    a. If the user provides enough details to describe the product, summarise those details as the "description".\n    b. If the user does not provide enough details, use the product name itself as the "description".\n4. Classify each product into one of the following categories:\n    [\'men_clothing\', \'women_clothing\', \'men_watches\', \'women_watches\', \'men_shoes\', \'women_shoes\']\n\nCONSTRAINTS:\n\nALL THE PRODUCTS SHOULD BE FROM SAME GENDER [MEN/WOMEN]. DONT MIX THEM\n\nOutput:\n\nA JSON object with a list of product objects, each containing "name", "description", and "category" keys.\n\nFormat and example:\n{\n    "products": [\n        {\n            "

In [None]:
# Format error checks
format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
    messages = ex.get("messages", None)
    if messages is None:
        format_errors["missing_messages_list"] += 1
    else:
        for message in messages:
            if "role" not in message or "content" not in message:
                format_errors["message_missing_key"] += 1
            if any(k not in ("role", "content", "name", "function_call", "weight") for k in message):
                format_errors["message_unrecognized_key"] += 1
            if message.get("role") not in ("system", "user", "assistant"):
                format_errors["unrecognized_role"] += 1
            if not isinstance(message.get("content", ''), str):
                format_errors["missing_content"] += 1
        if not any(message.get("role") == "assistant" for message in messages):
            format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

No errors found


In [None]:
encoding = tiktoken.get_encoding("cl100k_base")

def num_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        num_tokens += len(encoding.encode(message['content']))
    return num_tokens

def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.05)}, {np.quantile(values, 0.95)}")

In [None]:
n_missing_system = 0
n_missing_user = 0
n_messages = []
convo_lens = []
assistant_message_lens = []

for ex in dataset:
    messages = ex["messages"]
    system_present = any(m["role"] == "system" for m in messages)
    user_present = any(m["role"] == "user" for m in messages)
    n_missing_system += not system_present
    n_missing_user += not user_present
    n_messages.append(len(messages))
    convo_lens.append(num_tokens_from_messages(messages))
    assistant_message_lens.append(num_tokens_from_messages([m for m in messages if m["role"] == "assistant"]))

print("Num examples missing system message:", n_missing_system)
print("Num examples missing user message:", n_missing_user)
print_distribution(n_messages, "num_messages_per_example")
print_distribution(convo_lens, "num_total_tokens_per_example")
print_distribution(assistant_message_lens, "num_assistant_tokens_per_example")

Num examples missing system message: 0
Num examples missing user message: 0

#### Distribution of num_messages_per_example:
min / max: 3, 3
mean / median: 3.0, 3.0
p5 / p95: 3.0, 3.0

#### Distribution of num_total_tokens_per_example:
min / max: 331, 402
mean / median: 388.85046728971963, 389.0
p5 / p95: 382.3, 400.0

#### Distribution of num_assistant_tokens_per_example:
min / max: 51, 120
mean / median: 108.08411214953271, 108.0
p5 / p95: 102.0, 117.0


In [None]:
# Assume each example's token count does not exceed the maximum context length for the model.
n_epochs = 3  # typically a good starting point
n_train_examples = len(dataset)
n_billing_tokens_in_dataset = sum(min(402, l) for l in convo_lens)

print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training")
print(f"By default, you'll train for {n_epochs} epochs on this dataset")
print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens")

# Visit OpenAI's pricing page for detailed cost information.

Dataset has ~41607 tokens that will be charged for during training
By default, you'll train for 3 epochs on this dataset
By default, you'll be charged for ~124821 tokens


# Create file and Fine tuning Job

In [None]:
from openai import OpenAI

client = OpenAI(api_key='XXX')

response = client.files.create(
    file=open("fine.jsonl", "rb"),  # Make sure the file path and name are correct
    purpose="fine-tune"
)
print(response)  # This will print the response from the server including the file ID

FileObject(id='file-OX399P52cYXvQhjXauzBww3S', bytes=201053, created_at=1723091836, filename='fine.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)


In [None]:
from openai import OpenAI

# Start a fine-tuning job
response = client.fine_tuning.jobs.create(
    training_file="file-OX399P52cYXvQhjXauzBww3S",
    model="gpt-3.5-turbo-0125"  # Model type to fine-tune
)
print(response)

FineTuningJob(id='ftjob-GLLP17wOXH7QpU8tKWKmBtLp', created_at=1723092776, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-5WlOT3q0QKbAJX4vVQIW85tY', result_files=[], seed=1428391678, status='validating_files', trained_tokens=None, training_file='file-OX399P52cYXvQhjXauzBww3S', validation_file=None, estimated_finish=None, integrations=[], user_provided_suffix=None)


In [None]:
completion = client.chat.completions.create(
    model="ft:gpt-3.5-turbo-0125:personal::9tpkCK91",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Hello!"}
    ]
)
print(completion.choices[0].message)

ChatCompletionMessage(content='Hello! How can I assist you today?', refusal=None, role='assistant', function_call=None, tool_calls=None)
