In [None]:
'''

This is a short tutorial to:
    - Connect to the OpenAI API
    - Fine-tune the model to create a chatbot that provides the day's events when greeted (Transfer Learning)
    - Wrapping the AI in a simple UI app to allow interactions with the AI

The "tone" or persona of the AI will be defined as a "Marline" who is a witty, dramatic chatbot who loves to give the gossip.

This example follows the Medium article: 


An OpenAI account is required to use the API.

'''

# Importing libraries
import os, openai, json
import pandas as pd


# Input your OpenAI Secret Key:
os.environ['OPENAI_API_KEY'] = 'YOUR OPENAI SECRET KEY'
openai.api_key = os.getenv('OPENAI_API_KEY')

# Connecting to the OpenAI API
client = openai.OpenAI()

In [None]:
'''

Loading in the fine-tuning dataset.
The dataset is a CSV that contains the information on the persona and example prompts and responses.
There is the usage of "multi-turn chat" as "weight" (0 or 1 values) are used in the responses.
However, only one example truly utilizes the weights.

There are only 10 examples in this dataset but it is recommened by the OpenAI documentation that 50-100 examples be provided for better tuning:
https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset#example-count-recommendations

(10 examples are the minimum requirement for fine-tuning.)

'''

# Loading in the fine-tuning dataset
df = pd.read_csv('marline_fine_tuning_examples.csv')

In [None]:
'''

From the OpenAI documentation, the fine-tuning process follows this format: https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset#multi-turn-chat-examples

{"messages": 
    [
        {"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, 
        {"role": "user", "content": "What's the capital of France?"}, 
        {"role": "assistant", "content": "Paris", "weight": 0}, 
        {"role": "user", "content": "Can you be more sarcastic?"}, 
        {"role": "assistant", "content": "Paris, as if everyone doesn't know that already.", "weight": 1}
    ]
}

*** Since this is using the "weight" argument, this is more aligned with the multi-turn chat format (shown above) rather than the more simple singe example format.

'''

# Setting up the format to be used by OpenAI to fine-tune:
fine_tuning_list = []

for id in df['id'].unique():

    messages_list = []

    for index, row in df[df['id'] == id].iterrows():
        
        if pd.isna(row['weight']):
            inner_dict = {"role": row['role'], "content": row['content']}
        else:
            inner_dict = {"role": row['role'], "content": row['content'], "weight": row['weight']}

        messages_list.append(inner_dict)
        
    
    fine_tuning_list.append({"messages": messages_list})


# Converting the list to a JSONL file (JSON Lines)
with open('marline_fine_tuning.jsonl', 'w') as f:
    for entry in fine_tuning_list:
        f.write(json.dumps(entry))
        f.write('\n')

In [None]:
'''

(Skippable)

Validating the JSON file, token counting, and cost estimate using the Python script provided by OpenAI:
https://cookbook.openai.com/examples/chat_finetuning_data_prep

Unfornately, the token counting and cost estimation does not work for the multi-turn chat format because the "weight" arugment is a numerical data type and the Python script only coverts strings to tokens.

Since only 10 examples were created, the cost should not be significant enough to worry about.

'''
from collections import defaultdict

data_path = "marline_fine_tuning.jsonl"

# Load the dataset
with open(data_path, 'r', encoding='utf-8') as f:
    dataset = [json.loads(line) for line in f]

# Initial dataset stats
print("Num examples:", len(dataset))
print("First example:")
for message in dataset[0]["messages"]:
    print(message)

# Format error checks
format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue
        
    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue
        
    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1
        
        if any(k not in ("role", "content", "name", "function_call", "weight") for k in message):
            format_errors["message_unrecognized_key"] += 1
        
        if message.get("role", None) not in ("system", "user", "assistant", "function"):
            format_errors["unrecognized_role"] += 1
            
        content = message.get("content", None)
        function_call = message.get("function_call", None)
        
        if (not content and not function_call) or not isinstance(content, str):
            format_errors["missing_content"] += 1
    
    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")


In [None]:
'''

For the model to find the fine-tuning dataset/JSONL, it needs to be uploaded to the OpenAI platform.
Then a model needs to be selected and trained.

Models that can be fine-tuned: https://platform.openai.com/docs/guides/fine-tuning#which-models-can-be-fine-tuned

'''

# Uploading the JSON file to OpenAI
upload_info = client.files.create(
  file=open("marline_fine_tuning.jsonl", "rb"),
  purpose="fine-tune"
)

print('Uploaded File Information: ', upload_info)

# Selecting the model to fine-tune
fine_tuning_info = client.fine_tuning.jobs.create(
  training_file=upload_info.id, # This is the "id" attribute from the object returned by OpenAI for uploading the fine-tuning dataset
  model="gpt-4o-mini-2024-07-18", 
  suffix="marline_chatbox" # Giving an addition name identifier to easily distinguish in the future
)

print('Fine Tuning Job Information: ', client.fine_tuning.jobs.retrieve(fine_tuning_info.id))

In [None]:
# You can check the status of the fine-tuning job by running:

trained_model_info = client.fine_tuning.jobs.retrieve(fine_tuning_info.id)
print('Fine Tuned Model Information: ', trained_model_info)

'''

Similar to the code to fine tune a model, an object is returned with information about the fine-tuning job. The job "id" is used to pass as an argument to check on the status of the job.
An email should be sent to the email address associated with the OpenAI account when the fine-tuning job is completed.
Or, if using code to check for completion, the atribute "finished_at" can be checked to see if the job is complete.
Either way, the name of the fine-tuned model is returned in the "fine_tuned_model" attribute.

You can also check on the status of the fine-tuning job/models on the OpenAI website under the "Dashboard" >> "Fine-Tuning" section.
This is also a nice method for finding the name of the fine-tuned model.

'''

In [None]:
'''

Testing the trained model.

'''

completion = client.chat.completions.create(
  model=trained_model_info.fine_tuned_model, # Trained model name
  messages=[
    {'role': 'system', 'content': 'Marline is a witty, dramatic, and gossipy informative chatbot.'},
    {'role': 'user', 'content': 'Tell me a story.'}
  ],
  temperature= 0.8 # Determines the "creativity"/"randomness" of the response. Higher = more random. (Ranges from 0-2)
)

print(completion.choices[0].message)