In [35]:
# Import Libraries and Set API Key
import openai
import pandas as pd
import json
from dotenv import load_dotenv
import os
from openai import OpenAI
import numpy as np


import json
import tiktoken # for token counting
from collections import defaultdict


# Load environment variables
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")



In [36]:
# Load Processed
df = pd.read_csv("../data/processed/cleaned_tweets.csv")
df_validation = pd.read_csv("../data/processed/cleaned_tweets_validation.csv")
df.head()
df_validation.head()

Unnamed: 0,sentiment,text,cleaned_text
0,Positive,im getting on borderlands and i will murder yo...,im getting borderlands murder
1,Positive,I am coming to the borders and I will kill you...,coming borders kill
2,Positive,im getting on borderlands and i will kill you ...,im getting borderlands kill
3,Positive,im coming on borderlands and i will murder you...,im coming borderlands murder
4,Positive,im getting on borderlands 2 and i will murder ...,im getting borderlands murder


In [37]:
# Take at most 500 random samples
df_sampled = df.sample(n=min(1000, len(df)), random_state=42)
df_validation_sampled = df_validation.sample(n=min(1000, len(df_validation)), random_state=42)

# Display the samples
df_sampled.head()
df_validation_sampled.head()

Unnamed: 0,sentiment,text,cleaned_text
34877,Irrelevant,He said told u I'm getting in that box of a br...,said told u im getting box brain dead controll...
21704,Positive,Yo this looks LIT! CS: GO / Overwatch combo,yo looks lit cs go overwatch combo
47008,Negative,@HomeDepot attention executive administrators....,attention executive administrators ever stores...
7969,Irrelevant,Guy has notified me and says that my name has ...,guy notified says name forwarded litter list l...
454,Positive,F Loving the new DLC!!!. RhandlerR RhandlerR R...,f loving new dlc rhandlerr rhandlerr rhandlerr...


In [38]:
# Prepare Data for OpenAI Fine-Tuning (JSONL format)
def create_jsonl(df, filename):
    with open(filename, 'w') as file:
        for _, row in df.iterrows():
            if pd.isna(row['cleaned_text']) or pd.isna(row['sentiment']):
                continue  # Skip rows with missing data
            json_record = {
                "messages": [
                    {"role": "system", "content": "You classify tweet sentiment as positive, negative, or neutral."},
                    {"role": "user", "content": row['cleaned_text']},
                    {"role": "assistant", "content": row['sentiment']}
                ]
            }
            file.write(json.dumps(json_record) + "\n")

create_jsonl(df_sampled, "../data/processed/train_data.jsonl")
create_jsonl(df_validation_sampled, "../data/processed/test_data.jsonl")


In [39]:
data_path = "../data/processed/train_data.jsonl"

# Load the dataset
with open(data_path, 'r', encoding='utf-8') as f:
    dataset = [json.loads(line) for line in f]

# Initial dataset stats
print("Num examples:", len(dataset))
print("First example:")
for message in dataset[0]["messages"]:
    print(message)

Num examples: 959
First example:
{'role': 'system', 'content': 'You classify tweet sentiment as positive, negative, or neutral.'}
{'role': 'user', 'content': 'said told u im getting box brain dead controller player'}
{'role': 'assistant', 'content': 'Irrelevant'}


In [40]:
# Format error checks
format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue
        
    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue
        
    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1
        
        if any(k not in ("role", "content", "name", "function_call", "weight") for k in message):
            format_errors["message_unrecognized_key"] += 1
        
        if message.get("role", None) not in ("system", "user", "assistant", "function"):
            format_errors["unrecognized_role"] += 1
            
        content = message.get("content", None)
        function_call = message.get("function_call", None)
        
        if (not content and not function_call) or not isinstance(content, str):
            format_errors["missing_content"] += 1
    
    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

No errors found


In [41]:
# Token Count Check
encoding = tiktoken.get_encoding("cl100k_base")

# not exact!
# simplified from https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3
    return num_tokens

def num_assistant_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        if message["role"] == "assistant":
            num_tokens += len(encoding.encode(message["content"]))
    return num_tokens

def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")







n_missing_system = 0
n_missing_user = 0
n_messages = []
convo_lens = []
assistant_message_lens = []

for ex in dataset:
    messages = ex["messages"]
    if not any(message["role"] == "system" for message in messages):
        n_missing_system += 1
    if not any(message["role"] == "user" for message in messages):
        n_missing_user += 1
    n_messages.append(len(messages))
    convo_lens.append(num_tokens_from_messages(messages))
    assistant_message_lens.append(num_assistant_tokens_from_messages(messages))
    
print("Num examples missing system message:", n_missing_system)
print("Num examples missing user message:", n_missing_user)
print_distribution(n_messages, "num_messages_per_example")
print_distribution(convo_lens, "num_total_tokens_per_example")
print_distribution(assistant_message_lens, "num_assistant_tokens_per_example")
n_too_long = sum(l > 16385 for l in convo_lens)
print(f"\n{n_too_long} examples may be over the 16,385 token limit, they will be truncated during fine-tuning")


Num examples missing system message: 0
Num examples missing user message: 0

#### Distribution of num_messages_per_example:
min / max: 3, 3
mean / median: 3.0, 3.0
p5 / p95: 3.0, 3.0

#### Distribution of num_total_tokens_per_example:
min / max: 29, 192
mean / median: 42.50990615224192, 41.0
p5 / p95: 31.0, 56.0

#### Distribution of num_assistant_tokens_per_example:
min / max: 1, 2
mean / median: 1.1689259645464025, 1.0
p5 / p95: 1.0, 2.0

0 examples may be over the 16,385 token limit, they will be truncated during fine-tuning


In [42]:
# Pricing and default n_epochs estimate
MAX_TOKENS_PER_EXAMPLE = 16385

TARGET_EPOCHS = 2
MIN_TARGET_EXAMPLES = 100
MAX_TARGET_EXAMPLES = 25000
MIN_DEFAULT_EPOCHS = 1
MAX_DEFAULT_EPOCHS = 25

n_epochs = TARGET_EPOCHS
n_train_examples = len(dataset)
if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
    n_epochs = min(MAX_DEFAULT_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
    n_epochs = max(MIN_DEFAULT_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)

n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens)
print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training")
print(f"By default, you'll train for {n_epochs} epochs on this dataset")
print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens")

Dataset has ~40767 tokens that will be charged for during training
By default, you'll train for 2 epochs on this dataset
By default, you'll be charged for ~81534 tokens


In [43]:
client = OpenAI()
# Upload the file
file_training = client.files.create(
    file=open("../data/processed/train_data.jsonl", "rb"),
    purpose="fine-tune"
)

file_validation = client.files.create(
    file=open("../data/processed/test_data.jsonl", "rb"),
    purpose="fine-tune"
)



In [45]:
# Initiate fine-tuning
fine_tune_job = client.fine_tuning.jobs.create(
    training_file=file_training.id,
    validation_file = file_validation.id,
    model="gpt-3.5-turbo"
)