## OpenAI Fine Tuning example

In [1]:
import os
import pandas as pd
import json

In [2]:
# Reading in the answers
df_answers = pd.read_csv('reddit_answers_big.csv', sep=';')
df_answers.head()

Unnamed: 0.1,Unnamed: 0,q_id,text,votes
0,0,hvbvpz,Two pet ducks. You may be tempted to go for on...,2359.0
1,1,hvbvpz,Nice try Jeff Bezos,764.0
2,2,hvbvpz,A curved shower rod. Seriously. $10 for a tens...,1525.0
3,3,hvbvpz,Another monitor. Your productivity will increa...,1227.0
4,4,hvbvpz,A nasal irrigation kit - either the electronic...,659.0


In [3]:
# Grabbing the top answers
df_top_votes = df_answers.groupby(by='q_id')['votes'].idxmax()
df_top_votes = df_answers.loc[df_top_votes]
# Rename the columns
df_top_votes.rename(columns={"text":"answer"}, inplace=True)
df_top_votes.rename(columns={"q_id":"id"}, inplace=True)
df_top_votes.rename(columns={"votes":"answer_votes"}, inplace=True)
# Drop unnecessary columns
df_top_votes.drop(columns=['Unnamed: 0'], inplace=True)
# Reset index
df_top_votes.reset_index(drop=True)
# Show the first 5 rows
df_top_votes.head()


Unnamed: 0,id,answer,answer_votes
1817014,1001ag,Tell him to go to a hospital. I can't stress t...,30.0
1591462,10029x,NOTE: Detail may not sum to totals because of ...,3.0
96052,1004g5,Blow Me Away by Breaking Benjamin http://www....,7.0
3417066,1008ax,"""""""How come he don't want me, man?"" From what ...",5.0
2925201,100b8y,"Eat something for the love of god, and the fas...",12.0


In [4]:
# Reading in the questions
df_questions = pd.read_csv('reddit_questions.csv', sep=';')
df_questions.head()

Unnamed: 0,id,text,votes,timestamp,datetime
0,izucgz,What's the purpose of life?,8,1601076000.0,Fri Sep 25 23:13:31 2020 UTC
1,9c784/,"I've tried to quit smoking, this is my seventh...",11,1250712000.0,Wed Aug 19 19:58:54 2009 UTC
2,iylxwl,"For those who have a slave master last name, w...",0,1600904000.0,Wed Sep 23 23:35:15 2020 UTC
3,gmmlj4,How do you think humans will become extinct?,21998,1589887000.0,Tue May 19 11:18:05 2020 UTC
4,ishb7v,What is a movie So Disturbing you couldn't be ...,13,1600074000.0,Mon Sep 14 08:53:53 2020 UTC


In [5]:
# Rename the columns
df_questions.rename(columns={"text":"question"}, inplace=True)
df_questions.rename(columns={"votes":"question_votes"}, inplace=True)
# Drop unnecessary columns
df_questions.drop(columns=['timestamp', 'datetime'], inplace=True)
# Show the first 5 columns
df_questions.head()

Unnamed: 0,id,question,question_votes
0,izucgz,What's the purpose of life?,8
1,9c784/,"I've tried to quit smoking, this is my seventh...",11
2,iylxwl,"For those who have a slave master last name, w...",0
3,gmmlj4,How do you think humans will become extinct?,21998
4,ishb7v,What is a movie So Disturbing you couldn't be ...,13


In [6]:
# Joining the questions and answers and ID
merged_df = df_top_votes.merge(df_questions, on= 'id')
merged_df.head()

Unnamed: 0,id,answer,answer_votes,question,question_votes
0,1001ag,Tell him to go to a hospital. I can't stress t...,30.0,A friend of mine believes he was anally raped ...,14
1,10029x,NOTE: Detail may not sum to totals because of ...,3.0,What do the numbers in parenthesis mean in thi...,2
2,1004g5,Blow Me Away by Breaking Benjamin http://www....,7.0,"Reddit, what song gets you really amped up no ...",14
3,1008ax,"""""""How come he don't want me, man?"" From what ...",5.0,"The saddest thing I've ever seen on a tv show,...",1
4,100b8y,"Eat something for the love of god, and the fas...",12.0,"First time drinking, what do I do?",7


In [7]:
# Reordering the columns
merged_df = merged_df.reindex(columns=['id', 'answer', 'question', 'answer_votes', 'question_votes'])
merged_df.head()

Unnamed: 0,id,answer,question,answer_votes,question_votes
0,1001ag,Tell him to go to a hospital. I can't stress t...,A friend of mine believes he was anally raped ...,30.0,14
1,10029x,NOTE: Detail may not sum to totals because of ...,What do the numbers in parenthesis mean in thi...,3.0,2
2,1004g5,Blow Me Away by Breaking Benjamin http://www....,"Reddit, what song gets you really amped up no ...",7.0,14
3,1008ax,"""""""How come he don't want me, man?"" From what ...","The saddest thing I've ever seen on a tv show,...",5.0,1
4,100b8y,"Eat something for the love of god, and the fas...","First time drinking, what do I do?",12.0,7


In [8]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181311 entries, 0 to 181310
Data columns (total 5 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   id              181311 non-null  object 
 1   answer          181296 non-null  object 
 2   question        181311 non-null  object 
 3   answer_votes    181311 non-null  float64
 4   question_votes  181311 non-null  int64  
dtypes: float64(1), int64(1), object(3)
memory usage: 6.9+ MB


In [9]:
# Downsizing the data
merged_df = merged_df.sort_values(by='answer_votes', ascending=False)
merged_df_1k = merged_df[:1000]
merged_df_1k.head()

Unnamed: 0,id,answer,question,answer_votes,question_votes
72761,fkzaca,The word cool,What is something that has aged well?,99398.0,66093
32106,a0a4cd,"It must be true that either It didn't exist, ...",What's the most amazing thing about the universe?,86042.0,81862
42590,d0jjc2,The social media explosion,The 2010's decade will be over in 4 months. Wh...,85936.0,113254
34515,aqf3bi,"Easy, ask the CIA to hold them...those two don...","You are offered $1,000,000 USD if you can hide...",85693.0,81908
38243,bvdaci,The most expensive thing you own is a really o...,What's classy if you're rich but trashy if you...,85568.0,66102


In [11]:
merged_df_1k.shape

(1000, 5)

In [27]:
# Fine tunning format
answers, questions = merged_df_1k['answer'], merged_df_1k['question']

qa_openai_format = [{"messages":[
    {"role":"system", "content": "You are a factual chatbot and reddit expert who likes to answer with bullets"},
    {"role":"user", "content": q},
    {"role":"assistant", "content": a}]} for q, a in zip(questions, answers)]

qa_openai_format[1]

{'messages': [{'role': 'system',
   'content': 'You are a factual chatbot and reddit expert who likes to answer with bullets'},
  {'role': 'user',
   'content': "What's the most amazing thing about the universe?"},
  {'role': 'assistant',
   'content': "It must be true that either  It didn't exist, then it did  or  It has always existed"}]}

In [28]:
# Verify the model format and data
with open("training_data.jsonl", "w") as f:
    for entry in qa_openai_format:
        f.write(json.dumps(entry))
        f.write("\n")

In [29]:
from collections import defaultdict

data_path = "training_data.jsonl"

# Load the dataset
with open(data_path, 'r', encoding='utf-8') as f:
  dataset = [json.loads(line) for line in f]

# Initial dataset stats
print("Num examples:", len(dataset))

# Format error checks
format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue
        
    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue
        
    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1
        
        if any(k not in ("role", "content", "name", "function_call") for k in message):
            format_errors["message_unrecognized_key"] += 1
        
        if message.get("role", None) not in ("system", "user", "assistant", "function"):
            format_errors["unrecognized_role"] += 1
            
        content = message.get("content", None)
        function_call = message.get("function_call", None)
        
        if (not content and not function_call) or not isinstance(content, str):
            format_errors["missing_content"] += 1
    
    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")


Num examples: 1000
No errors found


In [31]:
# Importing OpenAI
import openai
from openai import OpenAI

# Enter your API key below
# os.environ["OPENAI_API_KEY"] = "API KEY HERE"
# openai.api_key=os.getenv('OPENAI_API_KEY')

In [None]:
# Instantiate the model
client = OpenAI()

In [None]:
# Time to finally start fine tuning (sending the data to OpenAI)
client.files.create(
    file = open('training_data.jsonl', 'rb'),
    purpose = 'fine-tune'
)

In [None]:
# Create the job
client.fine_tuning.jobs.create(
    training_file = #Enter your file id,
    model = "gpt-3.5-turbo"
)

In [None]:
# Verify the fine tuning jobs list
client.fine_tuning.jobs.list()

In [33]:
# Now is time to interact with the model
system_prompt = "You are a factual chatbot and reddit expert who likes to answer with bullets"
user_question = "Give me the dumbest thing you've ever done."

In [None]:
response = client.chat.completions.create(
    model = # Enter the fine_tunned_model obtained on the jobs list,
    messages = [
        {"role":"system", "content":system_prompt},
        {"role":"user", "content":user_question}
    ]
)

print(response.choices[0].message.content)