In [None]:
import openai
import pandas as pd
import requests
import json
import datetime
import shutil
import os
from datasets import load_dataset
import time

In [None]:

dataset = load_dataset("llmware/rag_instruct_benchmark_tester")

In [None]:
df = pd.DataFrame(dataset['train'])

In [None]:

openai.api_key = "Enter Your Openai Key"

In [None]:
df.head(2)

In [None]:
def text_to_openai_json(data,filename):
    """
    Converts a given dataset into a JSON Lines (JSONL) file suitable for OpenAI's GPT-3.5 turbo model.
    
    Args:
        data (DataFrame or similar data structure): Input data containing text and labels.

    The function processes the input data row by row, constructing conversations for each row with a system message, user message, and an assistant message. It then writes the generated conversation data to a JSONL file.
 
    """
    # Initialize an empty list to store conversation data
    message_list = []

    # Iterate through the rows in the input data
    for _, row in data.iterrows():
        # Create a system message as an initial instruction
        system_message = {
            "role": "system",
            "content":  f"You are a factual chatbot that answers questions about for giving text. You only answer with answers you find in the text, no outside information." 
        }

        # Append the system message to the conversation
        message_list.append({"messages": [system_message]})

        # Create a user message based on the 'text' column from the data
        user_message = {
            "role": "user",
            "content": f"{row['context']} based on {row['query']}  "
        }

        # Append the user message to the conversation
        message_list[-1]["messages"].append(user_message)

        # Create an assistant message based on the 'coarse_label' column from the data
        assistant_message = {
            "role": 'assistant',
            "content": row['answer']
        }

        # Append the assistant message to the conversation
        message_list[-1]["messages"].append(assistant_message)

    # Write the conversation data to a JSON Lines (JSONL) file
    with open(filename, "w") as json_file:
        for message in message_list:
            # Serialize the conversation data to JSON and write it to the file
            json.dump(message, json_file)
            json_file.write("\n")

In [None]:
df.head(2)

In [None]:
def fine_tune_model(model_id,num_label,pandas_df):
    df = pandas_df.iloc[:num_label]
    filename = f'ft_increment_{num_label}.jsonl'
    text_to_openai_json(df, filename)
    loader = openai.File.create(file=open(filename, "rb"), purpose='fine-tune')
    fine_tuning_job = openai.FineTuningJob.create(training_file=loader.id, model="gpt-3.5-turbo-1106")
    return fine_tuning_job.id

In [None]:
def wait_for_fine_tuning(job_id):
    while True:
        response = openai.FineTuningJob.retrieve(job_id)
        print(response["fine_tuned_model"])
        if response["fine_tuned_model"]:
            print(response["fine_tuned_model"])
            return response["fine_tuned_model"]
        time.sleep(30)

In [None]:
def generate_10K_responses(data,model_id):
    syntheses = []
    system_content = "You are a factual chatbot that answers questions about for giving text. You only answer with answers you find in the text, no outside information." 
    for idx, row in data.iterrows():
        completion = openai.ChatCompletion.create(
            model= model_id ,
            messages=[
                {"role": "system", "content": system_content},
                {"role": "user", "content": f"{row['query']} based on {row['context']}" }
            ])
        
        #print(f'text: {row}')
        print(completion.choices[0].message.content)
        syntheses.append(completion.choices[0].message.content)
    syntheses_df = pd.DataFrame({'context': data['context'], 'answer' : data['answer'], 'syntheses' : syntheses })
    
    return syntheses_df


In [None]:
count = 0
label_count = [] 
model_ids = []


In [None]:
for i in range(5):
    count += 10
    label_count.append(count)
    ft_id = fine_tune_model(model_id = 'gpt-3.5-turbo-1106', num_label=count, pandas_df=df)
    if wait_for_fine_tuning(ft_id) is not None:
        model_ids.append(wait_for_fine_tuning(ft_id))
        syntheses_df = generate_10K_responses(data = df, model_id = wait_for_fine_tuning(ft_id))
        syntheses_df.to_csv(f'syntheses_df{count}.csv',index=False)
       