In [None]:
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.storage import (
    LocalFileStore,
)
from langchain.embeddings import CacheBackedEmbeddings, HuggingFaceEmbeddings
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
import json

import pandas as pd
import os



In [None]:
def text_to_openai_json(data,filename):
    """
    Converts a given dataset into a JSON Lines (JSONL) file suitable for OpenAI's GPT-3.5 turbo model.
    
    Args:
        data (DataFrame or similar data structure): Input data containing text and labels.

    The function processes the input data row by row, constructing conversations for each row with a system message, user message, and an assistant message. It then writes the generated conversation data to a JSONL file.
 
    """
    # Initialize an empty list to store conversation data
    message_list = []

    # Iterate through the rows in the input data
    for _, row in data.iterrows():
        # Create a system message as an initial instruction
        system_message = {
            "role": "system",
            "content":  f"You are a factual chatbot that answers questions about for giving text. You only answer with answers you find in the text, no outside information." 
        }

        # Append the system message to the conversation
        message_list.append({"messages": [system_message]})

        # Create a user message based on the 'text' column from the data
        user_message = {
            "role": "user",
            "content": f"{row['question']} based on {row['evidence_text']}  "
        }

        # Append the user message to the conversation
        message_list[-1]["messages"].append(user_message)

        # Create an assistant message based on the 'coarse_label' column from the data
        assistant_message = {
            "role": 'assistant',
            "content": row['answer']
        }

        # Append the assistant message to the conversation
        message_list[-1]["messages"].append(assistant_message)

    # Write the conversation data to a JSON Lines (JSONL) file
    with open(filename, "w") as json_file:
        for message in message_list:
            # Serialize the conversation data to JSON and write it to the file
            json.dump(message, json_file)
            json_file.write("\n")

In [None]:
import openai
from packaging import version

required_version = version.parse("1.1.1")
current_version = version.parse(openai.__version__)

if current_version < required_version:
    raise ValueError(f"Error: OpenAI version {openai.__version__}"
                     " is less than the required version 1.1.1")
else:
    print("OpenAI version is compatible.")

In [None]:
# -- Now we can get to it
from openai import OpenAI

In [None]:
os.environ["OPENAI_API_KEY"] = "Enter openai key"


In [None]:
client = OpenAI(
    # This is the default and can be omitted
    api_key=os.environ.get("OPENAI_API_KEY"),
)

In [None]:
def fine_tune_model(model_id,num_label,pandas_df):
    df = pandas_df.iloc[:num_label]
    filename = f'ft_increment_{num_label}.jsonl'
    text_to_openai_json(df, filename)
    loader = client.files.create(file=open(filename, "rb"), purpose='fine-tune')
    fine_tuning_job = client.fine_tuning.jobs.create(training_file=loader.id, model="gpt-3.5-turbo-1106")
    return fine_tuning_job.id
    

In [None]:
import time

In [None]:
def wait_for_fine_tuning(job_id):
    while True:
        response = client.fine_tuning.jobs.retrieve(job_id)
        print(response.fine_tuned_model)
        #print(response["fine_tuned_model"])
        if response.fine_tuned_model:
            print(response.fine_tuned_model)
            return response.fine_tuned_model
        time.sleep(30)

In [None]:
df = pd.read_csv('train.csv')

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chains import FlareChain
from langchain.llms import OpenAI


In [None]:
def create_vector_db(embedding_model='text-embedding-ada-002',data_path='txt/'):
    # Load all the transcripts stored in the data folder
    loader = DirectoryLoader(data_path, glob="**/*.txt", show_progress=True)
    docs = loader.load()

    # Split the documents into smaller chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=750, chunk_overlap=50)
    documents = text_splitter.split_documents(docs)

    # Initialize OpenAI Embeddings
    openai_embedder = OpenAIEmbeddings(model=embedding_model)

    # Cache the embeddings for faster loadup
    cache_store = LocalFileStore("./cache/")
    cached_embedder = CacheBackedEmbeddings.from_bytes_store(openai_embedder, cache_store, namespace="sentence")

    # Create the vector db
    db = FAISS.from_documents(documents, cached_embedder)
    return db

In [None]:
db = create_vector_db()

In [None]:
from langchain_community.chat_models import ChatOpenAI

In [None]:
myllm = ChatOpenAI(temperature=0.30, model_name="gpt-3.5-turbo-16k")

flare = FlareChain.from_llm(
    llm=myllm,
    retriever=db.as_retriever(),
    max_generation_len=700,
    min_prob=0.15, 
    instruction="You are a factual chatbot that answers questions about 10-K documents. You only answer with answers you find in the text, no outside information."
    
    )

In [None]:
def flare_rag_pred(data, model_id):




    myllm = ChatOpenAI(temperature=0.30, model_name=model_id)

    flare = FlareChain.from_llm(
        llm=myllm,
        retriever=db.as_retriever(),
        max_generation_len=700,
        min_prob=0.15, 
        )


    syntheses = []
    for index, row in data.iterrows():
        result = flare.run(row['question'])
        print(index,result)
        syntheses.append(result)


    return syntheses
    
    

In [None]:
model_ids = [] 
num_labels = []
result = [] 

count = 0 
for i in range(5): 
    count += 10
    ft_id = fine_tune_model(model_id="gpt-3.5-turbo-16k", num_label=count, pandas_df= df)
    if wait_for_fine_tuning(ft_id) is not None:
        model_ids.append(wait_for_fine_tuning(ft_id))
        syntheses = flare_rag_pred(data=df,model_id=wait_for_fine_tuning(ft_id))
        result.append(syntheses)
        
    



In [None]:
model_ids
    

In [None]:
myllm = ChatOpenAI(temperature=0.30, model_name=model_ids[0])


In [None]:
def flare_rag_pred(data, model_id):
  
    myllm = ChatOpenAI(temperature=0.30, model_name=model_id)

    flare = FlareChain.from_llm(
        llm=myllm,
        retriever=db.as_retriever(),
        max_generation_len=700,
        min_prob=0.15, 
        instruction="You are a factual chatbot that answers questions about 10-K documents. You only answer with answers you find in the text, no outside information."
        )


    syntheses = []
    for index, row in data.iterrows():
        result = flare.run(row['question'])
        print(result)
        syntheses.append(result)


    return syntheses
    

In [None]:
syntheses = []
for model_id in model_ids:
    print(model_id)
    result = flare_rag_pred(data=df,model_id=model_id)
    syntheses.append(result)


In [None]:
syntheses

In [None]:
df.head(5)

In [None]:
pd.DataFrame({ 'syntheses' : syntheses[0], 'answer' : df['answer'] } ).to_csv('ft-flare-rag-10.csv',index=False)

In [None]:
pd.DataFrame({ 'syntheses' : syntheses[1], 'answer' : df['answer'] } ).to_csv('ft-flare-rag-20.csv',index=False)

In [None]:
pd.DataFrame({ 'syntheses' : syntheses[2], 'answer' : df['answer'] } ).to_csv('ft-flare-rag-30.csv',index=False)

In [None]:
pd.DataFrame({ 'syntheses' : syntheses[3], 'answer' : df['answer'] } ).to_csv('ft-flare-rag-40.csv',index=False)

In [None]:
pd.DataFrame({ 'syntheses' : syntheses[4], 'answer' : df['answer'] } ).to_csv('ft-flare-rag-50.csv',index=False)