In [None]:
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.storage import (
    LocalFileStore,
)
from langchain.embeddings import CacheBackedEmbeddings, HuggingFaceEmbeddings
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

import pandas as pd
#import openai



In [None]:
df = pd.read_csv('train.csv')

In [None]:
import os

# Assuming df is your DataFrame
for index, row in df.iterrows():
    evidence_text = row['evidence_text']
    # Define the filename based on the index or any other identifier
    filename = f"text_{index}.txt"
    # Specify the directory where you want to save the .txt files
    directory = "txt"
    # If the directory doesn't exist, create it
    if not os.path.exists(directory):
        os.makedirs(directory)
    # Combine the directory and filename to create the full path
    filepath = os.path.join(directory, filename)
    
    # Write the evidence_text to the .txt file
    with open(filepath, 'w') as file:
        file.write(evidence_text)

    print(f"Text for index {index} stored in {filepath}")


In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chains import FlareChain
from langchain.llms import OpenAI


In [None]:
import os

In [None]:
os.environ["OPENAI_API_KEY"] = "Enter your openai key"


In [None]:
def create_vector_db(embedding_model='text-embedding-ada-002',data_path='txt/'):
    # Load all the transcripts stored in the data folder
    loader = DirectoryLoader(data_path, glob="**/*.txt", show_progress=True)
    docs = loader.load()

    # Split the documents into smaller chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=900, chunk_overlap=50)
    documents = text_splitter.split_documents(docs)

    # Initialize OpenAI Embeddings
    openai_embedder = OpenAIEmbeddings(model=embedding_model)

    # Cache the embeddings for faster loadup
    cache_store = LocalFileStore("./cache/")
    cached_embedder = CacheBackedEmbeddings.from_bytes_store(openai_embedder, cache_store, namespace="sentence")

    # Create the vector db
    db = FAISS.from_documents(documents, cached_embedder)
    return db

In [None]:
db = create_vector_db()

In [None]:
from langchain_community.chat_models import ChatOpenAI

In [None]:
myllm = ChatOpenAI(temperature=0.30, model_name="gpt-3.5-turbo-16k")

flare = FlareChain.from_llm(
    llm=myllm,
    retriever=db.as_retriever(),
    max_generation_len=700,
    min_prob=0.15, 
    instruction="You are a factual chatbot that answers questions about 10-K documents. You only answer with answers you find in the text, no outside information."
    
    )


In [None]:
result = flare.run(df['question'][0])

In [None]:
result

In [None]:
df

In [None]:
results = []
for index, row in df.iterrows():
    result = flare.run(row['question'])
    print(index,result)
    results.append(result)
    

In [None]:
df.head()

In [None]:
data = {'synthesis': results, 'actual': df['answer'], 'evidence_text': df['evidence_text'], 'question': df['question']}


In [None]:
pd.DataFrame(data).to_csv('zero_shot_rag_flare_syntheses.csv')