In [23]:
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.storage import (
    LocalFileStore,
)
from langchain.embeddings import CacheBackedEmbeddings, HuggingFaceEmbeddings
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

import pandas as pd

from datasets import load_dataset
 
#import openai



In [24]:
dataset = load_dataset("llmware/rag_instruct_benchmark_tester")


In [25]:
df = pd.DataFrame(dataset['train'])

In [26]:
import os

# Assuming df is your DataFrame
for index, row in df.iterrows():
    evidence_text = row['context']
    # Define the filename based on the index or any other identifier
    filename = f"text_{index}.txt"
    # Specify the directory where you want to save the .txt files
    directory = "txt"
    # If the directory doesn't exist, create it
    if not os.path.exists(directory):
        os.makedirs(directory)
    # Combine the directory and filename to create the full path
    filepath = os.path.join(directory, filename)
    
    # Write the evidence_text to the .txt file
    with open(filepath, 'w') as file:
        file.write(evidence_text)

    print(f"Text for index {index} stored in {filepath}")


Text for index 0 stored in txt\text_0.txt
Text for index 1 stored in txt\text_1.txt
Text for index 2 stored in txt\text_2.txt
Text for index 3 stored in txt\text_3.txt
Text for index 4 stored in txt\text_4.txt
Text for index 5 stored in txt\text_5.txt
Text for index 6 stored in txt\text_6.txt
Text for index 7 stored in txt\text_7.txt
Text for index 8 stored in txt\text_8.txt
Text for index 9 stored in txt\text_9.txt
Text for index 10 stored in txt\text_10.txt
Text for index 11 stored in txt\text_11.txt
Text for index 12 stored in txt\text_12.txt
Text for index 13 stored in txt\text_13.txt
Text for index 14 stored in txt\text_14.txt
Text for index 15 stored in txt\text_15.txt
Text for index 16 stored in txt\text_16.txt
Text for index 17 stored in txt\text_17.txt
Text for index 18 stored in txt\text_18.txt
Text for index 19 stored in txt\text_19.txt
Text for index 20 stored in txt\text_20.txt
Text for index 21 stored in txt\text_21.txt
Text for index 22 stored in txt\text_22.txt
Text for

In [27]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chains import FlareChain
from langchain.llms import OpenAI


In [28]:
import os

In [29]:
os.environ["OPENAI_API_KEY"] = "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"


In [39]:
def create_vector_db(embedding_model='text-embedding-ada-002',data_path='txt/'):
    # Load all the transcripts stored in the data folder
    loader = DirectoryLoader(data_path, glob="**/*.txt", show_progress=True)
    docs = loader.load()

    # Split the documents into smaller chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=900, chunk_overlap=50)
    documents = text_splitter.split_documents(docs)

    # Initialize OpenAI Embeddings
    openai_embedder = OpenAIEmbeddings(model=embedding_model)

    # Cache the embeddings for faster loadup
    cache_store = LocalFileStore("./cache/")
    cached_embedder = CacheBackedEmbeddings.from_bytes_store(openai_embedder, cache_store, namespace="sentence")

    # Create the vector db
    db = FAISS.from_documents(documents, cached_embedder)
    return db

In [40]:
db = create_vector_db()

100%|██████████| 200/200 [00:02<00:00, 99.81it/s] 


In [41]:
from langchain_community.chat_models import ChatOpenAI

In [56]:
myllm = ChatOpenAI(temperature=0.30, model_name="gpt-3.5-turbo-16k")

flare = FlareChain.from_llm(
    llm=myllm,
    retriever=db.as_retriever(),
    max_generation_len=700,
    min_prob=0.25,   
    instruction = 'You are a factual chatbot that answers questions about for giving text. You only answer with answers you find in the text, no outside information.'  
    )


In [57]:
result = flare.run(df['query'][0])

In [58]:
result

'The total amount of the invoice is $500. '

In [45]:
df

Unnamed: 0,query,answer,context,sample_number,tokens,category
0,What is the total amount of the invoice?,"$22,500.00",Services Vendor Inc. \n100 Elm Street Pleasant...,0,138,core
1,What is the invoice number?,#0001,Services Vendor Inc. \n100 Elm Street Pleasant...,1,138,core
2,What is a list of the items being purchased?,•Front End Engineering Service;\n•Back End Eng...,Services Vendor Inc. \n100 Elm Street Pleasant...,2,138,core
3,What is the name of the contact for question?,Bia Hermes,Services Vendor Inc. \n100 Elm Street Pleasant...,3,138,core
4,What is the PO number?,#1000,Services Vendor Inc. \n100 Elm Street Pleasant...,4,138,core
...,...,...,...,...,...,...
195,What is a one-line summary?,"Nokia is cutting 14,000 jobs as part of a cost...","Nokia said it would cut up to 14,000 jobs as p...",195,96,summary
196,What is a list of the executive's responsibili...,"•Senior Vice President, Event Planning ('SVP')...",2.1. Duties and Responsibilities and Extent of...,196,340,summary
197,What is a headline description?,NVIDIA Announces Second Quarter Fiscal 2024 Re...,NVIDIA Announces Financial Results for Second ...,197,428,summary
198,What is a summary of the CEO's statement in 15...,Microsoft Cloud is the platform of choice for ...,'The world's most advanced AI models are comin...,198,137,summary


In [59]:
results = []
for index, row in df.iterrows():
    result = flare.run(row['query'])
    print(index,result)
    results.append(result)
    

0 The total amount of the invoice is $500. 
1 The invoice number is a unique identifier for a specific invoice. It is typically located at the top of the invoice and is used for tracking and reference purposes. Do you have a specific invoice number in mind? 
2 The items being purchased are Milk, Cheese, Chocolate, and Coffee. 
3 The contact for any questions regarding this invoice is Bia Hermes. 
4 The PO number for this invoice is 1000. 
5 The payment is due on the date specified in your contract or invoice. If you are unsure, please refer to your contract or contact our customer service team for assistance. 
6 The payment is due on the date specified in your contract or invoice. If you are unsure, please refer to your contract or contact our customer service team for assistance. 
7 The subtotal amount is the total cost of all items before any taxes or discounts are applied. 
8 The total amount can vary depending on the context. Can you provide more information so I can give you an ac

In [60]:
df.head()

Unnamed: 0,query,answer,context,sample_number,tokens,category
0,What is the total amount of the invoice?,"$22,500.00",Services Vendor Inc. \n100 Elm Street Pleasant...,0,138,core
1,What is the invoice number?,#0001,Services Vendor Inc. \n100 Elm Street Pleasant...,1,138,core
2,What is a list of the items being purchased?,•Front End Engineering Service;\n•Back End Eng...,Services Vendor Inc. \n100 Elm Street Pleasant...,2,138,core
3,What is the name of the contact for question?,Bia Hermes,Services Vendor Inc. \n100 Elm Street Pleasant...,3,138,core
4,What is the PO number?,#1000,Services Vendor Inc. \n100 Elm Street Pleasant...,4,138,core


In [None]:
data = {'synthesis': results, 'actual': df['answer'], 'context': df['context'], 'query': df['query']}


In [None]:
pd.DataFrame(data).to_csv('zero_shot_rag_flare_syntheses.csv')