# FinanceBench: Evaluation Playground


<hr style="border-bottom:0.1px solid gray">

##### (1) API Requirements
Add the following API keys into your `.env` file:

```ruby
OPENAI_API_KEY = 'INSERT API KEY HERE'
ANTHROPIC_API_KEY = 'INSERT API KEY HERE'
REPLICATE_API_TOKEN = 'INSERT API KEY HERE'
```

##### (2) Required Folder Structure

```bash
|-- /
|    |-- data/
|    |      | -- financebench_open_source.jsonl
     |      | -- financebench_document_information.jsonl
|    |-- pdfs/
|           | -- <... provided filings as PDF documents ...>
|    |-- results/
|    |-- vectorstores/
|    |-- evaluation_playground.ipynb
```


<br>
<hr style="border-bottom:0.1px solid gray">

In [7]:
%%capture
!pip install pymupdf

In [8]:
import os
import sys
import json
import pickle
import datetime
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from dotenv import load_dotenv

load_dotenv()

#LangChain Stuff
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains import RetrievalQA

# LangChain Model Wrappers
from langchain.chat_models import ChatOpenAI
from langchain.chat_models import ChatAnthropic
from langchain.llms.replicate import Replicate

# Model Providers
import openai
import anthropic
import replicate
import tiktoken

# import ANTHROPIC TOKENIZER
CLIENT = anthropic.Anthropic(api_key=os.environ['ANTHROPIC_API_KEY'])
anthropic_tokenizer = CLIENT.get_tokenizer()
openai.api_key = os.environ['OPENAI_API_KEY']

In [9]:
##############################################################################
# MODEL CONFIGS
##############################################################################
configs = [
            {"provider": "openai",     "model_name":"gpt-4o-2024-05-13",   "eval_mode":"singleStore",        "temp":0.01,   "max_tokens":2048},
            {"provider": "openai",     "model_name":"gpt-4o-2024-05-13",   "eval_mode":"sharedStore",        "temp":0.01,   "max_tokens":2048},
            {"provider": "openai",     "model_name":"gpt-4o-2024-05-13",   "eval_mode":"inContext",          "temp":0.01,   "max_tokens":2048},
            {"provider": "openai",     "model_name":"gpt-4o-2024-05-13",   "eval_mode":"inContext_reverse",  "temp":0.01,   "max_tokens":2048},
            {"provider": "openai",     "model_name":"gpt-4o-2024-05-13",   "eval_mode":"oracle",             "temp":0.01,   "max_tokens":2048},
            {"provider": "openai",     "model_name":"gpt-4o-2024-05-13",   "eval_mode":"oracle_reverse",     "temp":0.01,   "max_tokens":2048},
            {"provider": "openai",     "model_name":"gpt-4o-2024-05-13",   "eval_mode":"sharedStore",        "temp":0.01,   "max_tokens":2048},
            {"provider": "openai",     "model_name":"gpt-4-1106-preview",  "eval_mode":"sharedStore",        "temp":0.01,   "max_tokens":2048},
            {"provider": "openai",     "model_name":"gpt-4-1106-preview",  "eval_mode":"singleStore",        "temp":0.01,   "max_tokens":2048},
            {"provider": "openai",     "model_name":"gpt-4-1106-preview",  "eval_mode":"inContext",          "temp":0.01,   "max_tokens":2048},
            {"provider": "openai",     "model_name":"gpt-4-1106-preview",  "eval_mode":"closedBook",         "temp":0.01,   "max_tokens":2048},
            {"provider": "anthropic",  "model_name":"claude-2",            "eval_mode":"inContext",          "temp":0.01,   "max_tokens":2048},
            {"provider": "openai",     "model_name":"gpt-4-1106-preview",  "eval_mode":"oracle",             "temp":0.01,   "max_tokens":2048},
            {"provider": "replicate",  "model_name":"llama2",              "eval_mode":"sharedStore",        "temp":0.01,   "max_tokens":2048},
            {"provider": "replicate",  "model_name":"llama2",              "eval_mode":"singleStore",        "temp":0.01,   "max_tokens":2048},
            {"provider": "openai",     "model_name":"gpt-4",               "eval_mode":"sharedStore",        "temp":0.01,   "max_tokens":2048},
            {"provider": "openai",     "model_name":"gpt-4",               "eval_mode":"singleStore",        "temp":0.01,   "max_tokens":2048},
            {"provider": "openai",     "model_name":"gpt-4",               "eval_mode":"closedBook",         "temp":0.01,   "max_tokens":2048},
            {"provider": "openai",     "model_name":"gpt-4",               "eval_mode":"oracle",             "temp":0.01,   "max_tokens":2048},
            {"provider": "openai",     "model_name":"gpt-4-1106-preview",  "eval_mode":"oracle_reverse",     "temp":0.01,   "max_tokens":2048},
            {"provider": "anthropic",  "model_name":"claude-2",            "eval_mode":"oracle_reverse",     "temp":0.01,   "max_tokens":2048},
            {"provider": "openai",     "model_name":"gpt-4-1106-preview",  "eval_mode":"inContext_reverse",  "temp":0.01,   "max_tokens":2048},
            {"provider": "anthropic",  "model_name":"claude-2",            "eval_mode":"inContext_reverse",  "temp":0.01,   "max_tokens":2048},
            {"provider": "",           "model_name":"",                    "eval_mode":"singleStore",        "temp":None,   "max_tokens":None},       # SPECIAL MODE --> RETRIEVAL ONLY MODE (SINGLE STORE)
            {"provider": "",           "model_name":"",                    "eval_mode":"sharedStore",        "temp":None,   "max_tokens":None},       # SPECIAL MODE --> RETRIEVAL ONLY MODE (SHARED STORE)
]

replicate_model_mapping = dict({
            "llama2": "meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3"
        })

##############################################################################
# DATASET CONFIG
##############################################################################
PATH_CURRENT = os.path.abspath(os.getcwd())
PATH_DATASET_JSONL = PATH_CURRENT + "/data/financebench_open_source.jsonl"
PATH_DOCUMENT_INFO_JSONL = PATH_CURRENT + "/data/financebench_document_information.jsonl"
PATH_RESULTS = PATH_CURRENT + "/results/"
PATH_PDFS = PATH_CURRENT + "/pdfs/"

# Choose DATASET PORTION:
# - ALL: Full Dataset
# - OPEN_SOURCE: Open Source Part (n=150)
# - CLOSED_SOURCE: Closed Source Part --> Request access at contact@patronus.ai
DATASET_PORTION = "OPEN_SOURCE"   

##############################################################################
# VECTOR STORE SETUP
##############################################################################
VS_CHUNK_SIZE = 1024
VS_CHUNK_OVERLAP = 30
VS_DIR_VS = PATH_CURRENT + "/vectorstores"

In [10]:
##############################################################################
# LOAD DATASET
##############################################################################

# Load Full Dataset 
df_questions = pd.read_json(PATH_DATASET_JSONL, lines=True)
df_meta = pd.read_json(PATH_DOCUMENT_INFO_JSONL, lines=True)
df_full = pd.merge(df_questions, df_meta, on="doc_name")

# Get all docs
df_questions = df_questions.sort_values('doc_name')
ALL_DOCS = df_questions['doc_name'].unique().tolist()
print(f"Total number of distinct PDF: {len(ALL_DOCS)}")

# Select relevant dataset portion
if DATASET_PORTION != "ALL":
    df_questions = df_questions.loc[df_questions["dataset_subset_label"]==DATASET_PORTION]
print(f"Number of questions: {len(df_questions)}")

# Check relevant documents
df_questions = df_questions.sort_values('doc_name')
docs = df_questions['doc_name'].unique().tolist()
print(f"Number of distinct PDF: {len(docs)}")

Total number of distinct PDF: 84
Number of questions: 150
Number of distinct PDF: 84


In [11]:
##############################################################################
# HELPER FUNCTIONS (PDF-PARSING + VECTOR-STORE SETUPS)
##############################################################################
def get_pdf_text(doc):
    
    path_doc = f"{PATH_PDFS}/{doc}.pdf"
    pdf_reader = PyMuPDFLoader(path_doc)
    pdf_text = pdf_reader.load()

    return pdf_text

def build_vectorstore_retriever(docs, embeddings = OpenAIEmbeddings()):

    if docs == "all":
        docs = ALL_DOCS
        db_path = VS_DIR_VS + "/shared"
    else:
        docs = [docs]
        db_path = VS_DIR_VS + "/" + docs[0]
    
    # Create Vector Store if not already existing
    if not os.path.exists(db_path):
        
        # Create folder for vector store
        os.mkdir(db_path) 

        # Create vector store itself --> chrom.sqlite3 database
        if not os.path.exists(f"{db_path}/chroma.sqlite3"):
            vectordb = Chroma(persist_directory=db_path, embedding_function=embeddings)
            vectordb.persist()
    
            # Add Documents to Vector store    
            for doc in docs:
                pdf_text = get_pdf_text(doc)
                text_splitter = RecursiveCharacterTextSplitter(
                    chunk_size = VS_CHUNK_SIZE,
                    chunk_overlap = VS_CHUNK_OVERLAP,
                )
                splitted_texts = text_splitter.split_documents(pdf_text)
        
                # Add to vector store
                vectordb.add_documents(documents=splitted_texts)
                vectordb.persist()

    else:
        vectordb = Chroma(persist_directory=db_path, embedding_function=embeddings)

    return vectordb.as_retriever(), vectordb

##############################################################################
# MODEL + CALL HANDLERS
##############################################################################

def get_max_context_length(prompt, anthropic_cutoff=95000, openai_cutoff=105000):

    # (0) Check Anthropic Tokenizer
    tokens_anthropic = anthropic_tokenizer.encode(prompt)
    nb_tokens_anthropic = len(tokens_anthropic)
    number_of_chars_anthropic = len(prompt)
    
    if nb_tokens_anthropic > anthropic_cutoff:
        tokens_anthropic_tokens = tokens_anthropic.tokens
        token_lengths_anthropic = [len(token) for token in tokens_anthropic_tokens]
        number_of_chars_anthropic = sum(token_lengths_anthropic[:anthropic_cutoff])
        

    # (1) Check OpenAI Tokenizer
    tokenizer_openai = tiktoken.encoding_for_model("gpt-4-1106-preview")
    tokens_openai = tokenizer_openai.encode(prompt)
    nb_tokens_openai = len(tokens_openai)
    number_of_chars_openai = len(prompt)

    if nb_tokens_openai > openai_cutoff:
        tokens_openai_tokens = [tokenizer_openai.decode_single_token_bytes(token) for token in tokens_openai]
        token_lengths_openai = [len(token) for token in tokens_openai_tokens]
        number_of_chars_openai = sum(token_lengths_openai[:openai_cutoff])

    # Cut prompt depending on minimal length limit
    number_of_chars = min(number_of_chars_openai, number_of_chars_anthropic)

    return number_of_chars

def get_model(provider="openai", model_name="gpt-4", temp=0.01, max_tokens=2048):

    if provider == "openai":
        return ChatOpenAI(
            model_name=model_name, 
            temperature=temp, 
            max_tokens=max_tokens
            )
        
    elif provider == "anthropic":
        return ChatAnthropic(
            model=model_name,
            temperature=temp, 
            max_tokens_to_sample=max_tokens, 
            anthropic_api_key=os.environ['ANTHROPIC_API_KEY']
            )
    
    elif provider == "replicate":
        if model_name in replicate_model_mapping:
            return Replicate(
                model=replicate_model_mapping[model_name],
                model_kwargs={
                    'temperature': temp, 
                    'max_new_tokens': max_tokens
                    },
            )
        else:
            raise ValueError("Unknown Model")
        
    else:
        return None


def get_answer(model, eval_mode, question, context, retriever, retriever_only=False):

    retrieved_documents = []

    if eval_mode == "closedBook":
        prompt = f"Answer this question: {question}"
        answer = model.predict(prompt)
        
    elif eval_mode == "oracle":
        prompt = f"Answer this question: {question} \nHere is the relevant evidence that you need to answer the question:\n[START OF FILING] {context} [END OF FILING]"
        answer = model.predict(prompt)

    elif eval_mode == "oracle_reverse":
        
        prompt = f"Context:\n[START OF FILING] {context} [END OF FILING\n\n Answer this question: {question} \n"
        answer = model.predict(prompt)

    elif eval_mode in ["inContext",  "inContext_reverse"]:
        
        # Context Cutoff to satisfy max tokens
        max_number_of_chars = get_max_context_length(context)
        context = context[:max_number_of_chars]
        
        if eval_mode == "inContext":
            prompt = f"Answer this question: {question} \nHere is the relevant filing that you need to answer the question:\n[START OF FILING] {context} [END OF FILING]"
        else:
            prompt = f"Context:\n[START OF FILING] {context} [END OF FILING]\n\n Answer this question: {question}\n"

        answer = model.predict(prompt)

    elif eval_mode == "singleStore" or eval_mode == "sharedStore":
        
        # Retrieval-only mode if model=None (No LLM calls, only queries in VectorDB)
        if not model:           
            prompt = f"{question}"
            s = retriever.invoke(prompt)
            return ("", s)

        else:

            # Don't add a question prefix as RetrievalQA will do some automatic prompt wrapping
            # --> This can replace by more advanced Retrieval Strategies
            prompt = f"{question}"
            qa = RetrievalQA.from_chain_type(
                llm=model,
                chain_type="stuff",
                retriever=retriever,
                return_source_documents=True,
            )
            s = qa(prompt)
            
            answer = s["result"]
            retrieved_documents = s["source_documents"]


    
    return (answer, retrieved_documents)


In [None]:
##############################################################################
# EVALUATION
##############################################################################

# Specify evaluation model
model_config = configs[0]

# Set evaluation questions
df_eval = df_questions


# Get the model
model = get_model(provider=model_config["provider"],
                  model_name=model_config["model_name"],
                  temp=model_config["temp"],
                  max_tokens=model_config["max_tokens"])

print(f"--> Evaluating: {model_config['model_name']} / {model_config['eval_mode']}")

last_docs = None
results = []

# Run evaluation on the model  --> Sort along doc_name to reuse retriever configs in memory
for k, (idx, row) in tqdm(enumerate(df_eval.sort_values("doc_name").iterrows()), total=len(df_eval)):
        
    
    # (A) Setup Context or Retriever
    if model_config["eval_mode"] == "closedBook":
        retriever = None
        context = ""
    
    elif model_config["eval_mode"] in ["inContext", "inContext_reverse"]:
        retriever = None
        docs = row["doc_name"]
        if not (last_docs == docs):
            pages = get_pdf_text(row["doc_name"])
            context = "\n\n".join([page.page_content for page in pages])
            
    
    elif model_config["eval_mode"] in ["oracle", "oracle_reverse"]:
        context = "\n\n".join([evidence["evidence_text_full_page"] for evidence in row["evidence"]])
        retriever = None

    elif model_config["eval_mode"] in ["singleStore", "sharedStore"]:
        context = ""
        docs = "all"

        if model_config["eval_mode"] == "singleStore":
            docs = row["doc_name"]
        
        if not (last_docs == docs):
            retriever, _ = build_vectorstore_retriever(docs=docs)
            last_docs = docs


    else:
        raise ValueError("Unknown 'eval_mode'!")


    # (B) Model Call
    (answer, retrieved_documents) = get_answer(
                                        model=model, 
                                        eval_mode=model_config["eval_mode"], 
                                        question=row["question"], 
                                        context=context, 
                                        retriever=retriever
                                        )
    

    # (C) Bookkeeping
    results.append({
                        **model_config, 
                        "financebench_id" : row["financebench_id"],
                        "question" : row["question"],
                        "gold_answer": row["answer"],
                        "model_answer": answer,
                        "retrieved_documents" : retrieved_documents,
                    })

df_results = pd.DataFrame(results)
df_results.to_csv(PATH_RESULTS + "/" + model_config["model_name"] + "_" + model_config["eval_mode"] + ".csv")