In [1]:
from datetime import datetime
import json
import os

import cohere
import nest_asyncio
from llama_index.core.base.response.schema import Response
from llama_index.core import Settings
from llama_index.core import SimpleDirectoryReader
from llama_index.core.evaluation import (
    BatchEvalRunner,
    CorrectnessEvaluator,
)
from llama_index.llms.cohere import Cohere
from llama_index.llms.openai import OpenAI
import openai
import pandas as pd

from chunker import chunk_text, correct_text, threadpool_map

nest_asyncio.apply()

# Setup and parameters

In [2]:
# Load environment variables

COHERE_API_KEY = os.environ.get("COHERE_API_KEY")
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
                           
co = cohere.Client(api_key=COHERE_API_KEY)
oai = openai.OpenAI(api_key=OPENAI_API_KEY)

eval_directory = "./datasets/acme_spd/files"

rerank_top_n = 5

generation_llm_model = "command-r-plus"

evaluation_llm_family = "OPENAI"
evaluation_llm_model = "gpt-4-turbo"

Settings.llm = Cohere(api_key=COHERE_API_KEY, model=generation_llm_model, temperature=0)

if evaluation_llm_family == "OPENAI":
    Settings.eval_llm = OpenAI(api_key=OPENAI_API_KEY, model=evaluation_llm_model, temperature=0)
elif evaluation_llm_family == "COHERE":
    Settings.eval_llm = Cohere(api_key=COHERE_API_KEY, model=evaluation_llm_model, temperature=0)

datetime_id = datetime.now().strftime("%Y%m%d_%H%M%S")

print(f"Generating model: {generation_llm_model}")
print(f"LLM judge: {evaluation_llm_family} {evaluation_llm_model}")
print(datetime_id)

Generating model: command-r-plus
LLM judge: OPENAI gpt-4-turbo
20240510_142046


In [3]:
# Load the pdf, each page becomes a document
reader = SimpleDirectoryReader(eval_directory)
documents = reader.load_data()

# There are a few issues with the doc, this corrects them
all_text = correct_text(documents)

# Chunk document according to headings

In [4]:
# Load the headings (i.e. section titles)
# Currently these are manually generated, but exist tools to automate this
headings = json.load(open('./datasets/acme_spd/files/ACME_headings.json', 'r'))

In [5]:
# Chunk the document based on headings
# Each document includes relevant context and parent information
# Result is a dictionary with the headings (section titles) as keys
documents = chunk_text(all_text, headings)

Title not found:  INPATRIATES
Title not found:  MENTAL HEALTH, NEUROBIOLOGICAL DISORDERS  AUTISM SPECTRUM DISORDER SERVICES AND SUBSTANCE-RELATED AND ADDICTIVE DISORDERS SERVICES


# Prepare documents for retrieval
Add the Table of Contents (ToC) metadata to the chunks for retrieval. Note that here, the formatting of the documents when adding the metadata is slightly different between the retrieval step and the generation step. However, whether you choose to do this or keep them the same should not have a big impact, as long as the information contained is similar.

In [6]:
def format_document_for_retrieval(doc_dict):
    result = f"""## Text of this document:\n{doc_dict["text"]}"""
    parents = "\n".join(doc_dict['parents'])
    children = "\n".join(doc_dict['children'])
    siblings = "\n".join(doc_dict['siblings'])
    metadata = (
        f"## This document is contained under the following titles:\n{parents}"
        f"## Documents at the same level as this text:\n{siblings}"
        f"## Subtitles of this document:\n{children}"
    )
    result += f"\n{metadata}"
    return result

In [7]:
docs_for_retrieval = {
    title: format_document_for_retrieval(doc_dict) for title, doc_dict in documents.items() if doc_dict.get("text")
}

# Generate completions
- In this current pipeline, we are not using a embedding index, and instead passing all document chunks (appropriately augmented with Table of Contents metadata) to the Reranker to get the top N document chunks. 
    - This improves accuracy when the corpus of document chunks is small (as in this case), but is not a scalable option for larger datasets.
- In the generation step, we are using the `documents` parameter to supply the document chunks, and the `preamble` parameter to supply general task instructions. 
    - In particular, using the `documents` parameter helps to reduce hallucinations + you get citations for free (see https://docs.cohere.com/docs/documents-and-citations)

In [8]:
preamble = """## Task & Context
You are an expert Human Resources assistant that helps employees answer questions about company policies. \
Use the provided documents to answer questions about an employee's specific situation.

## Style Guide
- Think step by step, provide evidence and/or reasoning first, then the answer."""


def format_documents_for_generation(relevant_titles):
    docs = []
    for _, relevant_title in enumerate(relevant_titles):

        text = documents[relevant_title]["text"]
        # reverse the order of the parents (after reversal, the first element is the top level parent)
        parents = documents[relevant_title]['parents'][::-1]
        # add this section title to parents
        parents.append(relevant_title)
        parents = "\n".join(documents[relevant_title]['parents'])
        children = "\n".join(documents[relevant_title]['children'])
        siblings = "\n".join(documents[relevant_title]['siblings'])
        result = (
            f"## Relevant Document Title:\n{relevant_title}\n"
            f"## Document Text:\n{text}\n"
            f"## This document is contained under the following sections:\n{parents}\n"
            f"## Documents at the same level as this document:\n{siblings}\n"
        ) 
        if children.strip() != "":
            result += f"## This document contains the following subsections:\n{children}\n"
        docs.append({
            "title": relevant_title,
            "snippet": result,
        })

    return docs


def retrieve_docs(query, documents, rerank_top_n):
    results = co.rerank(
        model="rerank-english-v3.0",
        query=query,
        documents=documents, 
        top_n=int(rerank_top_n), 
        return_documents=False,
    )
    top_indices = [doc.index for doc in results.results]
    return top_indices

In [9]:
# Load the evaluation questions
queries = pd.read_excel('datasets/acme_spd/questions/ACME_SPD_Questions.xlsx')
queries["rerank_top_n"] = rerank_top_n

# Format the documents for retrieval by getting mapping of index to document
doc_index_to_title = {}
docs_for_retrieval_list = []
for idx, (heading, doc_dict) in enumerate(docs_for_retrieval.items()):
    doc_index_to_title[idx] = heading
    docs_for_retrieval_list.append(doc_dict)

In [10]:
# Run the RAG pipeline, including retrieval and generation

def run_rag_pipeline(row):
    # Retrieve the top n documents (retry as necessary)
    successful = False
    while not successful:
        try:
            relevant_indices = retrieve_docs(row["query"], docs_for_retrieval_list, row["rerank_top_n"])
            if isinstance(relevant_indices, list):
                successful = True
        except:
            continue

    # Fetch the original documents based on the top n retrieved documents
    relevant_titles = [doc_index_to_title[idx] for idx in relevant_indices]
    documents_for_generation = format_documents_for_generation(relevant_titles)

    # Generate the response (retry as necessary)
    successful = False
    while not successful:
        try:
            resp = co.chat(
                message=row["query"],
                documents=documents_for_generation,
                preamble=preamble,
                model=generation_llm_model,
                temperature=0.0
            )
            completion = resp.text
            if isinstance(completion, str):
                successful = True 
        except:
            continue

    return {
        "query_num": row["query_num"],
        "relevant_indices": relevant_indices,
        "relevant_titles": relevant_titles,
        "relevant_docs": documents_for_generation,
        "response": resp,
        "completion": completion,
    }

results = threadpool_map(run_rag_pipeline, [{"row": item[1]} for item in list(queries.iterrows())])

100%|██████████| 82/82 [01:36<00:00,  1.18s/it]


In [11]:
df = queries.merge(pd.DataFrame(results), on="query_num", how="inner")
assert len(df) == len(queries)  # Ensure that all queries have been processed

# Evaluate results

In [12]:
eval_lidx_c = CorrectnessEvaluator(llm=Settings.eval_llm)

runner = BatchEvalRunner(
    {"correctness": eval_lidx_c},
    workers=16,
)

LI_eval_results = await runner.aevaluate_responses(
    queries=df["query"].tolist(),
    responses=[Response(response=x) for x in df["completion"].tolist()],
    reference=[{"reference": x} for x in df["expected_answer"].tolist()],
)

In [13]:
df["correctness_result"] = LI_eval_results["correctness"]
df["correctness_score"] = df["correctness_result"].map(lambda x: x.score)
print(f"""Average score: {df["correctness_score"].mean()}""")

Average score: 4.2073170731707314


In [14]:
# if directory doesn't exist, create it
output_folder = f"./datasets/acme_spd/files/{generation_llm_model}/"
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
df.to_json(f"./datasets/acme_spd/files/{generation_llm_model}/eval_results_{datetime_id}.jsonl", lines=True, orient='records')