S006 Semantic Chunking  for RAG (Cohere version)
- Requires markdown documents
- Use for Cohere only - using Cohere APIs for generation
    - Preamble / Prompt from Cohere
- Use LangChain Markdown parser to semantically chunk documents
- Store the chunks in a vector database (along with necessary metadata)
- Use retriever to fetch chunks from vector database
- Optionally rerank the retrieved chunks
- Modify these chunks to include metadata information 
- Pass the retrieved chunks to LLM for generation


In [1]:
from langchain_core.documents import Document
from langchain_text_splitters import MarkdownHeaderTextSplitter
from langchain_community.document_loaders import TextLoader

In [2]:
from datetime import datetime
import json
import os

import cohere
import nest_asyncio
from llama_index.core.base.response.schema import Response
from llama_index.core import Settings
from llama_index.core.evaluation import (
    BatchEvalRunner,    
    CorrectnessEvaluator,
)
from llama_index.llms.cohere import Cohere
from llama_index.llms.openai import OpenAI
import openai
import pandas as pd

from evaluation_utils import threadpool_map

from langchain_community.vectorstores import Chroma
from langchain_core.documents import Document
from langchain_text_splitters import MarkdownHeaderTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders import DirectoryLoader
from langchain_cohere import CohereEmbeddings

nest_asyncio.apply()

In [3]:
from config import set_environment
set_environment()

Choose the LLM for generation

In [4]:
generation_llm_family = os.environ["GENERATION_LLM_FAMILY"]
generation_llm_model = os.environ["GENERATION_LLM_MODEL"]

co = cohere.Client(api_key=os.environ["COHERE_API_KEY"])

if generation_llm_family == "COHERE":
    Settings.llm = Cohere(api_key=os.environ["COHERE_API_KEY"], model=generation_llm_model,temperature=0)

Choose the LLM for embedding

In [5]:
embedding_llm_family = os.environ["EMBEDDING_LLM_FAMILY"]
embedding_llm_model = os.environ["EMBEDDING_LLM_MODEL"]
embedding_dimensions = int(os.environ["EMBEDDING_DIMESIONS"])

if embedding_llm_family == "COHERE":
    embeddings_model = CohereEmbeddings()


In [6]:
eval_name = os.environ["EVAL_NAME"]
eval_directory = os.environ["EVAL_DIRECTORY"]
eval_file = os.environ["EVAL_FILE"]
eval_questions = os.environ["EVAL_QUESTIONS"]
eval_results_dir = os.environ["EVAL_RESULTS_DIR"]
eval_quick_test = os.environ["EVAL_QUICK_TEST"]

rag_strategy = os.environ["RAG_STRATEGY"]
similarity_top_k = int(os.environ["SIMILARITY_TOP_K"]) 

prompt_template = os.environ["RAG_PROMPT_TEMPLATE"]

Pick the strategy

In [7]:
if rag_strategy == "S006_00":
    isReRank = False
    rag_strategy_desc = "Semantic_Coh"
    run_id = f"{eval_name}_{rag_strategy}_GM_{generation_llm_model}_EM_{embedding_llm_model}_K_{similarity_top_k}_{datetime.today().strftime('%Y-%m-%d')}"
elif rag_strategy == "S006_01": 
    isReRank = True
    rag_strategy_desc = "Semantic_Coh_Rerank"
    reranker = os.environ["RERANKER"]
    reranker_model = os.environ["RERANKER_MODEL"]
    rerank_top_n = int(os.environ["RERANK_TOP_N"])
    run_id = f"{eval_name}_{rag_strategy}_GM_{generation_llm_model}_EM_{embedding_llm_model}_K_{similarity_top_k}_RR_{reranker}_N_{rerank_top_n}_{datetime.today().strftime('%Y-%m-%d')}"

output_file = f"{eval_results_dir}/{run_id}.xlsx"

Read the documents, create chunks, calculate embeddings, store in a vector database

In [8]:
loader = DirectoryLoader(eval_directory, glob="**/*.md", loader_cls=TextLoader)
text_data = loader.load()
page_contents = [item.page_content for item in text_data]
text_concatenated = " ".join(page_contents)

headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
    ("####", "Header 4"),
    ("####", "Header 5")
]

markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False)
md_header_splits = markdown_splitter.split_text(text_concatenated)

In [9]:
# Embed
vectorstore = Chroma.from_documents(documents=md_header_splits, 
                                    embedding=embeddings_model)

retriever = vectorstore.as_retriever(search_kwargs={"k": similarity_top_k})

In [10]:
import re

def replace_title_in_text(text, title):
    # Create a regex pattern to match one or more # followed by the title
    pattern = re.compile(rf'#+\s+{re.escape(title)}')
    # Replace all matches with the title
    result = pattern.sub(title, text)
    return result

In [11]:
def format_documents(retrieved_chunks, isRerank):
    docs = []
    
    for chunk in retrieved_chunks:

        header_1 = chunk.metadata.get("Header 1", "")
        header_2 = chunk.metadata.get("Header 2", "")
        header_3 = chunk.metadata.get("Header 3", "")
        header_4 = chunk.metadata.get("Header 4", "")
        header_5 = chunk.metadata.get("Header 5", "")

        headers = [header_1, header_2, header_3, header_4, header_5]
        parents = []

        for header in headers:
            if header == "":
                break
            parents.append(header)
    
        # Identify the title as the last non-empty header
        title = parents[-1] if parents else "Untitled"
        text = replace_title_in_text(chunk.page_content, title)
        
        if not isRerank:
            parents = [parents][::-1]
        
        parents_concat = '\n'.join(parents[0])

        if isRerank:
            result = (f"""## Text of this document:\n{text}\n"""
                    f"## This document is contained under the following titles:\n{parents_concat}\n"
        )
        else:
            result = (
                    f"## Relevant Document Title:\n{title}\n"
                    f"## Document Text:\n{text}\n"
                    f"## This document is contained under the following sections:\n{parents_concat}\n"
            ) 
        
        docs.append({
            "title": title,
            "snippet": result,
        })

    return docs


Quick Test

In [12]:
import pprint

retrieved_chunks = retriever.invoke(eval_quick_test)
documents = []

if isReRank:
    documents=format_documents(retrieved_chunks, True)
    results = co.rerank(
            model=reranker_model,
            query=eval_quick_test,
            documents=documents, 
            top_n=int(rerank_top_n), 
            rank_fields = ['snippet'],
            return_documents=True,
        )
    top_indices = [doc.index for doc in results.results]
    reranked_chunks = [retrieved_chunks[i] for i in top_indices if i < len(retrieved_chunks)]
    documents=format_documents(reranked_chunks, False)

    # Printing the new dictionary
    print(reranked_chunks)
else:
    documents=format_documents(retrieved_chunks, False)



[Document(page_content='## Standard Plan  \n| Service                                      | Details                                                      |\n| -------------------------------------------- | ------------------------------------------------------------ |\n| Preventive exam                              | You pay $0 every calendar year                               |\n| Prescription glasses                         | $15 copay                                                    |\n| Frames allowance                             | $155<br>20% savings on the amount of allowance               |\n| Frames frequency                             | Every other calendar year                                    |\n| Lens type                                    | Single vision, lined bifocal, lined trifocal, standard progressive lenses, impact-resistant lenses for dependent children |\n| Contacts allowance*<br />Every calendar year | $145 allowance for contact lenses and contact lens fitt

In [13]:
resp = co.chat(
                message=eval_quick_test,
                documents=documents,
                preamble=prompt_template,
                model=generation_llm_model,
                temperature=0.0
            )
print(resp.text)
pprint.pprint(documents)

According to the Standard Plan, bifocals are covered. The plan provides a frames allowance of $155 every other calendar year.
In addition to the benefits offered by the Standard Plan mentioned above, the VSP KidsCare Plan provides children up to age 26 with two eye exams and one pair of glasses per calendar year. The glasses are eligible for a $15 copayment.
If you require further information on the vision plans, contact VSP using the following link: http://www.vsp.com or by telephone: 800-877-7195.
[{'snippet': '## Relevant Document Title:\n'
             'Standard Plan\n'
             '## Document Text:\n'
             'Standard Plan  \n'
             '| Service                                      | '
             'Details                                                      |\n'
             '| -------------------------------------------- | '
             '------------------------------------------------------------ |\n'
             '| Preventive exam                              

In [14]:
# Run the RAG pipeline, including retrieval and generation

def run_rag_pipeline(row):
    
    # Retrieve the top n chunks 
    retrieved_chunks = retriever.invoke(row["query"])
    documents = []
    
    if isReRank:
        documents = format_documents(retrieved_chunks, True)
        results = co.rerank(
            model=reranker_model,
            query=row["query"],
            documents=documents, 
            top_n=int(rerank_top_n), 
            rank_fields = ['snippet'],
            return_documents=False,
        )
        top_indices = [doc.index for doc in results.results]
        reranked_chunks = [retrieved_chunks[i] for i in top_indices if i < len(retrieved_chunks)]
        documents=format_documents(reranked_chunks, False)
    else:
        documents =format_documents(retrieved_chunks, False)
    
    # Generate the response (retry as necessary)
    successful = False
    while not successful:
        try:
            resp = co.chat(
                message=row["query"],
                documents=documents,
                preamble=prompt_template,
                model=generation_llm_model,
                temperature=0.0
            )
            completion = resp.text
            if isinstance(completion, str):
                successful = True 
        except:
            continue

    return {
        "query_num": row["query_num"],
        "sources": documents,
        "response": resp,
        "generated_answer": completion,
    }



In [15]:
# Load the evaluation questions
queries = pd.read_excel(eval_questions)
if isReRank:        
    queries["rerank_top_n"] = rerank_top_n

In [16]:
results = threadpool_map(run_rag_pipeline, [{"row": item[1]} for item in list(queries.iterrows())])

100%|██████████| 25/25 [00:17<00:00,  1.41it/s]


In [17]:
df = queries.merge(pd.DataFrame(results), on="query_num", how="inner")
assert len(df) == len(queries)  # Ensure that all queries have been processed

AttributeError: 'ApiError' object has no attribute 'keys'

Choose the LLM for evaluations

In [None]:
evaluation_llm_family = os.environ["EVALUATION_LLM_FAMILY"]
evaluation_llm_model = os.environ["EVALUATION_LLM_MODEL"]

if evaluation_llm_family == "OPENAI":
    Settings.eval_llm = OpenAI(temperature=0, model=evaluation_llm_model)

In [None]:
eval_lidx_c = CorrectnessEvaluator(llm=Settings.eval_llm)

runner = BatchEvalRunner(
    {"correctness": eval_lidx_c},
    workers=16,
)

LI_eval_results = await runner.aevaluate_responses(
    queries=df["query"].tolist(),
    responses=[Response(response=x) for x in df["generated_answer"].tolist()],
    reference=[{"reference": x} for x in df["expected_answer"].tolist()],
)

In [None]:
df["correctness_result"] = LI_eval_results["correctness"]
df["correctness_llm"] = df["correctness_result"].map(lambda x: x.score)
df["feedback_llm"] = df["correctness_result"].map(lambda x: x.feedback)
print(f"""Average score: {df["correctness_llm"].mean()}""")

In [None]:
responses_df = pd.DataFrame()
responses_df = df[['query_num', 'query', 'expected_answer', 'generated_answer', 'correctness_llm']]
responses_df['correctness_human'] = responses_df['correctness_llm']
responses_df.loc[:, ['faithfulness_llm', 'faithfulness_human']] = ""
responses_df['rag_strategy'] = rag_strategy
responses_df['rag_strategy_desc'] = rag_strategy_desc
responses_df['parameter_1'] = similarity_top_k
responses_df.loc[:, ['parameter_2', 'parameter_3', 'parameter_4', 'parameter_5']] = ""
responses_df['model'] = generation_llm_model 
responses_df['embed_model'] = embedding_llm_model 
responses_df['eval_model'] = evaluation_llm_model
responses_df['embed_dimensions'] = embedding_dimensions   
if rag_strategy =="S006_00":
   responses_df['reranker'] = ""
elif rag_strategy == "S006_01":
    responses_df['reranker'] = reranker_model
responses_df['run_date'] = datetime.today().strftime('%Y-%m-%d') 
responses_df['eval_name'] = eval_name

In [None]:
columns_to_set_none = [
    'total_tokens',
    'prompt_tokens',
    'completion_tokens',
    'total_cost',
    'prompt_cost',
    'completion_cost',
    'latency',
    'first_token_ms'
]

# Set the specified columns to None
responses_df.loc[:, columns_to_set_none] = None

In [None]:
correctness_sum = df['correctness_llm'].sum()
correctness_mean = df['correctness_llm'].mean()

# Create a new DataFrame for the summary
summary_df = pd.DataFrame({
    'Metric': ['Sum', 'Mean'],
    'Value': [correctness_sum, correctness_mean]
})

In [None]:
correctness_df = pd.DataFrame()
correctness_df = df[['query_num', 'query', 'expected_answer', 'generated_answer', 'correctness_llm', 'feedback_llm']]
correctness_df['correctness_human'] = correctness_df['correctness_llm']
correctness_df['feedback_human'] = ""

In [None]:
sources_df = df[['query_num', 'query', 'sources']]

In [None]:
with pd.ExcelWriter(output_file) as writer:
   responses_df.to_excel(writer, sheet_name="Responses", index=False)
   sources_df.to_excel(writer, sheet_name="Sources", index=False)
   summary_df.to_excel(writer, sheet_name="Summary", index=False)
   correctness_df.to_excel(writer, sheet_name="Correctness", index=False)