Generate the prompts using S007 - Semantic Chunking strategy
- Requires markdown documents
- Use LangChain Markdown parser to semantically chunk documents
- Store the chunks in a vector database (along with necessary metadata)
- Given a question
-   Use retriever to fetch chunks from vector database
-   Modify these chunks to include metadata information 
-   Create a prompt
- Persist the prompt (for downstream analysis)


In [1]:
import os

import sys
sys.path.append("..")

import nest_asyncio

from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_anthropic import ChatAnthropic
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain_cohere import ChatCohere, CohereEmbeddings
from langchain_together import ChatTogether
from langchain_core.documents import Document
from langchain_text_splitters import MarkdownHeaderTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders import DirectoryLoader


from datetime import datetime
import pandas as pd
import random

from evaluation_utils import threadpool_map

nest_asyncio.apply()

In [2]:
from config import set_environment
set_environment()

In [3]:

preamble = """You are an assistant for question-answering tasks. 
    Use the following pieces of retrieved context to answer the question. 
    If you don't know the answer, just say that you don't know. 
    Provide a detailed response, but do not invent stuff

   Context:
   
"""

Choose the LLM for embedding

In [4]:
embedding_llm_family = os.environ["EMBEDDING_LLM_FAMILY"]
embedding_llm_model = os.environ["EMBEDDING_LLM_MODEL"]
embedding_dimensions = int(os.environ["EMBEDDING_DIMESIONS"])

if embedding_llm_family == "OPENAI":
    embeddings_model = OpenAIEmbeddings()
elif embedding_llm_family == "GOOGLE":
    embeddings_model = GoogleGenerativeAIEmbeddings(model=embedding_llm_model)
elif embedding_llm_family == "COHERE":
    embeddings_model = CohereEmbeddings(model=embedding_llm_model)

In [5]:
eval_name = os.environ["EVAL_NAME"]
eval_directory = os.environ["EVAL_DIRECTORY"]
eval_file = os.environ["EVAL_FILE"]
eval_questions = os.environ["EVAL_QUESTIONS"]
eval_quick_test = os.environ["EVAL_QUICK_TEST"]
eval_db = os.environ["EVAL_DB"]
eval_prompts_dir = os.environ["EVAL_PROMPTS_DIR"]

similarity_top_k = int(os.environ["SIMILARITY_TOP_K"]) 

output_file_xls = f"{eval_prompts_dir}/{eval_name}_{embedding_llm_family}_P.xlsx"
output_file_json = f"{eval_prompts_dir}/{eval_name}_{embedding_llm_family}_P.json" 

Read the documents, create chunks, calculate embeddings, store in a vector database

In [6]:
if os.path.exists(eval_db) and os.path.isdir(eval_db):
    vectorstore = Chroma(persist_directory=eval_db,
                  embedding_function=embeddings_model)
else:
    loader = DirectoryLoader(eval_directory, glob="**/*.md", loader_cls=TextLoader)
    text_data = loader.load()
    page_contents = [item.page_content for item in text_data]
    text_concatenated = "\n\n".join(page_contents)

    headers_to_split_on = [
        ("#", "Header 1"),
        ("##", "Header 2"),
        ("###", "Header 3"),
        ("####", "Header 4"),
        ("####", "Header 5")
    ]

    markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False)
    md_header_splits = markdown_splitter.split_text(text_concatenated)
    vectorstore = Chroma.from_documents(documents=md_header_splits, 
                                    embedding=embeddings_model,
                                    persist_directory=eval_db)
    vectorstore.persist()

retriever = vectorstore.as_retriever(search_kwargs={"k": similarity_top_k})

In [7]:
import re

def replace_title_in_text(text, title):
    # Create a regex pattern to match one or more # followed by the title
    pattern = re.compile(rf'#+\s+{re.escape(title)}')
    # Replace all matches with the title
    result = pattern.sub(title, text)
    return result

In [8]:
def format_documents(retrieved_chunks):
    
    result = "\n"
    
    for chunk in retrieved_chunks:
    
        header_1 = chunk.metadata.get("Header 1", "")
        header_2 = chunk.metadata.get("Header 2", "")
        header_3 = chunk.metadata.get("Header 3", "")
        header_4 = chunk.metadata.get("Header 4", "")
        header_5 = chunk.metadata.get("Header 5", "")

        headers = [header_1, header_2, header_3, header_4, header_5]
        parents = []

        for header in headers:
            if header == "":
                break
            parents.append(header)
    
        # Identify the title as the last non-empty header
        title = parents[-1] if parents else "Untitled"
        text = replace_title_in_text(chunk.page_content, title)
        
        parents_concat = '\n'.join(parents)

        result += (
                    f"\n# Relevant Document Title:\n{title}\n"
                    f"## Document Text:\n{text}\n"
                    f"## This document is contained under the following sections:\n{parents_concat}\n"
            ) 
        

    return result


Quick Test

In [9]:
context = format_documents(retriever.invoke(eval_quick_test))
prompt = f"{preamble}{context}{chr(10)}'Question: '{eval_quick_test}{chr(10)}"

def replace_newlines(text):
    return text.replace('\\n', '\n')


# Replace \n with actual new lines
formatted_data = replace_newlines(prompt)

import pprint
pprint.pprint(formatted_data)

('You are an assistant for question-answering tasks. \n'
 '    Use the following pieces of retrieved context to answer the question. \n'
 "    If you don't know the answer, just say that you don't know. \n"
 '    Provide a detailed response, but do not invent stuff\n'
 '\n'
 '   Context:\n'
 '   \n'
 '\n'
 '\n'
 '# Relevant Document Title:\n'
 'Vision Hardware and Optical Services.\n'
 '## Document Text:\n'
 'Vision Hardware and Optical Services.  \n'
 'Corrective lenses, eyeglasses, and contact lenses are excluded, unless\n'
 'your Group has purchased an "Adult Vision Hardware and Optical\n'
 'Services Rider" and/or "Pediatric Vision Hardware and Optical Services\n'
 'Rider" or "Pediatric Vision Hardware and Optical Services Enhanced\n'
 'Benefit Rider."\n'
 '## This document is contained under the following sections:\n'
 'EXCLUSIONS AND LIMITATIONS\n'
 'Vision Hardware and Optical Services.\n'
 '\n'
 '# Relevant Document Title:\n'
 'Low-Vision Aids.\n'
 '## Document Text:\n'
 'Low-Vi

In [10]:

def run_prompt_generator(row):   

   context = format_documents(retriever.invoke(row["query"]))
   prompt = f"{preamble}{context}{chr(10)}'Question: '{row['query']}"

   return {
        "query_num": row["query_num"],
        "prompt": prompt.replace('\\n', '\n'),
    }


In [11]:
# Load the evaluation questions
queries = pd.read_excel(eval_questions)

In [12]:
results = threadpool_map(run_prompt_generator, [{"row": item[1]} for item in list(queries.iterrows())],num_workers=2)

100%|██████████| 37/37 [00:03<00:00, 10.14it/s]


In [13]:
df = queries.merge(pd.DataFrame(results), on="query_num", how="inner")
assert len(df) == len(queries)  # Ensure that all queries have been processed

In [14]:
with pd.ExcelWriter(output_file_xls) as writer:
   df.to_excel(writer, sheet_name="Prompts", index=False)
   
df.to_json(output_file_json, orient='records', lines=True)