Generate the prompts using S007 - Semantic Chunking strategy
- Requires markdown documents
- Use LangChain Markdown parser to semantically chunk documents
- Store the chunks in a vector database (along with necessary metadata)
- Given a question
-   Use retriever to fetch chunks from vector database
-   Modify these chunks to include metadata information 
-   Create a prompt
- Persist the prompt (for downstream analysis)


In [1]:
import os

import sys
sys.path.append("..")

import nest_asyncio

from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_anthropic import ChatAnthropic
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain_cohere import ChatCohere, CohereEmbeddings
from langchain_together import ChatTogether
from langchain_core.documents import Document
from langchain_text_splitters import MarkdownHeaderTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders import DirectoryLoader


from datetime import datetime
import pandas as pd
import random

from evaluation_utils import threadpool_map

nest_asyncio.apply()

In [2]:
from config import set_environment
set_environment()

In [3]:

preamble = """You are an assistant for question-answering tasks. 
    Use the following pieces of retrieved context to answer the question. 
    If you don't know the answer, just say that you don't know. 
    Provide a detailed response, but do not invent stuff

   Context:
   
"""

Choose the LLM for embedding

In [4]:
embedding_llm_family = os.environ["EMBEDDING_LLM_FAMILY"]
embedding_llm_model = os.environ["EMBEDDING_LLM_MODEL"]
embedding_dimensions = int(os.environ["EMBEDDING_DIMESIONS"])

if embedding_llm_family == "OPENAI":
    embeddings_model = OpenAIEmbeddings(model=embedding_llm_model)
elif embedding_llm_family == "GOOGLE":
    embeddings_model = GoogleGenerativeAIEmbeddings(model=embedding_llm_model)
elif embedding_llm_family == "COHERE":
    embeddings_model = CohereEmbeddings(model=embedding_llm_model)

In [5]:
eval_name = os.environ["EVAL_NAME"]
eval_directory = os.environ["EVAL_DIRECTORY"]
eval_file = os.environ["EVAL_FILE"]
eval_questions = os.environ["EVAL_QUESTIONS"]
eval_quick_test = os.environ["EVAL_QUICK_TEST"]
eval_db = os.environ["EVAL_DB"]
eval_prompts_dir = os.environ["EVAL_PROMPTS_DIR"]

similarity_top_k = int(os.environ["SIMILARITY_TOP_K"]) 

output_file_xls = f"{eval_prompts_dir}/{eval_name}_{embedding_llm_family}_P.xlsx"
output_file_json = f"{eval_prompts_dir}/{eval_name}_{embedding_llm_family}_P.json" 

Set up the retriever. The vector store must be precreated

In [6]:
vectorstore = Chroma(persist_directory=eval_db,
                  embedding_function=embeddings_model)

retriever = vectorstore.as_retriever(search_kwargs={"k": similarity_top_k})

In [None]:
def format_documents(retrieved_chunks):
    
    result = "\n"
    
    for chunk in retrieved_chunks:
        
        header_1 = chunk.metadata.get("Header 1", "")
        header_2 = chunk.metadata.get("Header 2", "")
        header_3 = chunk.metadata.get("Header 3", "")
        header_4 = chunk.metadata.get("Header 4", "")
        header_5 = chunk.metadata.get("Header 5", "")

        headers = [header_1, header_2, header_3, header_4, header_5]
        parents = []

        for header in headers:
            if header == "":
                break
            parents.append(header)
    
        # Identify the title as the last non-empty header
        title = parents[-1] if parents else "Untitled"
        
        parents_concat = '\n'.join(parents)

        result += (
                    f"\n# Relevant Document Title:\n{title}\n"
                    f"## Document Text:\n{chunk.page_content}\n"
                    f"## This document is contained under the following sections:\n{parents_concat}\n"
            ) 
    
    return result

Quick Test

In [None]:
context = format_documents(retriever.invoke(eval_quick_test))
prompt = f"{preamble}{context}{chr(10)}'Question: '{eval_quick_test}{chr(10)}"

def replace_newlines(text):
    return text.replace('\\n', '\n')


# Replace \n with actual new lines
formatted_data = replace_newlines(prompt)

import pprint
pprint.pprint(formatted_data)

In [10]:

def run_prompt_generator(row):   

   context = format_documents(retriever.invoke(row["query"]))
   prompt = f"{preamble}{context}{chr(10)}'Question: '{row['query']}"

   return {
        "query_num": row["query_num"],
        "query": row["query"],
        "prompt": prompt.replace('\\n', '\n'),
    }


In [11]:
# Load the evaluation questions
queries = pd.read_excel(eval_questions)

In [None]:
results = threadpool_map(run_prompt_generator, [{"row": item[1]} for item in list(queries.iterrows())],num_workers=2)

In [13]:
df = queries.merge(pd.DataFrame(results), on="query_num", how="inner")
assert len(df) == len(queries)  # Ensure that all queries have been processed

In [14]:
with pd.ExcelWriter(output_file_xls) as writer:
   df.to_excel(writer, sheet_name="Prompts", index=False)
   
df.to_json(output_file_json, orient='records', lines=True)