## Documentation Conversational Rag 

In [3]:
import os
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_community.document_loaders import DirectoryLoader, UnstructuredMarkdownLoader
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_chroma import Chroma
from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
from langchain.chains import create_retrieval_chain, create_history_aware_retriever
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_core.output_parsers import StrOutputParser
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo
from langchain_core.prompts.few_shot import FewShotPromptTemplate
from langchain_core.prompts.prompt import PromptTemplate
from langchain_core.example_selectors import SemanticSimilarityExampleSelector
from langchain_core.runnables import RunnablePassthrough



## 1. File Loading

In [5]:
path = r"C:\Users\reply\Desktop\testREPO\TestRepo\markdown_docs"
loader = DirectoryLoader(path, glob="./*.md", show_progress=True, loader_cls=UnstructuredMarkdownLoader)
docs = loader.load()

100%|██████████| 7/7 [00:17<00:00,  2.46s/it]


## 2. Model definition

In [6]:
model = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature = 0.1)
metadata_field_info=[
    AttributeInfo(
        name='source',
        description="Filename and location of the source file", 
        type="string", 
    )]


In [7]:
def split_documents(doc, chunk_size=250, chunk_overlap=30):
    """ Split a document into chunks of text."""

    headers_to_split_on = [
         ("#", "Header 1"),
        ("##", "Header 2"),
        ("###", "Header 3"),
    ]

    markdown_splitter = MarkdownHeaderTextSplitter(
        headers_to_split_on=headers_to_split_on, strip_headers=False
    )
    md_header_splits = markdown_splitter.split_text( doc.page_content)
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap
    )

    # Split
    splits = text_splitter.split_documents(md_header_splits)
   
    return splits

In [8]:
def get_chunk_with_source(docs, chunk_size=250, chunk_overlap=30):
    all_splits = []
    for doc in docs:
        splits = split_documents(doc, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        for doc_split in splits:
            filename = os.path.basename(list(doc.metadata.values())[0])
            doc_split.metadata = {'source':filename}        
        all_splits.extend(splits)
    return all_splits

## 2.1 Classifier

In [None]:
examples = [
    {
        "question": "Explain me what the repository is about",
        "answer": """general""",
    },
    {
        "question": "What is this program doing?",
        "answer": """general""",
    },
    {
        "question": "How does the project work?",
        "answer": """general""",
    },
    {
        "question": "which parameter should be passed to the function?",
        "answer": """specific""",
    },
    {
        "question": "what does the function X rerurns?",
        "answer": """specific""",
    }
    {
        "question": "show me how to use function X",
        "answer": """specific""",
    }
    {
        "question": "how the method X works?",
        "answer": """specific""",
    },
    {
        "question": """explain file X.md""",
        "answer": """specific""",
    },
]


In [None]:
example_prompt = PromptTemplate(
    input_variables=["question", "answer"], template="{answer}"
)


In [None]:
example_selector = SemanticSimilarityExampleSelector.from_examples(
    examples,
    OpenAIEmbeddings(),
    Chroma,
    k=1,
)


In [None]:
prompt = FewShotPromptTemplate(
    example_selector=example_selector,
    example_prompt=example_prompt,
    suffix="Question: {input}",
    input_variables=["input"],
)



In [None]:
prompt.format(input="how does the project work").strip()

'general\n\nQuestion: how does the project work'

## 2.2 General

In [None]:
#fileter the array of document based on source
def filter_docs(docs, source):
    return [doc for doc in docs if "summary.md" in doc.metadata['source']]
summary_doc = filter_docs(docs, 'summary.md')

In [None]:
summary_splits = get_chunk_with_source(summary_doc, chunk_size=1000, chunk_overlap=100)

In [None]:
vectorstore_general = Chroma.from_documents(documents=summary_splits, embedding=OpenAIEmbeddings(), collection_name="general")
document_content_description = "Code documentation general summary"

In [None]:
template = """
Given the following context:
{context}
Provide a comprehensive overview of the project considering the question:
{question}
"""
custom_rag_prompt = PromptTemplate.from_template(template)

In [None]:
general_retriever = vectorstore_general.as_retriever()
general_chain = (
    {"context": general_retriever , "question": RunnablePassthrough()}
    | custom_rag_prompt
    | model
    | StrOutputParser()
)


## 2.3 Specific

In [9]:
document_content_description = "Code documentation"

In [10]:
specific_splits = get_chunk_with_source(docs)
vectorstore_specific = Chroma.from_documents(documents=specific_splits, embedding=OpenAIEmbeddings(), collection_name="specific")

specific_retriever = SelfQueryRetriever.from_llm(model, 
                                        vectorstore_specific, 
                                        document_content_description, 
                                        metadata_field_info, 
                                        verbose=True,
                                        )
     

In [11]:
store = {}
def get_session_history(session_id: str) -> BaseChatMessageHistory:
    """Get the chat history for a session. If the session does not exist, create it."""
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]

In [12]:
# Define System Prompts
contextualize_q_system_prompt = (
    "Given a chat history and the latest user question which might reference context in the chat history, "
    "formulate a standalone question which can be understood without the chat history. "
    "Do NOT answer the question, just reformulate it if needed and otherwise return it as is."
)

qa_system_prompt = (
    "You are an assistant for question-answering tasks regarding code documentation file. "
    "Use the following pieces of retrieved context to answer the question. It's also specified the name of the file that contains the functions. "
    "If you don't know the answer, say that you don't know. Use three sentences maximum and keep the answer concise."
    "\n\n"
    "{context}"
)

# Create Prompt Templates
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

# Create History Aware Retriever
history_aware_retriever = create_history_aware_retriever(
    model, specific_retriever, contextualize_q_prompt
)

# Create Question Answering Chain
question_answer_chain = create_stuff_documents_chain(model, qa_prompt)

# Create RAG chain
rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)


# Create Specific Chain with Message History Management
specific_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)


## 3. Conversation

In [None]:
def get_answer(question,prompt, specific_chain, general_chain):
    response = prompt.format(input=question).strip()
    classification = response.split('\n')[0]
    print(classification)
    if classification == "specific":
        answer = specific_chain.invoke({"input":question},
        config={
            "configurable": {"session_id": "abc123"}
        },  # constructs a key "abc123" in `store`.
        )["answer"]
    else:
        answer = general_chain.invoke(question)
        
    return answer

def paginate_text(text, page_size=500):
    return [text[i:i + page_size] for i in range(0, len(text), page_size)]

def print_answer(paginated_response):
  for i, page in enumerate(paginated_response):
    print(f"Page {i + 1}/{len(paginated_response)}: {page}")
    if i < len(paginated_response) - 1:
            input("Press Enter to continue to the next page...")

    print("\n")

In [None]:

query = input("Enter your query (or 'exit' to quit): ")
if query == "":
    print("Please enter a query")
    query = input("Enter your query (or 'exit' to quit): ")
else:
    print(f"Question: {query}\n")
    response = get_answer(query, prompt, specific_chain, general_chain)
    paginated_response = paginate_text(response)
    print_answer(paginated_response)

Question: which parameters does get_staged_pys need?

specific
Page 1/1: The file that contains the function get_staged_pys is not provided.




In [16]:
for chunk in specific_chain.stream({'input': 'foo'}, {'configurable': {'session_id': 'ab12'}}
):
    print(chunk, end="|", flush=True)

{'input': 'foo'}|{'chat_history': [HumanMessage(content='foo'), AIMessage(content="I don't have enough information to provide an accurate answer.")]}|{'context': [Document(page_content='Code Description:', metadata={'source': 'utilities.md'}), Document(page_content='Code Description:', metadata={'source': 'rules_metrics_par.md'}), Document(page_content='Consolidated Summary:', metadata={'source': 'summary.md'}), Document(page_content='of Rule instances.', metadata={'source': 'rules.md'})]}|{'answer': ''}|{'answer': 'I'}|{'answer': ' don'}|{'answer': "'t"}|{'answer': ' have'}|{'answer': ' enough'}|{'answer': ' information'}|{'answer': ' to'}|{'answer': ' provide'}|{'answer': ' an'}|{'answer': ' accurate'}|{'answer': ' answer'}|{'answer': '.'}|{'answer': ''}|