# Conversational RAG

In [9]:
%pip install --upgrade --quiet  langchain langchain-community langchainhub langchain-chroma bs4 unstructured 

Note: you may need to restart the kernel to use updated packages.


In [11]:
import getpass
import os
import langchain
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_community.document_loaders import DirectoryLoader, UnstructuredMarkdownLoader
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_chroma import Chroma
from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
from langchain.chains import create_retrieval_chain, create_history_aware_retriever
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain import hub
from langchain_core.runnables.history import RunnableWithMessageHistory


## 1. Files Loading

In [16]:
path = r"C:\Users\reply\RepoAgent\markdown_docs\repo_agent"
loader = DirectoryLoader(path, glob="./*.md", show_progress=True, loader_cls=UnstructuredMarkdownLoader)
docs = loader.load()

100%|██████████| 8/8 [00:18<00:00,  2.30s/it]


In [17]:
docs[0].metadata

{'source': 'C:\\Users\\reply\\RepoAgent\\markdown_docs\\repo_agent\\change_detector.md'}

## 2. Split and Preprocess

In [18]:
def split_documents(doc, chunk_size=250, chunk_overlap=30):
    headers_to_split_on = [
         ("#", "Header 1"),
        ("##", "Header 2"),
        ("###", "Header 3"),
    ]

    markdown_splitter = MarkdownHeaderTextSplitter(
        headers_to_split_on=headers_to_split_on, strip_headers=False
    )
    md_header_splits = markdown_splitter.split_text("#"+ os.path.basename(doc.metadata['source']) + " \n\n  " + doc.page_content)
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap
    )

    # Split
    splits = text_splitter.split_documents(md_header_splits)
   
    return splits

In [19]:
all_splits = []

In [20]:
all_splits = []
for doc in docs:
    splits = split_documents(doc)
    all_splits.extend(splits)


In [21]:
vectorstore = Chroma.from_documents(documents=all_splits, embedding=OpenAIEmbeddings())
retriever = vectorstore.as_retriever()


## 3. Prompting

In [22]:
model = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature = 0.1)



In [23]:
contextualize_q_system_prompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate a standalone question which can be understood "
    "without the chat history. Do NOT answer the question, "
    "just reformulate it if needed and otherwise return it as is."
)
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
history_aware_retriever = create_history_aware_retriever(
    model, retriever, contextualize_q_prompt
)


### Answer question ###
system_prompt = (
    "You are an assistant for question-answering tasks regarding code documentation file. "
    "Use the following pieces of retrieved context to answer "
    "the question. It's also specified the name of the file that contains the functions.  If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
question_answer_chain = create_stuff_documents_chain(model, qa_prompt)

rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)


### Statefully manage chat history ###
store = {}


def get_session_history(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]


conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)

## 4. Test

In [28]:
def paginate_text(text, page_size=500):
    return [text[i:i + page_size] for i in range(0, len(text), page_size)]
query = input("Enter your query (or 'exit' to quit): ")
response = conversational_rag_chain.invoke(
    {"input": query},
    config={
        "configurable": {"session_id": "abc123"}
    },  # constructs a key "abc123" in `store`.
)["answer"]
print(f"Question: {query}\n")
paginated_response = paginate_text(response)

for i, page in enumerate(paginated_response):
    print(f"Page {i + 1}/{len(paginated_response)}: {page}")
    if i < len(paginated_response) - 1:
            input("Press Enter to continue to the next page...")

print("\n")

Question: wha't the general purpose of you input file 

Page 1/1: The function plays a crucial role in generating detailed information for documentation purposes, aiding in understanding the file's structure and components. It interacts with FileHandler, change_detector, and other components as part of a larger system designed for automated project management and documentation.




In [26]:
splits[0]

Document(page_content='#runner.md  \nFunctionDef make_fake_files  \nmake_fake_files: The function of make_fake_files is to prepare the repository for documentation generation by handling unstaged changes and untracked files according to their status in git.')

In [27]:
rag_chain.invoke({"input":"how get_staged_pys works?"})

KeyError: "Input to ChatPromptTemplate is missing variables {'chat_history'}.  Expected: ['chat_history', 'context', 'input'] Received: ['input', 'context']"

In [None]:
conversational_rag_chain.invoke(
   {"input":"no"},
    config={
        "configurable": {"session_id": "abc123"}
    },  # constructs a key "abc123" in `store`.
)["answer"]

"I don't have the specific information about the markdown file where the behavior of the last_element function is explained."

In [None]:
rag_chain.invoke({"input":"What file named last does?"})

KeyError: "Input to ChatPromptTemplate is missing variables {'chat_history'}.  Expected: ['chat_history', 'context', 'input'] Received: ['input', 'context']"

In [None]:
rag_chain.invoke("What last.md does?")

'last.md saves and updates the markdown content of an existing file after making structural changes.'

In [None]:
rag_chain.invoke("What prova.md does?")

'The prova.md function assigns values to variables a, b, and c, and then returns the values of b and c. It does not take any parameters and is used for version control operations with the git library.'