## Imports

In [314]:
import getpass
import os
import langchain
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_community.document_loaders import DirectoryLoader, UnstructuredMarkdownLoader
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_chroma import Chroma
from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
from langchain.chains import create_retrieval_chain, create_history_aware_retriever
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain import hub
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_core.output_parsers import StrOutputParser
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.chains import RetrievalQAWithSourcesChain

     



In [315]:
path = r"C:\Users\reply\Desktop\root-cause-analysis-asset\markdown-gpt-3.5"
loader = DirectoryLoader(path, glob="./*.md", show_progress=True, loader_cls=UnstructuredMarkdownLoader)
docs = loader.load()

100%|██████████| 11/11 [00:03<00:00,  3.58it/s]


In [316]:
def split_documents(doc, chunk_size=250, chunk_overlap=30):
    headers_to_split_on = [
         ("#", "Header 1"),
        ("##", "Header 2"),
        ("###", "Header 3"),
    ]

    markdown_splitter = MarkdownHeaderTextSplitter(
        headers_to_split_on=headers_to_split_on, strip_headers=False
    )
    md_header_splits = markdown_splitter.split_text( doc.page_content)
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap
    )

    # Split
    splits = text_splitter.split_documents(md_header_splits)
   
    return splits

In [317]:
all_splits = []
for doc in docs:
    splits = split_documents(doc)
    for doc_split in splits:
        filename = os.path.basename(list(doc.metadata.values())[0])
        doc_split.metadata = {'source':filename}        
    all_splits.extend(splits)

In [318]:
model = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature = 0.1)
vectorstore = Chroma.from_documents(documents=all_splits, embedding=OpenAIEmbeddings())


In [319]:
metadata_field_info=[
    AttributeInfo(
        name='source',
        description="Filename and location of the source file", 
        type="string", 
    )]
document_content_description = "Code documentation"


## Specific Model

In [320]:
retriever = SelfQueryRetriever.from_llm(model, 
                                        vectorstore, 
                                        document_content_description, 
                                        metadata_field_info, 
                                        verbose=True,
                                        )
     

In [321]:
contextualize_q_system_prompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate a standalone question which can be understood "
    "without the chat history. Do NOT answer the question, "
    "just reformulate it if needed and otherwise return it as is."
)
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
history_aware_retriever = create_history_aware_retriever(
    model, retriever, contextualize_q_prompt
)


### Answer question ###
system_prompt = (
    "You are an assistant for question-answering tasks regarding code documentation file. "
    "Use the following pieces of retrieved context to answer "
    "the question. It's also specified the name of the file that contains the functions.  If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
question_answer_chain = create_stuff_documents_chain(model, qa_prompt)

rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)


### Statefully manage chat history ###
store = {}


def get_session_history(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]


specific_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)



In [322]:
# chain({"explain ref_rules.md"},return_only_outputs=False)
# chain({"what is the rule class doing?"},
#       return_only_outputs=False)
# chain({"in which file is computeAccuracyStats?"}, return_only_outputs=False)
# chain({"explain ref_rule_extraction_algorithm.md"}, return_only_outputs=False)
# chain({"how should be created a hillclimb object?"},
#       return_only_outputs=False)
# chain({"where is ruleScore used? in which file?"}, return_only_outputs=False)


## General model

In [323]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough



In [324]:

loader = DirectoryLoader(path, glob="./summary.md", show_progress=True, loader_cls=UnstructuredMarkdownLoader)
doc = loader.load()

100%|██████████| 1/1 [00:00<00:00, 86.83it/s]


In [325]:
doc

[Document(page_content='Hypothetical Project: Rule-Based Classification System\n\nProject Overview\n\nThe purpose of this project is to develop a rule-based classification system that can evaluate the performance of different rule sets against true labels. The project aims to provide a set of functions that calculate various metrics such as rule scores, accuracy, and fault accuracy. The system incorporates mechanisms to penalize rule length to ensure that simpler, more generalizable rules are favored.\n\nProject Goal\n\nThe primary goal of this project is to develop a robust and fair rule-based classification system. By incorporating different metrics and allowing for rule length penalization, the system aims to balance accuracy and simplicity in rule-based models. This approach ensures that the generated rules are not only accurate but also generalizable and easy to interpret.', metadata={'source': 'C:\\Users\\reply\\Desktop\\root-cause-analysis-asset\\markdown-gpt-3.5\\summary.md'})]

In [326]:
from langchain_core.prompts import PromptTemplate

template = """
Given the following context:
{context}
Provide a comprehensive overview of the project considering the question:
{question}
"""
custom_rag_prompt = PromptTemplate.from_template(template)

In [327]:
splits = split_documents(doc[0], chunk_size=1000, chunk_overlap=100)
for s in splits:
    filename = os.path.basename(list(doc[0].metadata.values())[0])
    s.metadata = {'source':filename}
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())
retriever = vectorstore.as_retriever()


In [328]:
general_chain = (
    {"context": retriever , "question": RunnablePassthrough()}
    | custom_rag_prompt
    | model
    | StrOutputParser()
)

## Classifier

In [329]:
from langchain_core.prompts.few_shot import FewShotPromptTemplate
from langchain_core.prompts.prompt import PromptTemplate

examples = [
    {
        "question": "Explain me what the repository is about",
        "answer": "general",
    },
    {
        "question": "What is this program doing?",
        "answer": "general",
    },
    {
        "question": "which parameter should be passed to the function?",
        "answer": "specific",
    },
    {
        "question": "how the method X works?",
        "answer": "specific",
    },
    {
        "question": "explain file X.md",
        "answer": "specific",
    },
]

In [331]:
example_selector = SemanticSimilarityExampleSelector.from_examples(
    # This is the list of examples available to select from.
    examples,
    # This is the embedding class used to produce embeddings which are used to measure semantic similarity.
    OpenAIEmbeddings(),
    # This is the VectorStore class that is used to store the embeddings and do a similarity search over.
    Chroma,
    # This is the number of examples to produce.
    k=1,
)

In [332]:
example_prompt = PromptTemplate(
    input_variables=["question", "answer"], template="{answer}"
)


In [333]:
prompt = FewShotPromptTemplate(
    example_selector=example_selector,
    example_prompt=example_prompt,
    suffix="Question: {input}",
    input_variables=["input"],
)



In [334]:
question = "how the method get_staged_pys works"

In [336]:
response = prompt.format(input=question).strip()
classification = response.split('\n')[0]
classification


KeyError: 'answer'

In [None]:
response

'Explain me what the repository is about general\n\n What is this program doing? general\n\n which parameter should be passed to the function? specific\n\n how the method X works? specific\n\n explain file X.md specific\n\nQuestion: how the method get_staged_pys works'

In [None]:
colon_position = response.find(':')
question = response[colon_position+1:].strip()

In [None]:
print(specific_chain.invoke({"input":question},
    config={
        "configurable": {"session_id": "abc123"}
    },  # constructs a key "abc123" in `store`.
    ))

{'input': 'how the method get_staged_pys works', 'chat_history': [], 'context': [Document(page_content='For each rule in the ruleSet, a remote prediction task is initiated, and the results are collected. The predictions are then stored in the eachRulePredictions array. If verbose is True, a progress bar is updated during the process.', metadata={'source': 'hill_climbing_par.md'}), Document(page_content='For each rule in the ruleSet, a remote prediction task is initiated, and the results are collected. The predictions are then stored in the eachRulePredictions array. If verbose is True, a progress bar is updated during the process.', metadata={'source': 'hill_climbing_par.md'}), Document(page_content='For each rule in the ruleSet, a remote prediction task is initiated, and the results are collected. The predictions are then stored in the eachRulePredictions array. If verbose is True, a progress bar is updated during the process.', metadata={'source': 'hill_climbing_par.md'}), Document(p

In [None]:
if classification == "general":
    print(general_chain.invoke(question))
else:
    print(specific_chain.invoke({"input":question},
    config={
        "configurable": {"session_id": "abc123"}
    },  # constructs a key "abc123" in `store`.
    ))["answer"]

{'input': 'how the method get_staged_pys works', 'chat_history': [HumanMessage(content='how the method get_staged_pys works'), AIMessage(content="I don't know.")], 'context': [Document(page_content='For each rule in the ruleSet, a remote prediction task is initiated, and the results are collected. The predictions are then stored in the eachRulePredictions array. If verbose is True, a progress bar is updated during the process.', metadata={'source': 'hill_climbing_par.md'}), Document(page_content='For each rule in the ruleSet, a remote prediction task is initiated, and the results are collected. The predictions are then stored in the eachRulePredictions array. If verbose is True, a progress bar is updated during the process.', metadata={'source': 'hill_climbing_par.md'}), Document(page_content='For each rule in the ruleSet, a remote prediction task is initiated, and the results are collected. The predictions are then stored in the eachRulePredictions array. If verbose is True, a progres