# Question Answering over mobb.ninja

In [2]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.prompts import PromptTemplate

from langchain.document_loaders import ReadTheDocsLoader

## Ingestion

This command is used to download web pages and other content from a website, and in this case, it will download all the .html files from the https://mobb.ninja/docs/ URL and any subdirectories recursively.

`wget -r -A.html https://mobb.ninja/docs/`

In [17]:
ReadTheDocsLoader

langchain.document_loaders.readthedocs.ReadTheDocsLoader

In [51]:
from pathlib import Path
from typing import Any, List, Optional
import html

from langchain.docstore.document import Document

class MobbNinjaDocsLoader(ReadTheDocsLoader):
    """Loader that loads ReadTheDocs documentation directory dump."""

    def load(self) -> List[Document]:
        """Load documents."""
        from bs4 import BeautifulSoup

        def _clean_data(data: str) -> str:
            soup = BeautifulSoup(data, **self.bs_kwargs)
            # text = soup.find_all("main", {"id": "main-content"})
            article_tag = soup.find("article")
            # text = article_tag.get_text()
            if article_tag and len(article_tag) != 0:
                text = article_tag.get_text()
                # Replace HTML entities with their corresponding characters
                text = html.unescape(text)
                # Format the text for better readability
                text = text.strip().replace('\n', ' ')
            else:
                text = ""
            return "\n".join([t for t in text.split("\n") if t])

        docs = []
        for p in Path(self.file_path).rglob("*"):
            if p.is_dir():
                continue
            with open(p, encoding=self.encoding, errors=self.errors) as f:
                text = _clean_data(f.read())
            metadata = {"source": str(p)}
            docs.append(Document(page_content=text, metadata=metadata))
        return docs

In [52]:
loader = MobbNinjaDocsLoader("mobb.ninja/docs/", features='html.parser')

In [53]:
loader

<__main__.MobbNinjaDocsLoader at 0x7fe30630e9a0>

In [54]:
raw_documents = loader.load()

In [55]:
len(raw_documents)

119

In [57]:
raw_documents[0]

Document(page_content='Documentation from the MOBBQuickstarts / Getting StartedRed Hat OpenShift on AWS (ROSA)Azure Red Hat OpenShift (ARO)Advanced Managed OpenShiftROSADeploying ROSA in Private Link modeAdd Public Ingress to Private Link ClusterDeploying ROSA in STS modeDeploying ROSA in STS mode with Private LinkDeploying ROSA in STS mode with custom KMS KeyInstalling the AWS Load Balancer Operator on ROSAAssign Egress IP for External TrafficAdding AWS WAF in front of ROSA / OSDUse AWS Secrets CSI with ROSA in STS modeUse AWS CloudWatch Agent to push prometheus metrics to AWS CloudWatchFederating ROSA metrics to Prometheus with customer alertingConfiguring Alerts for User Workloads in ROSA 4.9.xAWS EFS on ROSAUsing Amazon Web Services Elastic File System (EFS) on ROSAUsing the AWS EFS CSI Driver Operator on ROSA 4.10.xConfiguring a ROSA cluster to pull images from AWS Elastic Container Registry (ECR)Configuring a ROSA cluster to use ECR secret operatorDeploy and use the AWS Kubernete

In [58]:
raw_documents[1]

Document(page_content='Azure Red Hat OpenShiftAzure Red Hat OpenShift is a fully managed, cloud-based service that allows users to quickly and easily deploy and manage containerized applications on the Azure platform. This product is a collaboration between Microsoft Azure and Red Hat, two industry leaders in cloud computing and open source software development. With Azure Red Hat OpenShift, users can leverage the benefits of Azure’s global infrastructure and scalability, as well as Red Hat’s expertise in containerization and open source technologies. This product is ideal for businesses that want to take advantage of the agility and flexibility of containers, but also need the reliability and security of a trusted cloud provider.', metadata={'source': 'mobb.ninja/docs/aro/index.html'})

In [59]:
raw_documents[2]

Document(page_content='Upgrade a disconnected ARO clusterAaron Green & Kevin Collins03/06/2023BackgroundOne of the great features of ARO is that you can create ‘disconnected’ clusters with no connectivity to the Internet. Out of the box, the ARO service mirrors all the code repositories to build OpenShift clusters to Azure Container Registry. This means ARO is built without having to reach out to the Internet as the images to build OpenShift are pulled via the Azure private network.When you upgrade a cluster, OpenShift needs to call out to the Internet to get an upgrade graph to see what options you have to upgrade the cluster. This of course breaks the concept of having a disconnected cluster. This guide goes through how to upgrade ARO without having the cluster reach out to the Internet and maintaining the disconnected nature of an ARO cluster.PrerequisitesA Private Azure Red Hat OpenShift cluster with no Internet ConnectivityCheck upgrade pathNOTE: This step is VERY important. In a 

In [60]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
)
text_splitter

<langchain.text_splitter.RecursiveCharacterTextSplitter at 0x7fe30739f130>

In [61]:
documents = text_splitter.split_documents(raw_documents)

In [62]:
len(documents)

843

In [63]:
documents[0]

Document(page_content='Documentation from the MOBBQuickstarts / Getting StartedRed Hat OpenShift on AWS (ROSA)Azure Red Hat OpenShift (ARO)Advanced Managed OpenShiftROSADeploying ROSA in Private Link modeAdd Public Ingress to Private Link ClusterDeploying ROSA in STS modeDeploying ROSA in STS mode with Private LinkDeploying ROSA in STS mode with custom KMS KeyInstalling the AWS Load Balancer Operator on ROSAAssign Egress IP for External TrafficAdding AWS WAF in front of ROSA / OSDUse AWS Secrets CSI with ROSA in STS modeUse AWS CloudWatch Agent to push prometheus metrics to AWS CloudWatchFederating ROSA metrics to Prometheus with customer alertingConfiguring Alerts for User Workloads in ROSA 4.9.xAWS EFS on ROSAUsing Amazon Web Services Elastic File System (EFS) on ROSAUsing the AWS EFS CSI Driver Operator on ROSA 4.10.xConfiguring a ROSA cluster to pull images from AWS Elastic Container Registry (ECR)Configuring a ROSA cluster to use ECR secret operatorDeploy and use the AWS Kubernete

In [70]:
documents[0].json()

'{"page_content": "Documentation from the MOBBQuickstarts / Getting StartedRed Hat OpenShift on AWS (ROSA)Azure Red Hat OpenShift (ARO)Advanced Managed OpenShiftROSADeploying ROSA in Private Link modeAdd Public Ingress to Private Link ClusterDeploying ROSA in STS modeDeploying ROSA in STS mode with Private LinkDeploying ROSA in STS mode with custom KMS KeyInstalling the AWS Load Balancer Operator on ROSAAssign Egress IP for External TrafficAdding AWS WAF in front of ROSA / OSDUse AWS Secrets CSI with ROSA in STS modeUse AWS CloudWatch Agent to push prometheus metrics to AWS CloudWatchFederating ROSA metrics to Prometheus with customer alertingConfiguring Alerts for User Workloads in ROSA 4.9.xAWS EFS on ROSAUsing Amazon Web Services Elastic File System (EFS) on ROSAUsing the AWS EFS CSI Driver Operator on ROSA 4.10.xConfiguring a ROSA cluster to pull images from AWS Elastic Container Registry (ECR)Configuring a ROSA cluster to use ECR secret operatorDeploy and use the AWS Kubernetes Co

In [64]:
# Initialize embeddings
embeddings = OpenAIEmbeddings()

In [74]:
# Prerequisite: chromadb module is required. install `pip install chromadb`
# Create a Chroma vectorstore from a raw documents.
# If a persist_directory is specified, the collection will be persisted there.
# Otherwise, the data will be ephemeral in-memory.

# also including the source as part of metadata 
docsearch = Chroma.from_documents(documents, embedding=embeddings, collection_name='mobbninja',
                  metadatas=[{"source": f"{i}-pl"} for i in range(len(documents))],
                  persist_directory='chroma_vector_store')

Using embedded DuckDB with persistence: data will be stored in: chroma_vector_store


## Querying

In [75]:
from langchain.chains import RetrievalQAWithSourcesChain
from langchain import OpenAI

In [76]:
# Question-answering with sources over an index
chain = RetrievalQAWithSourcesChain.from_chain_type(
    llm=OpenAI(temperature=0),
    chain_type="stuff",
    retriever=docsearch.as_retriever(),
)

## Demo

### Using the Title 

In [79]:
chain({"question": "How can I create disconnected ARO cluster"}, return_only_outputs=True)

{'answer': ' To create a disconnected ARO cluster, you need to follow the steps outlined in the guide "Upgrade a disconnected ARO cluster" on mobb.ninja/docs/aro/upgrade-disconnected-aro/index.html.\n',
 'sources': 'mobb.ninja/docs/aro/upgrade-disconnected-aro/index.html'}

### Mispelling typo clustere instead of cluster

In [80]:
chain({"question": "How can I create disconnected ARO clustere"}, return_only_outputs=True)

{'answer': " I don't know.\n", 'sources': 'N/A'}

### ROSA - Use Cases around AWS Load Balancer Controller

This is the page containing the info https://mobb.ninja/docs/rosa/aws-load-balancer-operator/

```
AWS Load Balancer Controller is a controller to help manage Elastic Load Balancers for a Kubernetes cluster.

It satisfies Kubernetes Ingress resources by provisioning Application Load Balancers.
It satisfies Kubernetes Service resources by provisioning Network Load Balancers.
Compared with default AWS In Tree Provider, this controller is actively developed with advanced annotations for both ALB and NLB. Some advanced usecases are:

Using native kubernetes ingress with ALB
Integrate ALB with WAF
Specify NLB source IP ranges
Specify NLB internal IP address
```

In [82]:
chain({"question": "What are the advanced use cases related to AWS Load Balancer Controller on ROSA "}, return_only_outputs=True)

{'answer': ' Advanced use cases related to AWS Load Balancer Controller on ROSA include using native Kubernetes ingress with ALB, integrating ALB with WAF, specifying NLB source IP ranges, specifying NLB internal IP address, and configuring TLS and DNS for the ingress.\n',
 'sources': 'mobb.ninja/docs/rosa/aws-load-balancer-operator/index.html, mobb.ninja/docs/index.html, mobb.ninja/docs/rosa/waf/readme-complex/index.html, mobb.ninja/docs/rosa/waf/alb/index.html'}

### ROSA - User trying to configure GPU workloads

In [84]:
chain({"question": "How can I configure ROSA for GPU workloads"}, return_only_outputs=True)

{'answer': " I don't know.\n", 'sources': 'mobb.ninja/docs/index.html'}

In [85]:
chain({"question": "How can I configure ROSA for NVIDIA GPU workloads"}, return_only_outputs=True)

{'answer': ' ROSA does not support NVIDIA GPU workloads.\n',
 'sources': 'mobb.ninja/docs/rosa/gpu/index.html, mobb.ninja/docs/index.html, mobb.ninja/docs/aro/gpu/index.html, mobb.ninja/docs/index.html'}

In [86]:
chain({"question": "Configure ROSA for NVIDIA GPU workloads"}, return_only_outputs=True)

{'answer': ' Configuring ROSA for NVIDIA GPU workloads involves setting up entitlements to use the NVIDIA Operator, installing jq, moreutils, and gettext packages, and requesting GPU quota.\n',
 'sources': 'mobb.ninja/docs/rosa/gpu/index.html, mobb.ninja/docs/index.html'}

It seems providing additional context in the form of prompt is critical.

## Next Steps

In [None]:
_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question.
You can assume the question about the most recent state of the union address.
Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:"""
CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)

template = """You are an AI assistant for answering questions about documentation in mobb.ninja.
You are given the following extracted parts of a long document and a question. Provide a conversational answer.
If you don't know the answer, just say "Hmm, I'm not sure." Don't try to make up an answer.
If you don't know the answer, just say that you don't know, don't try to make up an answer. Politely inform them that you are tuned to only answer questions about .
Question: {question}
=========
{context}
=========
Answer in Markdown:"""
QA_PROMPT = PromptTemplate(template=template, input_variables=["question", "context"])


In [92]:
from langchain.chains.chat_vector_db.prompts import QA_PROMPT
from langchain.chains.qa_with_sources import load_qa_with_sources_chain

In [91]:
QA_PROMPT.template, QA_PROMPT.input_variables

("Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\n{context}\n\nQuestion: {question}\nHelpful Answer:",
 ['context', 'question'])

In [99]:
# qa_chain = load_qa_with_sources_chain(
#     llm=OpenAI(temperature=0),
#     prompt=QA_PROMPT
# )
# qa = RetrievalQAWithSourcesChain(
#     combine_documents_chain=qa_chain,
#     retriever=docsearch.as_retriever())

# query = "What did the president say about Justice Breyer"
# qa({"input_documents": docsearch.as_retriever().get_relevant_documents(query),
#     "question": query }, 
#    return_only_outputs=True)