# Simple RAG system for confluence articles based on ollama
Assumes documents were downloaded with [this script](./scrapers/scrapeconfluence.ipynb) and stored in a postgres database

In [None]:
%pip install langchain langchain_community pgvector

In [2]:
## configuration
ollama_host="http://ollama:11434"
chatmodel= "llama3"
embedmodel="mxbai-embed-large"

In [3]:
from dataclasses import dataclass,asdict
from dotenv import dotenv_values
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.llms import Ollama
from langchain_core.document_loaders import BaseLoader
from langchain_core.documents import Document
from langchain_core.prompts.chat import SystemMessagePromptTemplate,HumanMessagePromptTemplate,ChatPromptTemplate
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import pgvector
from psycopg.rows import class_row
from typing import Iterable, Iterator
import psycopg

In [5]:
@dataclass
class Article:
    id:str
    parent:str
    version:int
    title:str
    status:str
    hasvectors:bool = False
    def dict(self):
        return {k: str(v) for k, v in asdict(self).items()}

@dataclass
class ArticleText:
    id:str
    title:str
    contents:str
    def __str__(self) -> str:
        return f'{self.title}\n{self.contents}'

def connectStr(alchemy=False): 
    if alchemy: return f'postgresql+psycopg://postgres:{dotenv_values()["POSTGRES_PASSWORD"]}@pgvector:5432/ragtest'
    return  f'host=pgvector dbname=ragtest user=postgres password={dotenv_values()["POSTGRES_PASSWORD"]}'

In [None]:
#generate embeddings
class PostgresLoader(BaseLoader):
    def __init__(self,length:int,mincontentlen=200):
        self.length = length
        self.mincontentlen= mincontentlen
    def lazy_load(self) -> Iterator[Document]:
        with psycopg.connect(connectStr()) as conn:
            with conn.cursor(row_factory=class_row(ArticleText)) as cur:
                query = f'''select articles.id, title, contents from articles INNER JOIN articlescontents ON articlescontents.id = articles.id 
                            where hasvectors = false and length(contents) > {self.mincontentlen} LIMIT {self.length}'''
                cur.execute(bytes(query,"utf-8"))
                article = cur.fetchone()
                while article!=None:
                    yield  Document(str(article),path= article.id,metadata = {"source": article.id,"title":article.title})
                    article = cur.fetchone()
def markasread(id ):
    with psycopg.connect(connectStr()) as conn:
        with conn.cursor() as cur:
            query = f'update articles set hasvectors = true where id = \'{id}\''
            cur.execute(bytes(query,"utf-8"))
            
def getVectorStore():
    embeddings = OllamaEmbeddings(base_url=ollama_host,model=embedmodel)
    return pgvector.PGVector(connectStr(True),embeddings)    


def loadEmbeddings(n=10):
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    vectorstore = getVectorStore()
    for  doc in PostgresLoader(n).lazy_load():
        chunks = text_splitter.split_documents([doc])
        baseurl = dotenv_values()["JIRASDBASEURL"]
        # add document link to each chunk. This will likely overflow the chunk size limit
        for chunk in chunks:
            url = f'{baseurl}/servicedesk/customer/portal/6/article/{chunk.metadata.get("source")}'
            chunk.page_content=f'{chunk.page_content}\nfor more information see <a href="{url}">{chunk.metadata.get("title")}</a>'
        vectorstore.add_documents(chunks)
        markasread(doc.metadata.get("source"))

loadEmbeddings(30000)

In [7]:
# raw vector search
retriever = getVectorStore().as_retriever()
retriever.get_relevant_documents("how do I import a transport in different clients?")

  warn_deprecated(


[Document(metadata={'source': '2359337524'}, page_content='HOW DO I: control the distribution of particular transports to particular SAP clients through the landscape?\nQuestion\n: Is it possible within ActiveControl to\xa0distribute particular transports to particular SAP clients through the landscape?\n\xa0\nAnswer\n:\nVersion 6.20 adds the capability to distribute transports to particular system clients of an SAP landscape, based on rules and the source client of the transport.\nThis requirement had previously been delivered as a custom enhancement for a customer wanting to distribute transports to clients in their Production system for seperate business entities, but has now been enhanced and added to standard ActiveControl for any other customers wanting to benefit from this capability.\n\xa0\nConfiguration\n1) Populate /BTI/TE_IMP_CLI table in the Domain Controller with your required transport distribution rules\nField label / [technical field name]\xa0\nPath ID [PATH]\n: the AC 

In [26]:
# create a chatbot
llm = Ollama(model=chatmodel,base_url=ollama_host)

general_system_template = r""" 
Given a specific context, please give a concise but exhaustive answer to the question, covering the required advices in general and then provide the names all of relevant(even if it relates a bit) products.
Always report any link you find
 ----
{context}
----
"""
general_user_template = "Question:```{question}```"
messages = [
            SystemMessagePromptTemplate.from_template(general_system_template),
            HumanMessagePromptTemplate.from_template(general_user_template)
]
qa_prompt = ChatPromptTemplate.from_messages( messages )


memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

conversation_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    chain_type="stuff",
    retriever=getVectorStore().as_retriever(),
    memory=memory,
    combine_docs_chain_kwargs={'prompt': qa_prompt}
)

  warn_deprecated(


In [23]:
conversation_chain({"question": "how do I disable form creation checks?"})["answer"]

'Based on the provided context, to disable form creation checks, you can use a feature flag called `EXTERNAL_TARGET_NOPATHCHECK` with a value of \'X\' in the `/BTI/TE_TVARV` table. This flag is related to TE-5627.\n\nAdditionally, if you want to switch off the automatic population of certain fields when creating a new Transport Form or Business Task, you would need to use user exits (610 and 620 respectively) as there is no configuration option available for this. You can refer to the Administration Guide for details on setting up these user exits.\n\nReferences:\n1. ActiveControl Change Notes: TE-5627 : A feature flag to disable some form creation check\n2. Is it possible to switch off the automatic population of certain fields when you create a new Transport Form or Business Task?\n\nPlease note that the "Allow TF Creation By-Pass" option is not recommended for customers and is intended only for internal Basis Technologies development purposes.'

In [28]:
query = "How do I choose the clients where a transport will be imported?"
result = conversation_chain({"question": query})
answer = result["answer"]
answer

"It's possible to set Import Modes at an individual transport level, similar to during a manual STMS transport import.\n\nWithin ActiveControl, you can specify the unconditional mode within the Transport Form's [Advanced Options] tab. This allows for more control over the import process.\n\nFor more information, see:\n\nhttps://get.support.basistechnologies.com/servicedesk/customer/portal/6/article/2359995157\n\nAs for doing an en-masse upload of Transport Forms to a Control Point, you can use the backend program `/BTI/TE_RMASS_ADD_TO_CP`.\n\nThis is only available in the standard product as of ActiveControl 7.1.\n\nFor more information, see:\n\nhttps://get.support.basistechnologies.com/servicedesk/customer/portal/6/article/2359696160\n\nTo create rules to import transports into specific client(s) based on custom fields in Transport Form, follow these steps:\n\n4. Creation of Rules for overall master step in `/BTI/TE_RE_RULES`.\n5. Definition of step conditions in `/BTI/TE_RE_STEPC`\n6