# Simple RAG system for confluence articles based on ollama
Assumes documents were downloaded with [this script](./scrapers/scrapeconfluence.ipynb) and stored in a postgres database

In [None]:
%pip install langchain langchain_community pgvector

In [2]:
## configuration
ollama_host="http://ollama:11434"
chatmodel= "llama3"
embedmodel="mxbai-embed-large"

In [3]:
from dataclasses import dataclass,asdict
from dotenv import dotenv_values
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.llms import Ollama
from langchain_core.document_loaders import BaseLoader
from langchain_core.documents import Document
from langchain_core.prompts.chat import SystemMessagePromptTemplate,HumanMessagePromptTemplate,ChatPromptTemplate
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import pgvector
from psycopg.rows import class_row
from typing import Iterable, Iterator
import psycopg

In [5]:
@dataclass
class Article:
    id:str
    parent:str
    version:int
    title:str
    status:str
    hasvectors:bool = False
    def dict(self):
        return {k: str(v) for k, v in asdict(self).items()}

@dataclass
class ArticleText:
    id:str
    title:str
    contents:str
    def __str__(self) -> str:
        return f'{self.title}\n{self.contents}'

def connectStr(alchemy=False): 
    if alchemy: return f'postgresql+psycopg://postgres:{dotenv_values()["POSTGRES_PASSWORD"]}@pgvector:5432/ragtest'
    return  f'host=pgvector dbname=ragtest user=postgres password={dotenv_values()["POSTGRES_PASSWORD"]}'

In [6]:
#generate embeddings
class PostgresLoader(BaseLoader):
    def __init__(self,length:int,mincontentlen=200):
        self.length = length
        self.mincontentlen= mincontentlen
    def lazy_load(self) -> Iterator[Document]:
        with psycopg.connect(connectStr()) as conn:
            with conn.cursor(row_factory=class_row(ArticleText)) as cur:
                query = f'''select articles.id, title, contents from articles INNER JOIN articlescontents ON articlescontents.id = articles.id 
                            where hasvectors = false and length(contents) > {self.mincontentlen} LIMIT {self.length}'''
                cur.execute(bytes(query,"utf-8"))
                article = cur.fetchone()
                while article!=None:
                    yield  Document(str(article),path= article.id,metadata = {"source": article.id})
                    article = cur.fetchone()
def markasread(id ):
    with psycopg.connect(connectStr()) as conn:
        with conn.cursor() as cur:
            query = f'update articles set hasvectors = true where id = \'{id}\''
            cur.execute(bytes(query,"utf-8"))
            
def getVectorStore():
    embeddings = OllamaEmbeddings(base_url=ollama_host,model=embedmodel)
    return pgvector.PGVector(connectStr(True),embeddings)    


def loadEmbeddings(n=10):
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    vectorstore = getVectorStore()
    for  doc in PostgresLoader(n).lazy_load():
        chunks = text_splitter.split_documents([doc])
        vectorstore.add_documents(chunks)
        markasread(doc.metadata.get("source"))

loadEmbeddings(30000)
# markasread("70516867")

  warn_deprecated(
  warn_deprecated(


In [7]:
# raw vector search
retriever = getVectorStore().as_retriever()
retriever.get_relevant_documents("how do I import a transport in different clients?")

  warn_deprecated(


[Document(metadata={'source': '2359337524'}, page_content='HOW DO I: control the distribution of particular transports to particular SAP clients through the landscape?\nQuestion\n: Is it possible within ActiveControl to\xa0distribute particular transports to particular SAP clients through the landscape?\n\xa0\nAnswer\n:\nVersion 6.20 adds the capability to distribute transports to particular system clients of an SAP landscape, based on rules and the source client of the transport.\nThis requirement had previously been delivered as a custom enhancement for a customer wanting to distribute transports to clients in their Production system for seperate business entities, but has now been enhanced and added to standard ActiveControl for any other customers wanting to benefit from this capability.\n\xa0\nConfiguration\n1) Populate /BTI/TE_IMP_CLI table in the Domain Controller with your required transport distribution rules\nField label / [technical field name]\xa0\nPath ID [PATH]\n: the AC 

In [8]:
# create a chatbot
llm = Ollama(model=chatmodel,base_url=ollama_host)

general_system_template = r""" 
Given a specific context, please give a short answer to the question, covering the required advices in general and then provide the names all of relevant(even if it relates a bit) products.
Always include a reference to the source document 
 ----
{context}
----
"""
general_user_template = "Question:```{question}```"
messages = [
            SystemMessagePromptTemplate.from_template(general_system_template),
            HumanMessagePromptTemplate.from_template(general_user_template)
]
qa_prompt = ChatPromptTemplate.from_messages( messages )


memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

conversation_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    chain_type="stuff",
    retriever=getVectorStore().as_retriever(),
    memory=memory,
    combine_docs_chain_kwargs={'prompt': qa_prompt}
)

  warn_deprecated(


In [14]:
query = "How do I choose the clients where a transport will be imported?"
result = conversation_chain({"question": query})
answer = result["answer"]
answer

"To control the distribution of particular transports to particular SAP clients through the landscape, follow these steps:\n\n1. Populate the `/BTI/TE_IMP_CLI` table in the Domain Controller with your required transport distribution rules.\n2. Switch on the `/BTI/TE_EXIT_SAMPLE_0065` exit in the `/BTI/TE_EXITC` table in the Domain Controller.\n\nNotes:\n\n* The import queue is not skipped; it's during the import that the skipping occurs based on the rules defined in the configuration table.\n* Clients will be sorted based on `SORTORDER`.\n* If a client is not defined in the configuration, it will be ignored.\n* If no entries are found for a transport, it will be imported into all clients defined in the config.\n* This functionality is only for transport distribution from Development through the subsequent targets in a path. It's not possible to use this functionality during the standard Merge process.\n\nProducts:\n\n* ActiveControl"