# Simple RAG system for confluence articles based on ollama
Assumes documents were downloaded with [this script](./scrapers/scrapeconfluence.ipynb) and stored in a postgres database

In [None]:
%pip install langchain langchain_community pgvector

In [13]:
## configuration
ollama_host="http://ollama:11434"
chatmodel= "llama3"
embedmodel="mxbai-embed-large"

In [14]:
from dataclasses import dataclass,asdict
from dotenv import dotenv_values
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.llms import Ollama
from langchain_core.document_loaders import BaseLoader
from langchain_core.documents import Document
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import pgvector
from psycopg.rows import class_row
from typing import Iterable, Iterator
import psycopg

In [15]:
@dataclass
class Article:
    id:str
    parent:str
    version:int
    title:str
    status:str
    hasvectors:bool = False
    def dict(self):
        return {k: str(v) for k, v in asdict(self).items()}

@dataclass
class ArticleText:
    id:str
    title:str
    contents:str
    def __str__(self) -> str:
        return f'{self.title}\n{self.contents}'

def connectStr(alchemy=False): 
    if alchemy: return f'postgresql+psycopg://postgres:{dotenv_values()["POSTGRES_PASSWORD"]}@pgvector:5432/ragtest'
    return  f'host=pgvector dbname=ragtest user=postgres password={dotenv_values()["POSTGRES_PASSWORD"]}'

In [None]:
#generate embeddings
class PostgresLoader(BaseLoader):
    def __init__(self,length:int,mincontentlen=200):
        self.length = length
        self.mincontentlen= mincontentlen
    def lazy_load(self) -> Iterator[Document]:
        with psycopg.connect(connectStr()) as conn:
            with conn.cursor(row_factory=class_row(ArticleText)) as cur:
                query = f'''select articles.id, title, contents from articles INNER JOIN articlescontents ON articlescontents.id = articles.id 
                            where hasvectors = false and length(contents) > {self.mincontentlen} LIMIT {self.length}'''
                cur.execute(bytes(query,"utf-8"))
                article = cur.fetchone()
                while article!=None:
                    yield  Document(str(article),path= article.id,metadata = {"source": article.id})
                    article = cur.fetchone()
def markasread(id ):
    with psycopg.connect(connectStr()) as conn:
        with conn.cursor() as cur:
            query = f'update articles set hasvectors = true where id = \'{id}\''
            cur.execute(bytes(query,"utf-8"))
            
def getVectorStore():
    embeddings = OllamaEmbeddings(base_url=ollama_host,model=embedmodel)
    return pgvector.PGVector(connectStr(True),embeddings)    


def loadEmbeddings(n=10):
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    vectorstore = getVectorStore()
    for  doc in PostgresLoader(n).lazy_load():
        chunks = text_splitter.split_documents([doc])
        vectorstore.add_documents(chunks)
        markasread(doc.metadata.get("source"))

loadEmbeddings(30000)
# markasread("70516867")

In [37]:
# raw vector search
retriever = getVectorStore().as_retriever()
retriever.get_relevant_documents("how do I import a transport in different clients?")

[Document(page_content='HOW DO I: control the distribution of particular transports to particular SAP clients through the landscape?\nQuestion\n: Is it possible within ActiveControl to\xa0distribute particular transports to particular SAP clients through the landscape?\n\xa0\nAnswer\n:\nVersion 6.20 adds the capability to distribute transports to particular system clients of an SAP landscape, based on rules and the source client of the transport.\nThis requirement had previously been delivered as a custom enhancement for a customer wanting to distribute transports to clients in their Production system for seperate business entities, but has now been enhanced and added to standard ActiveControl for any other customers wanting to benefit from this capability.\n\xa0\nConfiguration\n1) Populate /BTI/TE_IMP_CLI table in the Domain Controller with your required transport distribution rules\nField label / [technical field name]\xa0\nPath ID [PATH]\n: the AC path for which the rule is being de

In [41]:
# create a chatbot
llm = Ollama(model=chatmodel,base_url=ollama_host)

memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

conversation_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    chain_type="stuff",
    retriever=getVectorStore().as_retriever(),
    memory=memory
)

In [43]:
query = "How do I distribute a transport to a given set of clients?"
result = conversation_chain({"question": query})
answer = result["answer"]
answer

'Based on the provided context, here\'s how you can distribute a transport to a given set of clients:\n\nTo setup automated SCC1 within ActiveControl and distribute a transport to a given set of clients, follow these steps:\n\n**Step 1: Configure SCC1 Automation**\n\n* Go to the Development target in ActiveControl.\n* Enable SCC1 automation by checking the box for "Execute client copy SCC1 automatically on Transport Release" or "Execute client copy SCC1 automatically on Transport Form creation".\n* Choose the correct option based on your organization\'s process and workflow.\n\n**Step 2: Define Clients**\n\n* In the Development target, enter the clients into which you want to distribute the transport.\n* Note that it is not an issue to still configure the main Development client (ie the source client of the transport).\n\n**Step 3: Configure Authorization**\n\n* Ensure that the AC_RFC user has the necessary authorizations in all relevant clients:\n\t+ S_CLNT_IMP\n\t+ S_USER_AGR\n\t+ S_