### 04. RAG med Vector Databas

- mer avancerade uppslag av data som sedan skickas med i prompten

![04 diagram](docs/04.drawio.png)

- med en vector databas kan man söka på ostrukturerat data
- en modell skapar 'vector embeddings' som lagras i databas
- av frågan som ställs skapas också en 'vector embedding' av samma modell
- nu kan man matematisk avgöra semantiska matchningar via 'nearest neighbour'-algoritm

![01 vector](docs/vector01.png)

![02 vector](docs/vector02.png)

In [1]:
import os
from dotenv import load_dotenv

from google import genai
from google.genai import types

import chromadb

from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

load_dotenv()
API_KEY = os.getenv("GOOGLE_API_KEY")

client = genai.Client(api_key=API_KEY)

In [2]:
# ladda filer in i vector databas

chroma_client = chromadb.PersistentClient(path="data/vector_db")

collection = chroma_client.get_or_create_collection(name="spoe-data")

loader = PyPDFDirectoryLoader("data/documents")
raw_documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=100,
    length_function=len,
    is_separator_regex=False,
)

chunks = text_splitter.split_documents(raw_documents)

documents = []
metadata = []
ids = []

i = 0

for chunk in chunks:
    documents.append(chunk.page_content)
    ids.append("ID" + str(i))
    metadata.append(chunk.metadata)
    i += 1

collection.upsert(documents=documents, metadatas=metadata, ids=ids)

In [3]:
# definera prompt

user_prompt = "hur lång tid kommer genomförandet av SPOE-projektet att pågå?"

In [None]:
# ställ fråga (prompt) till vector db först

results = collection.query(query_texts=[user_prompt], n_results=4)

print(results["documents"])
print(results["metadatas"])

In [None]:
#definera prompt för LLM, i denna ingår sökresultat från vector db

system_prompt = (
    """
You are a helpful assistant. You answer questions about the SPOE project. 
But you only answer based on knowledge I'm providing you. You don't use your internal 
knowledge and you don't make things up. If you don't know the answer, just say: I don't know
--------------------
The data:
"""
    + str(results["documents"])
    + """
"""
)

contents = []
contents.append(types.Content(role="model", parts=[types.Part(text=system_prompt)]))
contents.append(types.Content(role="user", parts=[types.Part(text=user_prompt)]))

response = client.models.generate_content(model="gemini-2.0-flash", contents=contents)

print(response.candidates[0].content.parts[0].text)