# Knowledge Base

## Auswahl der Sprachmodelle 

### llama3:8b
- **Leistung**: Starke Performance für RAG-Anwendungen mit guter Kontextverarbeitung
- **Größe**: Mit 8B Parametern effizient auf Consumer-Hardware lauffähig
- **Lizenz**: Permissive Lizenz ermöglicht kommerzielle Nutzung
- **Aktualität**: Neues Modell mit modernem Trainingsdatenset und verbesserter Instruction-Following-Fähigkeit

### mistral:7b-instruct
- **Effizienz**: Ausgezeichnetes Leistungs-Größen-Verhältnis
- **Spezialisierung**: Optimiert für Instruction-Following und Kontextverständnis
- **Architektur**: Gruppenweise Rotation der Aufmerksamkeit für verbesserte Verarbeitung langer Dokumente
- **Community-Support**: Breite Nutzerbasis und dokumentierte Anwendungsfälle für RAG

### phi4-mini
- **Ressourcenschonung**: Kleines Modell (3.8B) für Systeme mit begrenzten Ressourcen
- **Effizienz**: Hervorragende Leistung trotz geringer Größe
- **Antwortqualität**: Gute Formulierungsfähigkeit bei unternehmensbezogenen Inhalten
- **Kompatibilität**: Geringer VRAM-Bedarf macht es auf verschiedenen Systemen einsetzbar

Dieser Mix bietet eine gute Balance zwischen Performance, Ressourcenbedarf und verschiedenen Architekturen für einen aussagekräftigen Vergleich.



## Einrichtung der Knowledge Base

### Config

In [3]:
import os
import shutil
from pathlib import Path
import fitz
import re
from typing import List
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
from langchain.schema import Document
import time
import psutil
import requests
import json

# Folder structure
BASE_DIR = Path("knowledge-base")
IMPORT_DIR = BASE_DIR / "import"
PROCESSED_DIR = BASE_DIR / "processed"
INPUT_DIR = BASE_DIR / "embeddings-ready"
VECTOR_DB_DIR = BASE_DIR / "vector-stores"


# Query Options
CONTEXT_WINDOW = 8192
TOKEN_LIMIT = 4096
OLLAMA_MODEL = "phi4-mini"
TEST_QUERY = "Which callback function is called during training?"
EXPECTED_ANSWER = "ModelCheckpoint"

# Define Embedding Configs
EMBEDDING_CONFIGS = [
    {
        "name": "word_level",
        "model": "sentence-transformers/all-MiniLM-L6-v2",
        "db_path": VECTOR_DB_DIR / "word_level_db"
    },
    {
        "name": "sentence_level",
        "model": "sentence-transformers/all-mpnet-base-v2",
        "db_path": VECTOR_DB_DIR / "sentence_level_db"
    },
    {
        "name": "document_level",
        "model": "intfloat/multilingual-e5-large",
        "db_path": VECTOR_DB_DIR / "document_level_db"
    }
]

# Embedding Model to use for chunking test
EMBEDDING_MODEL = "sentence-transformers/all-mpnet-base-v2"

# Chunking-Methoden
CHUNKING_METHODS = {
    "fixed_size": RecursiveCharacterTextSplitter(
        chunk_size=1000, 
        chunk_overlap=100
    ),
    "sentence": RecursiveCharacterTextSplitter(
        separators=["\n\n", "\n", ". ", "! ", "? ", ";", ":"],
        chunk_size=1000,
        chunk_overlap=0
    ),
    "paragraph": RecursiveCharacterTextSplitter(
        separators=["\n\n", "\n"],
        chunk_size=2000,
        chunk_overlap=50
    )
}

OLLAMA_MODELS = ["llama3:8b", "mistral:7b-instruct", "phi4-mini"]

# Default chunking method for model tests
CHUNKING_METHOD = "paragraph"
CHUNKING_DB = VECTOR_DB_DIR / "chunking_paragraph"

# Ensure Folders are created
VECTOR_DB_DIR.mkdir(exist_ok=True, parents=True)

### Vector-DB und Query Methoden definieren

In [11]:
def load_vector_db(db_path, embedding_model):
    """Lädt eine existierende Vektordatenbank"""
    print(f"Lade Vector DB aus {db_path}")
    embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
    db = Chroma(
        persist_directory=str(db_path),
        embedding_function=embeddings
    )
    return db

def query_ollama(prompt, model):
    # Create a results directory if it doesn't exist
    results_dir = BASE_DIR / "results"
    results_dir.mkdir(exist_ok=True, parents=True)
    
    output_file = results_dir / f"query_result_{model.replace(':', '_')}.txt"
    
    response = requests.post(
        "http://localhost:11434/api/chat",
        json={
            "model": model,
            "messages": [{"role": "user", "content": prompt}],
            "stream": False
        }
    )
    result = response.json()
    
    # Save to file instead of printing
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(json.dumps(result, indent=2))
    
    return result
  
def search_and_query_llm(db_path, query, embedding_model, model=OLLAMA_MODEL):
    start_time = time.time()
    print(f"\nSuche in {db_path} nach: '{query}'")
    
    db = load_vector_db(db_path, embedding_model)
    docs = db.similarity_search(query, k=3)
    search_time = time.time() - start_time
    
    context = "\n\n".join([doc.page_content for doc in docs])
    prompt = f"""Basierend auf dem folgenden Kontext, beantworte die Frage.

Kontext:
{context}

Frage: {query}"""
    
    start_time = time.time()
    response = query_ollama(prompt, model)
    antwortzeit = time.time() - start_time
    content = response['message']['content']
    
    return {
        "search_time": search_time,
        "answer_time": antwortzeit,
        "total_time": search_time + antwortzeit,
        "answer": content
    }


def create_vector_db(chunks, model_path, db_path):
    start_time = time.time()
    memory_before = psutil.Process().memory_info().rss / 1024 / 1024  # MB
    
    print(f"\nErstelle Vector DB mit {model_path}")
    embeddings = HuggingFaceEmbeddings(model_name=model_path)
    
    db = Chroma.from_documents(
        chunks,
        embeddings,
        persist_directory=str(db_path)
    )
    
    memory_after = psutil.Process().memory_info().rss / 1024 / 1024  # MB
    memory_usage = memory_after - memory_before
    creation_time = time.time() - start_time
    
    print(f"Vector DB in {db_path} gespeichert")
    print(f"Erstellungsdauer: {creation_time:.2f} Sekunden")
    print(f"Speicherverbrauch: {memory_usage:.2f} MB")
    
    return db, {
        "db_path": db_path,
        "chunks": len(chunks),
        "creation_time": creation_time,
        "memory_usage": memory_usage
    }


### Verzeichnis Struktur einlesen und laden

In [5]:
def load_documents(directory: Path):
    documents = []
    for file_path in directory.glob("*.txt"):
        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read()
        doc = Document(page_content=text, metadata={"source": file_path.name})
        documents.append(doc)
    print(f"Geladen: {len(documents)} Dokumente")
    return documents

for dir_path in [BASE_DIR, IMPORT_DIR, PROCESSED_DIR, INPUT_DIR]:
    dir_path.mkdir(exist_ok=True, parents=True)
    
def list_pdf_files(directory):
    return [f for f in directory.glob("*.pdf")]
  
pdf_files = list_pdf_files(IMPORT_DIR)

print(f"PDF-Dateien: {len(pdf_files)}")
for pdf in pdf_files:
    print(f" - {pdf.name}")

def process_pdf(pdf_path):
    document = fitz.open(pdf_path)
    text_content = []
    
    for page_num in range(len(document)):
        page = document[page_num]
        text = page.get_text()
        
        text = re.sub(r'\s+', ' ', text) 
        text = text.strip()
        
        if text:
            text_content.append(f"--- Seite {page_num + 1} ---\n{text}")
    
    document.close()
    
    processed_text = "\n\n".join(text_content)
    
    processed_text = re.sub(r'([.!?])\s*(\w)', r'\1\n\2', processed_text)  # Satzenden mit Zeilenumbrüchen
    
    return processed_text

processed_files = []

for pdf_path in pdf_files:
    print(f"Verarbeite: {pdf_path.name}")
    
    processed_text = process_pdf(pdf_path)
    
    output_file = INPUT_DIR / f"{pdf_path.stem}.txt"
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(processed_text)
    
    target_path = PROCESSED_DIR / pdf_path.name
    shutil.move(pdf_path, target_path)
    
    processed_files.append({
        "original_file": pdf_path.name,
        "processed_file": output_file.name,
        "size_kb": round(output_file.stat().st_size / 1024, 2)
    })


documents = load_documents(INPUT_DIR)

PDF-Dateien: 0
Geladen: 10 Dokumente


## Implementierung der Content Embeddings 

### Leitfragen zur Bewertung

Hier wurden die PDFs manuell analysiert um einen geeignete Query zu finden um die Qualität des Outputs zu prüfen.  
Da das Sprachmodell von z.B. phi4 auch schon viel Basis-Wissen ohne Knowledgebase hat, muss die Frage so formuliert werden, dass es nur mithilfe der Dokumente beantwortet werden kann.

> Which callback function is called during training?  
Erwartete Antwort: ModelCheckpoint

### Ausgewählte Embeddings

- word_level
- sentence_level
- document_level

### Vector DB für Embedding erstellen

In [6]:
# Dokumente in Chunks aufteilen
embedding_stats = {}

def prepare_chunks(documents):
    max_chunk_size = min(CONTEXT_WINDOW // 2, TOKEN_LIMIT)
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=max_chunk_size,
        chunk_overlap=50,
        length_function=len
    )
    chunks = text_splitter.split_documents(documents)
    print(f"Dokumente in {len(chunks)} Chunks aufgeteilt")
    return chunks

chunks = prepare_chunks(documents)

for config in EMBEDDING_CONFIGS:
    name = config["name"]
    _, stats = create_vector_db(chunks, config["model"], config["db_path"])
    embedding_stats[name] = stats


Dokumente in 222 Chunks aufgeteilt

Erstelle Vector DB mit sentence-transformers/all-MiniLM-L6-v2


  from .autonotebook import tqdm as notebook_tqdm


Vector DB in knowledge-base\vector-stores\word_level_db gespeichert
Erstellungsdauer: 58.55 Sekunden
Speicherverbrauch: 615.20 MB

Erstelle Vector DB mit sentence-transformers/all-mpnet-base-v2
Vector DB in knowledge-base\vector-stores\sentence_level_db gespeichert
Erstellungsdauer: 120.61 Sekunden
Speicherverbrauch: 848.27 MB

Erstelle Vector DB mit intfloat/multilingual-e5-large
Vector DB in knowledge-base\vector-stores\document_level_db gespeichert
Erstellungsdauer: 462.88 Sekunden
Speicherverbrauch: 1407.14 MB


### Evaluierung der Embedding Vector DBs

In [12]:
# Alle drei DBs mit der gleichen Frage testen
print("\n=== Vergleichstest aller Vector DBs ===")

embedding_results = {}

for config in EMBEDDING_CONFIGS:
     name = config["name"]
     result = search_and_query_llm(config["db_path"], TEST_QUERY, config["model"])
     embedding_results[name] = result

# Zusammenfassung der Ergebnisse
print("\n=== Zusammenfassung ===")
print("Embedding-Modell | Suchzeit (s) | Antwortzeit (s) | Gesamtzeit (s)")
print("-" * 65)
for model, result in embedding_results.items():
    print(f"{model:15} | {result['search_time']:.4f} | {result['answer_time']:.2f} | {result['total_time']:.2f}")


=== Vergleichstest aller Vector DBs ===

Suche in knowledge-base\vector-stores\word_level_db nach: 'Which callback function is called during training?'
Lade Vector DB aus knowledge-base\vector-stores\word_level_db

Suche in knowledge-base\vector-stores\sentence_level_db nach: 'Which callback function is called during training?'
Lade Vector DB aus knowledge-base\vector-stores\sentence_level_db

Suche in knowledge-base\vector-stores\document_level_db nach: 'Which callback function is called during training?'
Lade Vector DB aus knowledge-base\vector-stores\document_level_db

=== Zusammenfassung ===
Embedding-Modell | Suchzeit (s) | Antwortzeit (s) | Gesamtzeit (s)
-----------------------------------------------------------------
word_level      | 2.7418 | 143.33 | 146.08
sentence_level  | 2.4542 | 160.31 | 162.76
document_level  | 4.9877 | 271.82 | 276.81


## Implementierung des Chunking 

In [13]:
# Chunking-Methoden testen
chunking_stats = {}


for method_name, splitter in CHUNKING_METHODS.items():
    print(f"\n=== Chunking-Methode: {method_name} ===")
    
    # Dokumente in Chunks aufteilen
    chunks = splitter.split_documents(documents)
    print(f"Chunks erstellt: {len(chunks)}")
    
    # Vector-DB erstellen mit bestehender Methode
    db, stats = create_vector_db(chunks, EMBEDDING_MODEL,  VECTOR_DB_DIR / f"chunking_{method_name}")
    chunking_stats[method_name] = stats




=== Chunking-Methode: fixed_size ===
Chunks erstellt: 759

Erstelle Vector DB mit sentence-transformers/all-mpnet-base-v2
Vector DB in knowledge-base\vector-stores\chunking_fixed_size gespeichert
Erstellungsdauer: 312.23 Sekunden
Speicherverbrauch: 675.29 MB

=== Chunking-Methode: sentence ===
Chunks erstellt: 745

Erstelle Vector DB mit sentence-transformers/all-mpnet-base-v2
Vector DB in knowledge-base\vector-stores\chunking_sentence gespeichert
Erstellungsdauer: 308.94 Sekunden
Speicherverbrauch: 314.41 MB

=== Chunking-Methode: paragraph ===
Chunks erstellt: 407

Erstelle Vector DB mit sentence-transformers/all-mpnet-base-v2
Vector DB in knowledge-base\vector-stores\chunking_paragraph gespeichert
Erstellungsdauer: 216.63 Sekunden
Speicherverbrauch: 358.00 MB



### Evaluierung Chunking

In [14]:

chunking_results  = {}
for method_name, splitter in CHUNKING_METHODS.items():
    # Mit LLM testen
    chunking_results [method_name] = search_and_query_llm(VECTOR_DB_DIR / f"chunking_{method_name}", TEST_QUERY, EMBEDDING_MODEL)
        
# Zusammenfassung
print("\n=== Zusammenfassung der Chunking-Methoden ===")
print("Methode        | Suchzeit (s) | Antwortzeit (s) | Gesamtzeit (s)")
print("-" * 65)
for method, result in chunking_results.items():
    print(f"{method:15} | {result['search_time']:.4f} | {result['answer_time']:.2f} | {result['total_time']:.2f}")


Suche in knowledge-base\vector-stores\chunking_fixed_size nach: 'Which callback function is called during training?'
Lade Vector DB aus knowledge-base\vector-stores\chunking_fixed_size

Suche in knowledge-base\vector-stores\chunking_sentence nach: 'Which callback function is called during training?'
Lade Vector DB aus knowledge-base\vector-stores\chunking_sentence

Suche in knowledge-base\vector-stores\chunking_paragraph nach: 'Which callback function is called during training?'
Lade Vector DB aus knowledge-base\vector-stores\chunking_paragraph

=== Zusammenfassung der Chunking-Methoden ===
Methode        | Suchzeit (s) | Antwortzeit (s) | Gesamtzeit (s)
-----------------------------------------------------------------
fixed_size      | 12.4376 | 29.81 | 42.25
sentence        | 2.7703 | 23.92 | 26.69
paragraph       | 2.4173 | 35.82 | 38.24


### Verschiedene Modelle Testen

In [17]:
print("\n=== Vergleich der Sprachmodelle ===")
model_results = {}

for model in OLLAMA_MODELS:
    print(f"\nTesting model: {model}")
    result = search_and_query_llm(CHUNKING_DB, TEST_QUERY, EMBEDDING_MODEL, model)
    
    accuracy = EXPECTED_ANSWER.lower() in result["answer"].lower()
    model_results[model] = {
        "answer_time": result["answer_time"],
        "total_time": result["total_time"],
        "accuracy": accuracy,
        "answer": result["answer"]
    }
    
    print(f"Model: {model}")
    print(f"Accuracy: {'✓' if accuracy else '✗'}")
    print(f"Answer time: {result['answer_time']:.2f}s")
    print(f"Total time: {result['total_time']:.2f}s")


=== Vergleich der Sprachmodelle ===

Testing model: llama3:8b

Suche in knowledge-base\vector-stores\chunking_paragraph nach: 'Which callback function is called during training?'
Lade Vector DB aus knowledge-base\vector-stores\chunking_paragraph
Model: llama3:8b
Accuracy: ✓
Answer time: 16.48s
Total time: 19.14s

Testing model: mistral:7b-instruct

Suche in knowledge-base\vector-stores\chunking_paragraph nach: 'Which callback function is called during training?'
Lade Vector DB aus knowledge-base\vector-stores\chunking_paragraph
Model: mistral:7b-instruct
Accuracy: ✓
Answer time: 92.51s
Total time: 95.68s

Testing model: phi4-mini

Suche in knowledge-base\vector-stores\chunking_paragraph nach: 'Which callback function is called during training?'
Lade Vector DB aus knowledge-base\vector-stores\chunking_paragraph
Model: phi4-mini
Accuracy: ✓
Answer time: 41.62s
Total time: 45.77s


### Zusammenfassung

In [18]:
# Final evaluation code block
print("\n=== ZUSAMMENFASSUNG ALLER ERGEBNISSE ===\n")

# Embedding comparison
print("1. EMBEDDING MODELLE")
print("Embedding-Modell | Accuracy | Memory (MB) | Suchzeit (s) | Gesamtzeit (s)")
print("-" * 75)
for config in EMBEDDING_CONFIGS:
    model_name = config["name"]
    result = embedding_results[model_name]  # From your embedding tests
    memory = embedding_stats[model_name]["memory_usage"]  # Collect during vector DB creation
    accuracy = "✓" if EXPECTED_ANSWER.lower() in result["answer"].lower() else "✗"
    print(f"{model_name:15} | {accuracy:8} | {memory:10.2f} | {result['search_time']:11.4f} | {result['total_time']:13.2f}")

# Chunking comparison
print("\n2. CHUNKING METHODEN")
print("Methode      | Chunks | Accuracy | Memory (MB) | Suchzeit (s) | Gesamtzeit (s)")
print("-" * 80)
for method_name, result in chunking_results.items():  # From your chunking tests
    chunks = chunking_stats[method_name]["chunks"]
    memory = chunking_stats[method_name]["memory_usage"]
    accuracy = "✓" if EXPECTED_ANSWER.lower() in result["answer"].lower() else "✗"
    print(f"{method_name:12} | {chunks:6} | {accuracy:8} | {memory:10.2f} | {result['search_time']:11.4f} | {result['total_time']:13.2f}")

# Model comparison 
print("\n3. LLM MODELLE")
print("Modell        | Accuracy | Antwortzeit (s) | Gesamtzeit (s)")
print("-" * 60)
for model, result in model_results.items():
    accuracy = "✓" if result["accuracy"] else "✗"
    print(f"{model:13} | {accuracy:8} | {result['answer_time']:14.2f} | {result['total_time']:13.2f}")


=== ZUSAMMENFASSUNG ALLER ERGEBNISSE ===

1. EMBEDDING MODELLE
Embedding-Modell | Accuracy | Memory (MB) | Suchzeit (s) | Gesamtzeit (s)
---------------------------------------------------------------------------
word_level      | ✗        |     615.20 |      2.7418 |        146.08
sentence_level  | ✓        |     848.27 |      2.4542 |        162.76
document_level  | ✓        |    1407.14 |      4.9877 |        276.81

2. CHUNKING METHODEN
Methode      | Chunks | Accuracy | Memory (MB) | Suchzeit (s) | Gesamtzeit (s)
--------------------------------------------------------------------------------
fixed_size   |    759 | ✓        |     675.29 |     12.4376 |         42.25
sentence     |    745 | ✓        |     314.41 |      2.7703 |         26.69
paragraph    |    407 | ✓        |     358.00 |      2.4173 |         38.24

3. LLM MODELLE
Modell        | Accuracy | Antwortzeit (s) | Gesamtzeit (s)
------------------------------------------------------------
llama3:8b     | ✓        |   