# Knowledge Base

## Auswahl der Sprachmodelle 

### Llama 3.3 (8B Version)
- **Leistung**: Starke Performance für RAG-Anwendungen mit guter Kontextverarbeitung
- **Größe**: Mit 8B Parametern effizient auf Consumer-Hardware lauffähig
- **Lizenz**: Permissive Lizenz ermöglicht kommerzielle Nutzung
- **Aktualität**: Neues Modell mit modernem Trainingsdatenset und verbesserter Instruction-Following-Fähigkeit

### Mistral 7B Instruct
- **Effizienz**: Ausgezeichnetes Leistungs-Größen-Verhältnis
- **Spezialisierung**: Optimiert für Instruction-Following und Kontextverständnis
- **Architektur**: Gruppenweise Rotation der Aufmerksamkeit für verbesserte Verarbeitung langer Dokumente
- **Community-Support**: Breite Nutzerbasis und dokumentierte Anwendungsfälle für RAG

### Phi-4 (Mini)
- **Ressourcenschonung**: Kleines Modell (3.8B) für Systeme mit begrenzten Ressourcen
- **Effizienz**: Hervorragende Leistung trotz geringer Größe
- **Antwortqualität**: Gute Formulierungsfähigkeit bei unternehmensbezogenen Inhalten
- **Kompatibilität**: Geringer VRAM-Bedarf macht es auf verschiedenen Systemen einsetzbar

Dieser Mix bietet eine gute Balance zwischen Performance, Ressourcenbedarf und verschiedenen Architekturen für einen aussagekräftigen Vergleich.



## Einrichtung der Knowledge Base

### Setup

In [None]:
import os
import shutil
from pathlib import Path
import fitz
import re
from typing import List
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
from langchain.schema import Document
import time
import psutil
import requests

BASE_DIR = Path("knowledge-base")
IMPORT_DIR = BASE_DIR / "import"
PROCESSED_DIR = BASE_DIR / "processed"
INPUT_DIR = BASE_DIR / "embeddings-ready"
VECTOR_DB_DIR = BASE_DIR / "vector-stores"


CONTEXT_WINDOW = 8192
TOKEN_LIMIT = 4096
OLLAMA_MODEL = "phi4-mini"  # Ollama-Modell
TEST_QUERY = "Which callback function is called during training?"

EMBEDDING_CONFIGS = [
    {
        "name": "word_level",
        "model": "sentence-transformers/all-MiniLM-L6-v2",
        "db_path": VECTOR_DB_DIR / "word_level_db"
    },
    {
        "name": "sentence_level",
        "model": "sentence-transformers/all-mpnet-base-v2",
        "db_path": VECTOR_DB_DIR / "sentence_level_db"
    },
    {
        "name": "document_level",
        "model": "intfloat/multilingual-e5-large",
        "db_path": VECTOR_DB_DIR / "document_level_db"
    }
]

EMBEDDING_MODEL = "sentence-transformers/all-mpnet-base-v2"

# Chunking-Methoden
CHUNKING_METHODS = {
    "fixed_size": RecursiveCharacterTextSplitter(
        chunk_size=1000, 
        chunk_overlap=100
    ),
    "sentence": RecursiveCharacterTextSplitter(
        separators=["\n\n", "\n", ". ", "! ", "? ", ";", ":"],
        chunk_size=1000,
        chunk_overlap=0
    ),
    "paragraph": RecursiveCharacterTextSplitter(
        separators=["\n\n", "\n"],
        chunk_size=2000,
        chunk_overlap=50
    )
}


VECTOR_DB_DIR.mkdir(exist_ok=True, parents=True)

for dir_path in [BASE_DIR, IMPORT_DIR, PROCESSED_DIR, INPUT_DIR]:
    dir_path.mkdir(exist_ok=True, parents=True)
    
def list_pdf_files(directory):
    return [f for f in directory.glob("*.pdf")]
  
pdf_files = list_pdf_files(IMPORT_DIR)

print(f"PDF-Dateien: {len(pdf_files)}")
for pdf in pdf_files[:5]:  # Zeige die ersten 5 Dateien
    print(f" - {pdf.name}")

def process_pdf(pdf_path):
    document = fitz.open(pdf_path)
    text_content = []
    
    for page_num in range(len(document)):
        page = document[page_num]
        text = page.get_text()
        
        # Grundlegende Textbereinigung
        text = re.sub(r'\s+', ' ', text)  # Mehrfach-Leerzeichen entfernen
        text = text.strip()
        
        if text:
            text_content.append(f"--- Seite {page_num + 1} ---\n{text}")
    
    document.close()
    
    # Gesamten Text zusammenführen
    processed_text = "\n\n".join(text_content)
    
    # Weitere Bereinigungen für bessere LLM-Verarbeitung
    processed_text = re.sub(r'([.!?])\s*(\w)', r'\1\n\2', processed_text)  # Satzenden mit Zeilenumbrüchen
    
    return processed_text

# 4. Dateien verarbeiten und verschieben
processed_files = []

for pdf_path in pdf_files:
    print(f"Verarbeite: {pdf_path.name}")
    
    # Text extrahieren und aufbereiten
    processed_text = process_pdf(pdf_path)
    
    # Ausgabedatei im embeddings-ready Verzeichnis erstellen
    output_file = INPUT_DIR / f"{pdf_path.stem}.txt"
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(processed_text)
    
    # Originaldatei in processed-Verzeichnis verschieben
    target_path = PROCESSED_DIR / pdf_path.name
    shutil.move(pdf_path, target_path)
    
    processed_files.append({
        "original_file": pdf_path.name,
        "processed_file": output_file.name,
        "size_kb": round(output_file.stat().st_size / 1024, 2)
    })
    
def load_vector_db(db_path, embedding_model):
    """Lädt eine existierende Vektordatenbank"""
    print(f"Lade Vector DB aus {db_path}")
    embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
    db = Chroma(
        persist_directory=str(db_path),
        embedding_function=embeddings
    )
    return db

def query_ollama(prompt, model=OLLAMA_MODEL):
    response = requests.post(
        "http://localhost:11434/api/chat",
        json={
            "model": model,
            "messages": [{"role": "user", "content": prompt}],
            "stream": False
        }
    )
    return response.json()
  
def search_and_query_llm(db_path, query, embedding_model):
    # DB-Suche durchführen
    start_time = time.time()
    print(f"\nSuche in {db_path} nach: '{query}'")
    
    db = load_vector_db(db_path, embedding_model)
    docs = db.similarity_search(query, k=3)
    suchzeit = time.time() - start_time
    
    # LLM-Antwort generieren
    context = "\n\n".join([doc.page_content for doc in docs])
    prompt = f"""Basierend auf dem folgenden Kontext, beantworte die Frage.

Kontext:
{context}

Frage: {query}"""
    
    start_time = time.time()
    response = query_ollama(prompt)
    antwortzeit = time.time() - start_time
    content = response['message']['content']
    # Ausgabe formatieren
    print("\n" + "="*60)
    print(f"FRAGE:")
    print("-"*60)
    print(query)
    print("="*60)
    
    print(f"ANTWORT:")
    print("-"*60)
    print(content)
    print("="*60)
    
    print(f"METADATEN:")
    print("-"*60)
    print(f"Suchzeit:       {suchzeit:.4f} Sekunden")
    print(f"LLM-Antwortzeit: {antwortzeit:.2f} Sekunden")
    print(f"Gesamtzeit:     {suchzeit + antwortzeit:.2f} Sekunden")
    print("="*60 + "\n")
    
    return {
        "suchzeit": suchzeit,
        "antwortzeit": antwortzeit,
        "gesamtzeit": suchzeit + antwortzeit,
        "antwort": content
    }

# Dokumente laden
def load_documents(directory: Path):
    documents = []
    for file_path in directory.glob("*.txt"):
        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read()
        doc = Document(page_content=text, metadata={"source": file_path.name})
        documents.append(doc)
    print(f"Geladen: {len(documents)} Dokumente")
    return documents

# Vector DB für ein Embedding-Modell erstellen
def create_vector_db(chunks, model_path, db_path):
    start_time = time.time()
    memory_before = psutil.Process().memory_info().rss / 1024 / 1024  # MB
    
    print(f"\nErstelle Vector DB mit {model_path}")
    embeddings = HuggingFaceEmbeddings(model_name=model_path)
    
    db = Chroma.from_documents(
        chunks,
        embeddings,
        persist_directory=str(db_path)
    )
    
    memory_after = psutil.Process().memory_info().rss / 1024 / 1024  # MB
    erstellungszeit = time.time() - start_time
    
    print(f"Vector DB in {db_path} gespeichert")
    print(f"Erstellungsdauer: {erstellungszeit:.2f} Sekunden")
    print(f"Speicherverbrauch: {memory_after - memory_before:.2f} MB")
    
    return db, db_path

# Hauptfunktion
# Dokumente laden und vorbereiten
documents = load_documents(INPUT_DIR)




PDF-Dateien: 0
Geladen: 10 Dokumente


## Implementierung der Content Embeddings 

### Leitfragen zur Bewertung

### Ausgewählte Embeddings

### Ergebnis

In [16]:
# Dokumente in Chunks aufteilen
def prepare_chunks(documents):
    max_chunk_size = min(CONTEXT_WINDOW // 2, TOKEN_LIMIT)
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=max_chunk_size,
        chunk_overlap=50,
        length_function=len
    )
    chunks = text_splitter.split_documents(documents)
    print(f"Dokumente in {len(chunks)} Chunks aufgeteilt")
    return chunks

chunks = prepare_chunks(documents)

for config in EMBEDDING_CONFIGS:
    create_vector_db(chunks, config["model"], config["db_path"])


Geladen: 10 Dokumente
Dokumente in 222 Chunks aufgeteilt

Erstelle Vector DB mit sentence-transformers/all-MiniLM-L6-v2
Vector DB in knowledge-base\vector-stores\word_level_db gespeichert
Erstellungsdauer: 6.01 Sekunden
Speicherverbrauch: 412.75 MB

Erstelle Vector DB mit sentence-transformers/all-mpnet-base-v2
Vector DB in knowledge-base\vector-stores\sentence_level_db gespeichert
Erstellungsdauer: 67.81 Sekunden
Speicherverbrauch: 712.55 MB

Erstelle Vector DB mit intfloat/multilingual-e5-large
Vector DB in knowledge-base\vector-stores\document_level_db gespeichert
Erstellungsdauer: 191.03 Sekunden
Speicherverbrauch: -617.21 MB

=== Vergleichstest aller Vector DBs ===

Suche in word_level_db nach: 'Was sind die Hauptvorteile von erneuerbaren Energien?'
Suchzeit: 0.0823 Sekunden
LLM-Antwortzeit: 2.08 Sekunden
Gesamtzeit: 2.16 Sekunden



KeyError: 'message'

### Evaluierung Embedding

In [10]:
# Alle drei DBs mit der gleichen Frage testen
print("\n=== Vergleichstest aller Vector DBs ===")

results = {}
for config in EMBEDDING_CONFIGS:
    results[config["name"]] = search_and_query_llm(config["db_path"], TEST_QUERY, config["model"])

# Zusammenfassung der Ergebnisse
print("\n=== Zusammenfassung ===")
print("Embedding-Modell | Suchzeit (s) | Antwortzeit (s) | Gesamtzeit (s)")
print("-" * 65)
for model, result in results.items():
    print(f"{model:15} | {result['suchzeit']:.4f} | {result['antwortzeit']:.2f} | {result['gesamtzeit']:.2f}")


=== Vergleichstest aller Vector DBs ===

Suche in knowledge-base\vector-stores\word_level_db nach: 'Which callback function is called during training?'
Lade Vector DB aus knowledge-base\vector-stores\word_level_db


  from .autonotebook import tqdm as notebook_tqdm
  db = Chroma(


Suchzeit: 24.7918 Sekunden
LLM-Antwortzeit: 7.02 Sekunden
Gesamtzeit: 31.82 Sekunden

Antwort: The provided context does not explicitly mention a specific "callback" mechanism being used in the models or their implementations. Callback functions are typically associated with libraries like TensorFlow/Keras when working on machine learning projects and would be defined by the user to perform certain actions (like saving model checkpoints, adjusting hyperparameters during training based on performance metrics) at various stages of the neural network's lifecycle.

In Keras/TensorFlow specifically:
- `ModelCheckpoint` is used for periodically saving models.
- The custom callback functions can also include logic like reducing learning rate when a plateau in loss occurs (`ReduceLROnPlateau`) or stopping training after so many epochs if there's no improvement (Custom Callback).

However, since the provided context does not directly discuss callbacks but rather focuses on explaining different 

## Implementierung des Chunking 

In [None]:
# Chunking-Methoden testen
for method_name, splitter in CHUNKING_METHODS.items():
    print(f"\n=== Chunking-Methode: {method_name} ===")
    
    # Dokumente in Chunks aufteilen
    chunks = splitter.split_documents(documents)
    print(f"Chunks erstellt: {len(chunks)}")
    
    # Vector-DB erstellen mit bestehender Methode
    db, _ = create_vector_db(chunks, EMBEDDING_MODEL, f"chunking_{method_name}")
    



=== Chunking-Methode: fixed_size ===
Chunks erstellt: 759

Erstelle Vector DB mit sentence-transformers/all-mpnet-base-v2



### Evaluierung Embedding

In [24]:

results = {}
for method_name, splitter in CHUNKING_METHODS.items():
    # Mit LLM testen
    results[method_name] = search_and_query_llm(VECTOR_DB_DIR / f"chunking_{method_name}", TEST_QUERY, EMBEDDING_MODEL)
        
# Zusammenfassung
print("\n=== Zusammenfassung der Chunking-Methoden ===")
print("Methode        | Suchzeit (s) | Antwortzeit (s) | Gesamtzeit (s)")
print("-" * 65)
for method, result in results.items():
    print(f"{method:15} | {result['suchzeit']:.4f} | {result['antwortzeit']:.2f} | {result['gesamtzeit']:.2f}")


Suche in knowledge-base\vector-stores\chunking_fixed_size nach: 'Which callback function is called during training?'
Lade Vector DB aus knowledge-base\vector-stores\chunking_fixed_size

FRAGE:
------------------------------------------------------------
Which callback function is called during training?
ANTWORT:
------------------------------------------------------------
During the process of neural network or machine learning model training, a common practice involves monitoring and reacting to various events that occur. One such event where you might want an action triggered at specific moments (like after each batch processing) happens through what are known as "callbacks." Callbacks serve multiple purposes like saving checkpoints periodically for later restoration in case of failures during the long-running process or adjusting learning rates dynamically based on certain conditions.

Specifically, there is a special kind of callback function called `on_train_batch_end`. This part