# 🤖 Abschlussprojekt: RAG-System mit AI Index 2025

In [None]:
# 📦 Benötigte Bibliotheken
!pip install pymupdf langchain tiktoken chromadb sentence-transformers --quiet

In [None]:
# 📁 PDF-Datei hochladen
from google.colab import files
uploaded = files.upload()

In [None]:
# 📄 Text aus PDF extrahieren
import fitz  # PyMuPDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    return "".join([page.get_text() for page in doc])
pdf_path = 'hai_ai_index_report_2025.pdf'
raw_text = extract_text_from_pdf(pdf_path)
print(f"Metin uzunluğu: {len(raw_text)} karakter")

In [None]:
# 🧩 Text in Chunks aufteilen
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_text(raw_text)
print(f"Toplam {len(chunks)} chunk üretildi.")

In [None]:
# 🏷️ Metadaten hinzufügen
from langchain.docstore.document import Document
documents = []
for i, chunk in enumerate(chunks):
    metadata = {"source": "AI Index 2025", "chunk_id": i, "section": "Investments" if i < 50 else "Other"}
    documents.append(Document(page_content=chunk, metadata=metadata))

In [None]:
# 💾 Indexierung mit ChromaDB
from langchain.vectorstores import Chroma
from langchain.embeddings import SentenceTransformerEmbeddings
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
db = Chroma.from_documents(documents, embedding=embedding_function, persist_directory="rag_index_metadata")
db.persist()

In [None]:
# 🔍 Einfaches Retrieval-Beispiel
query = "What are the main AI investment trends in 2024?"
retrieved_docs = db.similarity_search(query, k=3)
for i, doc in enumerate(retrieved_docs, 1):
    print(f"--- Chunk {i} ---\n{doc.page_content[:500]}\n")

In [None]:
# 🎯 Retrieval mit Metadaten-Filterung
filtered_docs = db.similarity_search(query, k=3, filter={"section": "Investments"})
for i, doc in enumerate(filtered_docs, 1):
    print(f"--- Filtered Chunk {i} (Section: {doc.metadata['section']}) ---\n{doc.page_content[:500]}\n")

In [None]:
# 🔁 Mehrfach-Abfrage-Retrieval (Multi-Query Retrieval)
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.chat_models import ChatOpenAI
import os
os.environ["OPENAI_API_KEY"] = "your-api-key-here"
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
multi_retriever = MultiQueryRetriever.from_llm(retriever=db.as_retriever(), llm=llm)
docs = multi_retriever.get_relevant_documents(query)
for i, doc in enumerate(docs[:3], 1):
    print(f"--- MultiQuery {i} ---\n{doc.page_content[:500]}\n")

In [None]:
# 📊 LangSmith (optionales Tracking und Monitoring)
os.environ["LANGCHAIN_API_KEY"] = "your-langsmith-key"
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"

In [None]:
# 🤖 Antwortgenerierung mit LLM + RAG
from langchain.chains import RetrievalQA
qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=db.as_retriever(), chain_type="stuff")
response = qa_chain.run(query)
print("\n🤖 Model Cevabı:\n", response)

In [None]:
# 🧪 Auswertung: Retrieval aktiviert vs. deaktiviert
test_questions = [
    "What was the trend in global AI private investment in 2024?",
    "Which countries led in AI research output in 2024?",
    "How did AI adoption in education evolve in 2024?",
    "What ethical concerns about AI are mentioned in the 2025 report?",
    "Which sectors saw the highest AI implementation growth in 2024?"
]
for i, question in enumerate(test_questions, 1):
    print(f"\n=== Q{i}: {question} ===")
    print("\n🚫 LLM (no retrieval):")
    print(llm.predict(question))
    print("\n✅ RAG:")
    print(qa_chain.run(question))