# Getting started with LLMs and RAG

Note: First create a filtered dataset with `filter-dataset.ipynb`

In [5]:
import sys
import os
import pandas as pd
import json
from dotenv import load_dotenv
import tiktoken
import pickle

from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.rate_limiters import InMemoryRateLimiter

from langchain.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.llm import get_azure_embeddings_client, get_llm_client, get_gemini_llm_client

if not load_dotenv():
    raise Exception('Error loading .env file. Make sure to place valid keys in the .env file.')

In [9]:
ARTICLES_CLEAN_DIR = os.path.join("..", "data", "articles_clean")
FILTERED_METADATA_PATH = os.path.join("..", "data", "filtered_metadata.csv")
DB_PATH = os.path.join("..", "data", "db", "sample.db")

if not os.path.exists(DB_PATH):
    os.makedirs(os.path.dirname(DB_PATH), exist_ok=True)

In [10]:
filtered_metadata = pd.read_csv(FILTERED_METADATA_PATH)
filtered_metadata.head(5)

Unnamed: 0,id,filename,published_at,author,title,category,section,word_count,financial_crisis,sustainability,fake_news,ai,digitalization,local_journalism,covid,demographics,innovation,valid_indicator
0,fd343f2b-736d-4c86-920d-35fc4e3b4a2e,die-gotter-des-geldes.json,2011-05-10 18:31:00,Reinhard Göweil,Die Götter des Geldes,Leitartikel,Meinung,326,0.8968,0.6328,0.4778,0.5007,0.351,0.3519,0.5301,0.5913,0.2105,True
1,4f35eda4-d8c9-41da-86cf-acde392daced,justiz-und-ewige-skandale.json,2009-10-16 19:29:00,Peter Muzik,Justiz und ewige Skandale,Kommentare,Meinung,235,0.8105,0.1628,0.0716,0.0692,0.0364,0.059,0.0972,0.0809,0.0558,True
2,b0ce07cb-c2da-4dca-be7a-d70640aa0750,saubermacher-sagt-borsegang-ab-wien-sucht-eine...,2008-01-30 18:43:00,Rosa Eder-Kornfeld,Saubermacher sagt Börsegang ab - Wien sucht ei...,Analysen,Archiv,260,0.8644,0.4094,0.2481,0.2804,0.3053,0.2731,0.2425,0.3708,0.2866,True
3,946db10f-cc14-4a60-8319-d95e5f400981,wann-ist-land-in-sicht.json,2010-05-21 19:34:00,Petra Medek,Wann ist Land in Sicht?,Kommentare,Meinung,185,0.8254,0.3482,0.0898,0.1104,0.0169,0.115,0.2387,0.1897,0.0183,True
4,e05e0904-00d2-4a1d-8d8a-7e14cde30aa7,yen-gerat-unter-druck.json,2011-03-14 09:39:00,WZ Online,Yen gerät unter Druck,Wirtschaft,Nachrichten,171,0.8052,0.5291,0.3415,0.4253,0.376,0.5516,0.3966,0.5528,0.3544,True


Create simple vector database

In [None]:
def get_documents_from_path(filenames: list[str]) -> [Document]:
    documents = []
    
    for file_name in filenames:
        file_path = os.path.join(ARTICLES_CLEAN_DIR, file_name)
        with open(file_path, "r", encoding="utf-8") as file:
            file = json.load(file)

        text = file.get("text", "")
        documents.append(Document(page_content=text, metadata={
            "title": file.get("title", ""),
            "author": file.get("author", ""),
            "published_at": file.get("published_at", ""),
            "id": file.get("id", ""),
        }))

    return documents

In [None]:
documents = get_documents_from_path(filtered_metadata["filename"])

In [None]:
# Create database
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, separators=["\n\n", "\n"])

# Split documents and create vector database
texts = text_splitter.split_documents(documents)
embeddings = get_azure_embeddings_client()

In [None]:
db = FAISS.from_documents(texts, embeddings)

In [None]:
# Count build embedding token number
tokenizer = tiktoken.get_encoding("cl100k_base")
build_token_count = sum([len(tokenizer.encode(doc.page_content)) for doc in texts])
print(f"Token count: {build_token_count}")

In [None]:
# Store the database
with open(DB_PATH, "wb") as f:
    pickle.dump(db.serialize_to_bytes(), f)

Create simple RAG

In [None]:
# TODO: swap to openrouter
# Load the LLM
rate_limiter = InMemoryRateLimiter(
    requests_per_second=0.5,  # <-- Gemini Free Tier
    check_every_n_seconds=0.1,
)

llm = get_gemini_llm_client(
    max_tokens=1024,
    temperature=0.2,
    rate_limiter=rate_limiter,
)

In [None]:
system_prompt = """
You are an expert assistant. Use only the following retrieved context to answer the question accurately and concisely. 
If nothing is mentioned in the context, say "I don't know".
Context: {context}
Question: {question}
"""

prompt_template = PromptTemplate(
    input_variables=["context", "question"], 
    template=system_prompt
)

retrieval_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=db.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt_template}
)

In [None]:
def ask_question(query):
    response = retrieval_chain.invoke({"query": query})
    print(f"Question: {query}\nAnswer: {response['result']}")
    print("\nSources: \n")
    for source in response["source_documents"]:
        print(source.metadata)
    return response

In [None]:
response = ask_question("What are the current economic threats in Austria?")