# Getting started with LLMs and RAG

Note: First create a filtered dataset with `filter-dataset.ipynb`

In [19]:
import sys
import os
import pandas as pd
import json
from dotenv import load_dotenv
import tiktoken
import pickle

from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.rate_limiters import InMemoryRateLimiter

from langchain.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.llm import get_azure_embeddings_client, get_llm_client, get_gemini_llm_client

if not load_dotenv():
    raise Exception('Error loading .env file. Make sure to place valid keys in the .env file.')

In [3]:
ARTICLES_CLEAN_DIR = os.path.join("..", "data", "articles_clean")
FILTERED_METADATA_PATH = os.path.join("..", "data", "filtered_metadata.csv")
DB_PATH = os.path.join("..", "rag", "ai_topic.db")

if not os.path.exists(DB_PATH):
    os.makedirs(os.path.dirname(DB_PATH), exist_ok=True)

In [4]:
filtered_metadata = pd.read_csv(FILTERED_METADATA_PATH)
filtered_metadata.head(5)

Unnamed: 0,id,filename,published_at,author,title,category,section,word_count,financial_crisis,sustainability,fake_news,ai,digitalization,local_journalism,covid,demographics,innovation,valid_indicator
0,7415582d-2272-44a8-91d2-10fd9ba380ac,schulautonomie-offnet-die-tur-fur-konzentratio...,2014-09-26 18:29:00,Brigitte Pechar,Schulautonomie öffnet die Tür für Konzentratio...,Politik,Nachrichten,376,0.4627,0.5436,0.5818,0.6098,0.4597,0.5763,0.6398,0.5774,0.6911,True
1,a90f1215-56b1-44f4-a5c2-d623d5797d73,keine-bilder-sondern-hass.json,2006-02-17 00:00:00,Michael Schmölzer,"""Keine Bilder, sondern Hass""",Politik,Nachrichten,200,0.0698,0.5032,0.128,0.9302,0.028,0.0111,0.514,0.1343,0.1802,True
2,3b42941e-616f-4f72-a58f-e7bef47f3ea4,newex-die-neue-osteuropaborse-mit-hohem-qualit...,2000-02-04 00:00:00,Rosa Eder,NEWEX · Die neue Osteuropabörse mit hohem Qual...,Wirtschaft,Nachrichten,308,0.567,0.6146,0.5711,0.6185,0.5815,0.4234,0.6053,0.5924,0.5937,True
3,1d7d5e7f-b7e9-4f75-87a0-5fad2d424f80,ich-glaube-an-den-mundigen-burger.json,2013-11-21 19:50:00,Elisabeth Hewson,"""Ich glaube an den mündigen Bürger""",Wissen,Nachrichten,1640,0.4024,0.4625,0.6423,0.6304,0.3601,0.5503,0.6606,0.5526,0.3608,True
4,156e1469-2104-41c4-a06c-7916a7863844,persisches-kebab-als-ablenkung.json,2013-05-02 16:51:00,Selina Nowak,Persisches Kebab als Ablenkung,Politik,Nachrichten,605,0.3752,0.4779,0.5534,0.6336,0.4264,0.4229,0.575,0.5052,0.4895,True


Create simple vector database

In [6]:
def get_documents_from_path(filenames: list[str]) -> [Document]:
    documents = []
    
    for file_name in filenames:
        file_path = os.path.join(ARTICLES_CLEAN_DIR, file_name)
        with open(file_path, "r", encoding="utf-8") as file:
            file = json.load(file)

        text = file.get("text", "")
        documents.append(Document(page_content=text, metadata={
            "title": file.get("title", ""),
            "author": file.get("author", ""),
            "published_at": file.get("published_at", ""),
            "id": file.get("id", ""),
        }))

    return documents

In [7]:
documents = get_documents_from_path(filtered_metadata["filename"])
print(f"Number of articles: {len(documents)}")

Number of articles: 395


In [8]:
# Create database
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, separators=["\n\n", "\n"])

# Split documents and create vector database
texts = text_splitter.split_documents(documents)

In [9]:
embeddings = get_azure_embeddings_client(
    chunk_size=512, # number of documents' chunks processed in parallel, decrease if you hit rate limits
    show_progress_bar=True,
)

In [49]:
db = FAISS.from_documents(texts, embeddings)

  0%|          | 0/5 [00:00<?, ?it/s]

In [10]:
# Count build embedding token number
tokenizer = tiktoken.get_encoding("cl100k_base")
build_token_count = sum([len(tokenizer.encode(doc.page_content)) for doc in texts])
print(f"Token count: {build_token_count}")

Token count: 454318


In [51]:
# Store the database
with open(DB_PATH, "wb") as f:
    pickle.dump(db.serialize_to_bytes(), f)

Create simple RAG

In [11]:
# CHECKPOINT: Load vector DB
with open(DB_PATH, "rb") as f:
    serialized_data = pickle.load(f)

# Reconstruct the FAISS database
db = FAISS.deserialize_from_bytes(serialized_data, embeddings, allow_dangerous_deserialization=True)

In [12]:
# FYI: free tier Gemini LLM 
# rate_limiter = InMemoryRateLimiter(
#     requests_per_second=0.5,  # <-- Gemini Free Tier
#     check_every_n_seconds=0.1,
# )

# llm = get_gemini_llm_client(
#     max_tokens=1024,
#     temperature=0.2,
#     rate_limiter=rate_limiter,
# )

# Default go-to Openrouter LLM - check README for other available models
llm = get_llm_client(
    # Configurable parameters
    max_tokens=1024,
    temperature=0.2,
)

In [13]:
system_prompt = """
You are an expert assistant to find sentences or articles that are about similar topics to that of the inputed sentence. Use only the following retrieved context to answer the question accurately and concisely. 
If you find absolutely no sentences or articles relating to this topic, say "I don't know".
Context: {context}
Question: {question}
"""

prompt_template = PromptTemplate(
    input_variables=["context", "question"], 
    template=system_prompt
)

retrieval_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=db.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt_template}
)

In [14]:
def ask_question(query):
    response = retrieval_chain.invoke({"query": query})
    print(f"Question: {query}\nAnswer: {response['result']}")
    print("\nSources: \n")
    for source in response["source_documents"]:
        print(source.metadata)
    return response

In [17]:
response = ask_question("Could you find articles that are related to this quote and summarize them: ' Pünktlich zur Internationalen Orchideen- und Tillandsienschau der Blumengärten Hirschstetten wartet der Botanische Garten der Universität Wien mit einer kleinen Sensation auf'?")

  0%|          | 0/1 [00:00<?, ?it/s]

Question: Could you find articles that are related to this quote and summarize them: ' Pünktlich zur Internationalen Orchideen- und Tillandsienschau der Blumengärten Hirschstetten wartet der Botanische Garten der Universität Wien mit einer kleinen Sensation auf'?
Answer: The context mentions that the Botanical Garden of the University of Vienna is showcasing a remarkable event coinciding with the International Orchid and Tillandsia Show at the Blumengärten Hirschstetten. Specifically, it highlights that a spectacular orchid species, Grammatophyllum speciosum, has developed a long flowering shoot and is blooming for the first time since being in the garden's research collection since 2010. This event is part of the orchid exhibition, which offers expert advice on orchid care and the opportunity to purchase plants.

Sources: 

{'title': 'Spektakuläre Orchidee blüht zum ersten Mal', 'author': 'WZOnline', 'published_at': '2016-02-15 16:02', 'id': 'a6fd1e98-d853-460c-a7c8-5d74eaf0c845'}
{'t