### LLM with RAG demo
**Note:** This is just some code to help you getting started, but in no way mandatory to use. Feel free to use any other tools, libraries, approaches, etc.

In [None]:
import sys
import os
import pandas as pd
import json
from dotenv import load_dotenv
import tiktoken
import pickle

from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai.embeddings.base import OpenAIEmbeddings
from langchain_core.rate_limiters import InMemoryRateLimiter

from langchain.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.llm import get_llm_client, get_gemini_embeddings_client

if not load_dotenv():
    raise Exception('Error loading .env file. Make sure to place a valid OPEN_AI_KEY in the .env file.')

In [None]:
METADATA_PATH = os.path.join("..", "data", "metadata.csv")
ARTICLES_CLEAN_DIR = os.path.join("..", "data", "articles_clean")
DB_PATH = os.path.join("..", "data", "db", "sample.db")
if not os.path.exists(DB_PATH):
    os.makedirs(os.path.dirname(DB_PATH), exist_ok=True)

Filter the metadata df according to your needs

In [None]:
metadata = pd.read_csv(METADATA_PATH)
metadata["published_at"] = pd.to_datetime(metadata["published_at"])

In [None]:
filtered_metadata = metadata[
    # Date
    (metadata["published_at"] >= "2023-01-01") &
    # Word count
    (metadata["words_count"] >= 100) &
    # Category
    (metadata["category"].str.contains("Wirtschaft", case=False))
]
print(f"Expected number of articles: {filtered_metadata.shape[0]}")

Get filtered articles

In [None]:
def filter_articles(filtered_metadata, articles_dir):
    """
    Filter articles based on filtered metadata
    """
    articles = []
    for _, row in filtered_metadata.iterrows():
        article_path = os.path.join(articles_dir, row["filename"])
        with open(article_path, "r", encoding="utf-8") as file:
            article = json.load(file)
            articles.append(article)
    return articles

In [None]:
filtered_articles = filter_articles(filtered_metadata, ARTICLES_CLEAN_DIR)
print(f"Number of articles: {len(filtered_articles)}\n")
print(f"Sample article metadata:\n {filtered_metadata.iloc[0]}\n")
print(f"Sample article:\n {filtered_articles[0]}")

Create simple vector database

In [None]:
def get_documents_from_path(filenames: list[str]) -> [Document]:
    documents = []
    
    for file in filenames:
        text = file.get("text", "")
        documents.append(Document(page_content=text, metadata={
            "title": file.get("title", ""),
            "author": file.get("author", ""),
            "published_at": file.get("published_at", ""),
            "id": file.get("id", ""),
        }))

    return documents

In [None]:
documents = get_documents_from_path(filtered_articles)

In [None]:
# Create database
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, separators=["\n\n", "\n"])

# Split documents and create vector database
texts = text_splitter.split_documents(documents)

# https://platform.openai.com/docs/guides/embeddings/embedding-models
# embeddings = OpenAIEmbeddings()

embeddings = get_gemini_embeddings_client()

In [None]:
db = FAISS.from_documents(texts, embeddings)

In [None]:
# Count build embedding token number
tokenizer = tiktoken.get_encoding("cl100k_base")
build_token_count = sum([len(tokenizer.encode(doc.page_content)) for doc in texts])
print(f"Token count: {build_token_count}")

In [None]:
# Store the database
with open(DB_PATH, "wb") as f:
    pickle.dump(db.serialize_to_bytes(), f)

Create simple RAG

In [None]:
# Load the LLM
rate_limiter = InMemoryRateLimiter(
    requests_per_second=0.5,  # <-- Gemini Free Tier
    check_every_n_seconds=0.1,
)

llm = get_llm_client(
    base_url="https://generativelanguage.googleapis.com/v1beta/openai/",
    max_tokens=1024,
    temperature=0.2,
    rate_limiter=rate_limiter,
)

system_prompt = """
You are an expert assistant. Use only the following retrieved context to answer the question accurately and concisely. 
If nothing is mentioned in the context, say "I don't know".
Context: {context}
Question: {question}
"""

prompt_template = PromptTemplate(
    input_variables=["context", "question"], 
    template=system_prompt
)

retrieval_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=db.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt_template}
)

In [None]:
def ask_question(query):
    response = retrieval_chain.invoke({"query": query})
    print(f"Question: {query}\nAnswer: {response['result']}")
    print("\nSources: \n")
    for source in response["source_documents"]:
        print(source.metadata)
    return response

In [None]:
response = ask_question("What was the Austrian GDP development in recent decades?")