In [None]:
#Author : Tommy Lovaniaina RAMAROKOTO

import os
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())

gemini_api_key = os.getenv("GEMINI_API_KEY")

if gemini_api_key:
    os.environ["GEMINI_API_KEY"] = gemini_api_key

In [None]:
from typing import List, Tuple, Dict
from pypdf import PdfReader
from tqdm import tqdm
import os

def load_pdf(file_path: str) -> Tuple[List[str], List[Dict]]:
    """
    Reads text content from a PDF file, returns page texts and metadata.

    Args:
        file_path (str): Path to the PDF file.

    Returns:
        documents (List[str]): One string per page.
        metadatas (List[Dict]): Metadata with filename + page number.
    """
    reader = PdfReader(file_path)

    documents = []
    metadatas = []

    for page_number, page in enumerate(
        tqdm(reader.pages, desc=f"Reading {os.path.basename(file_path)}"), start=1
    ):
        text = page.extract_text()
        if text and text.strip():
            documents.append(text.strip())
            metadatas.append({
                "filename": os.path.basename(file_path),
                "page_number": page_number               
            })

    return documents, metadatas


# Loading data to vectorstore

In [None]:
from google import genai
import chromadb
from chromadb.utils.embedding_functions import GoogleGenerativeAiEmbeddingFunction
from tqdm import tqdm
import os

def load_data(documents, metadatas, collection_name):
    """
    Loads all data (embeddings) into chromadb

    Args: 
    documents (list[str]): list of all documents to load
    collection_name (str): the name of the collection where the documents will be stored
    """
    
    client = chromadb.EphemeralClient()

    google_api_key = None
    if "GEMINI_API_KEY" not in os.environ:
        gapikey = input("Please enter your Google API Key: ")
        google_api_key = gapikey
    else:
        google_api_key = os.environ["GEMINI_API_KEY"]

    embedding_function = GoogleGenerativeAiEmbeddingFunction(
        api_key=google_api_key
    )

    collection = client.get_or_create_collection(
        name=collection_name, embedding_function=embedding_function
    )

    count = collection.count()
    print(f"Collection already contains {count} documents")
    ids = [str(i) for i in range(count, count + len(documents))]

    # Load the documents in batches of 100
    for i in tqdm(
        range(0, len(documents), 100), desc="Adding documents", unit_scale=100
    ):
        collection.add(
            ids=ids[i : i + 100],
            documents=documents[i : i + 100],
            metadatas=metadatas[i : i + 100],
        )
    print(f"Documents loaded successfully")

# Getting DB by collection_name

In [None]:
import chromadb

def get_db(collection_name): 
    """
    Returns an instance of the db that match the collection_name
    """
    google_api_key = None
    if "GEMINI_API_KEY" not in os.environ:
        gapikey = input("Please enter your Google API Key: ")
        google_api_key = gapikey
    else:
        google_api_key = os.environ["GEMINI_API_KEY"]

    client = chromadb.EphemeralClient()

    # create embedding function
    embedding_function = GoogleGenerativeAiEmbeddingFunction(
        api_key=google_api_key, task_type="RETRIEVAL_QUERY"
    )

    db = client.get_collection(
        name=collection_name, embedding_function=embedding_function
    )
    return db

# Retrieval

In [None]:
def get_relevant_passage(query, db, n_results):
    results = db.query(
        query_texts=[query], n_results=n_results, include=["documents", "metadatas"]
    )
    return results['documents'][0], results['metadatas'][0]

# Generation

In [None]:
from typing import List, Union

def make_rag_prompt(query: str, relevant_passages: Union[str, List[str]]) -> str:
    """
    Build a RAG prompt from query + relevant passages.

    Args:
        query (str): User query.
        relevant_passages (str | List[str]): Either a single passage or a list of passages.

    Returns:
        str: Formatted prompt.
    """
    # Normalize to list
    if isinstance(relevant_passages, str):
        relevant_passages = [relevant_passages]

    # Clean and join passages
    escaped_passages = []
    for passage in relevant_passages:
        cleaned = passage.replace("'", "").replace('"', "").replace("\n", " ")
        escaped_passages.append(cleaned)

    combined_passages = "\n---\n".join(escaped_passages)

    prompt = f"""You are a helpful and informative bot that answers questions using text from the reference passages below. 
                Your response must be direct, no need for preambule or not relevant phrases.  
                If the passages are irrelevant to the answer, you may ignore them.
                
                QUESTION: '{query}'
                PASSAGES:
                {combined_passages}
                
                ANSWER:
            """
    return prompt


In [None]:
from typing import List

def get_gemini_response(query: str, context: List[str]) -> str:
    """
    Queries the Gemini API to get a response to the question.

    Args:
    query (str): The original query.
    context (List[str]): The context of the query, returned by embedding search.

    Returns:
    A response to the question.
    """
    client = genai.Client()

    response = client.models.generate_content(
        model="gemini-2.5-flash-lite",
        contents=make_rag_prompt(query, context)
    )

    return response.text

# Bringing it all together

In [None]:
mesupres_data, mesupres_metadata = load_pdf(file_path="./data/MESUPRES_en_chiffres_MAJ.pdf")

collection_name = 'rag'
load_data(documents=mesupres_data, metadatas=mesupres_metadata, collection_name=collection_name)

db = get_db(collection_name)

In [None]:
import pandas as pd
import csv

def process_questions_from_csv(db, csv_path: str, output_csv: str = 'submission_file.csv'):
    """
    Reads an XLSX file containing (id, question), generates answers,
    and writes results to a CSV file.

    Args:
        db: The Chroma/Vector database client.
        csv_path (str): Path to the CSV file containing questions.
        output_csv (str): Path to the output CSV file.
    """
    df = pd.read_csv(csv_path)

    results = []

    for _, row in df.iterrows():
        qid = row["id"]
        question = row["question"]
        
        print(f"Answering question {qid}: {question}")

        relevant_texts, metadatas = get_relevant_passage(question, db, n_results=1)
        answer = get_gemini_response(question, relevant_texts)
        print(f"Answer: {answer}")

        results.append({
            "id": qid,
            "question": question,
            "answer": answer,
            "context": relevant_texts,
            "ref_page": metadatas[0]["page_number"]
        })

    with open(output_csv, mode="w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=["id", "question", "answer", "context", "ref_page"])
        writer.writeheader()
        for row in results:
            writer.writerow({
                "id": row["id"],
                "question": row["question"],
                "answer": row["answer"],
                "context": str(row["context"]),
                "ref_page": str(row["ref_page"])
            })

    print(f"✅ Results written to {output_csv}")

In [None]:
process_questions_from_csv(db, './questions.csv')