In [51]:
# ---- IMPORTS & ENV -----------------------------------------
import os
import re
import sqlite3
from dotenv import load_dotenv
from pydantic import BaseModel, Field
from typing import Optional

from langchain_community.document_loaders import PyMuPDFLoader, DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_qdrant import QdrantVectorStore, FastEmbedSparse, RetrievalMode
from langchain_openai import OpenAIEmbeddings
from langchain_groq import ChatGroq
from langchain_cohere import CohereRerank
from langchain_classic.retrievers.contextual_compression import (
    ContextualCompressionRetriever,
)
from langchain_core.messages import HumanMessage, AIMessage, SystemMessage

load_dotenv()

openai_api_key = os.environ.get("OPENAI_API_KEY")
qdrant_url = os.environ.get("QDRANT_URL")
qdrant_api_key = os.environ.get("QDRANT_API_KEY")
cohere_api_key = os.environ.get("COHERE_API_KEY")
groq_api_key = os.environ.get("GROQ_API_KEY")

COLLECTION_NAME = "compliance_docs"
MODEL_NAME = "openai/gpt-oss-120b"
TOP_K = 6
TOP_N = 3

In [52]:
# ---- PYDANTIC SCHEMA ----------------------------------------
class CitedSource(BaseModel):
    document_name: str = Field(description="Name of the source document")
    page_number: Optional[int] = Field(description="Page number in the document")
    section: Optional[str] = Field(
        description="Section or article reference if mentioned e.g. Section 4.2 "
    )
    law_category: Optional[str] = Field(
        description="Category of the law e.g. Taxation, Data Protection, Corporate/CAC, Employment, General Compliance"
    )


class ComplianceAnswer(BaseModel):
    answer: str = Field(
        description="The compliance answer based strictly on the provided context"
    )
    found_in_docs: bool = Field(
        description="True if the answer was found in the documents, False if not"
    )
    sources: list[CitedSource] = Field(
        description="List of sources that support the answer"
    )
    confidence: str = Field(
        description="How confident the answer is: high, medium, or low"
    )

In [53]:
# ---- LOAD DOCUMENTS -----------------------------------------
loader = DirectoryLoader(
    path="../documents/",
    loader_cls=PyMuPDFLoader,
    show_progress=True,
)
doc = loader.load()
print(f"Loaded {len(doc)} pages")
print(f"Sample: {doc[0].page_content[:100]}...")


# ---- CLEAN DOCUMENTS ----------------------------------------
for d in doc:
    d.page_content = re.sub(
        r"[ \t]+",
        " ",
        re.sub(r"\n{3,}", "\n\n", re.sub(r"\r\n?", "\n", d.page_content)),
    ).strip()

  0%|          | 0/17 [00:00<?, ?it/s]

100%|██████████| 17/17 [00:06<00:00,  2.64it/s]


Loaded 2018 pages
Sample: 2020 No. 3          A 1
Companies and Allied Matters Act, 2020
Federal Republic of Nigeria
Official ...


In [54]:
# ---- CHUNK DOCUMENTS ----------------------------------------
splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    model_name="gpt-4",
    chunk_size=900,
    chunk_overlap=150,
    separators=[
        "\nPART ",
        "\nPart ",
        "\nCHAPTER ",
        "\nChapter ",
        "\nSECTION ",
        "\nSection ",
        "\nArticle ",
        "\nARTICLE ",
        "\nSCHEDULE ",
        "\nSchedule ",
        "\n\n",
        "\n(",
        "\n(a)",
        "\n(i)",
        "\n• ",
        "\n- ",
        "\n",
        ". ",
        " ",
        "",
    ],
    add_start_index=True,
)
chunks = splitter.split_documents(doc)
print(f"Created {len(chunks)} chunks")


# Metadata tagging this helps the AI filter by law type during retrieval
for chunk in chunks:
    source_name = chunk.metadata.get("source", "").lower()

    if any(word in source_name for word in ["tax", "nta", "revenue", "firs", "nrs"]):
        chunk.metadata["law_category"] = "Taxation"
    elif any(word in source_name for word in ["data", "ndpa", "privacy", "ndpc"]):
        chunk.metadata["law_category"] = "Data Protection"
    elif any(
        word in source_name for word in ["cama", "cac", "corporate", "annual return"]
    ):
        chunk.metadata["law_category"] = "Corporate/CAC"
    elif any(word in source_name for word in ["labour", "pension", "wage", "employee"]):
        chunk.metadata["law_category"] = "Employment"
    else:
        chunk.metadata["law_category"] = "General Compliance"
print(f"Created {len(chunks)} tagged chunks")

Created 2100 chunks
Created 2100 tagged chunks


In [55]:
dense_embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small", api_key=openai_api_key
)
sparse_embeddings = FastEmbedSparse(model_name="Qdrant/bm25")

print("Ingesting into Qdrant...")
qdrant_store = QdrantVectorStore.from_documents(
    documents=chunks,
    embedding=dense_embeddings,
    sparse_embedding=sparse_embeddings,
    collection_name=COLLECTION_NAME,
    url=qdrant_url,
    api_key=qdrant_api_key,
    retrieval_mode=RetrievalMode.HYBRID,
)
print("Ingestion complete")

Ingesting into Qdrant...
Ingestion complete


In [56]:
# ---- RETRIEVER + COHERE RERANKER ----------------------------
base_retriever = qdrant_store.as_retriever(search_kwargs={"k": TOP_K})

reranker = CohereRerank(
    cohere_api_key=cohere_api_key,
    model="rerank-english-v3.0",
    top_n=TOP_N,
)

retriever = ContextualCompressionRetriever(
    base_compressor=reranker, base_retriever=base_retriever
)

In [57]:
# ---- LLM SETUP + STRUCTURED OUTPUT --------------------------
llm = ChatGroq(model=MODEL_NAME, api_key=groq_api_key, temperature=0)

# Binds the Pydantic schema to the LLM.
structured_llm = llm.with_structured_output(ComplianceAnswer)

In [58]:
# ---- SQLITE CHAT HISTORY SETUP ------------------------------

conn = sqlite3.connect("chat_history.db")
conn.execute(
    """
    CREATE TABLE IF NOT EXISTS messages (
        id        INTEGER PRIMARY KEY AUTOINCREMENT,
        session   TEXT    NOT NULL,
        role      TEXT    NOT NULL,
        content   TEXT    NOT NULL,
        timestamp DATETIME DEFAULT CURRENT_TIMESTAMP
        )
    """
)

conn.commit()
conn.close()

In [59]:
# ---- LOAD HISTORY FROM SQLITE -------------------------------
session_id = "session_1"
conn = sqlite3.connect("chat_history.db")
rows = conn.execute(
    "SELECT role, content FROM messages WHERE session = ? ORDER BY id", (session_id,)
).fetchall()
conn.close()


chat_history = []
for role, content in rows:
    if role == "human":
        chat_history.append(HumanMessage(content=content))
    else:
        chat_history.append(AIMessage(content=content))

print(f"Loaded {len(chat_history)} messages from history")

Loaded 4 messages from history


In [60]:
# ---- USER QUESTION ----------
question = "According to the NDPR 2019, what are the specific criteria that mandate a Data Controller to file an annual Data Protection Audit, and what is the annual deadline for this filing?"

CONTEXTUALIZE_PROMPT = """Given the chat history and the latest user question, \
rewrite the question as a fully standalone question that can be understood \
without the chat history. Do NOT answer it, only rewrite it. \
If it is already standalone, return it as is."""

if chat_history:
    contextualize_messages = (
        [SystemMessage(content=CONTEXTUALIZE_PROMPT)]
        + chat_history
        + [HumanMessage(content=question)]
    )
    standalone_question = llm.invoke(contextualize_messages).content
else:
    standalone_question = question

print(f"Standalone question: {standalone_question}")

Standalone question: According to the NDPR 2019, what specific criteria require a Data Controller to file an annual Data Protection Audit, and what is the annual deadline for submitting this audit?


In [61]:
# ---- RETRIEVE + RERANK -------
retrieved_docs = retriever.invoke(standalone_question)
print(f"Retrieved {len(retrieved_docs)} chunks after reranking")

Retrieved 3 chunks after reranking


In [62]:
# ---- FORMAT CONTEXT ----------------------
context_parts = []
for i, d in enumerate(retrieved_docs, 1):
    source = d.metadata.get("source", "Unknown")
    page = d.metadata.get("page", "?")
    category = d.metadata.get("law_category", "General Compliance")
    context_parts.append(
        f"[{i}] Source: {source} | Page: {page} | Category: {category}\n{d.page_content}"
    )

context = "\n\n---\n\n".join(context_parts)

In [63]:
context

"[1] Source: ..\\documents\\NDPR Audit Template.pdf | Page: 19 | Category: General Compliance\n19 | P a g e \n \n✓ to assess the level of compliance with the NDPR \n✓ to evaluate compliance with the organisation's own data protection policy \n✓ to identify potential gaps and weaknesses in organisation’s processes \n✓ to give requisite advice and/or remedial actions for identified gaps \n \n7.1 Audit Periods \nArticle 4.1(7) of the Regulation addresses the period when audit report is to be \nfiled by Data Controllers. The Article provides as follows: \n(7) On annual basis, a Data Controller who processed the Personal \nData of more than 2000 Data Subjects in a period of 12 months shall, \nnot later than the 15th of March of the following year, submit a \nsummary of its data protection audit to the Agency. The data \nprotection audit shall contain information as specified in 4.1(5). \nNon-filing of annual audit report by a Data Controller, as required by NDPR, is a \nprima facie case of 

In [64]:
SYSTEM_PROMPT = """You are a precise compliance assistant. \
Answer using ONLY the provided context from the compliance documents. \
Always cite the source document name, page number, and section if available. \
Set found_in_docs to False and explain that the answer was not found if the context does not support an answer. \
Set confidence based on how clearly the context supports the answer: high, medium, or low."""

messages = (
    [SystemMessage(content=SYSTEM_PROMPT)]
    + chat_history
    + [HumanMessage(content=f"Context:\n{context}\n\nQuestion: {question}")]
)

result = structured_llm.invoke(messages)

print(f"\nAnswer: {result.answer}")
print(f"Found in docs: {result.found_in_docs}")
print(f"Confidence: {result.confidence}")
print("Sources:")
for s in result.sources:
    print(
        f"  - {s.document_name} | Page {s.page_number} | Section {s.section} | Category {s.law_category}"
    )


Answer: A Data Controller must file an annual Data Protection Audit if, in any 12‑month period, it has processed the personal data of more than 2,000 data subjects. The audit summary must be submitted to the Agency no later than 15 March of the following year.
Found in docs: True
Confidence: high
Sources:
  - NDPR Audit Template.pdf | Page 19 | Section 7.1 Audit Periods | Category General Compliance


In [65]:
# ---- SAVE TO SQLITE -------------
conn = sqlite3.connect("chat_history.db")
conn.execute(
    "INSERT INTO messages (session, role, content) VALUES (?, ?, ?)",
    (session_id, "human", question),
)
conn.execute(
    "INSERT INTO messages (session, role, content) VALUES (?, ?, ?)",
    (session_id, "ai", result.answer),
)
conn.commit()
conn.close()

print("\nSaved to history.")


Saved to history.
