In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/bills-data/bills.tsv
/kaggle/input/gazettes-data/gazettes.tsv


# Installing Dependencies

In [1]:
!pip install faiss-cpu rank-bm25 google-generativeai

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting rank-bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading faiss_cpu-1.12.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m54.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank-bm25, faiss-cpu
Successfully installed faiss-cpu-1.12.0 rank-bm25-0.2.2


# **Preprocessing Text Data**

Used [previous](https://github.com/niduniDK/LegalAI/blob/main/Notebooks/dp-1-preprocess%20bills.ipynb) methods for data cleaning.

In [2]:
import re
import unicodedata
from typing import Optional

def normalize_unicode(text: str) -> str:
    """Normalize unicode characters to their closest ASCII representation."""
    try:
        return unicodedata.normalize('NFKC', text).encode('ASCII', 'ignore').decode('ASCII')
    except Exception:
        return text

def normalize_whitespace(text: str) -> str:
    """Normalize whitespace, preserving essential structure."""
    try:
        # Replace multiple spaces with a single space
        text = re.sub(r'[ \t]+', ' ', text)
        # Normalize newlines (keep single newlines, remove excessive ones)
        text = re.sub(r'\n{2,}', '\n', text)
        return text.strip()
    except re.error:
        return text.strip()

import re

def clean_legal_metadata(text: str) -> str:
    """
    Remove repetitive headers, footers, and metadata from Sri Lankan legal bills.
    """
    try:
        patterns = [
            # Gazette headers
            r'^THE GAZETTE OF THE DEMOCRATIC SOCIALIST REPUBLIC OF SRI LANKA\s*Part\s+[IVX]+\s*of\s+[A-Za-z]+\s+\d+,\s+\d+\s*SUPPLEMENT\s*$',
            r'^\(Issued\s+on\s+\d+\.\s*\d+\.\s*\d+\)\s*$',
            # Printing and purchase information
            r'^PRINTED AT THE DEPARTMENT OF GOVERNMENT PRINTING.*?$',
            r'^TO BE PURCHASED AT THE GOVERNMENT PUBLICATIONS BUREAU.*?$',
            r'^Price\s*:\s*Rs\.\s*\d+\.\d+\s*Postage\s*:\s*Rs\.\s*\d+\.\d+\s*$',
            # Bill identifiers
            r'^\d+\s*-PL\s+\d+-\d+\s*\(\d+/\d+\)\s*$',
            # Subscription details
            r'^Annual subscription of English Bills and Acts of the Parliament.*?$',
            r'^Payable to the SUPERINTENDENT, GOVERNMENT PUBLICATIONS BUREAU.*?$',
            # Other repetitive metadata
            r'^N\.B\.- Part [A-Z0-9]+\s*of the Gazette No\.\s*\d+[,\d]*\s*of\s*\d{2}\.\d{2}\.\d{4}\s*was not published\.\s*$',
            r'^Published by Authority\s*$',
        ]
        
        # Apply all patterns
        for pattern in patterns:
            text = re.sub(pattern, '', text, flags=re.IGNORECASE | re.DOTALL)
        
        # Remove extra newlines and leading/trailing whitespace
        text = '\n'.join(line.strip() for line in text.split('\n') if line.strip())
        
        return text
    except re.error:
        # fallback: minimal cleanup
        return text.strip()



def remove_special_characters(text: str) -> str:
    """Remove special characters, preserving essential punctuation for legal text."""
    try:
        # Preserve alphanumeric, spaces, and common legal punctuation (.,;:-/()&)
        text = re.sub(r'[^\w\s.,;:\-/()&]', '', text)
        return text
    except re.error:
        return text

def preprocess_legal_document(text: str) -> Optional[str]:
    """Main function to preprocess legal document text while preserving core content and unique details."""
    # if not text or not text.strip():
    #     return None

    try:
        # Step 1: Normalize unicode
        text = normalize_unicode(text)

        # Step 2: Clean specific metadata
        text = clean_legal_metadata(text)

        # Step 3: Remove special characters
        text = remove_special_characters(text)

        # Step 4: Normalize whitespace
        text = normalize_whitespace(text)

        return text.strip()
    except Exception:
        # Fallback to minimal cleaning
        return normalize_whitespace(text)

# Loading and Cleaning Bills

In [4]:
import pandas as pd
import numpy as np

bills = pd.read_csv("/kaggle/input/bills-data/bills.tsv", sep="\t")
bills.head()


Unnamed: 0,filename,content
0,2010-10-16-2010_E.txt,THE GAZETTE OF THE DEMOCRATIC SOCIALIST REPUBL...
1,2010-10-17-2010_E.txt,PARLIAMENT OF THE DEMOCRATIC SOCIALIST REPUBLI...
2,2010-10-18-2010_E.txt,PARLIAMENT OF THE DEMOCRATIC SOCIALIST REPUBLI...
3,2010-5-01-2010_E.txt,THE GAZETTE OF THE DEMOCRATIC SOCIALIST REPUBL...
4,2010-5-02-2010_E.txt,THE GAZETTE OF THE DEMOCRATIC SOCIALIST REPUBL...


In [5]:
bills["content"] = bills["content"].map(preprocess_legal_document)
bills.head()

Unnamed: 0,filename,content
0,2010-10-16-2010_E.txt,THE GAZETTE OF THE DEMOCRATIC SOCIALIST REPUBL...
1,2010-10-17-2010_E.txt,PARLIAMENT OF THE DEMOCRATIC SOCIALIST REPUBLI...
2,2010-10-18-2010_E.txt,PARLIAMENT OF THE DEMOCRATIC SOCIALIST REPUBLI...
3,2010-5-01-2010_E.txt,THE GAZETTE OF THE DEMOCRATIC SOCIALIST REPUBL...
4,2010-5-02-2010_E.txt,THE GAZETTE OF THE DEMOCRATIC SOCIALIST REPUBL...


In [8]:
from kaggle_secrets import UserSecretsClient

user_secret = UserSecretsClient()

GEMINI_API_KEY = user_secret.get_secret("gemini-api-key")
GROQ_API_KEY = user_secret.get_secret("groq_api_key")
GROQ_MODEL = 'llama3-70b-8192'

def query_qroq(prompt: str) -> str:
    response = requests.post(
        'https://api.groq.com/openai/v1/chat/completions',
        headers = {
            'Authorization': f"Bearer {GROQ_API_KEY}",
            'Content-type': 'application/json'
        },
        json={
            'model': GROQ_MODEL,
            'messages': [{'role': 'user', 'content': prompt}],
            'temperature':0.0
        }
    )

    if response.ok:
        data = response.json()
        return data['choices'][0]['message']['content']
    else:
        print(f"Error: {response.status_code} - {response.text}")
        return "Error in API request"

# **Chunking**

Source: [15 Chunking Techniques.](https://www.analyticsvidhya.com/blog/2024/10/chunking-techniques-to-build-exceptional-rag-systems/)

In [9]:
def chunk_df(df, method):
    chunks = []
    for _, row in df.iterrows():
        chunk_list = df["content"].map(method)
        for i in range(0, len(chunk_list)):
            chunks.append({
                "name": df["filename"],
                "chunk_id": i,
                "content": chunk_list[i]
            })

    return pd.DataFrame(chunks)

# *Sentence-based Chunking*

In [41]:
import spacy

nlp = spacy.load("en_core_web_sm")

def sentence_chunk(text):
    doc = nlp(text)
    return [sent.text for sent in doc.sents]


# *Paragraph-wise Chunking*

In [10]:
def paragraph_chunk(text):
    paragraphs = text.split('\n\n')
    return paragraphs


# *Sliding Window Chunking*

In [None]:
def sliding_window_chunk(text, chunk_size=100, overlap=20):
    tokens = text.split()
    chunks = []
    for i in range(0, len(tokens), chunk_size - overlap):
        chunk = ' '.join(tokens[i:i + chunk_size])
        chunks.append(chunk)
    return chunks


# *Hierarchical Chunking*

In [None]:
def get_section_keywords(text):
    
    prompt = f"""
    You are an intelligent agent. Identify the section keywords in the given text.

    Example: 
        text: "
            "THE GAZETTE OF THE DEMOCRATIC SOCIALIST REPUBLIC OF SRI LANKA Part II of October 01, 2010 SUPPLEMENT (Issued on 04.10.2010) LOCAL AUTHORITIES (SPECIAL PROVISIONS) BILL to amend the Municipal Councils Ordinance, the Urban Councils Ordinance and the Pradeshiya Sabhas Act, No. 15 of 1987 Ordered to be published by the Minister of Local Government and Provincial Councils PRINTED AT THE DEPARTMENT OF GOVERNMENT PRINTING, SRI LANKA TO BE PURCHASED AT THE GOVERNMENT PUBLICATIONS BUREAU, COLOMBO 5 Price : Rs. 8.00 Postage : Rs. 5.00
    
        Local Authorities (Special Provisions) Short title. L.D.-O. 19/2008 AN ACT TO AMEND THE MUNICIPAL COUNCILS ORDINANCE, THE URBAN COUNCILS ORDINANCE AND THE PRADESHIYA SHABHAS ACT, NO. 15 OF 1987. BE it enacted by the Parliament of the Democratic Socialist Republic of Sri Lanka as follows:- This Act may be cited as the Local Authorities (Special Provisions) Act, No. of 2010. PART I AMENDMENTS TO THE MUNICIPAL COUNCILS ORDINANCE (CHAPTER 252) Section 5 of the Minicipal Councils Ordinance (CHAPTER 252) (hereinafter in this Part referred to as the “principal enactment”) is hereby repealed and the following section is substituted therefor :- (1) Each Municipal Council shall consist of :- (a) such number of elected Councillors as determined by the Minister by Order made under section 3c of the Local Authorities Elections Ordinance (Cap. 262); and (b) such number of other Councillors not exceeding thirty per centum of the total number of elected Councillors as determined by the Minister by Order Replacement of section 5 of Chapter 252. “Composition of Municipal Councils.
        
        Local Authorities (Special Provisions) made under section 3C of the Local Authorities Elections Ordinance (Cap. 262), to be returned as Councillors under section 65A of that Ordinance, to represent those electors who have not secured any representation in the Council, at the election held for the election of Councillors.”; and (2) Where the number constituting thirty per centum referred to in paragraph (b) of subsection (1) is an integer and fraction, the integer shall be deemed to be the number which shall constitute such thirty per centum, for the purpose of that subsection.”. Section 13 of the principal enactment is hereby amended in subsection (3) of that section, by the substitution for all the words from “with the provisions of the Local Authorities Elections Ordinance,” to the end of that subsection, and the substitution therefore of the words “with the provisions of section 66A of the Local Authorities Elections Ordinance (Cap. 262), and the person so elected shall hold office as a Councillor, until the next succeeding general election of Councillors of that Council.”. Section 14 of the principal enactment as amended by Law No. 24 of 1977, is hereby further amended as follows:- (1) by the repeal of paragraph (b) of subsection (2) of that section, and the substitution therefor of the following paragraph- “(b) a Mayor or Deputy Mayor who resigns or vacates his office, shall however continue to be a Councillor.”; and Amendment of section 13 of the principal enactment. Amendment of section 14 of the principal enactment.
        
        Local Authorities (Special Provisions) (2) by the repeal of subsection (7) of that section and the substitution therefor of the following subsection:- “(7) Whenever the office of Mayor of a Municipal Council falls vacant, notice of such vacancy shall forthwith be given by the Commissioner to the Commissioner of Local Government and the Commissioner of Local Government shall thereupon proceed to fill such vacancy in the manner provided for the same in the Local Authorities Elections Ordinance (Cap. 262).”. Section 215A of the principal enactment is hereby amended as follows:- (1) by the substitution for all the words beginning from the words “Where a budget or supplementary budget” to the end of that section, of the following words- “Where a budget is not passed by the Council within two weeks after it is resubmitted before such Council, the Mayor shall be deemed, at the expiry of such two weeks period, to have resigned from the office of Mayor.”; and (2) by the substitution for the marginal note to that section of the following marginal note:- “ Effect of not passing the budget by the Council.”. Amendment of section 215A of the principal enactment.
        
        Local Authorities (Special Provisions) PART II AMENDMENTS TO THE URBAN COUNCILS ORDINANCE (CHAPTER 255) Section 5 of the Urban Councils Ordinance (Chapter 255) (hereinafter in this Part referred to as the “principal enactment”) is hereby repealed and the following section is substituted therefore:- (1) Each Urban Council shall consist of :- (a) such number of elected Councillors as determined by the Minister by Order made under section 3C of the Local Authorities Elections Ordinance (Cap. 262) ; and (b) such number of other Councillors not exceeding thirty per centum of the total number of elected Councillors as determined by the Minister by Order made under section 3C of the Local Authorities Elections Ordinance (Cap. 262), to be returned as Councillors under section 65A of that Ordinance, to represent those electors who have not secured any representation in the Council, at the election held for the election of Councillors.”; and (2) Where the number constituting thirty per centum referred to in paragraph (b) of subsection (1) is an integer and fraction, the integer shall be deemed to be the number which shall constitute such thirty per centum, for the purpose of that subsection.”. “Composition of Municipal Councils. Replacement of section 5 of Chapter 255.
        
        Local Authorities (Special Provisions) Section 12 of the principal enactment is hereby amended in subsection (3) of that section, by the substitution for the words “the provisions of written law for the time being applicable in that behalf,” of the words “the provisions of section 66A of the Loacl Authorities Elections Ordinance (Cap. 262)”. Section 19 of the principal enactment as amended by Law No. 24 of 1977, is hereby further amended as follows:- (1) in subsection (1) of that section, by the substitution for the words “in accordance with the provisions of written law for the time being applicable in that behalf.”, of the words “in accordance with the provisions of the Local Authorities Elections Ordinance (Cap. 262).”; (2) in subsection (2) of that section, by the substitution for all the words from the words “vacates such office.”, to the end of that subsection, of the words “vacates such office. A Chairman or Vice-Chairman who resigns or vacates his office shall however continue to be a member of the Council.”; and (3) by the repeal of subsection (7) of that section and the substitution therefore of the following subsection :- “(7) Whenever the office of Chairman of an Urban Council falls vacant, notice of such vacancy shall forthwith be given by the Secretary of the Council to the Commissioner of Local Government and the Commissioner of Local Government shall thereupon proceed to fill such vacancy in the manner provided for the same in the Local Authorities Elections Ordinance (Cap 262).”. Amendment of section 12 of the principal enactment. Amendment of section 19 of the principal enactment.
        
        Local Authorities (Special Provisions) Section 178A of the principal enactment as amended by Law No. 24 of 1977, is hereby further amended as follows:- (1) by the substitution for all the words beginning from the words “Where a budget or supplementary budget,” to the end of that section, of the following words :- “Where a budget is not passed by the Council within two weeks after it is re-submitted before such Council, the Chairman shall be deemed, at the expiry of such two weeks period, to have resigned from the office of Chairman.”; and (2) by the substitution for the marginal note to that section of the following marginal note:- “ Effect of not passing the budget by the Council.”. 10. Section 184 of the principal enactment as amended by Law No. 24 of 1977, is hereby further amended in subsection (3) of that section, by the substitution for the words “and the provisions of written law for the time being applicable in that behalf “, of the words “and the provisions of the Local Authorities Elections Ordinance (Cap. 262)”. 11. Section 249 of the principal enactment as amended by Law No. 24 of 1977, is hereby further amended in the definition of the expression “Chairman and Vice Chairman”, by the substitution for the words “the provisions of written law for the time being applicable in that behalf;”, of the words “the provisions of the Local Authorities Elections Ordinance (Cap. 262);”. Amendment of section 249 of the principal enactment. Amendment of section 184 of the principal enactment. Amendment of section 178A of the principal enactment.
        
        Local Authorities (Special Provisions) PART III AMENDMENTS TO THE PRADESHIYA SABHA ACT 12. Section 4 of the Pradeshiya Sabha Act, No. 15 of 1987 (hereinafter in this Part referred to as the “principal enactment”) is hereby repealed and the following section is subtituted therefore:- 4. (1) A Pradeshiya Sabha constituted by an Order under subsection (1) of section 2, shall consist of:- (a) such number of elected members as determined by the Minister by Order made under section 3C of the Local Authorities Elections Ordinance ( Cap. 262); and (b) such number of other members not exceeding thirty per centum of the total number of elected members as determined under paragraph (a), to be returned as members under the Local Authorities Elections Ordinance (Cap. 262), to represent those electors who have not secured any representation in the Sabha, at an election held for the election of members . (2) Where the number constituting thirty per centum referred to in paragraph (b) of subsection (1) is an integer and fraction, the integer shall be deemed to be the number which shall constitute such thirty per centum, for the purpose of that subsection.”. “Composition of Pradeshiya Sabhas. Amendment of section 4 of Act No. 15 of 1987.
        
        Local Authorities (Special Provisions) 13. Section 169 of the principal enactment is hereby amended as follows:- (1) by the substitution for all the words beginning from the words “Where a budget or supplem entary budget,” to the end of that section, of the following words :- “Where a budget is not passed by the Pradeshiya Sabha within two weeks after it is resubmitted before such Pradeshiya Sabha, the Chairman shall be deemed, at the expiry of such two weeks period, to have resigned from the office of Chairman.”; and (2) by the substitution for the marginal note to that section of the following marginal note:- “Effect of not passing the budget by the Pradeshiya Sabha.”. 14. In the event of any inconsistency between the Sinhala and Tamil texts of this Act, the Sinhala text shall prevail. Amendment of section 169 of the principal enactment. Sinhala text to prevail in case of inconsistency.
        
        Local Authorities (Special Provisions) Annual subscription of English Bills and Acts of the Parliament Rs. 885 (Local), Rs. 1,180 (Foreign), Payable to the SUPERINTENDENT, GOVERNMENT PUBLICATIONS BUREAU, DEPARTMENT OF GOVERNMENT INFORMATION, NO. 163, KIRULAPONA MAWATHA, POLHENGODA, COLOMBO 05 before 15th December each year in respect of the year following."
        
            "
        section_keywords: [
            "PART I", "PART II", "PART III", "Section ", "Replacement of section", "Amendment of section","LOCAL AUTHORITIES", "Short title"
        ]

    Use the {text} as the text
    """

    return query_groq(prompt)


In [None]:
def hierarchical_chunk(text):
    sections = []
    current_section = []
    section_keywords = get_section_keywords(text)
    for line in text.splitlines():
        if any(keyword in line for keyword in section_keywords):
            if current_section:
                sections.append("\n".join(current_section))
            current_section = [line]
        else:
            current_section.append(line)
    if current_section:
        sections.append("\n".join(current_section))
    return sections

# # Applying Hierarchical Chunking
# section_keywords = ["Introduction", "Overview", "Methods", "Conclusion"]
# hierarchical_chunks = hierarchical_chunk(sample_text, section_keywords)
# for chunk in hierarchical_chunks:
#     print(chunk, '\n---\n')
    

# *Sliding Window + Paragraph*

In [None]:
def hybrid_chunk_sliding_para(text):
    texts = paragraph_chunk(text)
    chunks = []
    for text in texts:
        chunks.extend(sliding_window_chunk(text, chunk_size=100, overlap=20))
    return chunks


# *Sliding Window + Sentence*

In [None]:
def hybrid_chunk_sliding_sentence(text):
    texts = sentence_chunk(text)
    chunks = []
    for text in texts:
        chunks.extend(sliding_window_chunk(text, chunk_size=100, overlap=20))
    return chunks

# **Implementing RAG**

In [None]:
bills_para_chunks = chunk_df(bills, paragraph_chunk)
bills_sentence_chunks = chunk_df(bills, sentence_chunk)
bills_sliding_chunks = chunk_df(bills, sliding_window_chunk)
bills_hierarchical_chunks = chunk_df(bills, hierarchical_chunk)
bills_hybrid_sp_chunks = chunk_df(bills, hybrid_chunk_sliding_para)
bills_hybrid_ss_chunks = chunk_df(bills, hybrid_chunk_sliding_sentence)

# **Retriever Module Implementation**

In [17]:
pip install sentence_transformers

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence_transformers)
 

In [20]:
# from sentence_transformers import SentenceTransformers
import google.generativeai as genai

genai.configure(api_key=GEMINI_API_KEY)

# Vanila RAG Approach

In [None]:
class VanilaRAG:
    def __init__(
        self,
        model=None,
        gemini_api_key,
        chunk_df
    ):
        index_path = f"bills.faiss"
        data_path = f"bills_data.pkl"
        bm25_path = f"bills_bm25.pkl"

        # Load cached data if it exists and reload not requested
        if (
            not reload
            and os.path.exists(index_path)
            and os.path.exists(data_path)
            and os.path.exists(bm25_path)
        ):
            self.index = faiss.read_index(index_path)
            data = joblib.load(data_path)
            self.documents = data['documents']
            self.metadata = data['metadata']
            self.bm25_corpus = joblib.load(bm25_path)
            self.bm25 = BM25Okapi(self.bm25_corpus)
            return

        self.client = genai.GenerativeModel("gemini-2.0-flash")

        self.chunk_df = chunk_df
        self.documents = chunk_df["content"].to_list()

        self.model = model
        
        if not model:
            self.model = SentenceTransformer('nlpaueb/legal-bert-base-uncased')
            

        embeddings = self.model.encode(self.documents, batch_size=32, show_progress_bar=True)
        self.index = faiss.IndexFlatL2(self.dimension)
        self.index.add(np.array(embeddings, dtype=np.float32))

        # Save FAISS index and data
        faiss.write_index(self.index, index_path)
        joblib.dump({'documents': self.documents, 'metadata': self.metadata}, data_path)

        # Build and save BM25 index
        self.build_bm25()
        joblib.dump(self.bm25_corpus, bm25_path)

    def build_bm25(self):
        self.bm25_corpus = [re.findall(r"\w+", doc.lower()) for doc in self.documents]
        self.bm25 = BM25Okapi(self.bm25_corpus)

    def retrieve(self, query: str, k: int = 5) -> List[Tuple[str, dict, float]]:
        # FAISS embedding retrieval
        query_embedding = self.model.encode([query])[0]
        distances, indices = self.index.search(np.array([query_embedding], dtype=np.float32), k)
        results = []

        for idx, distance in zip(indices[0], distances[0]):
                if idx < len(self.documents):
                    score = 1 / (1 + distance)  # Convert L2 distance to similarity score
                    results.append((self.documents[idx], self.metadata[idx], score))
            return results

    def bm25_retrieve(self, query: str, k: int = 5) -> List[Tuple[str, dict, float]]:
        # BM25 keyword retrieval
        tokens = re.findall(r"\w+", query.lower())
        scores = self.bm25.get_scores(tokens)
        top_indices = np.argsort(scores)[::-1][:k]
        results = []
        for i in top_indices:
            if scores[i] > 0:
                results.append((self.documents[i], self.metadata[i], float(scores[i])))
        return results

    def generate_response(self, query: str, retrieved_docs: List[Tuple[str, dict, float]]) -> str:
        context = "\n\n".join([f"Document: {doc[0]}\nMetadata: {doc[1]}" for doc in retrieved_docs])
        prompt = f"""You are a legal assistant powered by a RAG system. Use the following context to answer the query accurately and concisely. If the context doesn't provide enough information, say so.

            Context:
            {context}
            
            Query:
            {query}
            
            Answer:
            """
        # response = self.client.generate_content(
        #     model="gpt-2.0-flash",
        #     messages=[
        #         {"role": "system", "content": "You are a helpful legal assistant."},
        #         {"role": "user", "content": prompt}
        #     ],
        #     max_tokens=500,
        #     temperature=0.7
        # )
        response = self.client.generate_content(prompt)
        # return response.choices[0].message.content.strip()
        return response.text

# GraphRAG Approach

In [11]:
pip install langextract

Collecting langextract
  Downloading langextract-1.0.9-py3-none-any.whl.metadata (19 kB)
Collecting async_timeout>=4.0.0 (from langextract)
  Downloading async_timeout-5.0.1-py3-none-any.whl.metadata (5.1 kB)
Collecting exceptiongroup>=1.1.0 (from langextract)
  Downloading exceptiongroup-1.3.0-py3-none-any.whl.metadata (6.7 kB)
Collecting python-dotenv>=0.19.0 (from langextract)
  Downloading python_dotenv-1.1.1-py3-none-any.whl.metadata (24 kB)
Downloading langextract-1.0.9-py3-none-any.whl (106 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.2/106.2 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading async_timeout-5.0.1-py3-none-any.whl (6.2 kB)
Downloading exceptiongroup-1.3.0-py3-none-any.whl (16 kB)
Downloading python_dotenv-1.1.1-py3-none-any.whl (20 kB)
Installing collected packages: python-dotenv, exceptiongroup, async_timeout, langextract
Successfully installed async_timeout-5.0.1 exceptiongroup-1.3.0 langextract-1.0.9 python-dotenv-1.1.1
Not

In [53]:
import networkx as nx
from pprint import pprint
import textwrap
import langextract as lx

neo4j_username = "neo4j"
neo4j_id = "7c554630"
neo4j_pwd = "q7uKz7eeq-9P3aXoO-6GFuGwyTnU-k-QOqJ89DXvCc8"
neo4j_url = f"neo4j+s://{neo4j_id}.databases.neo4j.io"

class GraphRAG:
    def __init__(self, chunk_df):
        self.chunk_df = chunk_df
        self.documents = chunk_df["content"].to_list()
        self.G = nx.Graph()
        self.client = genai.GenerativeModel("gemini-2.0-flash")

    def get_triplets(self, text):
        prompt = textwrap.dedent("""
        Extract entity relation triplets from the given text. Use exact text for extraction. Do not paraphrase or overlap entities.
        Provide meaningful attributes to add context.
        """)

        examples = [
            lx.data.ExampleData(
                text="Social Security Board (Amendment) Act, No. 33 of 1999 amends the Social Security Board Act, No. 17 of 1996.",
                extractions=[
                    lx.data.Extraction(
                        extraction_class="law",
                        extraction_text="Social Security Board (Amendment) Act, No. 33 of 1999",
                    ),
                    lx.data.Extraction(
                        extraction_class="law",
                        extraction_text="Social Security Board Act, No. 17 of 1996",
                    ),
                    lx.data.Extraction(
                        extraction_class="relationship",
                        extraction_text="amends",
                        attributes={"subject": "Social Security Board (Amendment) Act, No. 33 of 1999",
                                    "object": "Social Security Board Act, No. 17 of 1996"}
                    ),
                ]
            )
        ]

        results = lx.extract(
            text_or_documents=text,
            prompt_description=prompt,
            examples=examples,
            model_id="gemini-2.0-flash",
            api_key=GEMINI_API_KEY,
        )

        print(results)

        return results

    
    def build_kg(self):
        triplets = {}
        for text in self.documents:
            annotated_doc = self.get_triplets(text)
            for ex in annotated_doc.extractions:
                triplets.append({
                    "subject": ex.attributes["subject"],
                    "object": ex.attributes["object"],
                    "relation": ex.extraction_text
                })

        for triplet in triplets:
            self.G.add_node(triplet["subject"], type="entity")
            self.G.add_node(triplet["object"], type="entity")
            self.G.add_edge(triplet["subject"], triplet["object"], relation=triplet["relation"])

        print(triplets)
        return triplets

    
    def retrieve_subgraph(self, query):
        query_entities = self.extract_entities(query)
        results = []
        for entity in query_entities.extractions:
            entity_text = entity.extraction_text
            result = [node for node in self.G.nodes if entity_text.lower() in node.lower()]
            results.extend(result)

        subgraph_nodes = set(results)
        for r in subgraph_nodes:
            subgraph_nodes.add(self.G.neighbors(r))
            
        return self.G.subgraph(subgraph_nodes)
    

    def extract_entities(self, text):
        prompt = textwrap.dedent("""
        Extract entity relation triplets from the given text. Use exact text for extraction. Do not paraphrase or overlap entities.
        Provide meaningful attributes to add context. Include attributes like entity type (law, organization, person, committee, etc.) 
        and relationship type (amends, renames, appoints, establishes, provides, collaborates_with, etc.).
        """)

        examples = [
            lx.data.ExampleData(
                text="Social Security Board (Amendment) Act, No. 33 of 1999 amends the Social Security Board Act, No. 17 of 1996.",
                extractions=[
                    lx.data.Extraction(
                        extraction_class="law",
                        extraction_text="Social Security Board (Amendment) Act, No. 33 of 1999",
                    ),
                    lx.data.Extraction(
                        extraction_class="law",
                        extraction_text="Social Security Board Act, No. 17 of 1996",
                    ),
                ]
            )
        ]

        results = lx.extract(
            text_or_documents=text,
            prompt_description=prompt,
            examples=examples,
            model_id="gemini-2.0-flash",
            api_key=GEMINI_API_KEY,
        )

        print(results.extractions)

        return results


    def serialize_subgraph(self, subgraph):
        triples = []
        for u, v, data in subgraph.edges(data=True):
            relation = data.get("relation", "related_to")
            triples.append(f"{u} → {relation} → {v}")
        print(triples)
        return "\n".join(triples)

    
    def generate_response(self, query) -> str:
        subgraph = self.retrieve_subgraph(query)
        kg_context = self.serialize_subgraph(subgraph)
        print(kg_context)
        prompt = f"""You are a legal assistant powered by a RAG system. Use the following knowledge graph context to answer the query accurately and concisely. If the context doesn't provide enough information, say so.

            Context:
            {kg_context}
            
            Query:
            {query}
            
            Answer:
            """

        response = self.client.generate_content(prompt)
        return response.text

In [3]:
bills_para_chunks = chunk_df(bills, paragraph_chunk)
rag = GraphRAG(bills_para_chunks)

query = "What are the main objectives of the Jayanthipura association in community welfare and environment?"

rag.generate_response(query)