In [1]:
import pandas as pd
import numpy as np

In [2]:
!pip install pdfplumber



In [3]:
import pdfplumber

# path to your pdf
pdf_path = "/kaggle/input/costitution/constitution.pdf"

# open the pdf
with pdfplumber.open(pdf_path) as pdf:
    all_text = ""
    for page in pdf.pages:
        text = page.extract_text()
        if text:  # check if text exists
            all_text += text + "\n"

In [4]:
text = all_text[20988:]
text[:500]

'CHAPTER I\nTHE PEOPLE, THE STATE AND SOVEREIGNTY\n1. Sri Lanka (Ceylon) is a Free, Sovereign, The State\nIndependent and Democratic Socialist Republic and shall\nbe known as the Democratic Socialist Republic of Sri\nLanka.\n2. The Republic of Sri Lanka is a Unitary State. Unitary State\n3. In the Republic of Sri Lanka sovereignty is Sovereignty\nin the People and is inalienable. Sovereignty includes the of the\nPeople\npowers of government, fundamental rights and the\nfranchise.\n4. The Sovereignty of the P'

In [5]:
!pip install transformers==4.40.0 torch networkx pyvis tqdm

Collecting transformers==4.40.0
  Using cached transformers-4.40.0-py3-none-any.whl.metadata (137 kB)
Using cached transformers-4.40.0-py3-none-any.whl (9.0 MB)
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.44.2
    Uninstalling transformers-4.44.2:
      Successfully uninstalled transformers-4.44.2
Successfully installed transformers-4.40.0


In [6]:
import re
from collections import defaultdict
from itertools import combinations
from typing import List, Dict, Tuple, Any
from transformers import pipeline
import networkx as nx
from pyvis.network import Network
import json
from tqdm import tqdm

# --------- 1) chapter splitting / title extraction ----------
chapter_re = re.compile(
    r'^(chapter)\s+([ivxlcdm]+)\s*$\s*(?P<title>[^\n]+)?', re.IGNORECASE | re.MULTILINE
)
# the above matches lines like "CHAPTER III\nFUNDAMENTAL RIGHTS"
# we'll also accept headings like "chapter 3" by converting roman numerals if needed.

roman_to_int_map = {
    'i':1,'ii':2,'iii':3,'iv':4,'v':5,'vi':6,'vii':7,'viii':8,'ix':9,'x':10,
    'xi':11,'xii':12,'xiii':13,'xiv':14,'xv':15,'xvi':16,'xvii':17,'xviii':18,
    'xix':19,'xx':20
}
def roman_to_int(r):
    r = r.lower()
    return roman_to_int_map.get(r, None)

def split_into_chapters(text: str):
    """
    returns list of dicts: {'chapter_no': 'iii' or '3', 'chapter_title': 'fundamental rights', 'content': '...'}
    uses regex to find chapter starts. if a final trailing section exists it will be included.
    """
    # normalize line endings
    text = text.replace('\r\n', '\n').replace('\r', '\n')
    # find all chapter headings (positions)
    heads = []
    for m in chapter_re.finditer(text):
        # capture the whole match span start
        start = m.start()
        chap_word = m.group(2).strip()
        title_guess = m.group('title') or ''
        heads.append((start, chap_word, title_guess.strip()))
    chapters = []
    if not heads:
        # no chapter headings -> whole text as single pseudo-chapter
        chapters.append({'chapter_no': '1', 'chapter_title': '', 'content': text.strip()})
        return chapters
    # append end-of-text sentinel
    heads.append((len(text), None, None))
    for i in range(len(heads)-1):
        start_pos = heads[i][0]
        chap_token = heads[i][1]
        title_guess = heads[i][2]
        end_pos = heads[i+1][0]
        chap_block = text[start_pos:end_pos].strip()
        # try to extract the chapter number and the title more robustly from the block
        # first line may be "CHAPTER III" and following line the title
        lines = chap_block.split('\n')
        chap_no = chap_token
        if chap_no and re.fullmatch(r'[ivxlcdm]+', chap_no.lower()):
            numeric = roman_to_int(chap_no.lower())
            chap_no_out = chap_no  # keep roman; you could also use numeric
        else:
            chap_no_out = chap_no
        # find title: either the rest of the heading or next non-empty line
        title = title_guess
        if not title:
            # look at the first few lines after heading for a plausible title (uppercase lines)
            if len(lines) >= 2:
                cand = lines[1].strip()
                # if it's all uppercase words or title-case, accept
                if cand and (cand.isupper() or len(cand.split()) <= 6):
                    title = cand
                else:
                    title = ''
        content = '\n'.join(lines[1:]).strip() if len(lines) > 1 else ''
        # if content is empty, fallback to entire block (after first newline)
        if not content:
            # remove the heading line and treat rest as content
            if '\n' in chap_block:
                content = chap_block.split('\n',1)[1].strip()
            else:
                content = ''
        chapters.append({
            'chapter_no': str(chap_no_out),
            'chapter_title': title.strip(),
            'content': content
        })

    df_chaps = pd.DataFrame(chapters)
    df_chaps.to_csv('chapters', index=False)
    print(df_chaps.head())
    
    return df_chaps

# --------- 2) normalize broken headings inside content ----------
def normalize_inline_headings(content: str) -> str:
    """
    fixes line breaks inside headings like:
    "Freedom of 10. Every person is entitled to freedom of thought,
    thought,
    conscience and religion..."
    heuristics:
     - if a word fragment repeats on next line, join lines
     - if a line ends with a lower-case token and next line starts with lower-case and both are short, join
    this is heuristic; should work for OCR-broken lines like your example.
    """
    lines = content.split('\n')
    out_lines = []
    for i, line in enumerate(lines):
        if i < len(lines)-1:
            next_line = lines[i+1]
            # if the last token of line is a dangling fragment repeated at start of next, merge
            last_tok = line.strip().split()[-1] if line.strip().split() else ''
            next_first_tok = next_line.strip().split()[0] if next_line.strip().split() else ''
            if last_tok and last_tok == next_first_tok:
                # merge them into a single line (drop duplicate)
                merged = line.rstrip() + ' ' + ' '.join(next_line.strip().split()[1:])
                out_lines.append(merged)
                # skip next line
                lines[i+1] = ''  # clear so it won't be appended later
            elif (len(line.strip()) < 60 and len(next_line.strip()) < 60 and line.strip() != "" and next_line.strip() != "" 
                  and line.strip()[-1].islower() and next_line.strip()[0].islower()):
                # probably a broken sentence; join
                out_lines.append(line.rstrip() + ' ' + next_line.strip())
                lines[i+1] = ''
            else:
                if line.strip() != '':
                    out_lines.append(line)
        else:
            if line.strip() != '':
                out_lines.append(line)
    return '\n'.join([l for l in out_lines if l is not None and l != ''])

# --------- 3) split chapter content into subchunks by simple english letters ----------
subchunk_re_patterns = [
    # patterns that indicate a new subpoint at line start:
    r'^\(\s*([a-z])\s*\)\s+',  # (a) ...
    r'^\s*([a-z])\.\s+',       # a. ...
    r'^\s*([a-z])\)\s+',       # a) ...
    r'^\s*([A-Z])\.\s+',       # A. ...
    r'^\s*\(?\s*([ivxlcdm]+)\s*\)\s+',  # (i) ...
]
subchunk_re = re.compile('|'.join('(?:%s)' % p for p in subchunk_re_patterns), re.MULTILINE)

def split_into_subchunks(content: str) -> List[Dict[str,str]]:
    """
    splits content into subchunks using english letter markers.
    returns list of {'id': 'a', 'text': '...'} or autogenerated id if none found.
    if content doesn't contain submarkers, returns the whole content as a single chunk.
    """
    # find positions of markers
    markers = []
    for m in subchunk_re.finditer(content):
        start = m.start()
        # try to find which capturing group matched to get the letter/token
        groups = m.groups()
        token = None
        for g in groups:
            if g:
                token = g
                break
        markers.append((start, token))
    if not markers:
        return [{'id': '1', 'text': content.strip()}]
    markers.append((len(content), None))
    chunks = []
    for i in range(len(markers)-1):
        s = markers[i][0]
        e = markers[i+1][0]
        chunk_text = content[s:e].strip()
        # extract id from the chunk's leading marker
        m0 = subchunk_re.match(chunk_text)
        if m0:
            # remove the leading marker
            # find first newline or space after marker
            marker_len = m0.end()
            id_token = None
            # find actual letter in groups
            for g in m0.groups():
                if g:
                    id_token = g
                    break
            tidy_text = chunk_text[marker_len:].strip()
            chunks.append({'id': id_token or str(i+1), 'text': tidy_text})
        else:
            # fallback whole chunk
            chunks.append({'id': str(i+1), 'text': chunk_text})
    return chunks


def add_sub_chunks(df):
    sec_chunks = []
    for _, row in df.iterrows():
        chunks = split_into_subchunks(row["content"])
        i = 0
        for chunk in chunks:
            sec_chunks.append({
                'chapter_no': row['chapter_no'],
                'chapter_title': row['chapter_title'],
                'id': str(i+1),
                'content': chunk["text"]            
            })
            i += 1

    final_df = pd.DataFrame(sec_chunks)
    final_df.to_csv('constitution.tsv', index=False)

    return final_df

2025-10-02 05:23:16.500592: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1759382596.535976     168 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1759382596.546575     168 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [7]:
chapters = split_into_chapters(text)
final_df = add_sub_chunks(chapters)

  chapter_no                          chapter_title  \
0          I  THE PEOPLE, THE STATE AND SOVEREIGNTY   
1         II                               BUDDHISM   
2        III                     FUNDAMENTAL RIGHTS   
3         IV                               LANGUAGE   
4          V                            CITIZENSHIP   

                                             content  
0  THE PEOPLE, THE STATE AND SOVEREIGNTY\n1. Sri ...  
1  BUDDHISM\n9. The Republic of Sri Lanka shall g...  
2  FUNDAMENTAL RIGHTS\nFreedom of 10. Every perso...  
3  LANGUAGE\n18. 4[(1)]The Official Language of S...  
4  CITIZENSHIP\nCitizenship 26. (1) There shall b...  


In [8]:
final_df.head()

Unnamed: 0,chapter_no,chapter_title,id,content
0,I,"THE PEOPLE, THE STATE AND SOVEREIGNTY",1,the legislative power of the People shall be\n...
1,I,"THE PEOPLE, THE STATE AND SOVEREIGNTY",2,"the executive power of the People, including\n..."
2,I,"THE PEOPLE, THE STATE AND SOVEREIGNTY",3,the judicial power of the People shall be\nexe...
3,I,"THE PEOPLE, THE STATE AND SOVEREIGNTY",4,the fundamental rights which are by the\n2 The...
4,I,"THE PEOPLE, THE STATE AND SOVEREIGNTY",5,the franchise shall be exercisable at the\nele...


In [9]:
!pip install faiss-cpu rank-bm25



In [10]:
# !pip install -U transformers==4.44.2 sentence-transformers==2.7.0 peft==0.11.1 accelerate soundfile

In [11]:
import os
import re
import pandas as pd
import numpy as np
import faiss
import joblib
from typing import List, Tuple
from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25Okapi

model = SentenceTransformer("nlpaueb/legal-bert-base-uncased")

def build_index(docs):
    embeddings = model.encode(docs, batch_size=32, show_progress_bar=True)
    index = faiss.IndexFlatL2(768)
    index.add(np.array(embeddings, dtype=np.float32))
    faiss.write_index(index, "constitution.faiss")

def build_bm25(docs):
    bm25_corpus = [re.findall(r"\w+", doc.lower()) for doc in docs]
    bm25 = BM25Okapi(bm25_corpus)
    joblib.dump(bm25_corpus, "constitution_bm25.pkl")


build_index(final_df["content"])
build_bm25(final_df["content"])



config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Batches:   0%|          | 0/20 [00:00<?, ?it/s]