In [3]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [4]:
from PyPDF2 import PdfReader

# Function to load full text from PDF
def load_constitution(path):
    reader = PdfReader(path)
    full_text = ""
    for page in reader.pages:
        full_text += page.extract_text() + "\n"
    return full_text

# Load the PDF file
doc_path = "constitution.pdf"  # Make sure it's in the same folder
raw_text = load_constitution(doc_path)

# Preview the first 1000 characters
print(raw_text[:1000])


 
 
 
 
 
 
 
 
 
 
 
 
 
 
THE 
CONSTITUTION  
OF THE  
ISLAMIC REPUBLIC  
OF 
PAKISTAN  
 
 
 
 
 
 
[As modified upto the  31st May , 2018] 
 
 
 
 
 
 
NATIONAL ASSEMBLY OF PAKISTAN  

PREFACE  
 
 The National Assembly of Pakistan passed the Constitution on   
10th April, 1973, the President of the Assembly authenticated it on 12th 
April, 1973 and the Assembly published the Constitution of the  Islamic 
Republic of Pakistan. Since then, a number of amendments have been 
made therein and it has become necessary and expedient that an up -to-date 
and au thentic version of the Constitution be published by the Assembly.  
 
 This Eighth  Edition, which is intended to provide an updated 
version of the Constitution, incorporates all amendments made in it till 
date.  
 
 
 
TAHIR HUSSAIN  
Secretary , 
National Assembly of  Pakistan . 
ISLAMABAD  : 
The 31st May, 2018 
CONSTITUTION OF PAKI STAN   
 i THE CONSTITUTION OF THE ISLAMIC REPUBLIC OF PAKISTAN  
__________  
 
CONTENTS  
____

In [6]:
# Function to split text into smaller chunks
def chunk_text(text, chunk_size=300):
    chunks = []
    current = ""
    for line in text.split("\n"):
        if len(current) + len(line) <= chunk_size:
            current += line + " "
        else:
            chunks.append(current.strip())
            current = line + " "
    if current:
        chunks.append(current.strip())
    return chunks

# Apply chunking
chunks = chunk_text(raw_text)

# Preview the first 3 chunks
for i, chunk in enumerate(chunks[:3]):
    print(f"\n--- Chunk {i+1} ---\n{chunk}")
    
print(f"\n✅ Total chunks created: {len(chunks)}")



--- Chunk 1 ---
THE  CONSTITUTION   OF THE   ISLAMIC REPUBLIC   OF  PAKISTAN               [As modified upto the  31st May , 2018]              NATIONAL ASSEMBLY OF PAKISTAN    PREFACE      The National Assembly of Pakistan passed the Constitution on

--- Chunk 2 ---
10th April, 1973, the President of the Assembly authenticated it on 12th  April, 1973 and the Assembly published the Constitution of the  Islamic  Republic of Pakistan. Since then, a number of amendments have been  made therein and it has become necessary and expedient that an up -to-date

--- Chunk 3 ---
and au thentic version of the Constitution be published by the Assembly.      This Eighth  Edition, which is intended to provide an updated  version of the Constitution, incorporates all amendments made in it till  date.         TAHIR HUSSAIN   Secretary ,  National Assembly of  Pakistan .

✅ Total chunks created: 1744


In [8]:
from sentence_transformers import SentenceTransformer

# Load a pre-trained model (MiniLM is small & fast)
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for all chunks
embeddings = embedder.encode(chunks, show_progress_bar=True)

print(f"✅ Generated {len(embeddings)} embeddings.")


Batches:   0%|          | 0/55 [00:00<?, ?it/s]

✅ Generated 1744 embeddings.


In [9]:
import faiss
import numpy as np

# Convert list of embeddings to NumPy array
embedding_array = np.array(embeddings).astype("float32")

# Create FAISS index
dimension = embedding_array.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embedding_array)

print(f"✅ FAISS index created with {index.ntotal} vectors.")


✅ FAISS index created with 1744 vectors.


In [10]:
def search_index(query, k=3):
    # Embed the user query
    query_embedding = embedder.encode([query]).astype("float32")
    
    # Search FAISS for top-k similar chunks
    distances, indices = index.search(query_embedding, k)
    
    # Return the matched chunks
    results = [chunks[i] for i in indices[0]]
    return results


In [11]:
# Try a question related to the Constitution
sample_question = "What is the process of constitutional amendment in Pakistan?"
matched_chunks = search_index(sample_question)

# Show the retrieved chunks
for i, chunk in enumerate(matched_chunks):
    print(f"\n--- Retrieved Chunk {i+1} ---\n{chunk}")



--- Retrieved Chunk 1 ---
Constitution, has become the respon sibility of the Federal  Government, shall devolve upon the Federal Government.     275. Continuance in office of persons in service of Pakistan, etc.     275. (1) Subject to the Constitution and until law is made under

--- Retrieved Chunk 2 ---
be imposed, as if the law had not been repealed.     CHAPTER 6. – TITLE, COMMENCEMENT AND REPEAL       265. Title of Constitution and commencement     265. (1) This Constitution shall be known as the Constitution of the  Islamic Republic of Pakistan.

--- Retrieved Chunk 3 ---
This Constituent Assembly representing the people of Pakistan  resolves to frame a constitution for the sovereign independent State of  Pakistan;     Wherein the State shall exercise its powers and authority through  the chosen representatives of the people;


In [13]:
from transformers import pipeline

# Load a small QA pipeline (distilbert works well for CPU)
qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")

# Combine chunks into one context
context = "\n".join(matched_chunks)

# Ask a question
response = qa_pipeline({
    'context': context,
    'question': "What is the process of constitutional amendment in Pakistan?"
})

print(f"Answer: {response['answer']}")


config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Device set to use cpu


Answer: frame a constitution for the sovereign independent State of  Pakistan


In [14]:
import pickle

# Save chunks
with open("chunks.pkl", "wb") as f:
    pickle.dump(chunks, f)

# Save FAISS index
faiss.write_index(index, "faiss_index.bin")
