In [1]:
import fitz  # PyMuPDF
import os

# Folder containing your PDFs
pdf_folder = "./KnowledgeBase"

# Define chunk size
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 200

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

def split_text(text, chunk_size=1000, overlap=200):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]
        chunks.append(chunk)
        start += chunk_size - overlap
    return chunks

# Load and process all PDFs
all_chunks = []

for file in os.listdir(pdf_folder):
    if file.endswith(".pdf"):
        path = os.path.join(pdf_folder, file)
        print(f"Processing: {file}")
        text = extract_text_from_pdf(path)
        chunks = split_text(text, CHUNK_SIZE, CHUNK_OVERLAP)
        for i, chunk in enumerate(chunks):
            all_chunks.append({
                "file": file,
                "chunk_number": i + 1,
                "content": chunk
            })

print(f"\nTotal chunks created: {len(all_chunks)}")

# Show sample chunks
for i in range(min(3, len(all_chunks))):
    print(f"\n--- Chunk {i+1} from {all_chunks[i]['file']} ---")
    print(all_chunks[i]["content"][:500], "...\n")


Processing: 36328.pdf
Processing: Code_of_criminal_procedure_1898.pdf
Processing: Code_of_Criminal_Procedure_1898_incorporating_amendments_to_16_February_2017.pdf
Processing: Domestic Violence (Prevention and Protection) Act, 2013 & Rules, 2016 (Amendments upto date).pdf
Processing: Domestic-Violence-in-Pakistan.pdf
Processing: domestic-violence-laws-and-their-legal-framework-for-women-in-pakistan-an-analysis.pdf
Processing: ha.pdf
MuPDF error: format error: No default Layer config

Processing: Pakistan Penal Code.pdf
Processing: Pakistan_Penal_Code_1860_incorporating_amendments_to_16_February_2017 (1).pdf
Processing: Pakistan_Penal_Code_1860_incorporating_amendments_to_16_February_2017.pdf
Processing: Protection-of-Women-Criminal-Laws-Amendment-Act-2006-Editors-Comment-PLR-Vol-III.pdf
Processing: PUNJAB_PROTECTION_OF_WOMEN_AGAINST_VIOLENCE_ACT_2016 (1).pdf
Processing: PUNJAB_PROTECTION_OF_WOMEN_AGAINST_VIOLENCE_ACT_2016.pdf
Processing: the_code_of_criminal_procedure_1898.pdf
Processin

In [6]:
import chromadb
from sentence_transformers import SentenceTransformer
import uuid

# Initialize Chroma client and collection
chroma_client = chromadb.Client()
collection = chroma_client.create_collection(name="paklegal_laws")

# Load embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Insert chunks
for chunk in all_chunks:
    content = chunk['content']
    metadata = {"file": chunk['file'], "chunk_id": chunk['chunk_number']}
    embedding = model.encode(content).tolist()
    doc_id = str(uuid.uuid4())
    
    collection.add(
        documents=[content],
        embeddings=[embedding],
        metadatas=[metadata],
        ids=[doc_id]
    )

print(f"✅ Stored {len(all_chunks)} chunks into ChromaDB")

✅ Stored 4393 chunks into ChromaDB


In [7]:
import faiss
import numpy as np

# Prepare data
texts = [chunk['content'] for chunk in all_chunks]
metadatas = [{"file": chunk['file'], "chunk_id": chunk['chunk_number']} for chunk in all_chunks]
embeddings = model.encode(texts, convert_to_numpy=True)

# Create FAISS index (dimension must match embedding size)
dimension = embeddings.shape[1]
faiss_index = faiss.IndexFlatL2(dimension)
faiss_index.add(embeddings)

print(f"✅ FAISS index created with {len(texts)} documents.")

✅ FAISS index created with 4393 documents.


In [8]:
faiss.write_index(faiss_index, "faiss_index.idx")
print("✅ FAISS index saved to faiss_index.idx")

✅ FAISS index saved to faiss_index.idx


In [11]:
# query = "man slapped his wife what is further legal proceedure."
# query = "What is the punishment for kidnapping a person in Pakistan?"
# query = "What law protects women from domestic abuse in Punjab?"
query = "What are the legal provisions and procedural steps under Pakistani law, including the Pakistan Penal Code and Domestic Violence Acts, in the case of a husband committing physical assault or criminal force against his wife?"
query_embedding = model.encode(query).tolist()

results = collection.query(
    query_embeddings=[query_embedding],
    n_results=10
)
output_file = "TempOutput.txt"
# Display top 3 results
# for i, doc in enumerate(results["documents"][0]):
#     print(f"\n--- Result {i+1} ---")
#     print(doc[:1000], "...")
#     print("Metadata:", results["metadatas"][0][i])

with open(output_file, "w", encoding="utf-8") as f:
    for i, doc in enumerate(results["documents"][0]):
        metadata = results["metadatas"][0][i]
        f.write(f"--- Result {i+1} ---\n")
        f.write(f"Source File: {metadata.get('file', 'N/A')}\n")
        f.write(f"Chunk ID: {metadata.get('chunk_id', 'N/A')}\n")
        f.write("Content:\n")
        f.write(doc.strip() + "\n")
        f.write("\n" + "-"*80 + "\n\n")

print(f"✅ All {len(results['documents'][0])} results saved to {output_file}")

✅ All 10 results saved to TempOutput.txt


In [36]:
# Sanity query test
query1 = "What law protects women from domestic abuse in Punjab?"
query_embedding = model.encode([query1], convert_to_numpy=True)

# Perform search (top 3)
D, I = faiss_index.search(query_embedding, k=10)

# Save results to file
output_file_faiss = "TempOutput_FAISS.txt"
with open(output_file_faiss, "w", encoding="utf-8") as f:
    for rank, idx in enumerate(I[0]):
        doc = texts[idx]
        metadata = metadatas[idx]
        f.write(f"--- Result {rank+1} ---\n")
        f.write(f"Source File: {metadata.get('file', 'N/A')}\n")
        f.write(f"Chunk ID: {metadata.get('chunk_id', 'N/A')}\n")
        f.write("Content:\n")
        f.write(doc.strip() + "\n")
        f.write("\n" + "-"*80 + "\n\n")

print(f"✅ Top 3 FAISS results saved to {output_file_faiss}")


✅ Top 3 FAISS results saved to TempOutput_FAISS.txt


In [None]:
import chromadb
from sentence_transformers import SentenceTransformer
import uuid

# Use a better embedding model (higher quality)
model_name = "all-mpnet-base-v2"
model = SentenceTransformer(model_name)

# New Chroma collection for this model
chroma_client = chromadb.Client()
collection_mpnet = chroma_client.create_collection(name="paklegal_laws_mpnet")

# Embed and insert
for chunk in all_chunks:
    content = chunk['content']
    metadata = {"file": chunk['file'], "chunk_id": chunk['chunk_number']}
    embedding = model.encode(content).tolist()
    doc_id = str(uuid.uuid4())

    collection_mpnet.add(
        documents=[content],
        embeddings=[embedding],
        metadatas=[metadata],
        ids=[doc_id]
    )

print(f"✅ Stored {len(all_chunks)} chunks into ChromaDB using {model_name}")


In [None]:
query2 = "What law protects women from domestic abuse in Punjab?"
query_embedding2 = model.encode(query2).tolist()

results2 = collection_mpnet.query(
    query_embeddings=[query_embedding2],
    n_results=5
)

# Print top results
for i, doc in enumerate(results2["documents"][0]):
    print(f"\n--- Result {i+1} ---")
    print(doc[:1000], "...")
    print("Metadata:", results2["metadatas"][0][i])