In [1]:
import pdfplumber
import os
import json

def extract_pdf_to_json(pdf_path):
    data = {
        "pdf_file": os.path.basename(pdf_path),
        "content": []
    }

    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text() or ""
            tables = page.extract_tables()
            table_data = []
            for table in tables:
                table_data.append(table)

            data["content"].append({
                "page_number": page.page_number,
                "text": page_text.strip(),
                "tables": table_data
            })

    return data

def extract_all_pdfs_to_json(pdf_folder, output_json="pdf_data.json"):
    all_data = []
    for file in os.listdir(pdf_folder):
        if file.lower().endswith(".pdf"):
            full_path = os.path.join(pdf_folder, file)
            print(f"Processing {file}...")
            result = extract_pdf_to_json(full_path)
            all_data.append(result)

    with open(output_json, "w", encoding="utf-8") as f:
        json.dump(all_data, f, indent=2, ensure_ascii=False)
    print(f"\n✅ Data saved to: {output_json}")


extract_all_pdfs_to_json("PDFs/")

Processing handbook.pdf...
Processing handbook_c.pdf...
Processing hbi-2025-faculty-of-arts.pdf...
Processing hbi-2025-faculty-of-business.pdf...


Cannot set gray non-stroke color because /'Pattern1' is an invalid float value
Cannot set gray non-stroke color because /'Pattern2' is an invalid float value
Cannot set gray non-stroke color because /'Pattern3' is an invalid float value
Cannot set gray non-stroke color because /'Pattern4' is an invalid float value
Cannot set gray non-stroke color because /'Pattern5' is an invalid float value
Cannot set gray non-stroke color because /'Pattern6' is an invalid float value
Cannot set gray non-stroke color because /'Pattern7' is an invalid float value
Cannot set gray non-stroke color because /'Pattern8' is an invalid float value
Cannot set gray non-stroke color because /'Pattern9' is an invalid float value
Cannot set gray non-stroke color because /'Pattern10' is an invalid float value
Cannot set gray non-stroke color because /'Pattern11' is an invalid float value
Cannot set gray non-stroke color because /'Pattern12' is an invalid float value
Cannot set gray non-stroke color because /'Patter

Processing hbi-2025-faculty-of-design-fine-arts.pdf...
Processing hbi-2025-faculty-of-education.pdf...
Processing hbi-2025-faculty-of-engineering.pdf...
Processing hbi-2025-faculty-of-laws.pdf...
Processing hbi-2025-faculty-of-medical-science.pdf...
Processing hbi-2025-faculty-of-pharmacy.pdf...
Processing hbi-2025-faculty-of-science.pdf...
Processing hbi-2025-final-faculty-of-languages.pdf...
Processing hbi-2025-multi-faculty-depts.pdf...

✅ Data saved to: pdf_data.json


In [10]:
import pandas as pd
import json

# Load the JSON file
with open("pdf_data.json", "r", encoding="utf-8") as f:
    raw_data = json.load(f)

# Flatten into rows per page
records = []
for doc in raw_data:
    for page in doc["content"]:
        records.append({
            "pdf_file": doc["pdf_file"],
            "page_number": page["page_number"],
            "text": page.get("text", "").strip(),
            "tables": page.get("tables", [])
        })

df = pd.DataFrame(records)


In [11]:
df.shape

(363, 4)

In [12]:
df

Unnamed: 0,pdf_file,page_number,text,tables
0,handbook.pdf,1,FRONT PAGES 1 HANDBOOK OF INFORMATION 2025\nHo...,[]
1,handbook.pdf,2,FRONT PAGES 2 HANDBOOK OF INFORMATION 2025\nVi...,[]
2,handbook.pdf,3,FRONT PAGES 3 HANDBOOK OF INFORMATION 2025\nPA...,[]
3,handbook.pdf,4,FRONT PAGES 4 HANDBOOK OF INFORMATION 2025\nOF...,[]
4,handbook.pdf,5,FRONT PAGES 5 HANDBOOK OF INFORMATION 2025\nTA...,[]
...,...,...,...,...
358,hbi-2025-multi-faculty-depts.pdf,20,MULTI FACULTY DEPARTMENTS 185 HANDBOOK OF INFO...,"[[[Choice Based Physics Course + Practical, Ob..."
359,hbi-2025-multi-faculty-depts.pdf,21,MULTI FACULTY DEPARTMENTS 186 HANDBOOK OF INFO...,"[[[Course, Seats, Duration, Eligibility*, Admi..."
360,hbi-2025-multi-faculty-depts.pdf,22,MULTI FACULTY DEPARTMENTS 187 HANDBOOK OF INFO...,"[[[Semester-I, Semester-II], [English-I, Engli..."
361,hbi-2025-multi-faculty-depts.pdf,23,MULTI FACULTY DEPARTMENTS 188 HANDBOOK OF INFO...,[[[Paper IV & V-\n(a) Information Technology A...


In [13]:
import neattext.functions as nfx
import re

def clean_text(x):
    if not isinstance(x, str):
        return ""
    x = nfx.remove_multiple_spaces(x)
    x = re.sub(r'\s+', ' ', re.sub(r'[^A-Za-z0-9\-\(\)\s]', ' ', x))
    return x.strip()

In [14]:
df['cleaned_text'] = df['text'].apply(clean_text)

In [25]:
df

Unnamed: 0,pdf_file,page_number,text,tables,cleaned_text
0,handbook.pdf,1,FRONT PAGES 1 HANDBOOK OF INFORMATION 2025\nHo...,[],FRONT PAGES 1 HANDBOOK OF INFORMATION 2025 Hon...
1,handbook.pdf,2,FRONT PAGES 2 HANDBOOK OF INFORMATION 2025\nVi...,[],FRONT PAGES 2 HANDBOOK OF INFORMATION 2025 Vic...
2,handbook.pdf,3,FRONT PAGES 3 HANDBOOK OF INFORMATION 2025\nPA...,[],FRONT PAGES 3 HANDBOOK OF INFORMATION 2025 PAN...
3,handbook.pdf,4,FRONT PAGES 4 HANDBOOK OF INFORMATION 2025\nOF...,[],FRONT PAGES 4 HANDBOOK OF INFORMATION 2025 OFF...
4,handbook.pdf,5,FRONT PAGES 5 HANDBOOK OF INFORMATION 2025\nTA...,[],FRONT PAGES 5 HANDBOOK OF INFORMATION 2025 TAB...
...,...,...,...,...,...
358,hbi-2025-multi-faculty-depts.pdf,20,MULTI FACULTY DEPARTMENTS 185 HANDBOOK OF INFO...,"[[[Choice Based Physics Course + Practical, Ob...",MULTI FACULTY DEPARTMENTS 185 HANDBOOK OF INFO...
359,hbi-2025-multi-faculty-depts.pdf,21,MULTI FACULTY DEPARTMENTS 186 HANDBOOK OF INFO...,"[[[Course, Seats, Duration, Eligibility*, Admi...",MULTI FACULTY DEPARTMENTS 186 HANDBOOK OF INFO...
360,hbi-2025-multi-faculty-depts.pdf,22,MULTI FACULTY DEPARTMENTS 187 HANDBOOK OF INFO...,"[[[Semester-I, Semester-II], [English-I, Engli...",MULTI FACULTY DEPARTMENTS 187 HANDBOOK OF INFO...
361,hbi-2025-multi-faculty-depts.pdf,23,MULTI FACULTY DEPARTMENTS 188 HANDBOOK OF INFO...,[[[Paper IV & V-\n(a) Information Technology A...,MULTI FACULTY DEPARTMENTS 188 HANDBOOK OF INFO...


In [29]:
def flatten_tables_verbose(tables):
    if not tables or not isinstance(tables, list):
        return ""
    
    flat = []
    for table in tables:
        if not table or len(table) < 2:
            continue
        headers = table[0]
        for row in table[1:]:
            row_text = ", ".join(
                f"{headers[i]}: {cell.strip() if cell else ''}" 
                for i, cell in enumerate(row) if i < len(headers)
            )
            flat.append(row_text)
    
    return "\n".join(flat)


In [30]:
df["tables_text"] = df["tables"].apply(flatten_tables_verbose)

In [33]:
df["tables_text"] = df["tables_text"].str.replace('\n', ' ', regex=False)

Unnamed: 0,pdf_file,page_number,text,tables,cleaned_text,tables_text
0,handbook.pdf,1,FRONT PAGES 1 HANDBOOK OF INFORMATION 2025\nHo...,[],FRONT PAGES 1 HANDBOOK OF INFORMATION 2025 Hon...,
1,handbook.pdf,2,FRONT PAGES 2 HANDBOOK OF INFORMATION 2025\nVi...,[],FRONT PAGES 2 HANDBOOK OF INFORMATION 2025 Vic...,
2,handbook.pdf,3,FRONT PAGES 3 HANDBOOK OF INFORMATION 2025\nPA...,[],FRONT PAGES 3 HANDBOOK OF INFORMATION 2025 PAN...,
3,handbook.pdf,4,FRONT PAGES 4 HANDBOOK OF INFORMATION 2025\nOF...,[],FRONT PAGES 4 HANDBOOK OF INFORMATION 2025 OFF...,
4,handbook.pdf,5,FRONT PAGES 5 HANDBOOK OF INFORMATION 2025\nTA...,[],FRONT PAGES 5 HANDBOOK OF INFORMATION 2025 TAB...,
...,...,...,...,...,...,...
358,hbi-2025-multi-faculty-depts.pdf,20,MULTI FACULTY DEPARTMENTS 185 HANDBOOK OF INFO...,"[[[Choice Based Physics Course + Practical, Ob...",MULTI FACULTY DEPARTMENTS 185 HANDBOOK OF INFO...,Choice Based Physics Course + Practical: Works...
359,hbi-2025-multi-faculty-depts.pdf,21,MULTI FACULTY DEPARTMENTS 186 HANDBOOK OF INFO...,"[[[Course, Seats, Duration, Eligibility*, Admi...",MULTI FACULTY DEPARTMENTS 186 HANDBOOK OF INFO...,Course: B.A.LL.B (Hons.) 5Years Integrated Cou...
360,hbi-2025-multi-faculty-depts.pdf,22,MULTI FACULTY DEPARTMENTS 187 HANDBOOK OF INFO...,"[[[Semester-I, Semester-II], [English-I, Engli...",MULTI FACULTY DEPARTMENTS 187 HANDBOOK OF INFO...,"Semester-I: English-I, Semester-II: English-II..."
361,hbi-2025-multi-faculty-depts.pdf,23,MULTI FACULTY DEPARTMENTS 188 HANDBOOK OF INFO...,[[[Paper IV & V-\n(a) Information Technology A...,MULTI FACULTY DEPARTMENTS 188 HANDBOOK OF INFO...,Paper IV & V- (a) Information Technology Act &...


In [37]:
def chunk_text_token_based(text, tokenizer, max_tokens=512, stride=50):
    tokens = tokenizer.encode(text, add_special_tokens=False, truncation=False)
    chunks = []
    start = 0
    while start < len(tokens):
        end = start + max_tokens
        chunk_tokens = tokens[start:end]
        chunk = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
        chunks.append(chunk)
        start += max_tokens - stride  # shift start index
    return chunks

In [38]:
df['chunks'] = df['cleaned_text'].apply(lambda x: chunk_text_token_based(x, tokenizer))

In [39]:
df['chunks_table'] = df['tables_text'].apply(lambda x: chunk_text_token_based(x, tokenizer))

In [40]:
df

Unnamed: 0,pdf_file,page_number,text,tables,cleaned_text,tables_text,chunks,chunks_table
0,handbook.pdf,1,FRONT PAGES 1 HANDBOOK OF INFORMATION 2025\nHo...,[],FRONT PAGES 1 HANDBOOK OF INFORMATION 2025 Hon...,,[FRONT PAGES 1 HANDBOOK OF INFORMATION 2025 Ho...,[]
1,handbook.pdf,2,FRONT PAGES 2 HANDBOOK OF INFORMATION 2025\nVi...,[],FRONT PAGES 2 HANDBOOK OF INFORMATION 2025 Vic...,,[FRONT PAGES 2 HANDBOOK OF INFORMATION 2025 Vi...,[]
2,handbook.pdf,3,FRONT PAGES 3 HANDBOOK OF INFORMATION 2025\nPA...,[],FRONT PAGES 3 HANDBOOK OF INFORMATION 2025 PAN...,,[FRONT PAGES 3 HANDBOOK OF INFORMATION 2025 PA...,[]
3,handbook.pdf,4,FRONT PAGES 4 HANDBOOK OF INFORMATION 2025\nOF...,[],FRONT PAGES 4 HANDBOOK OF INFORMATION 2025 OFF...,,[FRONT PAGES 4 HANDBOOK OF INFORMATION 2025 OF...,[]
4,handbook.pdf,5,FRONT PAGES 5 HANDBOOK OF INFORMATION 2025\nTA...,[],FRONT PAGES 5 HANDBOOK OF INFORMATION 2025 TAB...,,[FRONT PAGES 5 HANDBOOK OF INFORMATION 2025 TA...,[]
...,...,...,...,...,...,...,...,...
358,hbi-2025-multi-faculty-depts.pdf,20,MULTI FACULTY DEPARTMENTS 185 HANDBOOK OF INFO...,"[[[Choice Based Physics Course + Practical, Ob...",MULTI FACULTY DEPARTMENTS 185 HANDBOOK OF INFO...,Choice Based Physics Course + Practical: Works...,[MULTI FACULTY DEPARTMENTS 185 HANDBOOK OF INF...,[Choice Based Physics Course + Practical: Work...
359,hbi-2025-multi-faculty-depts.pdf,21,MULTI FACULTY DEPARTMENTS 186 HANDBOOK OF INFO...,"[[[Course, Seats, Duration, Eligibility*, Admi...",MULTI FACULTY DEPARTMENTS 186 HANDBOOK OF INFO...,Course: B.A.LL.B (Hons.) 5Years Integrated Cou...,[MULTI FACULTY DEPARTMENTS 186 HANDBOOK OF INF...,[Course: B.A.LL.B (Hons.) 5Years Integrated Co...
360,hbi-2025-multi-faculty-depts.pdf,22,MULTI FACULTY DEPARTMENTS 187 HANDBOOK OF INFO...,"[[[Semester-I, Semester-II], [English-I, Engli...",MULTI FACULTY DEPARTMENTS 187 HANDBOOK OF INFO...,"Semester-I: English-I, Semester-II: English-II...",[MULTI FACULTY DEPARTMENTS 187 HANDBOOK OF INF...,"[Semester-I: English-I, Semester-II: English-I..."
361,hbi-2025-multi-faculty-depts.pdf,23,MULTI FACULTY DEPARTMENTS 188 HANDBOOK OF INFO...,[[[Paper IV & V-\n(a) Information Technology A...,MULTI FACULTY DEPARTMENTS 188 HANDBOOK OF INFO...,Paper IV & V- (a) Information Technology Act &...,[MULTI FACULTY DEPARTMENTS 188 HANDBOOK OF INF...,[Paper IV & V- (a) Information Technology Act ...


In [41]:
from llama_index.core import Document

documents = []

for idx, row in df.iterrows():
    # Process text chunks
    for i, chunk in enumerate(row['chunks']):
        documents.append(Document(
            text=chunk,
            metadata={
                "source": row["pdf_file"],
                "chunk_type": "text",
                "chunk_id": f"text_{i}"
            }
        ))
    
    # Process table chunks
    for j, chunk in enumerate(row['chunks_table']):
        documents.append(Document(
            text=chunk,
            metadata={
                "source": row["pdf_file"],
                "chunk_type": "table",
                "chunk_id": f"table_{j}"
            }
        ))


In [42]:
len(documents)

2424

In [43]:
from llama_index.core import Document, VectorStoreIndex, StorageContext
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.core.schema import TextNode
from chromadb import PersistentClient
from tqdm import tqdm

# Step 1: Create embedding model
embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Step 2: Setup Chroma vector store
persist_dir = "./chroma_db2"
client = PersistentClient(path=persist_dir)
collection = client.get_or_create_collection("rag-collection")
vector_store = ChromaVectorStore(chroma_collection=collection, persist_dir=persist_dir)

# Step 3: Create storage context
storage_context = StorageContext.from_defaults(vector_store=vector_store)

# Step 4: Convert your DataFrame to Document list (text + table chunks)
documents = []

for idx, row in tqdm(df.iterrows(), total=len(df), desc="📄 Creating documents"):
    # Text chunks
    for i, chunk in enumerate(row['chunks']):
        documents.append(Document(
            text=chunk,
            metadata={
                "source": row["pdf_file"],
                "chunk_type": "text",
                "chunk_id": f"text_{i}"
            }
        ))
    # Table chunks
    for j, chunk in enumerate(row['chunks_table']):
        documents.append(Document(
            text=chunk,
            metadata={
                "source": row["pdf_file"],
                "chunk_type": "table",
                "chunk_id": f"table_{j}"
            }
        ))

# Step 5: Parse documents to nodes
parser = SimpleNodeParser()
nodes = []
for doc in tqdm(documents, desc="📄 Parsing documents"):
    nodes.extend(parser.get_nodes_from_documents([doc]))

# Step 6: Embed each node (with progress bar)
for node in tqdm(nodes, desc="🔍 Embedding nodes"):
    node.embedding = embed_model.get_text_embedding(node.text)




📄 Creating documents: 100%|███████████████████████████████████████████████████████| 363/363 [00:00<00:00, 3213.56it/s]
📄 Parsing documents: 100%|███████████████████████████████████████████████████████| 2424/2424 [00:02<00:00, 970.85it/s]
🔍 Embedding nodes: 100%|██████████████████████████████████████████████████████████| 2424/2424 [02:40<00:00, 15.11it/s]


In [46]:
# Step 7: Build and persist the index
index = VectorStoreIndex(
    nodes,
    storage_context=storage_context,
    embed_model=embed_model
)
index.storage_context.persist(persist_dir="./index2")