In [9]:
import pdfplumber
import os
import pandas as pd
from tqdm import tqdm

def extract_text_and_tables(pdf_path):
    full_text = ""
    with pdfplumber.open(pdf_path) as pdf:
        print(f"\n📄 Extracting from: {os.path.basename(pdf_path)}")
        for page in tqdm(pdf.pages, desc=f"Processing pages", unit="page"):
            # Extract normal text
            page_text = page.extract_text() or ""
            full_text += page_text + "\n"

            # Extract tables and convert to text
            tables = page.extract_tables()
            for table in tables:
                for row in table:
                    row_text = "\t".join([cell if cell else "" for cell in row])
                    full_text += row_text + "\n"
    return full_text.strip()

def extract_all_pdfs_to_csv(pdf_folder, output_csv="pdf_data.csv"):
    data = []
    pdf_files = [file for file in os.listdir(pdf_folder) if file.lower().endswith(".pdf")]

    for file in pdf_files:
        pdf_path = os.path.join(pdf_folder, file)
        text = extract_text_and_tables(pdf_path)
        data.append({"filename": file, "text": text})
    
    df = pd.DataFrame(data)
    df.to_csv(output_csv, index=False, encoding='utf-8')
    print(f"\n✅ Extraction complete. Data saved to: {output_csv}")

# Example usage
extract_all_pdfs_to_csv("PDFs/", "rag_pdf_data.csv")



📄 Extracting from: handbook.pdf


Processing pages: 100%|██████████████████████████████████████████████████████████████| 45/45 [00:31<00:00,  1.42page/s]



📄 Extracting from: handbook_c.pdf


Processing pages: 100%|████████████████████████████████████████████████████████████| 129/129 [03:30<00:00,  1.63s/page]



📄 Extracting from: hbi-2025-faculty-of-arts.pdf


Processing pages: 100%|██████████████████████████████████████████████████████████████| 33/33 [00:36<00:00,  1.10s/page]



📄 Extracting from: hbi-2025-faculty-of-business.pdf


Processing pages:  38%|████████████████████████                                       | 8/21 [00:08<00:17,  1.34s/page]Cannot set gray non-stroke color because /'Pattern1' is an invalid float value
Cannot set gray non-stroke color because /'Pattern2' is an invalid float value
Cannot set gray non-stroke color because /'Pattern3' is an invalid float value
Cannot set gray non-stroke color because /'Pattern4' is an invalid float value
Cannot set gray non-stroke color because /'Pattern5' is an invalid float value
Cannot set gray non-stroke color because /'Pattern6' is an invalid float value
Cannot set gray non-stroke color because /'Pattern7' is an invalid float value
Cannot set gray non-stroke color because /'Pattern8' is an invalid float value
Cannot set gray non-stroke color because /'Pattern9' is an invalid float value
Cannot set gray non-stroke color because /'Pattern10' is an invalid float value
Cannot set gray non-stroke color because /'Pattern11' is an invalid float value
Cannot set


📄 Extracting from: hbi-2025-faculty-of-design-fine-arts.pdf


Processing pages: 100%|████████████████████████████████████████████████████████████████| 4/4 [00:04<00:00,  1.05s/page]



📄 Extracting from: hbi-2025-faculty-of-education.pdf


Processing pages: 100%|████████████████████████████████████████████████████████████████| 9/9 [00:08<00:00,  1.11page/s]



📄 Extracting from: hbi-2025-faculty-of-engineering.pdf


Processing pages: 100%|██████████████████████████████████████████████████████████████| 15/15 [00:16<00:00,  1.10s/page]



📄 Extracting from: hbi-2025-faculty-of-laws.pdf


Processing pages: 100%|████████████████████████████████████████████████████████████████| 7/7 [00:09<00:00,  1.30s/page]



📄 Extracting from: hbi-2025-faculty-of-medical-science.pdf


Processing pages: 100%|████████████████████████████████████████████████████████████████| 3/3 [00:02<00:00,  1.07page/s]



📄 Extracting from: hbi-2025-faculty-of-pharmacy.pdf


Processing pages: 100%|████████████████████████████████████████████████████████████████| 5/5 [00:03<00:00,  1.29page/s]



📄 Extracting from: hbi-2025-faculty-of-science.pdf


Processing pages: 100%|██████████████████████████████████████████████████████████████| 49/49 [00:54<00:00,  1.11s/page]



📄 Extracting from: hbi-2025-final-faculty-of-languages.pdf


Processing pages: 100%|██████████████████████████████████████████████████████████████| 19/19 [00:20<00:00,  1.10s/page]



📄 Extracting from: hbi-2025-multi-faculty-depts.pdf


Processing pages: 100%|██████████████████████████████████████████████████████████████| 24/24 [00:19<00:00,  1.21page/s]


✅ Extraction complete. Data saved to: rag_pdf_data.csv





In [11]:
import pandas as pd

df = pd.read_csv('rag_pdf_data.csv')

In [15]:
df

Unnamed: 0,filename,text
0,handbook.pdf,FRONT PAGES 1 HANDBOOK OF INFORMATION 2025\nHo...
1,handbook_c.pdf,PART-C 1 HANDBOOK OF INFORMATION 2025\nFEE STR...
2,hbi-2025-faculty-of-arts.pdf,FACULTY OF ARTS 1 HANDBOOK OF INFORMATION 2025...
3,hbi-2025-faculty-of-business.pdf,FACULTY OF BUSINESS MANAGEMENT AND COMMERCE 34...
4,hbi-2025-faculty-of-design-fine-arts.pdf,FACULTY OF DESIGN AND FINE ARTS 55 HANDBOOK OF...
5,hbi-2025-faculty-of-education.pdf,FACULTY OF EDUCATION 59 HANDBOOK OF INFORMATIO...
6,hbi-2025-faculty-of-engineering.pdf,FACULTY OF ENGINEERING AND TECHNOLOGY 68 HANDB...
7,hbi-2025-faculty-of-laws.pdf,FACULTY OF LAWS 102 HANDBOOK OF INFORMATION 20...
8,hbi-2025-faculty-of-medical-science.pdf,FACULTY OF MEDICAL SCIENCES 109 HANDBOOK OF IN...
9,hbi-2025-faculty-of-pharmacy.pdf,FACULTY OF PHARMACEUTICAL SCIENCES 112 HANDBOO...


In [16]:
import neattext.functions as nfx
import re

df['cleaned_text'] = df['text'].apply(lambda x: nfx.remove_multiple_spaces(x))
df['cleaned_text'] = df['cleaned_text'].apply(lambda x: re.sub(r'\s+', ' ', re.sub(r'[^A-Za-z0-9-(-)\s]', ' ', x)).strip())
df['cleaned_text'] = df['cleaned_text'].apply(lambda x: x.strip())

df.head()

Unnamed: 0,filename,text,cleaned_text
0,handbook.pdf,FRONT PAGES 1 HANDBOOK OF INFORMATION 2025\nHo...,FRONT PAGES 1 HANDBOOK OF INFORMATION 2025 Hon...
1,handbook_c.pdf,PART-C 1 HANDBOOK OF INFORMATION 2025\nFEE STR...,PART-C 1 HANDBOOK OF INFORMATION 2025 FEE STRU...
2,hbi-2025-faculty-of-arts.pdf,FACULTY OF ARTS 1 HANDBOOK OF INFORMATION 2025...,FACULTY OF ARTS 1 HANDBOOK OF INFORMATION 2025...
3,hbi-2025-faculty-of-business.pdf,FACULTY OF BUSINESS MANAGEMENT AND COMMERCE 34...,FACULTY OF BUSINESS MANAGEMENT AND COMMERCE 34...
4,hbi-2025-faculty-of-design-fine-arts.pdf,FACULTY OF DESIGN AND FINE ARTS 55 HANDBOOK OF...,FACULTY OF DESIGN AND FINE ARTS 55 HANDBOOK OF...


In [17]:
df['cleaned_text'] = df['cleaned_text'].apply(lambda x: x.lower())

In [18]:
df

Unnamed: 0,filename,text,cleaned_text
0,handbook.pdf,FRONT PAGES 1 HANDBOOK OF INFORMATION 2025\nHo...,front pages 1 handbook of information 2025 hon...
1,handbook_c.pdf,PART-C 1 HANDBOOK OF INFORMATION 2025\nFEE STR...,part-c 1 handbook of information 2025 fee stru...
2,hbi-2025-faculty-of-arts.pdf,FACULTY OF ARTS 1 HANDBOOK OF INFORMATION 2025...,faculty of arts 1 handbook of information 2025...
3,hbi-2025-faculty-of-business.pdf,FACULTY OF BUSINESS MANAGEMENT AND COMMERCE 34...,faculty of business management and commerce 34...
4,hbi-2025-faculty-of-design-fine-arts.pdf,FACULTY OF DESIGN AND FINE ARTS 55 HANDBOOK OF...,faculty of design and fine arts 55 handbook of...
5,hbi-2025-faculty-of-education.pdf,FACULTY OF EDUCATION 59 HANDBOOK OF INFORMATIO...,faculty of education 59 handbook of informatio...
6,hbi-2025-faculty-of-engineering.pdf,FACULTY OF ENGINEERING AND TECHNOLOGY 68 HANDB...,faculty of engineering and technology 68 handb...
7,hbi-2025-faculty-of-laws.pdf,FACULTY OF LAWS 102 HANDBOOK OF INFORMATION 20...,faculty of laws 102 handbook of information 20...
8,hbi-2025-faculty-of-medical-science.pdf,FACULTY OF MEDICAL SCIENCES 109 HANDBOOK OF IN...,faculty of medical sciences 109 handbook of in...
9,hbi-2025-faculty-of-pharmacy.pdf,FACULTY OF PHARMACEUTICAL SCIENCES 112 HANDBOO...,faculty of pharmaceutical sciences 112 handboo...


In [19]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("google/long-t5-tglobal-base")

In [20]:
def chunk_text_token_based(text, tokenizer, max_tokens=512, stride=50):
    tokens = tokenizer.encode(text, add_special_tokens=False, truncation=False)
    chunks = []
    start = 0
    while start < len(tokens):
        end = start + max_tokens
        chunk_tokens = tokens[start:end]
        chunk = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
        chunks.append(chunk)
        start += max_tokens - stride  # shift start index
    return chunks

In [21]:
df['chunks'] = df['cleaned_text'].apply(lambda x: chunk_text_token_based(x, tokenizer))

In [23]:
df.head()

Unnamed: 0,filename,text,cleaned_text,chunks
0,handbook.pdf,FRONT PAGES 1 HANDBOOK OF INFORMATION 2025\nHo...,front pages 1 handbook of information 2025 hon...,[front pages 1 handbook of information 2025 ho...
1,handbook_c.pdf,PART-C 1 HANDBOOK OF INFORMATION 2025\nFEE STR...,part-c 1 handbook of information 2025 fee stru...,[part-c 1 handbook of information 2025 fee str...
2,hbi-2025-faculty-of-arts.pdf,FACULTY OF ARTS 1 HANDBOOK OF INFORMATION 2025...,faculty of arts 1 handbook of information 2025...,[faculty of arts 1 handbook of information 202...
3,hbi-2025-faculty-of-business.pdf,FACULTY OF BUSINESS MANAGEMENT AND COMMERCE 34...,faculty of business management and commerce 34...,[faculty of business management and commerce 3...
4,hbi-2025-faculty-of-design-fine-arts.pdf,FACULTY OF DESIGN AND FINE ARTS 55 HANDBOOK OF...,faculty of design and fine arts 55 handbook of...,[faculty of design and fine arts 55 handbook o...


### Flatten chunks for indexing

In [24]:
from llama_index.core import Document

documents = []
for idx, row in df.iterrows():
    for i, chunk in enumerate(row['chunks']):
        documents.append(Document(text=chunk, metadata={"source": f"row_{idx}", "chunk_id": i}))

In [25]:
len(documents)

1049

### Buiding the Vector Database

In [26]:
from llama_index.core import Document, VectorStoreIndex, StorageContext
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.core.schema import TextNode
from chromadb import PersistentClient
from tqdm import tqdm

# Step 1: Create embed model
embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Step 2: Prepare Chroma vector store
persist_dir = "./chroma_db1"
client = PersistentClient(path=persist_dir)
collection = client.get_or_create_collection("rag-collection")
vector_store = ChromaVectorStore(chroma_collection=collection, persist_dir=persist_dir)

# Step 3: Create storage context
storage_context = StorageContext.from_defaults(vector_store=vector_store)

# Step 4: Parse documents to nodes
parser = SimpleNodeParser()
nodes = []
for doc in tqdm(documents, desc="📄 Parsing documents"):
    nodes.extend(parser.get_nodes_from_documents([doc]))

# Step 5: Embed documents (with progress)
for node in tqdm(nodes, desc="🔍 Embedding nodes"):
    node.embedding = embed_model.get_text_embedding(node.text)




📄 Parsing documents: 100%|███████████████████████████████████████████████████████| 1049/1049 [00:01<00:00, 557.51it/s]
🔍 Embedding nodes: 100%|██████████████████████████████████████████████████████████| 1049/1049 [02:07<00:00,  8.26it/s]


In [27]:
index = VectorStoreIndex(
    nodes,
    storage_context=storage_context,
    embed_model=embed_model
)
index.storage_context.persist(persist_dir="./index")