In [1]:
import pandas as pd
import os

# Folder where your 18 CSV files are stored
folder_path = 'C:/Users/Hari/RAG-Based Chatbot PU/Playwright/pu_scraped'

# List all CSV files in the folder
csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]

# Load and concatenate all CSVs
df_list = [pd.read_csv(os.path.join(folder_path, file)) for file in csv_files]
df = pd.concat(df_list, ignore_index=True)

# Check result
print(f"Total rows: {df.shape[0]}")
df.head()


Total rows: 3427


Unnamed: 0,URL,Text
0,https://puchd.ac.in/,Official Website of Panjab University Chandiga...
1,https://pumail.puchd.ac.in/,WebMail Panjab University :: Welcome to WebMai...
2,https://puchd.ac.in/contactus.php,University Contacts - Panjab University Chandi...
3,https://puchd.ac.in/screen-reader.php,Screen Reader : Official Website of Panjab Uni...
4,https://puchd.ac.in/,Official Website of Panjab University Chandiga...


In [2]:
df.shape

(3427, 2)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3427 entries, 0 to 3426
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   URL     3427 non-null   object
 1   Text    3427 non-null   object
dtypes: object(2)
memory usage: 53.7+ KB


In [4]:
df.describe()

Unnamed: 0,URL,Text
count,3427,3427
unique,530,392
top,https://puchd.ac.in,Official Website of Panjab University Chandiga...
freq,176,207


### There are no null values.

### Data Cleaning

In [5]:
import neattext.functions as nfx
import re

df['Cleaned_text'] = df['Text'].apply(lambda x: nfx.remove_multiple_spaces(x))
df['Cleaned_text'] = df['Cleaned_text'].apply(lambda x: re.sub(r'\s+', ' ', re.sub(r'[^A-Za-z0-9-(-)\s]', ' ', x)).strip())
df['Cleaned_text'] = df['Cleaned_text'].apply(lambda x: x.strip())

df.head()

Unnamed: 0,URL,Text,Cleaned_text
0,https://puchd.ac.in/,Official Website of Panjab University Chandiga...,Official Website of Panjab University Chandiga...
1,https://pumail.puchd.ac.in/,WebMail Panjab University :: Welcome to WebMai...,WebMail Panjab University Welcome to WebMail P...
2,https://puchd.ac.in/contactus.php,University Contacts - Panjab University Chandi...,University Contacts - Panjab University Chandi...
3,https://puchd.ac.in/screen-reader.php,Screen Reader : Official Website of Panjab Uni...,Screen Reader Official Website of Panjab Unive...
4,https://puchd.ac.in/,Official Website of Panjab University Chandiga...,Official Website of Panjab University Chandiga...


In [6]:
df['Cleaned_text'] = df['Cleaned_text'].apply(lambda x: x.lower())

In [7]:
df.head()

Unnamed: 0,URL,Text,Cleaned_text
0,https://puchd.ac.in/,Official Website of Panjab University Chandiga...,official website of panjab university chandiga...
1,https://pumail.puchd.ac.in/,WebMail Panjab University :: Welcome to WebMai...,webmail panjab university welcome to webmail p...
2,https://puchd.ac.in/contactus.php,University Contacts - Panjab University Chandi...,university contacts - panjab university chandi...
3,https://puchd.ac.in/screen-reader.php,Screen Reader : Official Website of Panjab Uni...,screen reader official website of panjab unive...
4,https://puchd.ac.in/,Official Website of Panjab University Chandiga...,official website of panjab university chandiga...


In [8]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("google/long-t5-tglobal-base")

In [9]:
max_tokens = 512
stride = 50  # Optional overlap between chunks

def chunk_text_token_based(text, tokenizer, max_tokens=512, stride=50):
    tokens = tokenizer.encode(text, add_special_tokens=False, truncation=False)
    chunks = []
    for i in range(0, len(tokens), max_tokens - stride):
        chunk_tokens = tokens[i:i + max_tokens]
        chunk = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
        chunks.append(chunk)
    return chunks

In [10]:
df['Chunks'] = df['Cleaned_text'].apply(lambda x: chunk_text_token_based(x, tokenizer))

In [11]:
df.head()

Unnamed: 0,URL,Text,Cleaned_text,Chunks
0,https://puchd.ac.in/,Official Website of Panjab University Chandiga...,official website of panjab university chandiga...,[official website of panjab university chandig...
1,https://pumail.puchd.ac.in/,WebMail Panjab University :: Welcome to WebMai...,webmail panjab university welcome to webmail p...,[webmail panjab university welcome to webmail ...
2,https://puchd.ac.in/contactus.php,University Contacts - Panjab University Chandi...,university contacts - panjab university chandi...,[university contacts - panjab university chand...
3,https://puchd.ac.in/screen-reader.php,Screen Reader : Official Website of Panjab Uni...,screen reader official website of panjab unive...,[screen reader official website of panjab univ...
4,https://puchd.ac.in/,Official Website of Panjab University Chandiga...,official website of panjab university chandiga...,[official website of panjab university chandig...


### Flatten the Chunks for Indexing

In [16]:
from llama_index.core import Document

documents = []
for idx, row in df.iterrows():
    for i, chunk in enumerate(row['Chunks']):
        documents.append(Document(text=chunk, metadata={"source": f"row_{idx}", "chunk_id": i}))

In [17]:
len(documents)

41974

### Build and Persist the Index

In [18]:
from llama_index.core import Document, VectorStoreIndex, StorageContext
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.core.schema import TextNode
from chromadb import PersistentClient
from tqdm import tqdm

# Step 1: Create embed model
embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Step 2: Prepare Chroma vector store
persist_dir = "./chroma_db"
client = PersistentClient(path=persist_dir)
collection = client.get_or_create_collection("rag-collection")
vector_store = ChromaVectorStore(chroma_collection=collection, persist_dir=persist_dir)

# Step 3: Create storage context
storage_context = StorageContext.from_defaults(vector_store=vector_store)

# Step 4: Parse documents to nodes
parser = SimpleNodeParser()
nodes = []
for doc in tqdm(documents, desc="📄 Parsing documents"):
    nodes.extend(parser.get_nodes_from_documents([doc]))

# Step 5: Embed documents (with progress)
for node in tqdm(nodes, desc="🔍 Embedding nodes"):
    node.embedding = embed_model.get_text_embedding(node.text)

📄 Parsing documents: 100%|█████████████████████████████████████████████████████| 41974/41974 [01:13<00:00, 573.82it/s]
🔍 Embedding nodes: 100%|██████████████████████████████████████████████████████| 41974/41974 [1:21:04<00:00,  8.63it/s]


In [19]:
index = VectorStoreIndex(
    nodes,
    storage_context=storage_context,
    embed_model=embed_model
)
index.storage_context.persist(persist_dir="./index")
