In [1]:
from pathlib import Path
import os

PROJECT_ROOT = Path().resolve().parent
os.chdir(PROJECT_ROOT)

print("Project root:", PROJECT_ROOT)


Project root: D:\Visual Studio practice\aviation-chatbot


In [2]:
import sys
import json

# Ensure src is importable
if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

from src.config import CHUNKS_PATH


D:\Visual Studio practice\aviation-chatbot


In [3]:
with open(CHUNKS_PATH, "r", encoding="utf-8") as f:
    chunks = json.load(f)

print("Total existing chunks:", len(chunks))
print("Sample chunk:")
print(chunks[0]["text"][:400])


Total existing chunks: 4434
Sample chunk:
Airport Operations
About the Authors
Norman J. Ashford was Professor of Transport Planning at the Loughborough University
of Technology, England, from 1972 to 1997. He holds bachelor’s, master’s, and doctoral
degrees in civil engineering. Dr. Ashford worked as a civil engineer in Canada and taught
at the Georgia Institute of Technology and Florida State University. He served as the


In [4]:
def inspect_chunks(chunks, n=5):
    for i in range(n):
        print("=" * 80)
        print(f"Chunk {i}")
        print("Document:", chunks[i]["document_name"])
        print("Page:", chunks[i]["page_number"])
        print(chunks[i]["text"][:600])

inspect_chunks(chunks, n=3)


Chunk 0
Document: airport_operations.pdf
Page: 2
Airport Operations
About the Authors
Norman J. Ashford was Professor of Transport Planning at the Loughborough University
of Technology, England, from 1972 to 1997. He holds bachelor’s, master’s, and doctoral
degrees in civil engineering. Dr. Ashford worked as a civil engineer in Canada and taught
at the Georgia Institute of Technology and Florida State University. He served as the
Chunk 1
Document: airport_operations.pdf
Page: 2
Director of the Transportation Institute for the State of Florida. Dr. Ashford runs an aviation
consulting company and has been active in the areas of airport planning, design,
operations, and privatization for more than 100 airports in more than 40 countries.
H. P. Martin Stanton (deceased) was an airport operations expert of international renown
Chunk 2
Document: airport_operations.pdf
Page: 2
who worked for the International Civil Airports Association in Paris, the Frankfort Airport
Authority, and the Ministr

In [5]:
def inspect_chunks(chunks, n=5):
    for i in range(n):
        print("=" * 80)
        print(f"Chunk {i}")
        print("Document:", chunks[i]["document_name"])
        print("Page:", chunks[i]["page_number"])
        print(chunks[i]["text"][:600])

inspect_chunks(chunks, n=3)


Chunk 0
Document: airport_operations.pdf
Page: 2
Airport Operations
About the Authors
Norman J. Ashford was Professor of Transport Planning at the Loughborough University
of Technology, England, from 1972 to 1997. He holds bachelor’s, master’s, and doctoral
degrees in civil engineering. Dr. Ashford worked as a civil engineer in Canada and taught
at the Georgia Institute of Technology and Florida State University. He served as the
Chunk 1
Document: airport_operations.pdf
Page: 2
Director of the Transportation Institute for the State of Florida. Dr. Ashford runs an aviation
consulting company and has been active in the areas of airport planning, design,
operations, and privatization for more than 100 airports in more than 40 countries.
H. P. Martin Stanton (deceased) was an airport operations expert of international renown
Chunk 2
Document: airport_operations.pdf
Page: 2
who worked for the International Civil Airports Association in Paris, the Frankfort Airport
Authority, and the Ministr

In [6]:
# Load page-level data instead of chunk-level data
# Adjust path if your page-level file name differs

RAW_PAGES_PATH = PROJECT_ROOT / "data" / "pages.json"

with open(RAW_PAGES_PATH, "r", encoding="utf-8") as f:
    pages = json.load(f)

print("Total pages loaded:", len(pages))
print("Sample page text:")
print(pages[0]["text"][:500])


Total pages loaded: 807
Sample page text:
Airport Operations
About the Authors
Norman J. Ashford was Professor of Transport Planning at the Loughborough University
of Technology, England, from 1972 to 1997. He holds bachelor’s, master’s, and doctoral
degrees in civil engineering. Dr. Ashford worked as a civil engineer in Canada and taught
at the Georgia Institute of Technology and Florida State University. He served as the
Director of the Transportation Institute for the State of Florida. Dr. Ashford runs an aviation
consulting company 


In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,        # target size
    chunk_overlap=100,     # overlap for continuity
    separators=["\n\n", "\n", ".", " ", ""]
)


  from .autonotebook import tqdm as notebook_tqdm


In [8]:
new_chunks = []

for page in pages:
    splits = text_splitter.split_text(page["text"])

    for chunk_text in splits:
        new_chunks.append({
            "text": chunk_text,
            "document_name": page["document_name"],
            "page_number": page["page_number"]
        })

print("New chunks created:", len(new_chunks))


New chunks created: 5011


In [9]:
print("OLD chunk sample:")
print(chunks[0]["text"][:400])

print("\n" + "=" * 80 + "\n")

print("NEW chunk sample:")
print(new_chunks[0]["text"][:400])


OLD chunk sample:
Airport Operations
About the Authors
Norman J. Ashford was Professor of Transport Planning at the Loughborough University
of Technology, England, from 1972 to 1997. He holds bachelor’s, master’s, and doctoral
degrees in civil engineering. Dr. Ashford worked as a civil engineer in Canada and taught
at the Georgia Institute of Technology and Florida State University. He served as the


NEW chunk sample:
Airport Operations
About the Authors
Norman J. Ashford was Professor of Transport Planning at the Loughborough University
of Technology, England, from 1972 to 1997. He holds bachelor’s, master’s, and doctoral
degrees in civil engineering. Dr. Ashford worked as a civil engineer in Canada and taught
at the Georgia Institute of Technology and Florida State University. He served as the


In [11]:
NEW_CHUNKS_PATH = CHUNKS_PATH  # overwrite intentionally

with open(NEW_CHUNKS_PATH, "w", encoding="utf-8") as f:
    json.dump(new_chunks, f, indent=2, ensure_ascii=False)

print("Updated chunks saved to:", NEW_CHUNKS_PATH)


Updated chunks saved to: D:\Visual Studio practice\aviation-chatbot\data\chunks.json


In [12]:
from collections import Counter

doc_counts = Counter(c["document_name"] for c in new_chunks)
print("Chunk distribution by document:")
doc_counts


Chunk distribution by document:


Counter({'airport_operations.pdf': 3825, 'scada_manual.pdf': 1186})