In [2]:
from sentence_transformers import SentenceTransformer, util
import numpy as np

In [3]:
class Chunk:
    def __init__(self,header,text,images,page_start,page_end):
        self.text = text
        self.images = images
        self.header = header
        self.valid = True
        self.page_start = page_start
        self.page_end = page_end

    def is_valid(self):
        if self.text.count(".") > 40:
            self.valid = False

        

        return self.valid

    def to_json(self):
        return {
            "images": self.images,
            "header": self.header,
            "text": self.text,
            
            "page_start": self.page_start,
            "page_end": self.page_end,
            "validity": self.valid
            
        }
    
    def get_text(self):
        return self.text
    
    def get_images(self):
        return self.images
    
    def get_header(self):
        return self.header
    



In [4]:
import fitz
import os

#Now to read the pdfs



path = "..//corpus//"
image_folder = "images/"
os.makedirs(image_folder,exist_ok=True)
buffer = ""
data_frames = []

for pdf_file in os.listdir(path):
    if not pdf_file.endswith(".pdf"):
        continue
    doc = fitz.open(path + pdf_file)
    file_name = os.path.splitext(pdf_file)[0]

    header = ""
    text_buffer = ""
    images = []
    last_images = [] #Found buffering images leads to them being on the correct chunk more often
    at_header = True
    page_start = 0
    
    for page_num, page in enumerate(doc):
        for img_index, img in enumerate(page.get_images(full=True), start=1):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]




            if base_image["width"] > 250 and base_image["height"] > 250:
                image_filename = f"{file_name}_p{page_num+1}_{img_index}.{image_ext}"
                image_path = os.path.join(image_folder, image_filename)
                with open(image_path, "wb") as f:
                    f.write(image_bytes)

                # Store the image path in the current section's image list
                images.append(image_path)
        
        
        blocks = page.get_text("dict")["blocks"]
        

        for block in blocks:
            if "lines" not in block:
                continue
            for line in block["lines"]:
                for span in line["spans"]:
                    text = span["text"].strip()
                    size = span["size"]  # font size
                    if not text:
                        continue
                    
                    if size >= 15:  # heuristic: large font = header
                        if at_header:
                            header += text
                        if not at_header:
                            if last_images == []:
                                last_images.append(images)
                            data_frame = {
                                "Text": text_buffer,
                                "metadata": {
                                    "Header": header,
                                    "Images": last_images,
                                    "page_start": page_start,
                                    "page_end": page_num
                                }

                            }
                            data_frames.append(data_frame)
                            
                            header = text
                            text_buffer = ""
                            last_images = images
                            images = []
                            at_header = True
                            page_start = page_num
                        

                    else:
                        text_buffer += text
                        at_header = False

#Dont forget the last one
if text_buffer:
    data_frame = {
        "Text": text_buffer,
        "metadata": {
        "Header": header,
        "Images": images,
        "page_start": page_start,
        "page_end": page_num
        }
    }  
    data_frames.append(data_frame)


   

        
        

    

In [5]:
#Convert Headers to Chunks for validation
#print(data_frames)
chunks = [Chunk(d["metadata"]["Header"], d["Text"], d["metadata"]["Images"],d["metadata"]["page_start"],d["metadata"]["page_end"]) for d in data_frames]

for chunk in chunks:
    print(chunk.to_json())
    




{'images': [[[[[['images/router-setup_p8_1.png']]]]]], 'header': '', 'text': '© 2025 TP-Link     1910013875    REV2.0.0', 'page_start': 0, 'page_end': 0, 'validity': True}
{'images': [[[[['images/router-setup_p8_1.png']]]]], 'header': 'User Guide', 'text': 'AX1800 Dual Band Wi-Fi 6 Router', 'page_start': 0, 'page_end': 1, 'validity': True}
{'images': [[[['images/router-setup_p8_1.png']]]], 'header': 'Contents', 'text': 'About This Guide . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 1Chapter 1.\t Get to Know About Your Router . . . . . . . . . . . . . . . . . . . . . . . . . . . 31. 1.Product Overview . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .41. 2.Appearance . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .41. 2. 1.\tTop Panel . . . . . . . . . . . . . . . . . . . . . . 

In [6]:
for chunk in chunks:
    print(chunk.to_json())

{'images': [[[[[['images/router-setup_p8_1.png']]]]]], 'header': '', 'text': '© 2025 TP-Link     1910013875    REV2.0.0', 'page_start': 0, 'page_end': 0, 'validity': True}
{'images': [[[[['images/router-setup_p8_1.png']]]]], 'header': 'User Guide', 'text': 'AX1800 Dual Band Wi-Fi 6 Router', 'page_start': 0, 'page_end': 1, 'validity': True}
{'images': [[[['images/router-setup_p8_1.png']]]], 'header': 'Contents', 'text': 'About This Guide . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 1Chapter 1.\t Get to Know About Your Router . . . . . . . . . . . . . . . . . . . . . . . . . . . 31. 1.Product Overview . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .41. 2.Appearance . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .41. 2. 1.\tTop Panel . . . . . . . . . . . . . . . . . . . . . . 

In [10]:
#Now to embed the data
model = SentenceTransformer("all-MiniLM-L6-v2")

text = [(f"Section: {t.get_header()} Text: {t.get_text()}") for t in chunks]


doc_embeddings = model.encode(text, convert_to_tensor=True)

#query = "what does the TP-Link AX router look like?"
query = "how should i position my router?"
query_embedding = model.encode(query,convert_to_tensor=True)

similarities = util.cos_sim(query_embedding, doc_embeddings)[0]


best_idx = int(similarities.argmax())
print(similarities.argmax())
print(best_idx)
print("🔍 Most relevant:", chunks[best_idx].get_text())
print(f"relavent images {chunks[best_idx].get_images()}" )

tensor(8, device='cuda:0')
8
🔍 Most relevant: •	 The product should not be located in a place where it will be exposed to moisture orexcessive heat.•	 Place the router in a location where it can be connected to multiple devices as well asto a power source.•	 Make sure the cables and power cord are safely placed out of the way so they do notcreate a tripping hazard.•	 The router can be placed on a shelf or desktop.•	 Keep the router away from devices with strong electromagnetic interference, such asBluetooth devices, cordless phones and microwaves.•	 Generally, the router is placed on a horizontal surface, such as on a shelf or desktop.The device also can be mounted on the wall as shown in the following figure.Note:The diameter of the screw head, 4.5mm < D < 6.6mm, and the distance of two screws is 211mm. The screw that projectfrom the wall need around 6.48 mm based, and the length of the screw need to be at least 9.5mm to withstand the weightof the product.
relavent images [[[['images/