In [None]:
def processed_chunk(chunks):
    # Split by newline and store in list
    lines = chunk.split('\n')
    search_texts = []
    i = 0
    while i < len(lines):
        line = lines[i]
        if line.endswith('-') and i + 1 < len(lines):
            # Remove hyphen and append next line directly
            merged = line[:-1] + lines[i + 1].lstrip()
            search_texts.append(merged)
            i += 2  # Skip next line since we already used it
        else:
            search_texts.append(line)
            i += 1
    return search_texts

In [54]:
def process_page_chunks(data, real_page_number):

    page_label = f"page_{real_page_number}"
    
    # Find the corresponding key (e.g., "0") for this page label
    matching_key = None
    for k, v in data["Pages"].items():
        if v == page_label:
            matching_key = k
            break

    if matching_key is None:
        raise ValueError(f"Page {real_page_number} ('{page_label}') not found in data['Pages'].")

    chunks = data["Chunks"].get(matching_key, [])
    all_lines = []
    for chunk in chunks:
            raw_lines = chunk.split('\n')
            i = 0
            while i < len(raw_lines):
                line = raw_lines[i]
                if line.endswith('-') and i + 1 < len(raw_lines):
                    merged = line[:-1] + raw_lines[i + 1].lstrip()
                    all_lines.append(merged)
                    i += 2
                else:
                    all_lines.append(line)
                    i += 1

    return all_lines



chunks = {'Pages': {'0': 'page_8', '1': 'page_9'}, 'Chunks': {'0': ['feature rates from 0.1 to 0.9. We can observe that\nTopoRAG consistently outperforms other strategies in both node\nclassification and link prediction across all rates of missing features.\nThis underscores the benefits of incorporating additional context\nin handling missing feature issues on graphs.\n\n# 6.4 Feature Imputation with TopoRAG\nMany machine learning models assume a fully observed feature\nmatrix. However, in practice, each feature is only observed for a\nsubset of nodes due to constraints like privacy concerns or limited\nresources for data annotation [67]. In all these scenarios, the missing\nfeature issues could catastrophically compromise the capability of\nmachine learning models [47], which motivates many previous\nworks developing solutions to handling missing feature issue [68].\n\nSince our proposed TopoRAG can naturally generate node fea-\ntures in graph-based datasets, in this section, we evaluate its effec-\ntiveness in handling missing features by comparing its performance\n\n#', 'al\nmethods by equipping it with topology awareness. Different from\nKG-based RAG where topology information is incorporated by re-\ntrieving triples from subgraphs around entities mentioned in the\nquestion [61], we explicitly consider proximity and role-based topo-\nlogical relations in guiding the retrieval, the related works of which\nare reviewed next.\n\n'], '1': ['ce domains. This\nfurther justifies why TopoRAG achieves almost consistently higher\nperformance than other baselines on all these datasets.\n\n', '-RAG in\naddressing missing feature issues in graph machine learning tasks.\n\nRecognizing the importance of not only considering the quantity\nbut also the structure of input knowledge in text generation [62],\nfuture work will focus on optimizing input formats by leverag-\ning topological signals for question-answering and text-generation\ntasks. Moreover, we plan to assess the robustness of the TopoRAG\nframework by exploring the potential of attacking/defending over\ngraphs to compromise/strengthen the capability of LLMs in com-\npleting downstream tasks.\n\nFigure 8: (a) The textual similarity of sent emails by pairs of\nemployees grouped based on their job titles; (b) The distribu-\ntion of passage length for each dataset.\n\nFigure 9: Correlation analysis between the textual similarity\nand proximity-based topological similarity over six datasets.\nThe correlation shown beside the dataset name is positive\nacross different datasets from different domains.\n\n# Acknowledgements\nThis research is suppo']}}
search_texts = process_page_chunks(chunks,9)
print(search_texts)

['ce domains. This', 'further justifies why TopoRAG achieves almost consistently higher', 'performance than other baselines on all these datasets.', '', '', '-RAG in', 'addressing missing feature issues in graph machine learning tasks.', '', 'Recognizing the importance of not only considering the quantity', 'but also the structure of input knowledge in text generation [62],', 'future work will focus on optimizing input formats by leveraging topological signals for question-answering and text-generation', 'tasks. Moreover, we plan to assess the robustness of the TopoRAG', 'framework by exploring the potential of attacking/defending over', 'graphs to compromise/strengthen the capability of LLMs in completing downstream tasks.', '', 'Figure 8: (a) The textual similarity of sent emails by pairs of', 'employees grouped based on their job titles; (b) The distribution of passage length for each dataset.', '', 'Figure 9: Correlation analysis between the textual similarity', 'and proximity-base

In [47]:
import fitz  # PyMuPDF
from PIL import Image
import io

def create_highlighted_page_image(pdf_path, page_number, texts_to_highlight, 
                                  dpi=300, highlight_color=(1, 0.8, 0)):
    # Open the PDF document
    doc = fitz.open(pdf_path)
    
    # Make sure page number is valid
    if page_number < 0 or page_number >= len(doc):
        raise ValueError(f"Page number {page_number} is out of range (document has {len(doc)} pages)")
    
    page = doc[page_number]
    
    if isinstance(texts_to_highlight, str):
        texts_to_highlight = [texts_to_highlight]
    
    highlight_count = 0
    
    for text in texts_to_highlight:
        text_instances = page.search_for(text)
        if text_instances:
            for inst in text_instances:
                highlight = page.add_highlight_annot(inst)
                highlight.set_colors(stroke=highlight_color)
                highlight.update()
                highlight_count += 1
    
    # Generate a pixmap (image) of the page
    zoom = dpi / 72  # Standard PDF is 72 dpi
    matrix = fitz.Matrix(zoom, zoom)
    pixmap = page.get_pixmap(matrix=matrix, alpha=False)
    
    # Convert pixmap to PIL Image
    img_data = pixmap.tobytes("png")
    img = Image.open(io.BytesIO(img_data))
    
    return img

# Example usage
if __name__ == "__main__":
    pdf_file = "Topology-aware-RAG.pdf"
    page_num = 8  # Page number (0-based indexing)
    
    output_file = "highlighted_page.png"
    
    image_path = create_highlighted_page_image(
        pdf_file, 
        page_num, 
        search_texts,
        output_file,
        highlight_color=(1, 1, 0)  # Yellow highlight
    )
    
    print(f"Created image with highlighted text: {image_path}")


TypeError: in method 'JM_search_stext_page', argument 2 of type 'char const *'