In [1]:
import os
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import display
from rich.console import Console
from rich.markdown import Markdown

from docsearch.doc_search import DocSearch

console = Console()

  from .autonotebook import tqdm as notebook_tqdm


## Path Setup

In [2]:
ROOT_DIR = Path(os.path.abspath("."))
DATA_DIR = ROOT_DIR / "data"

## DocSearch Directory
DOCSEARCH_DIR = DATA_DIR / "PhiDocSearch"

## Raw Phi Docs Directory. Put your raw phi pdfs here.
RAW_PHI_DOCS_DIR = DATA_DIR / "phi-docs"

print(f"Root Directory: {ROOT_DIR}")
print(f"Data Directory: {DATA_DIR}")
print(f"DocSearch Directory: {DOCSEARCH_DIR}")
print(f"Raw Phi Docs Directory: {RAW_PHI_DOCS_DIR}")



Root Directory: c:\Users\lllang\Desktop\Current_Projects\DocSearch\examples\sandbox
Data Directory: c:\Users\lllang\Desktop\Current_Projects\DocSearch\examples\sandbox\data
DocSearch Directory: c:\Users\lllang\Desktop\Current_Projects\DocSearch\examples\sandbox\data\PhiDocSearch
Raw Phi Docs Directory: c:\Users\lllang\Desktop\Current_Projects\DocSearch\examples\sandbox\data\phi-docs


## Initialize DocSearch

In [3]:
from docsearch.doc_search import DocSearch

docsearch = DocSearch(base_path=DOCSEARCH_DIR)

## Add PDFs to DocSearch

In [4]:
pdf_paths = list(RAW_PHI_DOCS_DIR.glob("*.pdf"))
docsearch.add_pdfs(pdf_paths, auto_load=True)

Extracting pages from PDF at 150 DPI...
Extracted 9 pages

0: 1024x800 5 titles, 9 plain texts, 9 abandons, 3 figures, 968.0ms
Speed: 7.0ms preprocess, 968.0ms inference, 1.0ms postprocess per image at shape (1, 3, 1024, 800)

0: 1024x800 3 plain texts, 5 abandons, 3 figures, 3 figure_captions, 945.6ms
Speed: 5.0ms preprocess, 945.6ms inference, 1.1ms postprocess per image at shape (1, 3, 1024, 800)

0: 1024x800 7 plain texts, 5 abandons, 927.0ms
Speed: 6.0ms preprocess, 927.0ms inference, 1.0ms postprocess per image at shape (1, 3, 1024, 800)

0: 1024x800 5 plain texts, 6 abandons, 1 figure, 1 figure_caption, 937.2ms
Speed: 6.0ms preprocess, 937.2ms inference, 1.0ms postprocess per image at shape (1, 3, 1024, 800)

0: 1024x800 5 plain texts, 5 abandons, 1 figure, 1 figure_caption, 910.3ms
Speed: 5.0ms preprocess, 910.3ms inference, 1.0ms postprocess per image at shape (1, 3, 1024, 800)

0: 1024x800 4 plain texts, 6 abandons, 1 figure, 1 figure_caption, 924.0ms
Speed: 6.0ms preprocess,

  _warn_reuse()


Processing async: <PIL.Image.Image image mode=RGB size=35x16 at 0x1DADB06CB80>
Processing async: <PIL.Image.Image image mode=RGB size=121x16 at 0x1DADB06CF40>
Processing async: <PIL.Image.Image image mode=RGB size=271x23 at 0x1DADB06D060>
Processing async: <PIL.Image.Image image mode=RGB size=494x463 at 0x1DADB06CEE0>
Processing async: <PIL.Image.Image image mode=RGB size=477x70 at 0x1DADB06FF10>
Processing async: <PIL.Image.Image image mode=RGB size=501x205 at 0x1DADB06CDF0>
Processing async: <PIL.Image.Image image mode=RGB size=503x226 at 0x1DADB06F610>
Processing async: <PIL.Image.Image image mode=RGB size=502x350 at 0x1DADB06EBC0>
Processing async: <PIL.Image.Image image mode=RGB size=269x28 at 0x1DADB06C8E0>
Processing async: <PIL.Image.Image image mode=RGB size=74x26 at 0x1DADB06CCD0>
Processing async: <PIL.Image.Image image mode=RGB size=208x20 at 0x1DADB479BA0>
Processing async: <PIL.Image.Image image mode=RGB size=502x41 at 0x1DADB47B640>
Processing async: <PIL.Image.Image ima

Parsing nodes: 100%|██████████| 1/1 [00:00<00:00,  4.08it/s]
Generating embeddings: 100%|██████████| 3/3 [00:01<00:00,  2.86it/s]


Loading index c:\Users\lllang\Desktop\Current_Projects\DocSearch\examples\sandbox\data\PhiDocSearch\vector_stores\default
Refreshing index c:\Users\lllang\Desktop\Current_Projects\DocSearch\examples\sandbox\data\PhiDocSearch\vector_stores\default
Loading index c:\Users\lllang\Desktop\Current_Projects\DocSearch\examples\sandbox\data\PhiDocSearch\vector_stores\default
Refreshing index c:\Users\lllang\Desktop\Current_Projects\DocSearch\examples\sandbox\data\PhiDocSearch\vector_stores\default
Loading index c:\Users\lllang\Desktop\Current_Projects\DocSearch\examples\sandbox\data\PhiDocSearch\vector_stores\default
Refreshing index c:\Users\lllang\Desktop\Current_Projects\DocSearch\examples\sandbox\data\PhiDocSearch\vector_stores\default
Loading index c:\Users\lllang\Desktop\Current_Projects\DocSearch\examples\sandbox\data\PhiDocSearch\vector_stores\default
Refreshing index c:\Users\lllang\Desktop\Current_Projects\DocSearch\examples\sandbox\data\PhiDocSearch\vector_stores\default
Loading inde

In [5]:
docsearch.query("What three products does phi offer?",
save_response=True,
similarity_top_k=5)

Loading index c:\Users\lllang\Desktop\Current_Projects\DocSearch\examples\sandbox\data\PhiDocSearch\vector_stores\default


Response(response='PHI offers several products, including the PHI nanoTOF 3 mass spectrometer, which is known for its unique ion beam technology and high-precision analysis capabilities [2]. Additionally, they provide optional accessories such as dual anode X-ray sources and a high magnification sample observation microscope [3]. Another product mentioned is the PHI 700Xi Scanning Auger NanoProbe, which is used for advanced materials analysis [5].', source_nodes=[NodeWithScore(node=TextNode(id_='f8b0d6b7-4466-434b-9fbf-59f4a5732714', embedding=None, metadata={'pdf_id': 69, 'page_id': 11, 'pdf_path': 'c:\\Users\\lllang\\Desktop\\Current_Projects\\DocSearch\\examples\\sandbox\\data\\phi-docs\\phinanotof3-compressed.pdf', 'pdf_name': 'phinanotof3-compressed'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='pdf_id-69_page_id-11', node_type='4', metadata={'pdf_id': 69, 'page_id': 11, 'pdf_path': 'c:\\Us