In [4]:
%pip install -qU pypdf langchain-openai

Note: you may need to restart the kernel to use updated packages.


In [None]:
import base64
import io

import fitz
from PIL import Image


def pdf_page_to_base64(pdf_path: str, page_number: int):
    pdf_document = fitz.open(pdf_path)
    page = pdf_document.load_page(page_number - 1)  # input is one-indexed
    pix = page.get_pixmap()
    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

    buffer = io.BytesIO()
    img.save(buffer, format="PNG")

    return base64.b64encode(buffer.getvalue()).decode("utf-8")

In [2]:
from langchain_community.document_loaders import PyPDFLoader

file_path = "../sample_data/chart_notes/B3YQT42Q_No PHI.pdf"

loader = PyPDFLoader(file_path)
pages = []
async for page in loader.alazy_load():
    pages.append(page)

In [3]:
print(f"{pages[0].metadata}\n")
print(pages[0].page_content)

{'source': '../sample_data/chart_notes/B3YQT42Q_No PHI.pdf', 'page': 0}

6/17/24, 5:42AM 
MRN:  
Office Visit 6/4/2024 
Bon Secours Neurology Clinic 
Westchester 
Provider: APRN - NP (Neurology) 
Primary diagnosis: Chronic migraine without aura, intractable, with status 
migrainosus 
Reason for Visit: New Patient 
Progress Notes __________ APRN - NP (Nurse Practitioner)• Neurology
�t,N SKOURS MEDICAL GROUP 
27 y.o. female who presents with the following 
Chief Complaint 
Patient presents with 
• New Patient
Patient was referred by Dr. for headaches. Patient reports having almost daily headaches. 
HPI 
Patient comes in as a new patient for chronic migraines 
She is having a migraine every single day 
She was diagnosed in her teenage years but then had a unfortunate bad car accident in 2019 which made 
them significantly worse 
Since then she has been having debilitating migraines 
Hypersensitivity light sound and smell 
Nausea but no vomiting 
Some dizziness and lightheadedness 
She has

In [7]:
import getpass
import os


def _set_if_undefined(var: str):
    if not os.environ.get(var):
        os.environ[var] = getpass.getpass(f"{var}=")


_set_if_undefined("OPENAI_API_KEY")
_set_if_undefined("OPENAI_API_BASE")

In [10]:
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_openai import AzureOpenAIEmbeddings

vector_store = InMemoryVectorStore.from_documents(pages, AzureOpenAIEmbeddings(model="embedding"))
docs = vector_store.similarity_search("What is chief complaint?", k=2)
for doc in docs:
    print(f'Page {doc.metadata["page"]}: {doc.page_content[:300]}\n')

Page 2: 6/17/24, 5:42AM 
Past Medical History: 
Diagnosis 
• Anxiety
• Asthma
• Bilateral pneumothorax
• Cervical spine fracture (HCC)
C3-C5
• Chorioretinitis
• Chronic pain
• Colostomy present (HCC)
• Depression
• Ectopic pregnancy
• Foot drop, left
• H/O skin graft
Allograft to left thigh and posterior tr

Page 0: 6/17/24, 5:42AM 
MRN:  
Office Visit 6/4/2024 
Bon Secours Neurology Clinic 
Westchester 
Provider: APRN - NP (Neurology) 
Primary diagnosis: Chronic migraine without aura, intractable, with status 
migrainosus 
Reason for Visit: New Patient 
Progress Notes __________ APRN - NP (Nurse Practitioner)•

