In [1]:
!pip install --upgrade --quiet accelerate langchain-unstructured verovio unstructured-client unstructured "unstructured[pdf]" python-magic bitsandbytes tiktoken
!pip install --upgrade transformers torch -y -q
!pip install --upgrade setuptools -y -q
!pip install chromadb -q

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
opentelemetry-proto 1.27.0 requires protobuf<5.0,>=3.19, but you have protobuf 5.28.2 which is incompatible.[0m[31m
[0m
Usage:   
  pip install [options] <requirement specifier> [package-index-options] ...
  pip install [options] -r <requirements file> [package-index-options] ...
  pip install [options] [-e] <vcs project url> ...
  pip install [options] [-e] <local project path> ...
  pip install [options] <archive url/path> ...

no such option: -y

Usage:   
  pip install [options] <requirement specifier> [package-index-options] ...
  pip install [options] -r <requirements file> [package-index-options] ...
  pip install [options] [-e] <vcs project url> ...
  pip install [options] [-e] <local project path> ...
  pip install [options] <archive url/path> ...

no such option: -y
[31mERROR: pip's dependency r

#### Use this to read PDF documents

In [1]:
import re
import os
import uuid
import chromadb
import tempfile
from dotenv import load_dotenv
from pdf2image import convert_from_path
import torch
from typing import List
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, pipeline, QuantoConfig, AutoModelForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
radiology_sample = "Radiology_example-chest_report.pdf"
physiotherapy_sample = "Physical Therapy Progress Note.pdf"
physexam_sample = "Physical_Exam_Sample.pdf"

In [3]:
%%time

ocr_model_id = 'ucaslcl/GOT-OCR2_0'

tokenizer = AutoTokenizer.from_pretrained(ocr_model_id, trust_remote_code=True)

model = AutoModel.from_pretrained(
    ocr_model_id, 
    trust_remote_code=True, 
    low_cpu_mem_usage=True, 
    device_map='cuda', 
    use_safetensors=True, 
    pad_token_id=tokenizer.eos_token_id,
    quantization_config=QuantoConfig(weights="int8"),
)
model = model.eval().cuda()

file_path = f"./data/pdf-samples/{physexam_sample}"

images = convert_from_path(file_path)

pdf_data = []
for image in images:
    with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_file:
        # Save your image to this temporary file
        temp_file_path = temp_file.name
        image.save(temp_file_path)
        res = model.chat(tokenizer, temp_file_path, ocr_type='format')
        pdf_data.append(res)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask

CPU times: user 1min 33s, sys: 1.92 s, total: 1min 35s
Wall time: 1min 28s


In [4]:
joined_pdf = " ".join(pdf_data)

In [5]:
document_content = "\n".join(pdf_data)

In [6]:
document_sections = re.sub(
    r'\\\((.*?)\\\)',
    '', 
    document_content.replace("\n", " ")
).split("\\section")

In [9]:
import uuid

In [13]:
collection_id = str(uuid.uuid4().hex)

In [14]:
chroma_client = chromadb.Client()
try: chroma_client.delete_collection(collection_id)
except: pass
chroma_collection = chroma_client.get_or_create_collection(name = collection_id)
chroma_client.list_collections()

[Collection(id=0bdef1a3-1f26-450d-82e4-be8a54a63117, name=temp),
 Collection(id=1aa41f42-a08d-4439-b41b-9a91693103bf, name=5c6a76b283a34bb5974b7778ab036784)]

In [15]:
chroma_collection.add(
    documents=document_sections,
    ids=[uuid.uuid4().hex for _ in range(len(document_sections))],
)

#### Use LLM

>> How can we have fast PDF Document retrieval? 

In [20]:
class InitiateLLM:
    
    def __init__(self, history_enabled: bool = False):
        
        self.history_enabled = history_enabled
        self.model_id = "microsoft/Phi-3-mini-128k-instruct"
        self.generation_args = { 
            "max_new_tokens": 2000, 
            "return_full_text": False,
            "temperature": 0.0, 
            "do_sample": False, 
        } 

        # Load model with quantization for reduced memory usage
        self.model = AutoModelForCausalLM.from_pretrained(
            self.model_id,
            device_map="auto",  # Automatically use GPU
            trust_remote_code=True, 
            low_cpu_mem_usage=True, 
            quantization_config=QuantoConfig(weights="int8"),
        )

        model.gradient_checkpointing_enable()
        
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_id)
        
        self.pipe = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            torch_dtype=torch.float16,
        )

        self.prompt_acc = [
            {"role": "system", "content": (
                "Answer the following question using only the provided context. Do not assume or add information beyond what is in the context. \n"
                "If the context does not contain sufficient information to answer the question or no context is provided at all, explicitly state that the context is insufficient. \n"
                "In your own understanding, if the description of the issue is vague, ask for clarifications \n"
            )}
        ]

    def _create_query(self, query: str, context: str):
        # - combine with keyword search for more semantic similarity
        return {
            "role": "user",
            "content": (
                f"Question: {query}\n"
                "Here is the context:\n"
                f"{context}"
            )
        }

    def _rerank_documents(self, query, relevant_documents: List[str]) -> List[str]:
        
        reranker_model = AutoModelForSequenceClassification.from_pretrained('BAAI/bge-reranker-v2-m3')
        reranker_tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-reranker-v2-m3')
        
        pairs = [[query, passage] for passage in relevant_documents]
        
        # We add torch no grad to prevent gradient from being calculated unecessarily which will lead to OOM error
        with torch.no_grad():
            inputs = reranker_tokenizer(pairs, padding=True, truncation=True, return_tensors='pt', max_length=512)
            outputs = reranker_model(**inputs, return_dict=True)
            scores = outputs.logits.squeeze().float()  # Adjust based on model output shape
            
        torch.cuda.empty_cache()
        # Pair the scores with the other list
        paired_list = list(zip(scores, relevant_documents))
        # Sort the pairs based on the scores in descending order
        sorted_pairs = sorted(paired_list, key=lambda x: x[0], reverse=True)
        # Separate the sorted pairs back into two lists
        _, l = zip(*sorted_pairs)
        
        return l

    def prompt(self, query: str):

        query_results = chroma_collection.query(query_texts=[query], n_results = 5)['documents'][0]
        reranked_results = self._rerank_documents(query, query_results)
        
        context = "\n\n".join(reranked_results)
        
        generated_prompt = self._create_query(query, context)

        if self.history_enabled:
            self.prompt_acc.append(generated_prompt)
            with torch.no_grad():
                output = self.pipe(self.prompt_acc, **self.generation_args)
            output = output[0]['generated_text'].strip()
            self.prompt_acc.append({ "role": "system", "content": output })
        else:
            with torch.no_grad():
                output = self.pipe(self.prompt_acc + [generated_prompt], **self.generation_args)
            output = output[0]['generated_text'].strip()
                
        return output

In [21]:
llm = InitiateLLM()

`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
Loading checkpoint shards: 100%|████████████████████████████████████████████████████| 2/2 [00:03<00:00,  1.87s/it]


In [22]:
%%time
llm.prompt("what is the patient_name")

OutOfMemoryError: CUDA out of memory. Tried to allocate 208.00 MiB. GPU 0 has a total capacity of 23.61 GiB of which 236.38 MiB is free. Including non-PyTorch memory, this process has 14.24 GiB memory in use. Process 665506 has 7.18 GiB memory in use. Of the allocated memory 13.07 GiB is allocated by PyTorch, and 370.73 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [23]:
%%time
llm.prompt("What is the name of the referring physician?")

CPU times: user 1min 16s, sys: 6.7 s, total: 1min 23s
Wall time: 7.77 s


'The context provided does not contain sufficient information to identify the name of the referring physician.'

In [24]:
%%time
llm.prompt("Summarise the important results of the physical examination for me in non medical terms")

CPU times: user 46.1 s, sys: 974 ms, total: 47.1 s
Wall time: 17 s


"The physical examination results show that the patient's vital signs are stable, and there are no abnormalities in the rectal area. The patient's stool is brown and does not contain blood. The pelvic examination revealed no abnormalities. The external genitalia, vagina, and cervix appear normal upon speculum examination. The bimanual examination did not reveal any palpable uterus, ovaries, or masses.\n\nThe chest examination showed clear lungs with crackles in the lung bases bilaterally. A grade 2/6 systolic decrescendo murmur was heard at the second right inter-costal space, which radiates to the neck. No fourth heart sound or rub were detected. Cystic changes were noted in the breasts bilaterally, but no masses or nipple discharge was observed.\n\nThe past medical history includes a list of potential problems, such as chest pain, family history of early heart disease, early surgical menopause, difficulty breathing, recent onset high blood pressure, abdominal bruit, systolic ejection