In [1]:
import re
import os
import uuid
import chromadb
import tempfile
from dotenv import load_dotenv
from pdf2image import convert_from_path
import torch
from typing import List
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, pipeline, QuantoConfig, AutoModelForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
radiology_sample = "Radiology_example-chest_report.pdf"
physiotherapy_sample = "Physical Therapy Progress Note.pdf"
physexam_sample = "Physical_Exam_Sample.pdf"

In [3]:
%%time

ocr_model_id = 'ucaslcl/GOT-OCR2_0'

tokenizer = AutoTokenizer.from_pretrained(ocr_model_id, trust_remote_code=True)

model = AutoModel.from_pretrained(
    ocr_model_id, 
    trust_remote_code=True, 
    low_cpu_mem_usage=True, 
    device_map='cuda', 
    use_safetensors=True, 
    pad_token_id=tokenizer.eos_token_id,
    quantization_config=QuantoConfig(weights="int8"),
)
model = model.eval().cuda()

file_path = f"./data/pdf-samples/{physexam_sample}"

images = convert_from_path(file_path)

pdf_data = []
for image in images:
    with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_file:
        # Save your image to this temporary file
        temp_file_path = temp_file.name
        image.save(temp_file_path)
        res = model.chat(tokenizer, temp_file_path, ocr_type='format')
        pdf_data.append(res)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask

CPU times: user 1min 29s, sys: 2.07 s, total: 1min 31s
Wall time: 1min 25s


In [4]:
del model

In [5]:
joined_pdf = " ".join(pdf_data)
document_content = "\n".join(pdf_data)
document_content = re.sub(
    r'\\\((.*?)\\\)',
    '', 
    document_content.replace("\n", " ")
)

In [28]:
import torch 
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline 

torch.random.manual_seed(0) 

class InitiateLLM:
    
    def __init__(self, context: str, history_enabled: bool = False):

        self.context = context
        self.history_enabled = history_enabled
        self.model_id = "microsoft/Phi-3-mini-128k-instruct"
        self.generation_args = { 
            "max_new_tokens": 500, 
            "return_full_text": False, 
            "temperature": 0.0, 
            "do_sample": False, 
        } 

        # Load model with quantization for reduced memory usage
        self.model = AutoModelForCausalLM.from_pretrained(
            self.model_id,
            device_map="auto", 
            trust_remote_code=False,
            quantization_config=QuantoConfig(weights="float8"),
        )
        
        # Enable gradient checkpointing to reduce memory footprint
        self.model.gradient_checkpointing_enable()
        
        # Load tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_id)
        
        # Define the pipeline
        self.pipe = pipeline(
            "text-generation",
            model=self.model,
            tokenizer=tokenizer,
        )

        self.prompt_acc = [
            {"role": "system", "content": "You are a helpful AI assistant."}, 
        ]

    def _create_query(self, query: str, context: str):
        return {
            "role": "user", 
            "content": (
                "Answer the following question using only the provided context. Do not assume or add information beyond what is in the context. \n"
                "If the context does not contain sufficient information to answer the question or no context is provided at all, explicitly state that the context is insufficient. \n"
                "In your own understanding, if the description of the issue is vague, ask for clarifications \n"
                f'Question: {query}'
                f'Context: {context}'
            )
        }

    def prompt(self, query: str):

        generated_prompt = self._create_query(query, self.context)

        if self.history_enabled:
            self.prompt_acc.append(generated_prompt)
            with torch.no_grad():
                output = self.pipe(self.prompt_acc, **self.generation_args)
                output = output[0]['generated_text']
            self.prompt_acc.append({ "role": "system", "content": output })
        else:
            with torch.no_grad():
                prompt_ = self.prompt_acc + [generated_prompt]
                output = self.pipe(prompt_, **self.generation_args)
                output = output[0]['generated_text']
                
        return output

In [29]:
llm = InitiateLLM("TEST")

Loading checkpoint shards: 100%|████████████████████████| 2/2 [00:03<00:00,  1.73s/it]


In [31]:
%%time
llm.prompt("what is the patient_name")

CPU times: user 6.21 s, sys: 1.74 ms, total: 6.21 s
Wall time: 6.21 s


' I\'m sorry, but the context provided is insufficient to answer any question regarding the patient\'s name. The context only contains the word "TEST," which does not provide any information about a patient\'s name.'

In [9]:
import torch 
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline 

torch.random.manual_seed(0) 
model = AutoModelForCausalLM.from_pretrained( 
    "microsoft/Phi-3-mini-128k-instruct",  
    device_map="cuda",  
    torch_dtype="auto",  
    trust_remote_code=True,  
) 

tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct") 

messages = [ 
    {"role": "system", "content": "You are a helpful AI assistant."}, 
    {"role": "user", "content": "Can you provide ways to eat combinations of bananas and dragonfruits?"}, 
    {"role": "assistant", "content": "Sure! Here are some ways to eat bananas and dragonfruits together: 1. Banana and dragonfruit smoothie: Blend bananas and dragonfruits together with some milk and honey. 2. Banana and dragonfruit salad: Mix sliced bananas and dragonfruits together with some lemon juice and honey."}, 
    {"role": "user", "content": "What about solving an 2x + 3 = 7 equation?"}, 
] 

pipe = pipeline( 
    "text-generation", 
    model=model, 
    tokenizer=tokenizer, 
) 

generation_args = { 
    "max_new_tokens": 500, 
    "return_full_text": False, 
    "temperature": 0.0, 
    "do_sample": False, 
} 

output = pipe(messages, **generation_args) 
print(output[0]['generated_text'])


`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
Loading checkpoint shards: 100%|████████████████████████| 2/2 [00:01<00:00,  1.92it/s]
You are not running the flash-attention implementation, expect numerical differences.


 To solve the equation 2x + 3 = 7, follow these steps:

1. Subtract 3 from both sides of the equation:
   2x + 3 - 3 = 7 - 3
   2x = 4

2. Divide both sides of the equation by 2:
   2x/2 = 4/2
   x = 2

So, the solution to the equation 2x + 3 = 7 is x = 2.
