In [1]:
import os
import json 

# Load the environment configuration JSON data
json_path = 'env_config.json'
with open(json_path, 'r') as file:
    env_config = json.load(file)

# Set the HF_HOME environment variable
os.environ['HF_HOME'] = env_config['HF_HOME']
# Set the access token to huggingface hub
access_token = env_config['access_token']

In [2]:
from transformers import AutoTokenizer, AutoModel
import torch
import torchvision.transforms as T
from PIL import Image

from torchvision.transforms.functional import InterpolationMode


IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)


def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform


def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
    best_ratio_diff = float('inf')
    best_ratio = (1, 1)
    area = width * height
    for ratio in target_ratios:
        target_aspect_ratio = ratio[0] / ratio[1]
        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
        if ratio_diff < best_ratio_diff:
            best_ratio_diff = ratio_diff
            best_ratio = ratio
        elif ratio_diff == best_ratio_diff:
            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
                best_ratio = ratio
    return best_ratio


def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnail=False):
    orig_width, orig_height = image.size
    aspect_ratio = orig_width / orig_height

    # calculate the existing image aspect ratio
    target_ratios = set(
        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
        i * j <= max_num and i * j >= min_num)
    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])

    # find the closest aspect ratio to the target
    target_aspect_ratio = find_closest_aspect_ratio(
        aspect_ratio, target_ratios, orig_width, orig_height, image_size)

    # calculate the target width and height
    target_width = image_size * target_aspect_ratio[0]
    target_height = image_size * target_aspect_ratio[1]
    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]

    # resize the image
    resized_img = image.resize((target_width, target_height))
    processed_images = []
    for i in range(blocks):
        box = (
            (i % (target_width // image_size)) * image_size,
            (i // (target_width // image_size)) * image_size,
            ((i % (target_width // image_size)) + 1) * image_size,
            ((i // (target_width // image_size)) + 1) * image_size
        )
        # split the image
        split_img = resized_img.crop(box)
        processed_images.append(split_img)
    assert len(processed_images) == blocks
    if use_thumbnail and len(processed_images) != 1:
        thumbnail_img = image.resize((image_size, image_size))
        processed_images.append(thumbnail_img)
    return processed_images


def load_image(image_file, input_size=448, max_num=6):
    image = Image.open(image_file).convert('RGB')
    transform = build_transform(input_size=input_size)
    images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
    pixel_values = [transform(image) for image in images]
    pixel_values = torch.stack(pixel_values)
    return pixel_values

path = "OpenGVLab/InternVL-Chat-V1-5"
# If you have an 80G A100 GPU, you can put the entire model on a single GPU.
model_ocr = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    trust_remote_code=True,
    load_in_8bit=True
    ).eval()#.cuda()
# Otherwise, you need to set device_map='auto' to use multiple GPUs for inference.
# import os
# os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
# model = AutoModel.from_pretrained(
#     path,
#     torch_dtype=torch.bfloat16,
#     low_cpu_mem_usage=True,
#     trust_remote_code=True,
#     device_map='auto').eval()

ocr_tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
# set the max number of tiles in `max_num`


  from .autonotebook import tqdm as notebook_tqdm
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Loading checkpoint shards: 100%|██████████| 11/11 [00:42<00:00,  3.88s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [27]:
# set the max number of tiles in `max_num`
pixel_values = load_image('handwritten_prescription.jpg', max_num=6).to(torch.bfloat16).cuda()

generation_config = dict(
    num_beams=1,
    max_new_tokens=512,
    do_sample=False,
)

# single-round single-image conversation
question = "Extract and format the text from the document in detail" # Please describe the picture in detail
response = model_ocr.chat(ocr_tokenizer, pixel_values, question, generation_config)
print(question, response)

dynamic ViT batch size: 7
Extract and format the text from the document in detail The text from the document is as follows:

HOSPITAL INFANTIL DE MEXICO
FEDERICO GOMEZ

CONSULTA EXTERNA
SERV. ONCOLOGIA
REG. NUM. 84938

NOMBRE:
SEXO:
EDAD:
FC:
TIA:
SC:

SELLOS, ORDENES Y TRATAMIENTO
NOTAS DE EVOLUCION

$5 AGO 2011

Edad 18a 5/12
Peso 110.5
Talla 134
FC 66
TIA 119/84
SC.

Temp. 35.6

En mantenimiento sem S6
a la ef sin datos de actividad
neoplasica ni infectiosa
cupciente.

Se le propone cateter pureto y rehusar
Plau 0 con hacer mantenimiento -
sem 57-60
-mtx 90mg 1M semundarx2
- Puntetho 150mg x2sm
- mtx 90mg 1M x4
- arac 1000mg IV dosssylica
- VCR 2mg IV
- lasp 1730000 u1m du Baja & Baja
- dexa 1.8mg/96hr
x isemana
- ondawseton
- 8mg Wprenio alarac
-TMP/SMX propiacto

Utilicen una nueva hora de evolucion hasta haberse cerrado que la anterior esta llena.
Numerense las horas de evolucion para facilitar la revision del expediente


In [18]:
# multi-round single-image conversation
question = "Explain this document to me in detail." # Please describe the picture in detail
response, history = model_ocr.chat(ocr_tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
print(question, response)

question = "What is the phase of the current treatment?" # Please write a poem according to the picture
response, history = model_ocr.chat(ocr_tokenizer, pixel_values, question, generation_config, history=history, return_history=True)
print(question, response)

question = "List the medications should be taken daily?" # Please write a poem according to the picture
response, history = model_ocr.chat(ocr_tokenizer, pixel_values, question, generation_config, history=history, return_history=True)
print(question, response)


dynamic ViT batch size: 7




Explain this document to me in detail. This document is a medical record from the Hospital Infantil de Mexico Federico Gomez, specifically from the Oncology department. The record is for a patient who is undergoing treatment for cancer. The document is written in Spanish and contains various sections with handwritten notes.

At the top, the hospital's name and department are mentioned, along with a reference number for the patient's file. The date of the consultation is listed as "5 Ago 2011."

The patient's personal information is provided, including their name, age, and weight. The document also includes the patient's medical history, noting that the child has been diagnosed with cancer and is currently undergoing treatment. The treatment plan is detailed in the "SELLOS, ORDENES Y TRATAMIENTO" section, which includes a list of medications and dosages, as well as instructions for the patient's care.

The "NOTAS DE EVOLUCIÓN" section contains notes on the patient's progress, including 

# How to prompt Llama 3
The base models have no prompt format. Like other base models, they can be used to continue an input sequence with a plausible continuation or for zero-shot/few-shot inference. They are also a great foundation for fine-tuning your own use cases. The Instruct versions use the following conversation structure:
```bash
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{{ system_prompt }}<|eot_id|><|start_header_id|>user<|end_header_id|>

{{ user_msg_1 }}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{{ model_answer_1 }}<|eot_id|>

```

This format has to be exactly reproduced for effective use. We’ll later show how easy it is to reproduce the instruct prompt with the chat template available in transformers.

In [7]:
import transformers 
print(transformers.__version__)

from transformers import pipeline
import torch

from accelerate import Accelerator
from transformers import AutoTokenizer, AutoModelForCausalLM

accelerator = Accelerator()
device = accelerator.device

model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

tokenizer = AutoTokenizer.from_pretrained(model_id)

4.41.0


Loading checkpoint shards: 100%|██████████| 4/4 [00:06<00:00,  1.69s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [32]:
prompt = r"""
This document appears to be a medical record from a hospital visit at "Hospital Infantil de México Federico Gómez." It includes patient details and notes from an oncology consultation. Here’s a detailed explanation:
Header Information
Hospital Name: Hospital Infantil de México Federico Gómez
Department: Oncología (Oncology)
Consultation Type: Consulta Externa (Outpatient Consultation)
Date: 15 Aug 2021
Record Number: 847388
Patient Information
Age: 18 years
Weight: 110.5 kg
Height: 174 cm
Heart Rate (FC): 66 bpm
Respiratory Rate (FR): (not clearly mentioned)
Blood Pressure (TA): 119/84 mmHg
Temperature: 35.6°C
Clinical Notes
The patient is known for their age.
Currently in maintenance (sem S6, likely week 6 of a treatment regimen).
Attended an asymptomatic review.
Physical examination showed no signs of active neoplastic (cancerous) or infectious diseases.
The proposed treatment includes maintaining the current plan and additional measures (likely related to a catheter, port, and chemotherapy drugs).
Treatment Plan
Current Medications and Dosages:
Methotrexate (MTX): 90 mg IM weekly
Purineethol (Mercaptopurine): 150 mg/m² daily
Methotrexate (MTX): 90 mg IM weekly
Ara-C (Cytarabine): 1000 mg intrathecally
VCR (Vincristine): 2 mg IV
L-asparaginase (L-ASP): 173,000 UI IM daily
Dexamethasone (DEXA): 1.8 mg/m²/day for 6 days weekly
Ondansetron: 8 mg for nausea prophylaxis
TMP/SMX (Trimethoprim/Sulfamethoxazole): Prophylactic dose
Additional Notes
The next important date for the patient (likely a significant treatment or appointment) is scheduled for August 9.
Other unspecified medical or supportive measures.
Prescribing Doctors
Signatures or names of doctors involved in the treatment (Dr. Rafael Ballon).
Instructions
Use a new sheet for further progress notes.
Number the evolution sheets for easier review.
"""
messages = [
    {"role": "system", "content": response},
    {"role": "user", "content": "Explain this document to me in detail."},
    # {"role": "user", "content": "List the medications should be taken daily "},
    # {"role": "user", "content": "What is the phase of the current treatment?"},
]

chat_template = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
input_dict = tokenizer.encode_plus(chat_template)

input_ids = torch.tensor(input_dict["input_ids"]).unsqueeze(0).to(device)
attention_mask = torch.tensor(input_dict["attention_mask"]).unsqueeze(0).to(device)
# input_ids = tokenizer.apply_chat_template(
#     messages,
#     add_generation_prompt=True,
#     return_tensors="pt"
# ).to(model.device)

terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]



In [33]:
outputs = model.generate(
    input_ids,
    attention_mask=attention_mask,
    max_new_tokens=512,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
)
assistant_response = tokenizer.decode(outputs[0])
print(assistant_response)

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


<|begin_of_text|><|begin_of_text|><|start_header_id|>system<|end_header_id|>

The text from the document is as follows:

HOSPITAL INFANTIL DE MEXICO
FEDERICO GOMEZ

CONSULTA EXTERNA
SERV. ONCOLOGIA
REG. NUM. 84938

NOMBRE:
SEXO:
EDAD:
FC:
TIA:
SC:

SELLOS, ORDENES Y TRATAMIENTO
NOTAS DE EVOLUCION

$5 AGO 2011

Edad 18a 5/12
Peso 110.5
Talla 134
FC 66
TIA 119/84
SC.

Temp. 35.6

En mantenimiento sem S6
a la ef sin datos de actividad
neoplasica ni infectiosa
cupciente.

Se le propone cateter pureto y rehusar
Plau 0 con hacer mantenimiento -
sem 57-60
-mtx 90mg 1M semundarx2
- Puntetho 150mg x2sm
- mtx 90mg 1M x4
- arac 1000mg IV dosssylica
- VCR 2mg IV
- lasp 1730000 u1m du Baja & Baja
- dexa 1.8mg/96hr
x isemana
- ondawseton
- 8mg Wprenio alarac
-TMP/SMX propiacto

Utilicen una nueva hora de evolucion hasta haberse cerrado que la anterior esta llena.
Numerense las horas de evolucion para facilitar la revision del expediente<|eot_id|><|start_header_id|>user<|end_header_id|>

Explain this

# Modeling explanation for Llama 3

In [None]:
import torch.nn as nn 
from models import MLP

class SimilarityMeasure(nn.Module):
    def __init__(self, input_ids, attention_mask, embed_size=512):
        super(SimilarityMeasure, self).__init__()

        self.pred_map = MLP(pred_hidden_size, 128, embed_size, num_blocks=2, bottleneck_dim=64)
        self.explain_map = MLP(explain_hidden_size, 128, embed_size, num_blocks=2, bottleneck_dim=64)

        self.logit_scale = nn.Parameter(torch.tensor(1.0))
    
    def forward(self, pred_feature, explain_features):
        """
        Forward pass of the model.

        Args:
            q (torch.Tensor): Query tensor of shape [N, pred_hidden_size].
            k (torch.Tensor): Key tensor of shape [N, L, explain_hidden_size].

        Returns:
            torch.Tensor: Similarity tensor of shape [N, L].
        """
        pred_feature = F.normalize(self.pred_map(pred_feature), p=2, dim=-1).unsqueeze(1)  # [N, 1, embed_size]
        explain_features = F.normalize(self.explain_map(explain_features), p=2, dim=-1)  # [N, L, embed_size]


        logit_scale = self.logit_scale.exp()

        similarity = torch.matmul(explain_features, pred_feature.transpose(-1, -2)).squeeze(-1) * logit_scale  # [N, L]

        return similarity  # [N, L]


class MaskGeneratingModel(nn.Module):
    def __init__(self, hidden_size):
        """ 
        hidden_size: int
            The hidden size of the output of the generative model
        """
        super().__init__()

        self.hidden_size = hidden_size
        # self.similarity_measure = SimilarityMeasure()
        self.explain_map = MLP(input_dim=hidden_size, 
                               hidden_dim=128, 
                               output_dim=1, 
                               num_blocks=2, 
                               bottleneck_dim=64) # takes [N, L, hidden_size] outputs [N, L, 1]
        
        self.similarity = MLP(input_dim=hidden_size, 
                               hidden_dim=128, 
                               output_dim=1, 
                               num_blocks=2, 
                               bottleneck_dim=64) # takes [N, hidden_size] outputs [N, 1]

        self.bce_loss = nn.BCELoss(reduction='none')
    
    def forward(self, pred_features):
        """ 
        pred_features: torch.Tensor of shape [N, L, hidden_size]
        """
        mask_logits = self.explain_map(pred_features) # [N, L, 1]
        return mask_logits 
    
    def compute_similarity(self, masked_concat_features):
        """ 
        masked_concat_features: torch.Tensor of shape [N, hidden_size]
        """
        similarity_logit = self.similarity(masked_concat_features) # [N, 1]
        return similarity_logit



In [34]:
tokenizer.encode_plus("<mask>")


{'input_ids': [128000, 27, 11508, 29], 'attention_mask': [1, 1, 1, 1]}

In [3]:
import easyocr
reader = easyocr.Reader(['es','en']) # this needs to run only once to load the model into memory
result = reader.readtext('handwritten_prescription.jpg')

In [18]:
prompt2 = ' '.join([str(result[i]) for i in range(len(result))])

In [6]:
response

'The document is in Spanish and contains medical information. Here is the text extracted from the document:\n\nHOSPITAL INFANTIL DE MEXICO\nFEDERICO GOMEZ\n\nCONSULTA EXTERNA\nSERV. ONCOLOGIA\nREG. NUM. 84938\n\nNOMBRE:\nSEXO:\nEDAD:\nFC:\nTIA:\nSC.:\nTEMP.:\nNOTAS DE EVOLUCION\n\nSELLOS, ORDENES Y TRATAMIENTO\n$5 AGO. 2021\n\nEdad 18a 5/12\nPeso 110.5\nTalla 134\nFC 66\nTIA 119/84\nSC. -\nTemp. 35.6\n\nEn mantenimiento sem S6\na la ef sin datos de actividad\nneoplasica ni infectiosa\ncupciente.\n\nSe le propone cateter pureto y rehuscar\nPlau 0 con hacer mantenimiento -\nsem 57-60\n-mtx 90mg 1M semundarx2\n- Puntetho 150mg x 2.5sm\n- mtx 90mg 1M x 4\n- arac 1000mg ldoossylica\n- VCR 2mg lV\n- lasp 17.3000 lU lM du Baja & Baja\n- dexa 1.8mg/96hr x Isemana\n- ondawseton\n- 8mg Wprenio alarac\n- TMP/SMX propiacto\n\nsem 57-60\n-mtx 90mg 1M semundarx2\n- Puntetho 150mg x 2.5sm\n- mtx 90mg 1M x 4\n- arac 1000mg ldoossylica\n- VCR 2mg lV\n- lasp 17.3000 lU lM du Baja & Baja\n- dexa 1.8mg/96