In [2]:
import os
import anthropic
import openai
import base64
import json
import requests
import pandas as pd
import random
from http import HTTPStatus
from dashscope import Generation
import dashscope
from dashscope import MultiModalConversation



In [3]:
claude_key = os.getenv("CLAUDE_KEY")
openai_key = os.getenv("OPENAI_KEY")
gemini_key = os.getenv("GEMINI_KEY")
qwen_key = os.getenv("QWEN_KEY")

print(claude_key)

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


## API VLM Query Helpers

In [4]:
def claude_review(filepath, prompt):

    with open(filepath, "rb") as image_file:
        image_data = base64.b64encode(image_file.read()).decode('utf-8')  # Ensure the image data is in base64 string format
    
    
    #check the media type of the image
    image_media_type = "image/jpeg" if filepath.endswith(".jpg") else "image/png"

    client = anthropic.Anthropic(
        api_key=""
    )
    response = client.messages.create(
        model="claude-3-opus-20240229",
        max_tokens=1024,
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "image",
                        "source": {
                            "type": "base64",
                            "media_type": image_media_type,
                            "data": image_data,
                        },
                    },
                    {
                        "type": "text",
                        "text": prompt
                    }
                ],
            }
        ],
    )
    return response.content[0].text


In [31]:
def openai_review(filepath, prompt):
    #TODO: Implement OpenAI review
    #https://platform.openai.com/docs/quickstart
    return "OpenAI review not implemented yet"

In [11]:
def qwen_review():
    """Simple single round multimodal conversation call.
    """
    messages = [
        {
            "role": "user",
            "content": [
                {"image": "https://raw.githubusercontent.com/luyingz06/VLMsForInjuryAssessment/refs/heads/main/images/injured_man.jpg"},
                {"text": "What is happening in this image?"}
            ]
        }
    ]
    responses = MultiModalConversation.call(model='qwen-vl-max',
                                           messages=messages,
                                           stream=False)

    print(responses['output']['choices'][0]['message']['content'][0]['text'])


In [12]:
dashscope.base_http_api_url = 'https://dashscope-intl.aliyuncs.com/api/v1'
qwen_review()

In the image, there's a young man who appears to be experiencing pain or discomfort. He's sitting on a gray sidewalk with his legs crossed and one hand resting gently on his knee while the other holds up his head as if he might have hit it against something hard. His attire consists of a black t-shirt paired with blue shorts and white sneakers that add a pop of color to his outfit.

The background reveals a grassy area dotted with trees, suggesting an outdoor setting possibly near a park or garden. The overall scene paints a picture of someone taking a break from their activities due to some unforeseen incident. However, without additional context, it's difficult to determine what exactly happened leading up to this moment captured in the photo.


In [33]:
def gemini_review(filepath, prompt):
    #TODO: Implement Gemini review
    #https://ai.google.dev/gemini-api?gad_source=1&gclid=Cj0KCQjwj4K5BhDYARIsAD1Ly2r_KaDE-fC0T3O2E0sA0kqv2WdncwLTkye_XaRwJ2tWEHm3tw9o5rUaAv83EALw_wcB
    return "Gemini review not implemented yet"

## Dataset Assessment

In [7]:
print(claude_review("images/injured_man.jpg", "what is happening in this image?"))

The image shows a young man sitting outdoors on a grassy area, laughing enthusiastically. He is wearing a gray hooded jacket and blue shorts, and has on a pair of light blue sneakers. The background is a lush, green environment, suggesting a natural outdoor setting. The man appears to be in a joyful, carefree mood, expressing his happiness through his wide, open-mouthed smile and laughter.


In [5]:
def gen_imagepath_list(dir_path):
    imagepath_list = []
    for root, dirs, files in os.walk(dir_path):
        for file in files:
            if file.endswith(".jpg") or file.endswith(".png"):
                imagepath_list.append(os.path.join(root, file))
    return imagepath_list


In [6]:
from PIL import Image
#downsample the image of this path to be 5mb or less with same aspect ratio
#width is always 1024
def resize_image(image_path):
    image = Image.open(image_path)
    width = 1024
    #height should be scaled to preserve aspect ratio
    height = int(width * image.height / image.width)
    image = image.resize((width, height))
    #downsample the image to be 5mb or less
    image.save(image_path, quality=95)


In [13]:
from create_dataset import LABEL_CLASSES, label_class_human_value_list, dict_sentences


def gen_dataset_elt(path, model):
    image_path = path
    id = path.split("/")[-1].split(".")[0]
    #image should be the url starting at the second folder in 
    image = "/".join(path.split("/")[2:])
    conversations = [
        {
            "from": "human",
            "value": "LEAVE BLANK"
        }
    ]

    trauma_labels = {
            "from": "gpt",
            "scene": [],
            "trauma_head": [],
            "trauma_torso": ["hi"],
            "trauma_lower_ext": [],
            "trauma_upper_ext": [],
            "alertness_ocular": [],
            "severe_hemorrhage": [],
            "value-dict": ""
        }
    value_dict = {
        "trauma_head": "",
        "trauma_torso": "",
        "trauma_lower_ext": "",
        "trauma_upper_ext": "",
        "alertness_ocular": "",
        "severe_hemorrhage": ""
    }

    for key in label_class_human_value_list.keys():
        if model == "claude":
            print(label_class_human_value_list[key])
            trauma_labels[key] = claude_review(image_path, label_class_human_value_list[key][0])
            try:
                value_dict[key] = claude_review(image_path, dict_sentences[key][0])
            except:
                pass
        elif model == "openai":
            trauma_labels[key] = openai_review(image_path, label_class_human_value_list[key])
        elif model == "gemini":
            trauma_labels[key] = gemini_review(image_path, label_class_human_value_list[key])
        elif model == "qwen":
            trauma_labels[key] = qwen_review(image_path, label_class_human_value_list[key])
        else:
            raise ValueError("Invalid model name")
        print(key, " scored")
    
    trauma_labels["value-dict"] = json.dumps(value_dict)
    conversations.append(trauma_labels)
    return {
        "id": id,
        "image": image,
        "conversations": conversations
    }
    

        



print(dict_sentences)


{'trauma_head': ["Summarize the results in a python dictionary with the key 'trauma_head'. 'absence' means no injury to the head, 'presence' means injury to the head. ONLY respond with the proposes labels. Do not add any additional information. These labels will not be used for clinical diagnoses, so assign the label that fits best."], 'trauma_torso': ["Summarize the results in a python dictionary with the key 'trauma_torso'. 'absence' means no injury to the torso, 'presence' means injury to the torso. ONLY respond with the proposes labels. Do not add any additional information. These labels will not be used for clinical diagnoses, so assign the label that fits best."], 'trauma_lower_ext': ["Summarize the results in a python dictionary with the key 'trauma_lower_ext'. 'absence' means no injury to the lower extremities, 'wound' means injury to at least one lower extremity, 'amputation' missing at least one lower extremity. ONLY respond with the proposes labels. Do not add any additional

In [8]:

imagepaths = gen_imagepath_list("/home/edward/VLMsForInjuryAssessment/all_training_data_no_humans")

i  = 0
for image in imagepaths:
    print(i)
    print(image)
    i+=1

0
/home/edward/VLMsForInjuryAssessment/all_training_data_no_humans/Data-Collect-09132024/0/0/1726253057910063264.png
1
/home/edward/VLMsForInjuryAssessment/all_training_data_no_humans/Data-Collect-09132024/20/0/1726258511787907725.png
2
/home/edward/VLMsForInjuryAssessment/all_training_data_no_humans/Data-Collect-09132024/4/0/1726253735684346391.png
3
/home/edward/VLMsForInjuryAssessment/all_training_data_no_humans/Data-Collect-09132024/4/1/1726253786482275148.png
4
/home/edward/VLMsForInjuryAssessment/all_training_data_no_humans/Data-Collect-09132024/4/2/1726253854251025118.png
5
/home/edward/VLMsForInjuryAssessment/all_training_data_no_humans/Data-Collect-09132024/4/3/1726253913383883104.png
6
/home/edward/VLMsForInjuryAssessment/all_training_data_no_humans/Data-Collect-09132024/1/0/1726253101436713875.png
7
/home/edward/VLMsForInjuryAssessment/all_training_data_no_humans/Data-Collect-09132024/1/1/1726253148108423972.png
8
/home/edward/VLMsForInjuryAssessment/all_training_data_no_hum

In [14]:
#loop through the images directory and have each of the vlms assess the images given a prompt
#save it to a csv
#it should have the image name, the prompt, the response from each vlms

imagepaths = gen_imagepath_list("/home/edward/VLMsForInjuryAssessment/all_training_data_no_humans")

#pretty print with indent the dict_summary_sentences


path = imagepaths[0]

dataset = []

for path in imagepaths:
    df = gen_dataset_elt(path, "claude")
    dataset.append(df)
    with open("dataset_claude_opus.json", "w") as f:
        json.dump(dataset, f, indent=4)
    print("finished ", path)

#save as json
with open("dataset_claude_opus.json", "w") as f:
    json.dump(dataset, f, indent=4)




["Describe the scene. Is there a person in the image and what's their condition?"]
scene  scored
['Describe the scene. Does the person have any head injuries? If yes, specify the type of injury.', 'Give a detailed description of the person. Are there any head injuries?', 'Does the person show any signs of head injuries?', 'There is a person in the center of the scene. Describe the person. Are there any head injuries?', 'Does the person in the scene have any head injuries?']
trauma_head  scored
['Describe the scene. Does the person have any torso injuries? If yes, specify the type of injury.', 'Give a description of the person. Are there any torso injuries?', 'Are there any signs of torso injuries?', 'There is a person in the center of the scene. Describe the person. Are there any injuries to their torso?', 'Does the person in the scene have any torso injuries?']
trauma_torso  scored
['Describe the scene. Does the person have any lower extremity injuries? If yes, specify the type of inj

#TODO:
1. Implement the API Calls
2. Do a cost assessment
3. Finetune the different levels of LLaVA 
4. LoRA the different levels of LLaVA
5. Run the mannequin images through everything
6. Analysis