In [None]:
!python --version

Python 3.11.13


In [None]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/My Drive

Mounted at /content/drive
/content/drive/My Drive


In [None]:
!pip install transformers huggingface_hub datasets evaluate unsloth

In [None]:
!pip install bitsandbytes ultralytics

In [None]:
%cd /content/drive/My Drive/yolov11

In [None]:
from unsloth import FastVisionModel
from ultralytics import YOLO
from pathlib import Path
from PIL import Image
import torch
import gc
from IPython.display import display

In [None]:
det_model = YOLO("runs/detect/train2/weights/best.pt")

In [None]:
model_name = "unsloth/Llama-3.2-11B-Vision-Instruct-bnb-4bit"
model, tokenizer = FastVisionModel.from_pretrained(
    model_name,
    load_in_4bit=True,
    use_gradient_checkpointing="unsloth"
)
model.eval()


In [None]:
def get_detections_with_pos(img_path: str):
    img = Image.open(img_path)
    w, h = img.size
    res = det_model(img_path)[0]
    labels = [det_model.names[int(c)] for c in res.boxes.cls]
    boxes  = [xyxy.tolist() for xyxy in res.boxes.xyxy]

    result = []
    for lab, box in zip(labels, boxes):
        x1,y1,x2,y2 = box
        cx, cy = (x1+x2)/2, (y1+y2)/2

        hor = "left" if cx < w/3 else "right" if cx > 2*w/3 else "center"

        ver = "top"  if cy < h/3 else "bottom" if cy > 2*h/3 else "center"

        if hor=="center" and ver=="center":
            rel = "center"
        elif hor=="center":
            rel = ver + "-center"
        elif ver=="center":
            rel = "center-" + hor
        else:
            rel = ver + "-" + hor
        result.append((lab, [round(x,1) for x in box], rel))
    return result

In [None]:
from transformers import TextStreamer

def evaluate_with_fastvision(img_path: str):

    dets = get_detections_with_pos(img_path)


    if not dets:
        instruction = "No necrotizing soft tissue infection lesions were detected in this image."
    else:
        mapping = {
            "air": "ectopic gas",
            "thick": "fascia edematous changes",
            "water": "fluid accumulation",
            "low attenuation": "soft tissue non-enhancement"
        }
        lines = ["Below are the detections found by the model:"]
        for label, coords, rel in dets:
            desc = mapping.get(label, label)
            lines.append(f"- {label} ({desc}), coordinates: {coords}, relative position: {rel}")

        lines.append(
            "\nNow **for each** of the above detections, "
            "write **one complete English sentence** describing the finding, its bounding box, "
            "and its relative location. "
            "Make sure you cover **all** detections, **one sentence per detection**, each on its own line."
        )
        instruction = "\n".join(lines)


    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": Image.open(img_path).convert("RGB")},
            {"type": "text",  "text": instruction}
        ]
    }]


    input_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
    inputs = tokenizer(
        Image.open(img_path).convert("RGB"),
        input_text,
        add_special_tokens=False,
        return_tensors="pt",
    ).to(model.device)


    streamer = TextStreamer(tokenizer, skip_prompt=True)
    with torch.inference_mode():
        _ = model.generate(
            **inputs,
            streamer=streamer,
            max_new_tokens=256,
            use_cache=True,
            do_sample=False,
        )


In [None]:
ct_folder = Path("LLM_2")
for img_path in sorted(ct_folder.glob("*.*")):
    if img_path.suffix.lower() not in {".png", ".jpg", ".jpeg"}:
        continue


    res = det_model(str(img_path))[0]
    annotated = res.plot()
    annotated_img = Image.fromarray(annotated)


    display(annotated_img)


    print(f"\n=== {img_path.name} Analysis ===")
    evaluate_with_fastvision(str(img_path))


    gc.collect()
    torch.cuda.empty_cache()
