# Inference Code

In [None]:
!pip install -q fastapi uvicorn nest_asyncio transformers torch pillow pyngrok python-multipart bitsandbytes accelerate

In [1]:
!ngrok config add-authtoken 2uXobtEjuC5xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:
import nest_asyncio
nest_asyncio.apply()

import os
import io
import torch
from fastapi import FastAPI, File, UploadFile, Form, HTTPException
from fastapi.responses import JSONResponse
from transformers import AutoProcessor, AutoModelForImageTextToText
from PIL import Image
from fastapi.middleware.cors import CORSMiddleware
from pyngrok import ngrok
import uvicorn

app = FastAPI()
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],  # or ["http://localhost:5173"]
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)


DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_ID = "p4rzvl/Llama-3.2-11B-Vision-Radiology-mini"

print(f"Loading {MODEL_ID} on {DEVICE}…")
processor = AutoProcessor.from_pretrained(MODEL_ID)
model = AutoModelForImageTextToText.from_pretrained(MODEL_ID).to(DEVICE)
print("Model loaded.")

@app.post("/predict")
async def predict(
    instruction: str = Form(...),
    file: UploadFile = File(...)
):
    # read image
    try:
        img = Image.open(io.BytesIO(await file.read())).convert("RGB")
    except Exception as e:
        raise HTTPException(status_code=400, detail=f"Invalid image: {e}")

    # chat template
    messages = [
        {"role": "user", "content": [
            {"type": "image"},
            {"type": "text",  "text": instruction}
        ]}
    ]
    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)

    # tokenize multimodal inputs
    inputs = processor(text=prompt, images=img, return_tensors="pt").to(DEVICE)

    # generate
    outputs = model.generate(
        **inputs,
        max_new_tokens=300,
        do_sample=True,
        temperature=1.5,
        top_p=0.9,
    )

    report = processor.decode(outputs[0], skip_special_tokens=True)
    return JSONResponse({"report": report})


public_url = ngrok.connect(8000, "http").public_url
print(">>> ngrok tunnel available at:", public_url)

uvicorn.run(app, host="0.0.0.0", port=8000, log_level="info")

2025-04-18 16:52:24.844160: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744995144.866826     118 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744995144.873590     118 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Loading p4rzvl/Llama-3.2-11B-Vision-Radiology-mini on cuda…


model.safetensors.index.json:   0%|          | 0.00/375k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.94G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/210 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/269M [00:00<?, ?B/s]

Model loaded.
>>> ngrok tunnel available at: https://705b-34-58-14-166.ngrok-free.app


INFO:     Started server process [118]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)


INFO:     14.139.122.100:0 - "POST /predict HTTP/1.1" 200 OK
INFO:     14.139.122.100:0 - "POST /predict HTTP/1.1" 200 OK
INFO:     14.139.122.100:0 - "POST /predict HTTP/1.1" 200 OK
INFO:     14.139.122.100:0 - "POST /predict HTTP/1.1" 200 OK
INFO:     14.139.122.100:0 - "POST /predict HTTP/1.1" 200 OK
INFO:     14.139.122.100:0 - "POST /predict HTTP/1.1" 200 OK
INFO:     14.139.122.100:0 - "POST /predict HTTP/1.1" 200 OK
INFO:     14.139.122.100:0 - "POST /predict HTTP/1.1" 200 OK


In [None]:
import requests

# replace with the Kaggle-provided URL/port if different
url = "http://0.0.0.0:8000/predict"
files = {"file": open("path/to/img","rb")}
data  = {"instruction": "You are an expert radiographer. Describe accurately what you see in this image."}

resp = requests.post(url, files=files, data=data)
print(resp.json())