**MODEL Training on small dataset and GUI testing**

In [None]:

!pip install transformers datasets accelerate bitsandbytes gradio pillow

Collecting bitsandbytes
  Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl (59.4 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m59.4/59.4 MB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.48.2


In [None]:

from datasets import load_dataset
from transformers import BlipProcessor, BlipForConditionalGeneration, Trainer, TrainingArguments
from PIL import Image
import torch
import gradio as gr

In [None]:

dataset = load_dataset("jspetrisko/slake-simplified", split="train[:1000]")

# Preview
print(dataset.column_names)
print(dataset[0])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/588 [00:00<?, ?B/s]

data/train-00000-of-00003.parquet:   0%|          | 0.00/37.3M [00:00<?, ?B/s]

data/train-00001-of-00003.parquet:   0%|          | 0.00/25.5M [00:00<?, ?B/s]

data/train-00002-of-00003.parquet:   0%|          | 0.00/22.9M [00:00<?, ?B/s]

data/validation-00000-of-00001.parquet:   0%|          | 0.00/17.4M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/19.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9835 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2099 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2094 [00:00<?, ? examples/s]

['image', 'query', 'answers']
{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=256x256 at 0x7A9728A03770>, 'query': 'What modality is used to take this image?', 'answers': 'MRI'}


In [None]:

model_name = "Salesforce/blip-vqa-base"
processor = BlipProcessor.from_pretrained(model_name)
model = BlipForConditionalGeneration.from_pretrained(model_name)


def preprocess(example):
    image = example["image"].convert("RGB")
    question = example["query"]
    answer = example["answers"]

    inputs = processor(images=image, text=question, padding="max_length", truncation=True, return_tensors="pt")
    inputs["labels"] = processor.tokenizer(answer, padding="max_length", truncation=True, return_tensors="pt").input_ids
    return {k: v.squeeze() for k, v in inputs.items()}

processed_dataset = dataset.map(preprocess)



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
from transformers import AutoProcessor
processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base")

def preprocess(batch):
    inputs = processor(images=batch["image"], text=batch["query"], return_tensors="pt", padding=True)
    return inputs

dataset = dataset.map(preprocess, batched=True)


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
from torchvision import transforms
from PIL import Image

# Define transform to convert images to tensors and resize
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # match model input size
    transforms.ToTensor()
])

def preprocess(batch):
    images = []
    for img in batch["image"]:
        if isinstance(img, Image.Image):
            img = transform(img)
        images.append(img)
    batch["pixel_values"] = images
    return batch

dataset = dataset.map(preprocess, batched=True)


NameError: name 'dataset' is not defined

**Model Training and parameters adjusting**

In [None]:
# =====================================
# STEP 6: Training Arguments
# =====================================
import os
os.environ["WANDB_DISABLED"] = "true"


training_args = TrainingArguments(
    output_dir="./blip-medvqa",
    per_device_train_batch_size=2,
    num_train_epochs=1,
    learning_rate=5e-5,
    fp16=True,
    save_total_limit=1,
    logging_steps=20,
    remove_unused_columns=False,
)
# =====================================
# STEP 7: Trainer Setup
# =====================================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_dataset,
)

# =====================================
# STEP 8: Train the Model
# =====================================
trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


RuntimeError: Could not infer dtype of JpegImageFile

In [None]:
# =====================================
# STEP 9: Save Model & Processor
# =====================================
model.save_pretrained("./blip-medvqa_mymodel")
processor.save_pretrained("./blip-medvqa_myodel")


**Gradio GUI Testing**

In [None]:
# =====================================
# STEP 10: Inference Function
# =====================================
def answer_question(image, question):
    image = image.convert("RGB")
    inputs = processor(images=image, text=question, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")
    model.to(inputs["input_ids"].device)
    out = model.generate(**inputs, max_length=50)
    answer = processor.decode(out[0], skip_special_tokens=True)
    return answer

# =====================================
# STEP 11: Gradio Interface
# =====================================
interface = gr.Interface(
    fn=answer_question,
    inputs=[gr.Image(type="pil"), gr.Textbox(label="Ask a medical question")],
    outputs=gr.Textbox(label="Model's Answer"),
    title="üß† Medical Visual Question Answering (BLIP-MedVQA)",
    description="Upload a radiology image (e.g., X-ray, MRI) and ask a question. The model will provide an answer."
)

interface.launch(share=True)


***New code after fixing errors***

In [None]:
!pip install transformers datasets accelerate bitsandbytes gradio pillow

Collecting bitsandbytes
  Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl (59.4 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m59.4/59.4 MB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.48.2


In [None]:
# =====================================
# STEP 1: Imports
# =====================================
from datasets import load_dataset
from transformers import BlipProcessor, BlipForConditionalGeneration, Trainer, TrainingArguments
from PIL import Image
from torchvision import transforms
import torch
import gradio as gr
import gc

In [None]:
# =====================================
# STEP 2: Disable wandb logging (optional)
# =====================================
import os
os.environ["WANDB_DISABLED"] = "true"

# =====================================
# STEP 3: Load SLAKE Dataset (limit to 1000 rows)
# =====================================
dataset = load_dataset("jspetrisko/slake-simplified", split="train[:1000]")

print("Columns:", dataset.column_names)
print("Sample:", dataset[0])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Columns: ['image', 'query', 'answers']
Sample: {'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=256x256 at 0x7FF1ED4870E0>, 'query': 'What modality is used to take this image?', 'answers': 'MRI'}


In [None]:
# =====================================
# STEP 4: Define Image Preprocessing
# =====================================
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])

def preprocess(example):
    img = example["image"]
    if isinstance(img, Image.Image):
        example["pixel_values"] = transform(img)
    example["text"] = example["query"]
    example["labels_text"] = example["answers"]
    return example

processed_dataset = dataset.map(preprocess, batched=False)  # batched=False saves memory

# Delete original dataset to free RAM
del dataset
gc.collect()

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

335

In [None]:
# =====================================
# STEP 5: Load Pretrained BLIP Model
# =====================================
model_name = "Salesforce/blip-vqa-base"
processor = BlipProcessor.from_pretrained(model_name)
model = BlipForConditionalGeneration.from_pretrained(model_name).to("cuda" if torch.cuda.is_available() else "cpu")

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [None]:


# =====================================
# STEP 6: Prepare TrainingArguments
# =====================================
training_args = TrainingArguments(
    output_dir="./blip-medvqa",
    per_device_train_batch_size=1,
    num_train_epochs=1,
    learning_rate=5e-5,
    fp16=True,
    logging_steps=10,
    save_total_limit=1,
    remove_unused_columns=False  # <-- crucial
)

from transformers import BlipForConditionalGeneration

class BlipWrapper(BlipForConditionalGeneration):
    def forward(self, *args, **kwargs):
        # Remove any unsupported keys
        kwargs.pop("num_items_in_batch", None)
        return super().forward(*args, **kwargs)

model = BlipWrapper.from_pretrained("Salesforce/blip-vqa-base").to("cuda" if torch.cuda.is_available() else "cpu")

# =====================================
# STEP 7: Prepare Dataset for Trainer
# =====================================
from torch.utils.data import Dataset

class MedVQADataset(Dataset):
    def __init__(self, dataset, processor):
        self.dataset = dataset
        self.processor = processor

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        encoding = self.processor(images=item["image"],
                                  text=item["text"],
                                  return_tensors="pt",
                                  padding="max_length",
                                  truncation=True)
        labels = self.processor.tokenizer(item["labels_text"],
                                          padding="max_length",
                                          truncation=True,
                                          return_tensors="pt").input_ids
        encoding["labels"] = labels.squeeze()
        encoding = {k: v.squeeze() for k, v in encoding.items()}
        return encoding

train_dataset = MedVQADataset(processed_dataset, processor)

# =====================================
# STEP 8: Trainer Setup
# =====================================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset
)

# =====================================
# STEP 9: Train Model
# =====================================
trainer.train()

NameError: name 'TrainingArguments' is not defined

In [None]:
# =====================================
# STEP 10: Save Model & Processor
# =====================================
model.save_pretrained("./blip-medvqa_mymodel")
processor.save_pretrained("./blip-medvqa_mymodel")




In [None]:
# =====================================
# STEP 11: Inference Function
# =====================================
def answer_question(image, question):
    image = image.convert("RGB")
    inputs = processor(images=image, text=question, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")
    model.to(inputs["input_ids"].device)
    out = model.generate(**inputs, max_length=50)
    answer = processor.decode(out[0], skip_special_tokens=True)
    return answer

# =====================================
# STEP 12: Gradio Interface
# =====================================
interface = gr.Interface(
    fn=answer_question,
    inputs=[gr.Image(type="pil"), gr.Textbox(label="Ask a medical question")],
    outputs=gr.Textbox(label="Model Answer"),
    title="üß† Medical Visual Question Answering (BLIP-MedVQA)",
    description="Upload a radiology image (e.g., X-ray, MRI) and ask a question. The model will provide an answer."
)

interface.launch(share=True)

**Gradio GUI for pretrained model- Demo**

In [None]:
pip install git+https://github.com/huggingface/transformers.git

Collecting git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to /tmp/pip-req-build-xjwavhc8
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-req-build-xjwavhc8
  Resolved https://github.com/huggingface/transformers.git to commit bb65d2d953a512609a86727b6de64035717b1d45
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting huggingface-hub<2.0,>=1.0.0 (from transformers==5.0.0.dev0)
  Downloading huggingface_hub-1.1.1-py3-none-any.whl.metadata (13 kB)
Collecting typer-slim (from transformers==5.0.0.dev0)
  Downloading typer_slim-0.20.0-py3-none-any.whl.metadata (16 kB)
Downloading huggingface_hub-1.1.1-py3-none-any.whl (514 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚

In [None]:


from transformers import BlipProcessor, BlipForQuestionAnswering
import gradio as gr
from PIL import Image
import torch

# ‚úÖ Publicly available model
model_name = "Salesforce/blip-vqa-base"

processor = BlipProcessor.from_pretrained(model_name)
model = BlipForQuestionAnswering.from_pretrained(model_name).to("cuda" if torch.cuda.is_available() else "cpu")

def med_vqa(image, question):
    # Add a medical context prompt for better relevance
    question = f"This is a medical image. {question}"
    inputs = processor(image, question, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=50)
    answer = processor.decode(outputs[0], skip_special_tokens=True)
    return answer

# Gradio GUI
demo = gr.Interface(
    fn=med_vqa,
    inputs=[
        gr.Image(type="pil", label="Upload Medical Image"),
        gr.Textbox(label="Ask a Medical Question")
    ],
    outputs=gr.Textbox(label="Answer"),
    title="ü©∫ Medical Visual QA Demo (BLIP Base)",
    description="Upload an X-ray or CT image and ask a medical question. Model: Salesforce/blip-vqa-base"
)

demo.launch(share=True)


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/445 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.54G [00:00<?, ?B/s]

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://765db3d49bb44fd8cf.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




# **Testing new VLM - more general - descriptive-20/11/25**

In [None]:
!pip install transformers torch torchvision pillow gradio --quiet

In [None]:


from transformers import BlipProcessor, BlipForConditionalGeneration
import gradio as gr
from PIL import Image
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

# Load BLIP image captioning model
caption_model_name = "Salesforce/blip-image-captioning-base"
processor = BlipProcessor.from_pretrained(caption_model_name)
model = BlipForConditionalGeneration.from_pretrained(caption_model_name).to(device)



Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

In [None]:
def describe_image(image):
    """
    Given a medical image (e.g., X-ray), produce a descriptive caption.
    """
    # You can add a hint in the text to make it more medical:
    prompt = "A medical scan shows: "
    inputs = processor(images=image, text=prompt, return_tensors="pt").to(device)
    out_ids = model.generate(**inputs, max_new_tokens=50)
    description = processor.decode(out_ids[0], skip_special_tokens=True)
    return description

# Gradio app for demonstration
demo = gr.Interface(
    fn=describe_image,
    inputs=[gr.Image(type="pil", label="Upload Medical Image")],
    outputs=gr.Textbox(label="Description of Image"),
    title="Medical Image Description Demo",
    description="Upload a radiology image (X-ray, CT, etc.) ‚Äî the model will describe what it sees."
)

demo.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://9ddf507be24308fc0a.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
import gradio as gr
from transformers import AutoProcessor, BlipForConditionalGeneration
from PIL import Image
import torch

# ---------------------------
# Load Model
# ---------------------------
model_name = "WafaaFraih/blip-roco-radiology-captioning"

device = "cuda" if torch.cuda.is_available() else "cpu"

processor = AutoProcessor.from_pretrained(model_name)
model = BlipForConditionalGeneration.from_pretrained(model_name).to(device)


# ---------------------------
# Caption Function
# ---------------------------
def describe_medical_image(image):
    if image is None:
        return "Please upload an image."

    inputs = processor(images=image, return_tensors="pt").to(device)

    output_ids = model.generate(
        **inputs,
        max_new_tokens=70,
        num_beams=5,
        repetition_penalty=1.2
    )

    caption = processor.batch_decode(output_ids, skip_special_tokens=True)[0].strip()

    return caption





preprocessor_config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/136 [00:00<?, ?B/s]

In [None]:

# ---------------------------
# Gradio UI
# ---------------------------
demo = gr.Interface(
    fn=describe_medical_image,
    inputs=gr.Image(type="pil", label="Upload Medical Image (X-ray, CT, MRI, etc.)"),
    outputs=gr.Textbox(label="AI-Generated Medical Description"),
    title="ü©∫ Medical Image Captioning (Radiology)",
    description=(
        "Upload any medical image and the model will generate a radiology-style "
        "caption trained on the ROCO dataset. Model: WafaaFraih/blip-roco-radiology-captioning"
    ),
    examples=[
        ["example_xray.png"],
        ["example_ct.png"]
    ],
)

demo.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://e5af7d81fd2d2138b0.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




# **Medical caption - QA + Explanation**

In [None]:
import gradio as gr
from transformers import AutoProcessor, BlipForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM
from PIL import Image
import torch

# ---------------------------
# Load Captioning Model (BLIP ROCO Radiology)
# ---------------------------
caption_model_name = "WafaaFraih/blip-roco-radiology-captioning"
device = "cuda" if torch.cuda.is_available() else "cpu"

caption_processor = AutoProcessor.from_pretrained(caption_model_name)
caption_model = BlipForConditionalGeneration.from_pretrained(caption_model_name).to(device)

In [None]:
# ---------------------------
# Load Text Expansion LLM (Small, Open-Source)
# ---------------------------
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

expand_model_name = "tiiuae/Falcon-H1-0.5B-Instruct"

expand_tokenizer = AutoTokenizer.from_pretrained(expand_model_name)
expand_model = AutoModelForCausalLM.from_pretrained(
    expand_model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

def expand_caption_to_paragraph(caption: str) -> str:
    prompt = (
        "You are a medical expert. Expand this radiology caption into a clinically relevant paragraph:\n"
        f"Caption: {caption}\n\nExplanation:"
    )
    inputs = expand_tokenizer(prompt, return_tensors="pt", truncation=True, padding=True).to(expand_model.device)
    outputs = expand_model.generate(
        **inputs,
        max_new_tokens=180,
        temperature=0.7,
        top_p=0.95,
        do_sample=True
    )
    return expand_tokenizer.decode(outputs[0], skip_special_tokens=True)






# ---------------------------
# Generate Caption
# ---------------------------
def generate_caption(image):
    inputs = caption_processor(images=image, return_tensors="pt").to(device)
    output_ids = caption_model.generate(
        **inputs,
        max_new_tokens=70,
        num_beams=5,
        repetition_penalty=1.2,
    )
    caption = caption_processor.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
    return caption

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/1.04G [00:00<?, ?B/s]

The fast path is not available because one of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)` is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1d


generation_config.json:   0%|          | 0.00/145 [00:00<?, ?B/s]

In [None]:




# ---------------------------
# Expand Caption into a 4‚Äì6 Line Medical Paragraph
# ---------------------------
def expand_caption(caption):
    prompt = (
        "Rewrite the following medical image caption into a detailed 4-6 sentence "
        "clinical paragraph with medically relevant explanations:\n\n"
        f"Caption: {caption}\n\nExpanded Explanation:"
    )

    inputs = expand_tokenizer(prompt, return_tensors="pt").to(device)
    outputs = expand_model.generate(
        **inputs,
        max_new_tokens=200,
        num_beams=5,
        temperature=0.7,
    )
    paragraph = expand_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return paragraph

# ---------------------------
# Pipeline: Image ‚Üí Caption ‚Üí Expanded Paragraph
# ---------------------------
def full_pipeline(image):
    if image is None:
        return "Please upload an image.", ""

    caption = generate_caption(image)
    expanded = expand_caption(caption)

    return caption, expanded



In [None]:
# ---------------------------
# Gradio Interface
# ---------------------------

import gradio as gr
from transformers import AutoProcessor, BlipForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM
from PIL import Image
import torch
demo = gr.Interface(
    fn=full_pipeline,
    inputs=gr.Image(type="pil", label="Upload Medical Image (X-ray, CT, MRI)"),
    outputs=[
        gr.Textbox(label="Model Caption (BLIP-Radiology)"),
        gr.Textbox(label="Expanded 4‚Äì6 Line Medical Explanation")
    ],
    title="ü©∫ Medical Image Explanation AI",
    description=(
        "This tool uses BLIP Radiology Captioning to detect medical findings and "
        "new model to expand it into a clinical paragraph explanation."
    ),
)

demo.launch(debug=True)

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://4060687ba99b03b593.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


ERROR:    Exception in ASGI application
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/uvicorn/protocols/http/h11_impl.py", line 403, in run_asgi
    result = await app(  # type: ignore[func-returns-value]
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/uvicorn/middleware/proxy_headers.py", line 60, in __call__
    return await self.app(scope, receive, send)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/fastapi/applications.py", line 1134, in __call__
    await super().__call__(scope, receive, send)
  File "/usr/local/lib/python3.12/dist-packages/starlette/applications.py", line 113, in __call__
    await self.middleware_stack(scope, receive, send)
  File "/usr/local/lib/python3.12/dist-packages/starlette/middleware/errors.py", line 186, in __call__
    raise exc
  File "/usr/local/lib/python3.12/dist-packages/starlette/middleware/errors.py",

In [None]:
# ---------------------------
# Gradio Interface
# ---------------------------
demo = gr.Interface(
    fn=full_pipeline,
    inputs=gr.Image(type="pil", label="Upload Medical Image (X-ray, CT, MRI)"),
    outputs=[
        gr.Textbox(label="Model Caption (BLIP-Radiology)"),
        gr.Textbox(label="Expanded 4‚Äì6 Line Medical Explanation")
    ],
    title="ü©∫ Medical Image Explanation AI",
    description=(
        "This tool uses BLIP Radiology Captioning to detect medical findings and "
        "FLAN-T5 to expand it into a clinical paragraph explanation."
    ),
)

demo.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://fafd52e1405bce4f1a.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


