In [None]:
from unsloth import FastVisionModel # FastLanguageModel for LLMs
import torch
import torch._dynamo.config

torch._dynamo.config.recompile_limit = 512

model, tokenizer = FastVisionModel.from_pretrained(
    "qizunlee/gemma3n_E4B_it_ft_3RGarbageClassification", # or "unsloth/gemma-3n-E2B-it"
    load_in_4bit = True, # Use 4bit to reduce memory use. False for 16bit LoRA.
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for long context
    # token = ""
)

==((====))==  Unsloth 2025.7.11: Fast Gemma3N patching. Transformers: 4.54.1.
   \\   /|    NVIDIA A100 80GB PCIe. Num GPUs = 1. Max memory: 79.151 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.1+cu126. CUDA: 8.0. CUDA Toolkit: 12.6. Triton: 3.3.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.31.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
from PIL import Image

FastVisionModel.for_inference(model) # Enable for inference!

PROMPT_FOR_VISION = (
    "You are a garbage classification assistant. Based on the image, identify and classify all distinct parts of the object. "
    "For each part, determine the type of garbage from the following options: A: Cardboard, B: Glass, C: Metal, D: Paper, E: Plastic, F: Trash. "
    "Your response must be in a JSON format. The JSON should contain a single key, 'material', which holds an array of objects. "
    "Each object in the array must have two keys: 'part_name' (a brief description of the item) and 'answer' (the classification from the provided options, in the format 'A: Cardboard'). "
    "If the image contains multiple distinct parts made of different materials, list each part as a separate object in the 'material' array. "
    "For example, if the image shows a paper coffee cup with a plastic lid, you should output two separate objects in the array. "
    "The cup should be classified as 'D: Paper' and the lid as 'E: Plastic'. "
    "If a part is not classified into a specific category, consider it as 'F: Trash'."
)

image_path = "WhatsApp Image 2025-08-01 at 23.50.15_38d0037a.jpg"

image = Image.open(image_path).convert("RGB").resize((512, 512))

messages = [
    {"role": "user", "content": [
        {"type": "image"},
        {"type": "text", "text": PROMPT_FOR_VISION}
    ]}
]
input_text = tokenizer.apply_chat_template(messages, add_generation_prompt = True)
# Convert the grayscale image to RGB
if image.mode != "RGB":
    image = image.convert("RGB")

inputs = tokenizer(
    image,
    input_text,
    add_special_tokens = False,
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128,
                   use_cache = True, temperature = 1.0, top_p = 0.95, top_k = 64)

```json
{
  "material": [
    {
      "part_name": "Coffee cup body",
      "answer": "A: Cardboard"
    },
    {
      "part_name": "Lid",
      "answer": "E: Plastic"
    }
  ]
}
```<end_of_turn>
