In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoProcessor
from hulumed_qwen3 import disable_torch_init, model_init, mm_infer
from hulumed_qwen3 import disable_torch_init, model_init, mm_infer
from hulumed_qwen3.model import load_pretrained_model
from hulumed_qwen3.mm_utils import load_images, process_images, load_video, process_video, tokenizer_multimodal_token, get_model_name_from_path, KeywordsStoppingCriteria
from hulumed_qwen3.model.processor import HulumedProcessor
import os

#os.environ["CUDA_VISIBLE_DEVICES"] = "1"
model_path = "./Model/Hulu-Med-14B"
model_name = get_model_name_from_path(model_path)
tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, None, model_name,device_map='auto')
processor = HulumedProcessor(image_processor, tokenizer)
model.config.use_token_compression=False



In [None]:
frames, timestamps = load_video("./Inference/demo/1min_demo.mp4", fps=1, max_frames=3000)
conversation = [
        {
            "role": "user",
            "content": [
               {"type": "video", "num_frames": len(frames)},
                {"type": "text", "text": "Please describe this video in detail."},
            ]
        }
    ]
modal='video'
model=model.to("cuda:0")
inputs = processor(
        images=[frames] if modal != "text" else None,
        text=conversation,
        merge_size=2 if modal == "video" else 1,
        return_tensors="pt"
        )
inputs = {k: v.cuda().to('cuda:0') if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
if "pixel_values" in inputs:
    inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16)
with torch.inference_mode():
        output_ids = model.generate(
            **inputs,
            do_sample=True,
            modals=[modal],
            temperature=0.6,
            max_new_tokens=8192,
            use_cache=True,
            pad_token_id=tokenizer.eos_token_id,
        )

outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
print(outputs)

In [None]:
slices = load_images(
    "./Inference/demo/amos_0013.nii", 
    nii_num_slices=160        # 指定切片数量为100
)
conversation = [
        {
            "role": "user",
            "content": [
               {"type": "video", "num_frames": len(slices)},
                {"type": "text", "text": "This is a medical 3D scenario. Please generate a medical report for the given 3D medical images, including both findings and impressions."},
            ]
        }
    ]
modal='video'
model=model.to("cuda:0")
inputs = processor(
        images=[slices] if modal != "text" else None,
        text=conversation,
        merge_size=2 if modal == "video" else 1,
        return_tensors="pt"
        )
inputs = {k: v.cuda().to('cuda:0') if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
if "pixel_values" in inputs:
    inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16)
with torch.inference_mode():
        output_ids = model.generate(
            **inputs,
            do_sample=True,
            modals=[modal],
            temperature=0.6,
            max_new_tokens=8192,
            use_cache=True,
            pad_token_id=tokenizer.eos_token_id,
        )

outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
print(outputs)

In [None]:
slices = load_images(
    "./Inference/demo/demo.jpg", 
)
conversation = [
        {
            "role": "user",
            "content": [
               {"type": "image"},
                {"type": "text", "text": "Describe this image in detail."},
            ]
        }
    ]
modal='image'
model=model.to("cuda:0")
inputs = processor(
        images=[slices] if modal != "text" else None,
        text=conversation,
        merge_size=2 if modal == "video" else 1,
        return_tensors="pt"
        )
inputs = {k: v.cuda().to('cuda:0') if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
if "pixel_values" in inputs:
    inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16)
with torch.inference_mode():
        output_ids = model.generate(
            **inputs,
            do_sample=True,
            modals=[modal],
            temperature=0.6,
            max_new_tokens=8192,
            use_cache=True,
            pad_token_id=tokenizer.eos_token_id,
        )

outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
print(outputs)