In [1]:
!pip install decord

Collecting decord
  Downloading decord-0.6.0-py3-none-manylinux2010_x86_64.whl.metadata (422 bytes)
Downloading decord-0.6.0-py3-none-manylinux2010_x86_64.whl (13.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.6/13.6 MB[0m [31m94.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m:01[0m
[?25hInstalling collected packages: decord
Successfully installed decord-0.6.0


In [2]:
import math
import numpy as np
import torch
import torchvision.transforms as T
from decord import VideoReader, cpu
from PIL import Image
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer, AutoConfig

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)

def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
    best_ratio_diff = float('inf')
    best_ratio = (1, 1)
    area = width * height
    for ratio in target_ratios:
        target_aspect_ratio = ratio[0] / ratio[1]
        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
        if ratio_diff < best_ratio_diff:
            best_ratio_diff = ratio_diff
            best_ratio = ratio
        elif ratio_diff == best_ratio_diff:
            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
                best_ratio = ratio
    return best_ratio

def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
    orig_width, orig_height = image.size
    aspect_ratio = orig_width / orig_height

    # calculate the existing image aspect ratio
    target_ratios = set(
        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
        i * j <= max_num and i * j >= min_num)
    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])

    # find the closest aspect ratio to the target
    target_aspect_ratio = find_closest_aspect_ratio(
        aspect_ratio, target_ratios, orig_width, orig_height, image_size)

    # calculate the target width and height
    target_width = image_size * target_aspect_ratio[0]
    target_height = image_size * target_aspect_ratio[1]
    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]

    # resize the image
    resized_img = image.resize((target_width, target_height))
    processed_images = []
    for i in range(blocks):
        box = (
            (i % (target_width // image_size)) * image_size,
            (i // (target_width // image_size)) * image_size,
            ((i % (target_width // image_size)) + 1) * image_size,
            ((i // (target_width // image_size)) + 1) * image_size
        )
        # split the image
        split_img = resized_img.crop(box)
        processed_images.append(split_img)
    assert len(processed_images) == blocks
    if use_thumbnail and len(processed_images) != 1:
        thumbnail_img = image.resize((image_size, image_size))
        processed_images.append(thumbnail_img)
    return processed_images

def load_image(image_file, input_size=448, max_num=12):
    image = Image.open(image_file).convert('RGB')
    transform = build_transform(input_size=input_size)
    images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
    pixel_values = [transform(image) for image in images]
    pixel_values = torch.stack(pixel_values)
    return pixel_values

def split_model(model_name):
    device_map = {}
    world_size = torch.cuda.device_count()
    config = AutoConfig.from_pretrained("OpenGVLab/InternVL3-2B", trust_remote_code=True)
    num_layers = config.llm_config.num_hidden_layers
    # Since the first GPU will be used for ViT, treat it as half a GPU.
    num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
    num_layers_per_gpu = [num_layers_per_gpu] * world_size
    num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)
    layer_cnt = 0
    for i, num_layer in enumerate(num_layers_per_gpu):
        for j in range(num_layer):
            device_map[f'language_model.model.layers.{layer_cnt}'] = i
            layer_cnt += 1
    device_map['vision_model'] = 0
    device_map['mlp1'] = 0
    device_map['language_model.model.tok_embeddings'] = 0
    device_map['language_model.model.embed_tokens'] = 0
    device_map['language_model.output'] = 0
    device_map['language_model.model.norm'] = 0
    device_map['language_model.model.rotary_emb'] = 0
    device_map['language_model.lm_head'] = 0
    device_map[f'language_model.model.layers.{num_layers - 1}'] = 0

    return device_map

# If you set `load_in_8bit=True`, you will need two 80GB GPUs.
# If you set `load_in_8bit=False`, you will need at least three 80GB GPUs.
path = 'OpenGVLab/InternVL3-2B'
device_map = split_model('InternVL3-2B')
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    load_in_8bit=False,
    low_cpu_mem_usage=True,
    use_flash_attn=False,
    trust_remote_code=True,
    device_map=device_map).eval()
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)

config.json:   0%|          | 0.00/6.33k [00:00<?, ?B/s]

configuration_internvl_chat.py:   0%|          | 0.00/4.04k [00:00<?, ?B/s]

configuration_intern_vit.py:   0%|          | 0.00/5.55k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/OpenGVLab/InternVL3-2B:
- configuration_intern_vit.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/OpenGVLab/InternVL3-2B:
- configuration_internvl_chat.py
- configuration_intern_vit.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_internvl_chat.py:   0%|          | 0.00/16.0k [00:00<?, ?B/s]

modeling_intern_vit.py:   0%|          | 0.00/18.1k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/OpenGVLab/InternVL3-2B:
- modeling_intern_vit.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


conversation.py:   0%|          | 0.00/15.3k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/OpenGVLab/InternVL3-2B:
- conversation.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/OpenGVLab/InternVL3-2B:
- modeling_internvl_chat.py
- modeling_intern_vit.py
- conversation.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
2025-05-27 09:47:17.259303: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748339237.469625      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:174833

FlashAttention2 is not installed.


model.safetensors:   0%|          | 0.00/4.18G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/69.0 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/8.92k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/3.38M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/790 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

In [3]:
# set the max number of tiles in `max_num`
# pixel_values = load_image('/kaggle/input/test-image/basketball.jpg', max_num=12).to(torch.bfloat16).cuda()
generation_config = dict(max_new_tokens=1024, do_sample=True)

In [4]:
# prompt = """You are a vision-language model that, given a food image and a noisy or vague dish title, generates a concise 2–3-step cooking instruction capturing the essence of the dish.

# Few-Shot Examples

# Example 1:
# Image: <image>
# Vague Title: “green swirl bake”
# Instruction: “Boil pasta with peas and asparagus until just tender, then drain (reserving some pasta water). Sauté seasoned chicken pieces until golden, then set aside. Make a creamy sauce in the same pot, toss in pasta, veggies, chicken, transfer to a baking dish, top with cheese and breadcrumbs, and bake until bubbly.” 

# Example 2:
# Image: <image>
# Vague Title: “noodly tangle”
# Instruction: “Cook and chill the macaroni until firm, then rinse under cold water. Whisk together mayo, sugar, vinegar, mustard and seasonings, toss with pasta and chopped veggies. Refrigerate for several hours (or overnight) to meld flavors before serving.” 

# Example 3:
# Image: <image>
# Vague Title: “creamy pink layer”
# Instruction: “Bake a buttery pretzel crust until golden and let it cool. Blend cream cheese with sugar and whipped topping, spread over crust and chill. Pour strawberry-gelatin mixture with berries on top, then refrigerate until set.” 

# Example 4:
# Image: <image>
# Vague Title: “spiced shred stack”
# Instruction: “Simmer a whole chicken in seasoned water until cooked through, then shred and reserve the broth. Reduce the broth to concentrate flavor, return shredded chicken to simmer until saucy. Season to taste with extra spices and lemon juice, then stack in bread or wraps.” 

# Example 5:
# Image: <image>
# Vague Title: “tart berry crumble”
# Instruction: “Toss strawberry and rhubarb with sugar, cornstarch, and salt, then fill a baking dish. Mix flour, almonds, sugar, butter, and zest into a crumbly topping and chill briefly. Cover the fruit with topping and bake until bubbly and golden, then rest before serving.” 

# Example 6:
# Image: <image>
# Vague Title: “veggie garden layers”
# Instruction: “Press crescent dough into a pan, bake until golden, then cool completely. Whip together sour cream, cream cheese, ranch mix and seasonings, spread over crust. Top with chopped veggies, chill until set, then cut into squares to serve.” 

# Example 7:
# Image: <image>
# Vague Title: “charred sea pops”
# Instruction: “Mash garlic with salt, spices, oil and lemon juice into a paste, then toss with shrimp. Grill shrimp 2–3 minutes per side until opaque. Plate with lemon wedges and serve hot.” 

# Example 8:
# Image: <image>
# Vague Title: “folded veggie bundle”
# Instruction: “Blend mayo, yogurt and chipotles into a spicy spread. Warm tortillas, layer half the veggies, cheese, bacon, onion, tomato and chicken, then fold edges and roll tight.” 

# Example 9:
# Image: <image>
# Vague Title: “layered olive loaf”
# Instruction: “Mix chopped olives, veggies, seasonings and oils into a jar; refrigerate overnight. Hollow out loaves, spread both halves with olive salad, layer meats and cheeses, reassemble and slice.” 

# Example 10:
# Image: <image>
# Vague Title: “poppy crunch tube”
# Instruction: “Poach the hot dog, steam the poppy-seed bun. Nestle the dog in the bun and pile on mustard, relish, onion, tomato, pickle spear, sport peppers and a dash of celery salt—no ketchup.” 

# Example 11:
# Image: <image>
# Vague Title: “spiced beef bun”
# Instruction: “Mix mayo with a teaspoon of Cajun seasoning for the spread, then season beef with the rest of the spice mix, onion, jalapeño, garlic and Worcestershire; form patties. Grill burgers 5 minutes per side, topping with pepper jack at the end. Assemble on buns with lettuce, tomato and Cajun mayo.” 

# Example 12:
# Image: <image>
# Vague Title: “fiery crunch pieces”
# Instruction: “Marinate chicken in buttermilk, brine and hot sauce for a few hours. Dredge in seasoned flour twice, fry at 325 °F until chicken reaches 160 °F inside, then drain. Brush with cayenne-butter sauce and serve immediately.” 

# New Input
# Image: <image>
# Vague Title: “cheesy wheel”
# Instruction:"""

In [5]:
prompt = """You are a vision-language model that, given a food image and a noisy or vague dish title, generates a concise 2–3-step cooking instruction capturing the essence of the dish.

Few-Shot Examples

Example 1:
Image: <image>
Vague Title: “green swirl bake”
Instruction: “Boil pasta with peas and asparagus until just tender, then drain (reserving some pasta water). Sauté seasoned chicken pieces until golden, then set aside. Make a creamy sauce in the same pot, toss in pasta, veggies, chicken, transfer to a baking dish, top with cheese and breadcrumbs, and bake until bubbly.” 

Example 2:
Image: <image>
Vague Title: “noodly tangle”
Instruction: “Cook and chill the macaroni until firm, then rinse under cold water. Whisk together mayo, sugar, vinegar, mustard and seasonings, toss with pasta and chopped veggies. Refrigerate for several hours (or overnight) to meld flavors before serving.” 

Example 3:
Image: <image>
Vague Title: “creamy pink layer”
Instruction: “Bake a buttery pretzel crust until golden and let it cool. Blend cream cheese with sugar and whipped topping, spread over crust and chill. Pour strawberry-gelatin mixture with berries on top, then refrigerate until set.” 

Example 4:
Image: <image>
Vague Title: “spiced shred stack”
Instruction: “Simmer a whole chicken in seasoned water until cooked through, then shred and reserve the broth. Reduce the broth to concentrate flavor, return shredded chicken to simmer until saucy. Season to taste with extra spices and lemon juice, then stack in bread or wraps.” 

New Input
Image: <image>
Vague Title: “cheesy wheel”
Instruction:"""

In [6]:
# multi-image multi-round conversation, separate images (多图多轮对话，独立图像)
pixel_values1 = load_image('/kaggle/input/recipie-resized-images/recipie_resized_images/Chicken Primavera Pasta Bake Recipe.jpg', max_num=12).to(torch.bfloat16).cuda()
pixel_values2 = load_image('/kaggle/input/recipie-resized-images/recipie_resized_images/Classic Macaroni Salad Recipe with Video.jpg', max_num=12).to(torch.bfloat16).cuda()
pixel_values3 = load_image('/kaggle/input/recipie-resized-images/recipie_resized_images/Judys Strawberry Pretzel Salad Recipe.jpg', max_num=12).to(torch.bfloat16).cuda()
pixel_values4 = load_image('/kaggle/input/recipie-resized-images/recipie_resized_images/Pulled Chicken Shawarma Sandwich Recipe.jpg', max_num=12).to(torch.bfloat16).cuda()
pixel_values5 = load_image('/kaggle/input/recipes-dataset/recipes_output/Strawberry Rhubarb Crumble Recipe/main_image.jpg', max_num=12).to(torch.bfloat16).cuda()
pixel_values6 = load_image('/kaggle/input/recipes-dataset/recipes_output/Veggie Pizza Recipe/main_image.jpg', max_num=12).to(torch.bfloat16).cuda()
pixel_values7 = load_image('/kaggle/input/recipes-dataset/recipes_output/Spicy Grilled Shrimp Recipe/main_image.jpg', max_num=12).to(torch.bfloat16).cuda()
pixel_values8 = load_image('/kaggle/input/recipes-dataset/recipes_output/California Club Chicken Wraps Recipe/main_image.jpg', max_num=12).to(torch.bfloat16).cuda()
pixel_values9 = load_image('/kaggle/input/recipes-dataset/recipes_output/Real Nawlins Muffuletta Recipe/main_image.jpg', max_num=12).to(torch.bfloat16).cuda()
pixel_values10 = load_image('/kaggle/input/recipes-dataset/recipes_output/Chicago-Style Hot Dog Recipe/main_image.jpg', max_num=12).to(torch.bfloat16).cuda()
pixel_values11 = load_image('/kaggle/input/recipes-dataset/recipes_output/Tex-Mex Burger with Cajun Mayo Recipe/main_image.jpg', max_num=12).to(torch.bfloat16).cuda()
pixel_values12 = load_image('/kaggle/input/recipes-dataset/recipes_output/Chef Johns Nashville Hot Chicken Recipe/main_image.jpg', max_num=12).to(torch.bfloat16).cuda()

pixel_values_input = load_image('/kaggle/input/recipie-resized-images/recipie_resized_images/Campfire Pepperoni Pizza Recipe.jpg', max_num=12).to(torch.bfloat16).cuda()

# pixel_values = torch.cat((pixel_values1, pixel_values2, pixel_values3, pixel_values4, pixel_values5, pixel_values6, pixel_values7, pixel_values8, pixel_values9, pixel_values10, pixel_values11, pixel_values12, pixel_values_input), dim=0)
# num_patches_list = [pixel_values1.size(0), pixel_values2.size(0), pixel_values3.size(0), pixel_values4.size(0), pixel_values5.size(0), pixel_values6.size(0), pixel_values7.size(0), pixel_values8.size(0), pixel_values9.size(0), pixel_values10.size(0), pixel_values11.size(0), pixel_values12.size(0), pixel_values_input.size(0)]
pixel_values = torch.cat((pixel_values1, pixel_values2, pixel_values3, pixel_values4, pixel_values_input), dim=0)
num_patches_list = [pixel_values1.size(0), pixel_values2.size(0), pixel_values3.size(0), pixel_values4.size(0),pixel_values_input.size(0)]

In [7]:
question = prompt
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
                               num_patches_list=num_patches_list,
                               history=None, return_history=True)
print(f'User: {question}\nAssistant: {response}')

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


OutOfMemoryError: CUDA out of memory. Tried to allocate 5.33 GiB. GPU 0 has a total capacity of 14.74 GiB of which 2.45 GiB is free. Process 2791 has 12.29 GiB memory in use. Of the allocated memory 11.03 GiB is allocated by PyTorch, and 1.14 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
torch.cuda.empty_cache()
torch.cuda.ipc_collect()