In [1]:
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
import csv
from tqdm import tqdm
import json

In [3]:
qwen2VL = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", device_map="auto")
qwen_processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [6]:
def qwen2vl_text_only_gen(question):
    conversation = [
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": question
                }
            ]
        }
    ]
    
    # Preprocess the inputs
    text_prompt = qwen_processor.apply_chat_template(conversation, add_generation_prompt=True)
    
    inputs = qwen_processor(text=[text_prompt], images=None, padding=True, return_tensors="pt")
    inputs = inputs.to('cuda')
    
    # Use sampling instead of deterministic generation
    output_ids = qwen2VL.generate(
        **inputs, 
        max_new_tokens=64, 
        do_sample=True, 
        top_k=50, 
        top_p=0.9,  # Adjust for more diversity
        temperature=0.8  # Increase randomness
    )
    generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
    output_text = qwen_processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    return output_text[0]

In [7]:
import json
from tqdm import tqdm

output_file = "prompts.json"
objects_file = "interesting_objects_v3.txt"
gen_types = ["easy", "medium", "hard"]

base_prompts = {
    "easy": "Generate a very short prompt for an image that includes the given object. Generate the prompt and nothing else. Object: {obj}. Prompt:",
    "medium": "Generate a description for an image that includes the given object along with other objects that can be with it. Generate the prompt and nothing else. Object: {obj}. Prompt:",
    "hard": "Generate a prompt for a surreal or imaginary scenario for an image that includes the given object. Generate the prompt and nothing else. Object: {obj}. Prompt:",
}

# Read objects from file
with open(objects_file, "r") as f:
    objects = [o.strip() for o in f.readlines()]

# Dictionary to store generated prompts
prompts_dict = {}

# Process objects with tqdm progress bar
for obj in tqdm(objects, desc="Generating Prompts", unit="obj"):
    prompts_dict[obj] = []
    
    for prompt_type, prompt in base_prompts.items():
        formatted_prompt = prompt.format(obj=obj)
        generated_prompt = qwen2vl_text_only_gen(formatted_prompt)
        
        # Store prompt type along with the generated text
        prompts_dict[obj].append({
            "type": prompt_type,
            "text": generated_prompt.replace('"', "")
        })

    # Save to JSON file after all prompts are generated
    with open(output_file, "w") as jsonfile:
        json.dump(prompts_dict, jsonfile, indent=4)

print("All prompts generated and saved successfully!")

Generating Prompts: 100%|██████████| 159/159 [07:52<00:00,  2.97s/obj]

All prompts generated and saved successfully!





In [8]:
import json

# File containing the generated prompts
output_file = "prompts.json"

# Load the JSON data
with open(output_file, "r") as jsonfile:
    prompts_dict = json.load(jsonfile)

# Print the full dictionary
print(json.dumps(prompts_dict, indent=4))

{
    "food": [
        {
            "type": "easy",
            "text": "A plate of food sits on a table, inviting someone to indulge in a delicious meal."
        },
        {
            "type": "medium",
            "text": "A plate of delicious food with a fork and knife on the side."
        },
        {
            "type": "hard",
            "text": "In a dreamlike landscape, a floating, glowing hamburger with a sizzling patty and dripping condiments dances in the air, surrounded by shimmering, liquid-like vegetables and fruits."
        }
    ],
    "part": [
        {
            "type": "easy",
            "text": "Close-up shot of a metal part in a workshop."
        },
        {
            "type": "medium",
            "text": "A part of a larger object surrounded by various tools and materials for assembly or repair."
        },
        {
            "type": "hard",
            "text": "A part of a human body, floating in a dreamscape, surrounded by vibrant colors and s