# Generative Prompts Exploration

This notebook explores the prompts used to generate synthetic images for SynthBuster+ and SynthCLIC datasets.

These prompts were created from real images (Raise1K for SynthBuster+, CLIC2020 for SynthCLIC) and then used to generate synthetic versions with various generative models.

## Setup

In [None]:
from pathlib import Path

import pandas as pd
import matplotlib.pyplot as plt
from datasets import load_from_disk
import numpy as np

## Load Prompt Files

In [None]:
# Load SynthCLIC prompts
synthclic_prompts_path = Path("../data/datasets/synthclic/synthclic_prompts.parquet")
synthclic_prompts = pd.read_parquet(synthclic_prompts_path)

print("SynthCLIC Prompts")
print("=" * 60)
print(f"Total prompts: {len(synthclic_prompts):,}")
print(f"Columns: {synthclic_prompts.columns.tolist()}")
print(f"\nSplit distribution:")
print(synthclic_prompts['split'].value_counts())
print()

In [None]:
# Load SynthBuster+ prompts
synthbuster_prompts_path = Path("../data/datasets/synthbuster-plus/synthbuster_plus_prompts.parquet")
synthbuster_prompts = pd.read_parquet(synthbuster_prompts_path)

print("SynthBuster+ Prompts")
print("=" * 60)
print(f"Total prompts: {len(synthbuster_prompts):,}")
print(f"Columns: {synthbuster_prompts.columns.tolist()}")
print(f"\nSplit distribution:")
print(synthbuster_prompts['split'].value_counts())
print()

## Example Prompts

Let's look at some example prompts from each dataset.

### SynthCLIC Prompts

In [None]:
# Show first 10 prompts from SynthCLIC
print("First 10 SynthCLIC Prompts:")
print("=" * 80)
for idx, row in synthclic_prompts.head(10).iterrows():
    print(f"\n{idx + 1}. Image ID: {row['image_id']}")
    print(f"   Split: {row['split']}")
    print(f"   Size: {row['width']}x{row['height']}")
    print(f"   Prompt: {row['prompt']}")

### SynthBuster+ Prompts

In [None]:
# Show first 10 prompts from SynthBuster+
print("First 10 SynthBuster+ Prompts:")
print("=" * 80)
for idx, row in synthbuster_prompts.head(10).iterrows():
    print(f"\n{idx + 1}. Image ID: {row['image_id']}")
    print(f"   Split: {row['split']}")
    print(f"   Size: {row['width']}x{row['height']}")
    print(f"   Prompt: {row['prompt']}")

## Prompt Statistics

In [None]:
# Analyze prompt lengths
synthclic_prompts['prompt_length'] = synthclic_prompts['prompt'].str.len()
synthbuster_prompts['prompt_length'] = synthbuster_prompts['prompt'].str.len()

print("Prompt Length Statistics")
print("=" * 60)
print("\nSynthCLIC:")
print(synthclic_prompts['prompt_length'].describe())
print("\nSynthBuster+:")
print(synthbuster_prompts['prompt_length'].describe())

In [None]:
# Visualize prompt length distributions
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].hist(synthclic_prompts['prompt_length'], bins=30, alpha=0.7, edgecolor='black')
axes[0].set_xlabel('Prompt Length (characters)', fontsize=12)
axes[0].set_ylabel('Count', fontsize=12)
axes[0].set_title('SynthCLIC Prompt Length Distribution', fontsize=14, fontweight='bold')
axes[0].grid(alpha=0.3)

axes[1].hist(synthbuster_prompts['prompt_length'], bins=30, alpha=0.7, edgecolor='black', color='orange')
axes[1].set_xlabel('Prompt Length (characters)', fontsize=12)
axes[1].set_ylabel('Count', fontsize=12)
axes[1].set_title('SynthBuster+ Prompt Length Distribution', fontsize=14, fontweight='bold')
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

## Visualize Prompts with Real Images

Let's load the datasets and show some prompts alongside their corresponding real images.

### SynthCLIC: CLIC2020 Real Images with Prompts

In [None]:
# Load SynthCLIC dataset
synthclic_dataset = load_from_disk("../data/datasets/synthclic")

# Get CLIC2020 real images from train split
train_data = synthclic_dataset['train']
clic2020_indices = []
for idx, (label, source, image_id) in enumerate(zip(train_data['label'], train_data['source'], train_data['image_id'])):
    if label == 0 and source == 'clic2020':  # Real images from CLIC2020
        clic2020_indices.append(idx)

print(f"Found {len(clic2020_indices)} CLIC2020 real images in train split")

In [None]:
# Select 6 random CLIC2020 images
np.random.seed(123)
selected_indices = np.random.choice(clic2020_indices, 6, replace=False)

# Create visualization
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.flatten()

for i, idx in enumerate(selected_indices):
    example = train_data[int(idx)]
    image_id = example['image_id']

    # Get prompt for this image
    prompt_row = synthclic_prompts[synthclic_prompts['image_id'] == image_id]

    if len(prompt_row) > 0:
        prompt = prompt_row.iloc[0]['prompt']
    else:
        prompt = "(Prompt not found)"

    # Display image
    axes[i].imshow(example['image'])
    axes[i].axis('off')

    # Add title with wrapped prompt
    prompt_wrap_length = 90
    wrapped_prompt = '\n'.join([prompt[j:j+prompt_wrap_length] for j in range(0, len(prompt), prompt_wrap_length)])
    axes[i].set_title(f"ID: {image_id}\n\n{wrapped_prompt}",
                      fontsize=9, pad=20, loc='left', wrap=True)

plt.suptitle('SynthCLIC: CLIC2020 Real Images with Generation Prompts',
             fontsize=16, fontweight='bold', y=0.995)
plt.tight_layout()
plt.savefig("../docs/images/synthclic_clic2020_real_images_with_prompts.png", dpi=300)

### SynthBuster+: Raise1K Real Images with Prompts

In [None]:
# Load SynthBuster+ dataset
synthbuster_dataset = load_from_disk("../data/datasets/synthbuster-plus")

# Get Raise1K real images from train split
train_data = synthbuster_dataset['train']
raise1k_indices = []
for idx, (label, source, image_id) in enumerate(zip(train_data['label'], train_data['source'], train_data['image_id'])):
    if label == 0 and source == 'raise1k':  # Real images from Raise1K
        raise1k_indices.append(idx)

print(f"Found {len(raise1k_indices)} Raise1K real images in train split")

In [None]:
# Select 6 random Raise1K images
np.random.seed(123)
selected_indices = np.random.choice(raise1k_indices, 6, replace=False)

# Create visualization
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.flatten()

for i, idx in enumerate(selected_indices):
    example = train_data[int(idx)]
    image_id = example['image_id']

    # Get prompt for this image
    prompt_row = synthbuster_prompts[synthbuster_prompts['image_id'] == image_id]

    if len(prompt_row) > 0:
        prompt = prompt_row.iloc[0]['prompt']
    else:
        prompt = "(Prompt not found)"

    # Display image
    axes[i].imshow(example['image'])
    axes[i].axis('off')

    # Add title with wrapped prompt
    prompt_wrap_length = 90
    wrapped_prompt = '\n'.join([prompt[j:j+prompt_wrap_length] for j in range(0, len(prompt), prompt_wrap_length)])
    axes[i].set_title(f"ID: {image_id}\n\n{wrapped_prompt}",
                      fontsize=9, pad=20, loc='left', wrap=True)

plt.suptitle('SynthBuster+: Raise1K Real Images with Generation Prompts',
             fontsize=16, fontweight='bold', y=0.995)
plt.tight_layout()
plt.savefig("../docs/images/synthbuster_raise1k_real_images_with_prompts.png", dpi=300)

## Summary

This notebook demonstrates:
1. The prompts used to generate synthetic images for both datasets
2. Prompt statistics (length distribution, etc.)
3. Visual examples of real images paired with the prompts used to generate their synthetic counterparts

These prompts were carefully crafted to describe the content of real photographs, enabling the generation of semantically paired synthetic images for more meaningful evaluation of detection models.