# Load CLIP

In [1]:
!pip install diffusers==0.11.1
!pip install transformers scipy ftfy accelerate

Collecting diffusers==0.11.1
  Downloading diffusers-0.11.1-py3-none-any.whl (524 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m524.9/524.9 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting filelock
  Downloading filelock-3.12.0-py3-none-any.whl (10 kB)
Collecting huggingface-hub>=0.10.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
Collecting importlib-metadata
  Downloading importlib_metadata-6.6.0-py3-none-any.whl (22 kB)
Collecting regex!=2019.12.17
  Downloading regex-2023.5.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (769 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m769.7/769.7 kB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting fsspec
  Downloading fsspec-2023.5.0-py3-none-any.whl (160 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [1]:
import numpy as np
import torch
import random
from transformers import CLIPTextModel, CLIPTokenizer
device = 'cuda'

len_prompt = 5

tokenizer = CLIPTokenizer.from_pretrained('openai/clip-vit-large-patch14')
text_encoder = CLIPTextModel.from_pretrained('openai/clip-vit-large-patch14')
text_encoder = text_encoder.to(device)
char_table = get_char_table()

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at openai/clip-vit-large-patch14 were not used when initializing CLIPTextModel: ['vision_model.encoder.layers.2.layer_norm1.bias', 'vision_model.encoder.layers.21.self_attn.v_proj.weight', 'vision_model.encoder.layers.3.self_attn.q_proj.bias', 'vision_model.encoder.layers.18.self_attn.out_proj.bias', 'vision_model.encoder.layers.23.self_attn.k_proj.weight', 'vision_model.encoder.layers.16.mlp.fc1.weight', 'vision_model.encoder.layers.17.self_attn.v_proj.bias', 'vision_model.encoder.layers.15.self_attn.out_proj.weight', 'vision_model.encoder.layers.4.self_attn.q_proj.weight', 'vision_model.encoder.layers.12.mlp.fc1.bias', 'vision_model.encoder.layers.3.self_attn.k_proj.weight', 'vision_model.encoder.layers.5.layer_norm2.weight', 'vision_model.encoder.layers.3.self_attn.k_proj.bias', 'vision_model.encoder.layers.18.mlp.fc2.bias', 'vision_model.encoder.layers.22.mlp.fc1.weight', 'vision_model.encoder.la

# Generate Untargeted Perturbation Prompts

In [2]:
attack_sentence = "a snake and a young man"

#  Greedy
greedy_sentence = search_min_sentence_iteration(attack_sentence, char_table, len_prompt, 
                                                1, tokenizer=tokenizer, text_encoder=text_encoder)
print("greedy_prompt:",greedy_sentence)

#  Genetic
for i in range(5):
    genetic_prompt = genetic(attack_sentence, char_table, len_prompt, tokenizer=tokenizer, 
                             text_encoder=text_encoder)
    genetic_sentence = attack_sentence + ' ' + genetic_prompt[0][0]
    print("genetic_prompt:",genetic_sentence)
    
#  PGD
for i in range(5):
    max_tensor, loss_list, pgd_prompt, max_loss = train(init_per_sample=1, sentence=attack_sentence, 
                                                        len_prompt=len_prompt, char_list=char_table, 
                                                        model=text_encoder.text_model, iter_num = 100, 
                                                        eta_u=1, tokenizer=tokenizer, text_encoder=text_encoder)  
    pgd_sentence = attack_sentence + ' ' + pgd_prompt
    print("pgd_prompt:",pgd_sentence)

KeyboardInterrupt: 

# Targeted QF Attack

## Step 1: Generate mask

In [None]:
from utils import object_key
sentence_list = [
    "A black panther lying in a jungle and a young man",
    "A fishing boat on a lake at sunrise and a young man",
    "A tea cup on a saucer with a teapot and a young man",
    "A man playing guitar on a street corner and a young man",
    "A group of flamingos standing in a pond and a young man",
    "A fireflies in a field at dusk and a young man",
    "A train chugging through a countryside and a young man",
    "A butterfly on a colorful flower and a young man",
    "A soccer game being played on a stadium and a young man",
    "A man kayaking down a river through rapids and a young man"
]
mask = object_key(sentence_list, 'and a young man', thres=9, tokenizer=tokenizer, text_encoder=text_encoder)

## Step 2: Generate perturbation prompts

In [None]:
attack_sentence = "a snake and a young man"
mask = mask.view(-1)

#  Greedy
greedy_sentence = search_min_sentence_iteration(attack_sentence, char_table, len_prompt, 
                                                1, tokenizer=tokenizer, text_encoder=text_encoder,  mask=mask)
print("greedy_prompt:",greedy_sentence)

#  Genetic
for i in range(10):
    genetic_prompt = genetic(attack_sentence, char_table, len_prompt, tokenizer=tokenizer, 
                             text_encoder=text_encoder,  mask=mask)
    genetic_sentence = attack_sentence + ' ' + genetic_prompt[0][0]
    print("genetic_prompt:",genetic_sentence)
    
#  PGD
for i in range(10):
    max_tensor, loss_list, pgd_prompt, max_loss = train(init_per_sample=1, sentence=attack_sentence, 
                                                        len_prompt=len_prompt, char_list=char_table, 
                                                        model=text_encoder.text_model, iter_num = 100, 
                                                        eta_u=1, tokenizer=tokenizer, text_encoder=text_encoder,  mask=mask)  
    pgd_sentence = attack_sentence + ' ' + pgd_prompt
    print("pgd_prompt:",pgd_sentence)

# Evaluation

## Load Stable Diffusion Model v1.4

In [None]:
from diffusers import StableDiffusionPipeline
from torch import autocast
from utils import image_grid
# pipe = StableDiffusionPipeline.from_pretrained('CompVis/stable-diffusion-v1-4',torch_dtype=torch.float16)
pipe = StableDiffusionPipeline.from_pretrained(
    'CompVis/stable-diffusion-v1-4', revision='fp16',
    torch_dtype=torch.float16, use_auth_token=True)
pipe = pipe.to(device)

## Generate Images and Show Results

In [None]:
generator = torch.Generator("cuda").manual_seed(27)

original_sentence = 'a snake and a young man'
perturbation_prompt = '-08=*'
sentence = original_sentence + ' ' + perturbation_prompt

num_images = 5
prompt = [sentence] * num_images
with autocast('cuda'):
    images = pipe(prompt, generator=generator, num_inference_steps=50).images

grid = image_grid(images, rows=1, cols=5)
grid
