# Preparation


Install necessary packages and import CLIP model

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

! pip install ftfy regex tqdm
! pip install git+https://github.com/openai/CLIP.git

In [None]:
import os
import cv2
import clip
import torch
import random
import requests
import numpy as np
from PIL import Image
from tqdm import tqdm
import matplotlib.pyplot as plt
from pkg_resources import packaging
from sklearn.linear_model import LogisticRegression

In [None]:
model, preprocess = clip.load("RN50x4")
model.cuda().eval()
input_resolution = model.visual.input_resolution
context_length = model.context_length
vocab_size = model.vocab_size

print("Model parameters:", f"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}")
print("Input resolution:", input_resolution)
print("Context length:", context_length)
print("Vocab size:", vocab_size)

# Image preprocessing


Prepare different image versions

In [None]:
def add_text(img, text, coordinates, font_scale, font_thickness, outline_thickness):
    font = cv2.FONT_HERSHEY_DUPLEX
    # Draw black outline
    cv2.putText(img, text, coordinates, font, font_scale, (0, 0, 0), outline_thickness)
    # Draw text on top of the outline
    cv2.putText(img, text, coordinates, font, font_scale, (255, 255, 255), font_thickness)

def calculate_changed_pixels(original_img, modified_img):
    diff_img = cv2.absdiff(original_img, modified_img)
    changed_pixels = np.count_nonzero(diff_img)
    total_pixels = original_img.shape[0] * original_img.shape[1]
    percentage_changed = (changed_pixels / total_pixels) * 100
    return percentage_changed

def calculate_text_size(img, text, font_scale, font_thickness):
    font = cv2.FONT_HERSHEY_DUPLEX
    text_size, _ = cv2.getTextSize(text, font, font_scale, font_thickness)
    return text_size

def calculate_coordinates(img, text, font_scale, font_thickness, num_rows, num_cols, buffer=50):
    text_size = calculate_text_size(img, text, font_scale, font_thickness)
    cell_width = (img.shape[1] - 2 * buffer) // num_cols
    cell_height = (img.shape[0] - 2 * buffer) // num_rows
    coordinates = []

    for row in range(num_rows):
        for col in range(num_cols):
            cell_x = buffer + col * cell_width
            cell_y = buffer + row * cell_height
            cell_center_x = cell_x + cell_width // 2
            cell_center_y = cell_y + cell_height // 2
            rand_x = random.randint(cell_x, cell_x + cell_width - text_size[0])
            rand_y = random.randint(cell_y + text_size[1], cell_y + cell_height - text_size[1])
            coordinates.append((rand_x, rand_y))

    return coordinates

def process_images(input_directory, text_to_add, coordinates = None):
    images = []
    images_text = []

    output_directory = f"{input_directory}_{text_to_add}"
    # Create the output directory if it doesn't exist
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    # Define the number of rows and columns for the grid
    num_rows, num_cols = 4, 2

    # Calculate the size of each text
    font_scale = 0.6
    font_thickness = 1

    percentage, count = 0, 0

    # Initialize coordinates for the whole directory
    if not coordinates:
        coordinates = calculate_coordinates(cv2.imread(os.path.join(input_directory, os.listdir(input_directory)[0])), text_to_add, font_scale, font_thickness, num_rows, num_cols)

    # Process each image in the input directory
    for filename in tqdm(os.listdir(input_directory)):
        if filename.endswith(".JPEG"):  # Process only image files
            count += 1
            # Load the input image
            input_image_path = os.path.join(input_directory, filename)
            input_image = cv2.imread(input_image_path)

            # Add text to the image at precalculated coordinates
            modified_image = input_image.copy()
            outline_thickness = 3
            for coord in coordinates:
                add_text(modified_image, text_to_add, coord, font_scale, font_thickness, outline_thickness)

            # Save the modified image to the output directory
            output_image_path = os.path.join(output_directory, filename)
            cv2.imwrite(output_image_path, modified_image)

            # Calculate the percentage of changed pixels
            percentage_changed = calculate_changed_pixels(input_image, modified_image)
            percentage += percentage_changed

            # Preprocess images to use as input for CLIP model
            image = Image.open(input_image_path).convert('RGB')
            images.append(preprocess(image))

            image = Image.open(output_image_path).convert('RGB')
            images_text.append(preprocess(image))

    average_percent_changed = percentage/count
    print(f"Processed {count} images. Average percentage of changed pixels: {average_percent_changed:.2f}%")

    return coordinates, images, images_text

In [None]:
input_directory = "/content/drive/MyDrive/patrec/ImageNetValidation"
text_to_add = "radio"

# We used the same coordinates for different words
coords = [(41, 72), (167, 149), (278, 96), (447, 76), (15, 349), (153, 310), (293, 327), (409, 315)]
coords, images, images_text = process_images(input_directory, text_to_add, coordinates=coords)

# Typographic Attacks


Using Zero-Shot Classification

In [None]:
imagenet_labels = requests.get("https://raw.githubusercontent.com/anishathalye/imagenet-simple-labels/master/imagenet-simple-labels.json").json()

batch_size = 100

# Define a function to process a batch of images
def process_batch(images):
    image_input = torch.tensor(np.stack(images)).cuda()

    # Extract image features
    with torch.no_grad():
        image_features = model.encode_image(image_input).float()
        image_features /= image_features.norm(dim=-1, keepdim=True)

    text_descriptions = [f"This is a photo of a {label}" for label in imagenet_labels]
    text_tokens = clip.tokenize(text_descriptions).cuda()

    with torch.no_grad():
        text_features = model.encode_text(text_tokens).float()
        text_features /= text_features.norm(dim=-1, keepdim=True)

    text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
    top_probs, top_labels = text_probs.cpu().topk(5, dim=-1)

    return top_probs, top_labels

# Iterate over batches of images
all_top_probs = []
all_top_labels = []
all_top_probs_text = []
all_top_labels_text = []
for i in range(0, len(images), batch_size):
    batch_images = images[i:i+batch_size]
    top_probs, top_labels = process_batch(batch_images)
    all_top_probs.append(top_probs)
    all_top_labels.append(top_labels)

    batch_images = images_text[i:i+batch_size]
    top_probs, top_labels = process_batch(batch_images)
    all_top_probs_text.append(top_probs)
    all_top_labels_text.append(top_labels)

# Concatenate results from all batches
all_top_probs = torch.cat(all_top_probs, dim=0)
all_top_labels = torch.cat(all_top_labels, dim=0)
all_top_probs_text = torch.cat(all_top_probs_text, dim=0)
all_top_labels_text = torch.cat(all_top_labels_text, dim=0)

In [None]:
category = "radio"
success = 0

# Calculate success rate of attack
for i in range(0, len(images)):
    if category != imagenet_labels[all_top_labels[i].numpy()[0]] and category == imagenet_labels[all_top_labels_text[i].numpy()[0]]:
        success += 1
print(f"Processed {len(images)} images. Success rate of typographic attack with text '{text_to_add}': {success/len(images)*100:.2f}%")

In [None]:
# Used to find ossible attack texts htat are short
imagenet_labels = requests.get("https://raw.githubusercontent.com/anishathalye/imagenet-simple-labels/master/imagenet-simple-labels.json").json()
for label in imagenet_labels:
    if len(label) <= 5:
      print(label)

# Stroop Effect


Using zero-shot classification

In [None]:
output_directory = "/content/drive/MyDrive/patrec/StroopEffect"
# Create the output directory if it doesn't exist
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

def create_pattern_image(text, text_color):
    # Define image dimensions
    width, height = 400, 200

    # Create a white background image
    image = np.ones((height, width, 3), dtype=np.uint8) * 255

    # Choose font and scale
    font = cv2.FONT_HERSHEY_SIMPLEX
    font_scale = 2

    # Get the size of the text
    text_size = cv2.getTextSize(text, font, font_scale, 2)[0]

    # Calculate text position to be centered
    text_x = (width - text_size[0]) // 2
    text_y = (height + text_size[1]) // 2

    # Write the text on the image with specified color
    cv2.putText(image, text, (text_x, text_y), font, font_scale, text_color, 2, cv2.LINE_AA)

    return image

def save_image(image, text, color_name):
    # Save the image to a file
    dir = os.path.join(output_directory, text)
    if not os.path.exists(dir):
        os.makedirs(dir)
    filename = f"{color_name}.png"
    filepath = os.path.join(dir, filename)
    cv2.imwrite(filepath, image)
    print(f"Image with {color_name} text saved as {filename}")

# List of colors and their corresponding BGR values
colors = {
    "red": (0, 0, 255),
    "blue": (255, 0, 0),
    "green": (0, 255, 0),
    "yellow": (0, 255, 255),
    "orange": (0, 165, 255),
    "purple": (128, 0, 128),
    "black": (0, 0, 0),
    "pink": (203, 192, 255)
}

# Generate and save images for each color
text = "purple"
for color_name, color_value in colors.items():
    pattern_image = create_pattern_image(text, color_value)
    save_image(pattern_image, text, color_name)


In [None]:
labels = ['black', 'red', 'green', 'blue', 'orange', 'yellow', 'purple', 'pink']
output_directory = "/content/drive/MyDrive/patrec/StroopEffect"
batch_size = 8

# Load and preprocess the images
original_images = []
images = []

directory = os.path.join(output_directory, "pink")
for filename in os.listdir(directory):
    if filename.endswith(".png"):  # Process only image files
      image = Image.open(os.path.join(directory, filename)).convert("RGB")

      original_images.append(image)
      images.append(preprocess(image))

image_input = torch.tensor(np.stack(images)).cuda()

# Extract image features
with torch.no_grad():
    image_features = model.encode_image(image_input).float()
    image_features /= image_features.norm(dim=-1, keepdim=True)

text_descriptions = [f"My favorite word, written in the color {label}" for label in labels]
text_tokens = clip.tokenize(text_descriptions).cuda()

with torch.no_grad():
    text_features = model.encode_text(text_tokens).float()
    text_features /= text_features.norm(dim=-1, keepdim=True)

text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
top_probs, top_labels = text_probs.cpu().topk(3, dim=-1)

In [None]:
plt.figure(figsize=(4, 14))

for i, image in enumerate(original_images):
    plt.subplot(8, 2, 2 * i + 1)
    plt.imshow(image)
    plt.axis("off")

    plt.subplot(8, 2, 2 * i + 2)
    y = np.arange(top_probs.shape[-1])
    plt.grid()
    bars = plt.barh(y, top_probs[i], color='gray')

    max_index = np.argmax(top_probs[i])
    bars[max_index].set_color('darkgreen')

    plt.gca().invert_yaxis()
    plt.gca().set_axisbelow(True)
    plt.yticks(y, [labels[index] for index in top_labels[i]])
    plt.xlabel("probability")

plt.subplots_adjust(wspace=0.5)
plt.show()