In [1]:
# https://stackoverflow.com/a/71567609

from sentence_transformers import SentenceTransformer, util
from PIL import Image
import glob
import os

# Load the OpenAI CLIP Model
print('Loading CLIP Model...')
model = SentenceTransformer('clip-ViT-B-32')

# Next we compute the embeddings
image_names = list(glob.glob('./stimulus_set_final_all/*.jpg'))
print("Images:", len(image_names))
encoded_images = model.encode(
    [Image.open(filepath) for filepath in image_names], 
    batch_size=128, 
    convert_to_tensor=True, 
    show_progress_bar=True
    )

processed_images = util.paraphrase_mining_embeddings(encoded_images)
NUM_SIMILAR_IMAGES = 5

# =================
# DUPLICATES
# =================
print('Finding duplicate images...')
# Filter list for duplicates. Results are triplets (score, image_id1, image_id2) and is scorted in decreasing order
# A duplicate image will have a score of 1.00
duplicates = [image for image in processed_images if image[0] >= 1]

# Output the top X duplicate images
for score, image_id1, image_id2 in duplicates[0:NUM_SIMILAR_IMAGES]:
    print("\nScore: {:.3f}%".format(score * 100))
    print(image_names[image_id1])
    print(image_names[image_id2])

# =================
# NEAR DUPLICATES
# =================
print('Finding near duplicate images...')
# Use a threshold parameter to identify two images as similar. By setting the threshold lower, 
# you will get larger clusters which have less similar images in it. Threshold 0 - 1.00
# A threshold of 1.00 means the two images are exactly the same. Since we are finding near 
# duplicate images, we can set it at 0.99 or any number 0 < X < 1.00.
threshold = 0.95
near_duplicates = [image for image in processed_images if image[0] < threshold]

for score, image_id1, image_id2 in near_duplicates[0:NUM_SIMILAR_IMAGES]:
    print("\nScore: {:.3f}%".format(score * 100))
    print(image_names[image_id1])
    print(image_names[image_id2])

  from .autonotebook import tqdm as notebook_tqdm


Loading CLIP Model...


Downloading: 100%|██████████| 690/690 [00:00<00:00, 114kB/s]
Downloading: 100%|██████████| 4.03k/4.03k [00:00<00:00, 752kB/s]
Downloading: 100%|██████████| 525k/525k [00:00<00:00, 969kB/s]  
Downloading: 100%|██████████| 316/316 [00:00<00:00, 61.2kB/s]
Downloading: 100%|██████████| 605M/605M [00:34<00:00, 17.7MB/s] 
Downloading: 100%|██████████| 389/389 [00:00<00:00, 78.6kB/s]
Downloading: 100%|██████████| 604/604 [00:00<00:00, 125kB/s]
Downloading: 100%|██████████| 961k/961k [00:00<00:00, 1.47MB/s]
Downloading: 100%|██████████| 1.88k/1.88k [00:00<00:00, 405kB/s]
Downloading: 100%|██████████| 116/116 [00:00<00:00, 30.8kB/s]
Downloading: 100%|██████████| 122/122 [00:00<00:00, 27.9kB/s]


Images: 1494


Batches: 100%|██████████| 12/12 [01:40<00:00,  8.36s/it]


Finding duplicate images...

Score: 100.000%
./stimulus_set_final_all/train_34.jpg
./stimulus_set_final_all/train_68.jpg

Score: 100.000%
./stimulus_set_final_all/dis_hammer_2_6.jpg
./stimulus_set_final_all/dis_hammer_49.jpg

Score: 100.000%
./stimulus_set_final_all/dis_woman's face_74.jpg
./stimulus_set_final_all/dis_face_145.jpg

Score: 100.000%
./stimulus_set_final_all/bat_244.jpg
./stimulus_set_final_all/bat_9.jpg

Score: 100.000%
./stimulus_set_final_all/dis_tennis_player_105.jpg
./stimulus_set_final_all/dis_tennis player_117.jpg
Finding near duplicate images...

Score: 95.000%
./stimulus_set_final_all/bat_253.jpg
./stimulus_set_final_all/houseplant_32.jpg

Score: 95.000%
./stimulus_set_final_all/surfer_37.jpg
./stimulus_set_final_all/monkey_243.jpg

Score: 95.000%
./stimulus_set_final_all/rake_166.jpg
./stimulus_set_final_all/train_143.jpg

Score: 95.000%
./stimulus_set_final_all/face_147.jpg
./stimulus_set_final_all/wrench_18.jpg

Score: 95.000%
./stimulus_set_final_all/gymnast_