# Description

Finds similar images in a given folder and prints similar pairs.

There are three types of similarity:
  - duplicate (images which are exactly the same)
  - modification (images which differ by size, blur level or noise filters)
  - similar (images of the same scene from another angle)


In [0]:
import os
import glob
import numpy as np
import itertools

from PIL import Image
from PIL.ImageStat import Stat
from PIL import ImageChops as ops

from google.colab import drive, files
drive.mount("/content/drive")


def str2hex(chain):
  return str(hex(int(chain, 2)))[2:][::-1].upper()


def average_hashing(img, size=32):
  size_ = int(np.sqrt(size))    # resize image
  img = img.resize((size_, size_), resample=Image.BICUBIC).convert('L')    # resize and convert to grayscale

  pix_data = np.array(img)
  pix = (pix_data.ravel() > pix_data.mean()).astype(np.int32)
  chain = ''.join('{}'.format(i) for i in pix)
  hex_res = str2hex(chain)

  return hex_res

  
def hamming_distance(img0, img1):
  return np.sum(img0 != img1)


def main(path):   
  res = []
  images = {image: Image.open(image) for image in glob.glob(path)}

  for image0, image1 in itertools.combinations(images.keys(), 2):     
      hex0 = average_hashing(images[image0])
      hex1 = average_hashing(images[image1])

      if hamming_distance(hex0, hex1) == 0:
          res.append([os.path.basename(image0), os.path.basename(image1)])
  
  for im in res:
      print(im)
      
if __name__=="__main__":
  main("/content/drive/My Drive/ML-Summer-School-Test/dev_dataset/*")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
['1.jpg', '1_duplicate.jpg']
['11_modification.jpg', '11.jpg']
['11_modification.jpg', '11_duplicate.jpg']
['6.jpg', '6_similar.jpg']
['15_modification.jpg', '15.jpg']
['11.jpg', '11_duplicate.jpg']
