# Find identical images
*Martina Brachmann <martina.brachmann(at)ri(dot)se>*

In [1]:
!curl -s https://course.fast.ai/setup/colab | bash

Updating fastai...
Done.


In [2]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
root_dir = "/content/gdrive/My Drive/"
base_dir = root_dir + 'fastai-v3/'

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [0]:
'''
Compares two images in order to tell whether they are identical.
Returns True if they are identical otherwise returns false.

Copied from https://stackoverflow.com/a/34678958
'''

import cv2

def is_identical(img1, img2):
  if img1.shape == img2.shape:
    #print("The images have same size and channels")
    difference = cv2.subtract(img1, img2)
    b, g, r = cv2.split(difference)

    if cv2.countNonZero(b) == 0 and cv2.countNonZero(g) == 0 and cv2.countNonZero(r) == 0:
      #print("The images are completely Equal")
      return True
    else:
      #print("images are different")
      return False
  else:
    return False

In [6]:
'''
Get the path and the images. The images and their path is saved as a list. 
The list is sorted in case you want to only compare a range of images, e.g., 
you have to stop the process in the middle of execution so you do not have to 
go through all images again.
'''

from fastai.vision import *

path = Path(root_dir + 'data-sets/faces_fake-vs-real/fake')
print(path)
fnames = get_image_files(path)
print(len(fnames))
fnames.sort()
fnames[:5]

/content/gdrive/My Drive/data-sets/faces_fake-vs-real/fake
1971


[PosixPath('/content/gdrive/My Drive/data-sets/faces_fake-vs-real/fake/fake_01.jpeg'),
 PosixPath('/content/gdrive/My Drive/data-sets/faces_fake-vs-real/fake/fake_02.jpeg'),
 PosixPath('/content/gdrive/My Drive/data-sets/faces_fake-vs-real/fake/fake_03.jpeg'),
 PosixPath('/content/gdrive/My Drive/data-sets/faces_fake-vs-real/fake/fake_04.jpeg'),
 PosixPath('/content/gdrive/My Drive/data-sets/faces_fake-vs-real/fake/fake_05.jpeg')]

In [7]:
'''
In case you want to only to compare a range of images
'''

print(len(fnames))
fnames = fnames[1200:]
print(len(fnames))

1971
771


In [8]:
'''
I guess this is not really needed. But just in case, check if there are file 
extensions that we do not want to compare.
'''

extension = "jpeg"

for filename in fnames:
  if extension not in str(filename):
    fnames.remove(filename)
    print("Removed %s from list" % filename)
    
print(len(fnames))

771


In [0]:
import time

In [0]:
'''
Compare a specific image with the images in the folder.
'''

image_1 = path/'fake_619.jpeg'
print(image_1)
img_1 = cv2.imread(str(image_1))

if img_1 is not None:
  for i, image_2 in enumerate(fnames):
    #print("%s out of %s" % (i+1, len(fnames)))
    start = time.time()
    img_2 = cv2.imread(str(image_2))
    if img_2 is not None:
      ident = is_identical(img_1, img_2)
      if ident:
        print("Image 1: %s" % image_1)
        print("Image 2: %s" % image_2)
        print()
    else:
      print("Something wrong with file %s" % image_2)
  end = time.time()
  print("Time: %a" % (end-start))
else:    
  print("Something wrong with file %s" % image_1)

/content/gdrive/My Drive/data-sets/faces_fake-vs-real/fake_619.jpeg
Image 1: /content/gdrive/My Drive/data-sets/faces_fake-vs-real/fake_619.jpeg
Image 2: /content/gdrive/My Drive/data-sets/faces_fake-vs-real/fake_619.jpeg

Time: 0.039621829986572266


In [0]:
'''
Compare a list of images with the list of images of the folder. It may 
take a couple of hours to go through the list.
'''
s_start = time.time() 

img_list = [path/'real_928.jpeg', path/'real_929.jpeg', path/'real_930.jpeg', 
            path/'real_931.jpeg', path/'real_932.jpeg', path/'real_933.jpeg', path/'real_934.jpeg', path/'real_935.jpeg', path/'real_936.jpeg', path/'real_937.jpeg', path/'real_938.jpeg', 
            path/'real_939.jpeg', path/'real_940.jpeg', path/'real_941.jpeg', path/'real_942.jpeg', path/'real_943.jpeg', path/'real_944.jpeg', path/'real_945.jpeg', path/'real_946.jpeg', 
            path/'real_947.jpeg', path/'real_948.jpeg', path/'real_949.jpeg', path/'real_950.jpeg', path/'real_951.jpeg', path/'real_952.jpeg', path/'real_953.jpeg', path/'real_954.jpeg', 
            path/'real_955.jpeg', path/'real_956.jpeg', path/'real_957.jpeg', path/'real_958.jpeg', path/'real_959.jpeg', path/'real_960.jpeg', path/'real_961.jpeg', path/'real_962.jpeg', 
            path/'real_963.jpeg', path/'real_964.jpeg', path/'real_965.jpeg', path/'real_966.jpeg', path/'real_967.jpeg', path/'real_968.jpeg', path/'real_969.jpeg', path/'real_970.jpeg', 
            path/'real_971.jpeg', path/'real_972.jpeg', path/'real_973.jpeg', path/'real_974.jpeg', path/'real_975.jpeg', path/'real_976.jpeg', path/'real_977.jpeg', path/'real_978.jpeg', 
            path/'real_979.jpeg', path/'real_980.jpeg', path/'real_981.jpeg', path/'real_982.jpeg', path/'real_983.jpeg', path/'real_984.jpeg', path/'real_985.jpeg', path/'real_986.jpeg', 
            path/'real_987.jpeg', path/'real_988.jpeg', path/'real_989.jpeg', path/'real_990.jpeg', path/'real_991.jpeg', path/'real_992.jpeg', path/'real_993.jpeg', path/'real_994.jpeg', 
            path/'real_995.jpeg', path/'real_996.jpeg', path/'real_997.jpeg', path/'real_998.jpeg', path/'real_999.jpeg', path/'real_1000.jpeg']

for image_1 in img_list:
  print(image_1)
  start = time.time()
  img_1 = cv2.imread(str(image_1))
  if img_1 is not None:
    for image_2 in fnames:
      img_2 = cv2.imread(str(image_2))
      if img_2 is not None:
        ident = is_identical(img_1, img_2)
        if ident:
          print("Image 1: %s" % image_1)
          print("Image 2: %s" % image_2)
          print()
      else:
        print("Something wrong with file %s" % image_2)
  else:    
    print("Something wrong with file %s" % image_1)
  end = time.time()
  print("Time: %a" % (end-start))

s_finish = time.time()
print("Finished after %s minutes" % ((s_finish - s_start)/60))

In [11]:
'''
Compare all images with each other that are in the list of images. It may 
take a couple of hours to go through the list.
'''

for i, image_1 in enumerate(fnames):
  print("%s out of %s" % (i+1, len(fnames)))
  start = time.time()
  img_1 = cv2.imread(str(image_1))
  if img_1 is not None:
    for image_2 in fnames[i+1:]:
      img_2 = cv2.imread(str(image_2))
      if img_2 is not None:
        ident = is_identical(img_1, img_2)
        if ident:
          print("Image 1: %s" % image_1)
          print("Image 2: %s" % image_2)
          print()
      else:
        print("Something wrong with file %s" % image_2)
  else:    
    print("Something wrong with file %s" % image_1)
  end = time.time()
  print("Time: %a" % (end-start))

1 out of 771
Time: 339.54619669914246
2 out of 771
Time: 38.609251737594604
3 out of 771
Time: 38.555275201797485
4 out of 771
Time: 38.37760305404663
5 out of 771
Time: 37.95015096664429
6 out of 771
Time: 37.8862669467926
7 out of 771
Time: 38.38918113708496
8 out of 771
Time: 37.52989077568054
9 out of 771
Time: 37.781712770462036
10 out of 771
Time: 37.33173680305481
11 out of 771
Time: 37.59950065612793
12 out of 771
Time: 37.24482989311218
13 out of 771
Time: 37.22842788696289
14 out of 771
Time: 37.052730083465576
15 out of 771
Time: 37.38107752799988
16 out of 771
Time: 37.328657150268555
17 out of 771
Time: 37.36923146247864
18 out of 771
Time: 36.97727036476135
19 out of 771
Time: 37.067535638809204
20 out of 771
Time: 36.93754434585571
21 out of 771
Time: 36.702876567840576
22 out of 771
Time: 36.80514478683472
23 out of 771
Time: 36.6847927570343
24 out of 771
Time: 37.17071771621704
25 out of 771
Time: 36.39199376106262
26 out of 771
Time: 36.631850242614746
27 out of 771
