In [None]:
import itertools
from PIL import Image
import os

import imagehash
import pandas as pd

In [None]:
class Duplicates:
    """Class to find duplicate images.
    
    Arguments
        directories - list of directories to find images in. (Must be
            full directory path).
        extensions - list of file extensions to include. (Does not 
            support HEIC).
        hash_size - integer size of the average hash. A larger size 
            picks up more detail in the image, reducing the chance that
            similar, but different, images share the same hash. It also
            lengthens the process time.
    """
    def __init__(self, directories, extensions, hash_size):
        self.directories = directories
        self.extensions = extensions
        self.hash_size = hash_size
        
    def find_images(self):
        """Find images in all directories listed. 
        
        Walks through each of the specified directories and appends any
        files containing one of the specified file extensions to a list. 
        """
        images = []
        
        for directory in self.directories:
            for root, dirs, files in os.walk(directory):
                for file in files:
                    if any(ext in file.lower() for ext in self.extensions):
                        images.append(os.path.join(root, file))
        
        self.images = images
        
    def avg_hash(self, file):
        """Calculate the average hash for a file using imagehash.
        
        Returns
            Average hash array for the file.
        """
        return imagehash.average_hash(Image.open(file), 
                                      hash_size=self.hash_size)
    
    def get_hashes(self):
        """Calculate average hashes for all images.
        
        Creates a dictionary with each filepath as a key and each value
        as the file's average hash. 
        """
        self.hashes = {image: self.avg_hash(image) for image in self.images}
        
    def dataframe(self):
        """Create a dataframe to compare the similarity of image hashes.
        
        All pairwise combinations of the image files found in the
        directories are put in a data frame. A third column, 'score', is
        the difference between the hashes for the two images.
        
        Returns
            Data frame sorted by 'score' in ascending order.
        """
        df = pd.DataFrame(
            {
                'img_1': [i[0] for i in itertools.combinations(self.images, 2)],
                'img_2': [i[1] for i in itertools.combinations(self.images, 2)],
            }
        )

        df['score'] = [self.hashes[img_1] - self.hashes[img_2] 
                       for img_1, img_2 in zip(df['img_1'], df['img_2'])]
        
        df = df.sort_values('score')
        
        return df

In [None]:
directories = [
    '/Users/xx/Pictures', 
    '/Users/xx/Documents',
]
extensions = ['.gif', '.ico', '.jpeg', '.jpg', '.png', '.tiff']

In [None]:
dup = Duplicates(directories, extensions, 16)
dup.find_images()
dup.get_hashes()
df = dup.dataframe()

In [None]:
df