In [288]:
import itertools
import re
import scipy.spatial
from hashlib import md5
import time
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# check & create folder


def check_folder(folder, nameFolder):
    if (os.path.exists(folder)):
        folder = folder
    else:
        os.mkdir(folder)
        print('mkdir folder '+nameFolder+' sucessful')


def load_image_folder(folder):
    image_list = []
    filenames_full = []
    filenames = []

    for filename in os.listdir(folder):
        # for image in images:
        filenames_full.append(folder+'/'+filename)
        filenames.append(filename)
        try:
            image = cv2.imread(folder+'/'+filename)
            assert image.shape[2] == 3

            image_list.append(image)
        except AssertionError as e:
            print(e)
    return image_list, filenames_full, filenames
# save file to folder


def save_file(folder_upload, folder_cleaned):
    count = 0
    for filename in os.listdir(folder_upload):
        if filename[0] != "." and "." in filename:
            img = cv2.imread(folder_upload+'/'+filename)
            if img is None:
                pass
            else:
                cv2.imwrite(os.path.join(
                    folder_cleaned, str(count) + '.jpg'), img)
                count += 1
    print(count)


# First turn the image into a gray scale image


def img_gray(image):
    image = cv2.imread(image)
    return np.average(image, weights=[0.299, 0.587, 0.114], axis=2)

# resize image and flatten


def resize(image, height=30, width=30):
    row_res = cv2.resize(image, (height, width),
                         interpolation=cv2.INTER_AREA).flatten()
    col_res = cv2.resize(image, (height, width),
                         interpolation=cv2.INTER_AREA).flatten('F')
    return row_res, col_res

# Calculate distance hamming


def hamming_distance(image, image2):
    score = scipy.spatial.distance.hamming(image, image2)
    return score

# gradient direction based on intensity


def intensity_diff(row_res, col_res):
    difference_row = np.diff(row_res)
    difference_col = np.diff(col_res)
    difference_row = difference_row > 0
    difference_col = difference_col > 0
    return np.vstack((difference_row, difference_col)).flatten()
    # return difference_row
    # return np.vstack((difference_row, difference_col)) #str method


def difference_score(image, height=30, width=30):
    gray = img_gray(image)
    row_res, col_res = resize(gray, height, width)
    difference = intensity_diff(row_res, col_res)

    return difference


def difference_score_dict(folder):
    _, filenames_full, filenames = load_image_folder(folder)
    ds_dict = {}
    for filename in filenames:
        ds = difference_score(folder+'/'+filename)
        if filename not in ds_dict:
            ds_dict[filename] = ds
    return ds_dict, filenames

# difference_score_dict(dir_upload)


def find_duplicates(folder):
    duplicates = []
    cleaned = []
    ds_dict, _ = difference_score_dict(folder)
    for k1, k2 in itertools.combinations(ds_dict, 2):
        x = hamming_distance(ds_dict[k1], ds_dict[k2])
        if hamming_distance(ds_dict[k1], ds_dict[k2]) < .10:
            duplicates.append((k1, k2))

    return duplicates

# path = f"{dir}/CleanedImages/"


def save_file_dup(folder_up, folder_del):
    duplicates = find_duplicates(folder_up)
    file_deleted = []
    print(len(duplicates))
    for filenames in duplicates:
        file_ori = filenames[0]
        img = cv2.imread(folder_up+'/'+file_ori)
        cv2.imwrite(os.path.join(folder_del+'/'+file_ori), img)
        file_deleted.append(file_ori)
    return file_deleted

# save_file_dup()


def save_file_ori(folder_up, folder_cle, folder_del):
    _, filenames = difference_score_dict(folder_up)
    file_deleted = save_file_dup(folder_up, folder_del)
    # print(file_deleted)
    deletedlist = []
    count = 0
    for filename in filenames:
        print(filename)
        if filename not in file_deleted:
            # print('ok')
            img = cv2.imread(folder_up+'/'+filename)
            # plt.imshow(img)
            # plt.show()
            cv2.imwrite(os.path.join(folder_cle+'/'+filename), img)
            count += 1
        else:
            print('Image duplicates')
    return count


In [290]:
# def show_duplicates():
#     duplicates = find_duplicates()
#     for file_names in duplicates:
#         try:

#             plt.subplot(121), plt.imshow(cv2.imread(file_names[0]))
#             plt.title('Duplicate'), plt.xticks([]), plt.yticks([])

#             plt.subplot(122), plt.imshow(cv2.imread(file_names[1]))
#             plt.title('Original'), plt.xticks([]), plt.yticks([])
#             plt.show()

#         except OSError as e:
#             continue
# show_duplicates()

In [301]:
dir = f"/home/minhquang9914/Documents/VCCorp/Quang/img_processing/remove_image/Deleting-Duplicate-Images-Tool"


def main(dir):
    try:
        folder_upload = 'UploadFolder'
        folder_cleaned = 'CleanedImages'
        folder_delete = 'DeletedPhoto'
        dir_upload = f"{dir}/{folder_upload}"
        dir_cleaned = f"{dir}/{folder_cleaned}"
        dir_delete = f"{dir}/{folder_delete}"
        check_folder(dir_upload, folder_upload)
        check_folder(dir_cleaned, folder_cleaned)
        check_folder(dir_delete, folder_delete)
        save_file_ori(dir_upload, dir_cleaned, dir_delete)
        return True
    except Exception as e:

        return False


main(dir)


39
7-phim-gan-mac-16-cuu-canh-man-anh-2016-0eab28.jpg
Image duplicates
7-phim-gan-mac-16-cuu-canh-man-anh-2016-0eab28 (copy).jpg
00-1-360x203 (copy).png
Image duplicates
146672980_472078380473464_7586937369686810207_o.jpg
Image duplicates
93ea7dec14aab6ed3c3ce23e3446d9a3.20 (copy).jpg
Image duplicates
146672980_472078380473464_7586937369686810207_o (1).jpg
Image duplicates
146672980_472078380473464_7586937369686810207_o (copy).jpg
Image duplicates
0-phim-18-75 (copy).jpg
Image duplicates
1-1620634106-373-width650height900 (copy).jpg
Image duplicates
0.jpg
Image duplicates
20210411-tang-thien-kim-22 (1) (copy).jpg
Image duplicates
1511864885-738-my-nhan-sieu-vong-1-chiem-tron-trai-tim-dan-ong-thai-lan-0-1511860244-width500height620.jpg
Image duplicates
20210425-le-bong-1.jpg
Image duplicates
11-1024x543-1.jpg
Image duplicates
20210821-han-kyung-9-617x771.jpg
Image duplicates
0 (copy).jpg
20210613-da-thao-10.jpg
Image duplicates
110043571_323842948650952_503861320204253260_n.webp
Image d

True