# Find duplicate images in the animal dataset

In [37]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.cluster import DBSCAN
from tensorflow.image import resize
from tensorflow.keras.preprocessing.image import load_img, img_to_array, save_img

We load the images.

In [2]:
metadata = pd.read_csv(f"{os.environ['SCRATCH']}/isic-archive/metadata.csv")
metadata = metadata.drop(metadata[metadata['isic_id'].isin(['ISIC_0060052', 'ISIC_0029842'])].index)
metadata = metadata.drop(metadata[pd.isnull(metadata['diagnosis'])].index)
metadata['isic_id'] += '.jpg'

  metadata = pd.read_csv(f"{os.environ['SCRATCH']}/isic-archive/metadata.csv")


In [3]:
generator = tf.keras.preprocessing.image.ImageDataGenerator(
    rescale=1./255,
)

In [4]:
dataset = generator.flow_from_dataframe(
    metadata,
    directory=f"{os.environ['SCRATCH']}/isic-archive",
    x_col='isic_id',
    y_col='diagnosis',
    
    target_size=(299, 299), 
    class_mode='categorical', 
)

Found 42318 validated image filenames belonging to 22 classes.


We instantiate a model. 

In [5]:
model = tf.keras.applications.inception_resnet_v2.InceptionResNetV2(
    include_top=False,
    weights='imagenet',
    input_shape=(299, 299, 3),
    pooling='avg'
)

We make feature vectors.

In [6]:
vectors = model.predict(dataset)



We clusterize the vectors.

In [7]:
dbscan_model  = DBSCAN(eps=0.25, min_samples=2)

In [8]:
clusters = dbscan_model.fit_predict(vectors)

In [9]:
clusters

array([-1, -1, -1, ..., -1, -1, -1])

In [13]:
unique_clusters = np.unique(clusters)
unique_clusters

array([ -1,   0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,
        12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,
        25,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,
        38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,
        51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,
        64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,
        77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
        90,  91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102,
       103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115,
       116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128,
       129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141,
       142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154,
       155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167,
       168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 17

In [20]:
potential_duplicates = {
    unique_cluster: [
        filename 
        for filename, cluster in zip(dataset.filenames, clusters) 
            if cluster == unique_cluster
    ] 
    for unique_cluster in unique_clusters[1:]
}
potential_duplicates

{0: ['ISIC_5247508.jpg', 'ISIC_0009257.jpg'],
 1: ['ISIC_8414609.jpg', 'ISIC_0031872.jpg'],
 2: ['ISIC_9583969.jpg', 'ISIC_0024535.jpg'],
 3: ['ISIC_5714411.jpg', 'ISIC_0005867.jpg'],
 4: ['ISIC_3359880.jpg', 'ISIC_0000695.jpg'],
 5: ['ISIC_6089318.jpg', 'ISIC_0003387.jpg'],
 6: ['ISIC_2145640.jpg', 'ISIC_0034258.jpg'],
 7: ['ISIC_8006211.jpg', 'ISIC_0009663.jpg'],
 8: ['ISIC_6285376.jpg', 'ISIC_0004403.jpg'],
 9: ['ISIC_5258657.jpg', 'ISIC_0001790.jpg'],
 10: ['ISIC_3521744.jpg', 'ISIC_0025433.jpg'],
 11: ['ISIC_7050773.jpg', 'ISIC_0006152.jpg'],
 12: ['ISIC_3599125.jpg', 'ISIC_0062162.jpg'],
 13: ['ISIC_4973809.jpg', 'ISIC_3923443.jpg'],
 14: ['ISIC_9184306.jpg', 'ISIC_7189864.jpg'],
 15: ['ISIC_4887403.jpg', 'ISIC_0033117.jpg'],
 16: ['ISIC_0715702.jpg', 'ISIC_4624352.jpg'],
 17: ['ISIC_1362265.jpg', 'ISIC_0060999.jpg'],
 18: ['ISIC_7316871.jpg', 'ISIC_0060935.jpg'],
 19: ['ISIC_9802008.jpg', 'ISIC_0065351.jpg'],
 20: ['ISIC_6734679.jpg', 'ISIC_0053716.jpg'],
 21: ['ISIC_7665397.jpg

In [39]:
for cluster_no, images in potential_duplicates.items():
    ims = [resize(img_to_array(load_img(f"{os.environ['SCRATCH']}/isic-archive/{im}")), size=(1000, 1000)) for im in images]
    ims = np.concatenate(ims, axis=1)
    save_img(f"/net/people/plgamyshenin/masters-thesis---melanoma-analysis-with-fnn/isic/potential_duplicates/{cluster_no}.jpg", ims)