# Find duplicate images in the animal dataset

In [1]:
import os
import numpy as np
import tensorflow as tf

from sklearn.cluster import DBSCAN

We load the images.

In [2]:
generator = tf.keras.preprocessing.image.ImageDataGenerator(
    rescale=1./255,
)

In [3]:
dataset = generator.flow_from_directory(
    directory=f"{os.environ['SCRATCH']}/data",
    target_size=(299, 299), 
    class_mode='categorical'
)

Found 332 images belonging to 1 classes.


We instantiate a model. 

In [4]:
model = tf.keras.applications.inception_resnet_v2.InceptionResNetV2(
    include_top=False,
    weights='imagenet',
    input_shape=(299, 299, 3),
    pooling='avg'
)

We make feature vectors.

In [5]:
vectors = model.predict(dataset)



We clusterize the vectors.

In [6]:
dbscan_model  = DBSCAN(eps=0.25, min_samples=2)

In [7]:
clusters = dbscan_model.fit_predict(vectors)

In [8]:
clusters

array([-1, -1, -1,  0, -1, -1,  1, -1, -1, -1, -1, -1, -1,  2, -1, -1, -1,
       -1, -1, -1, -1,  3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1,  5, -1, -1, -1,  0, -1, -1,  6, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  6,
       -1,  5, -1, -1, -1, -1, -1, -1,  3, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1,  5,  6, -1, -1, -1, -1, -1, -1, -1, -1,  7,  8, -1,  9, -1,
       -1, -1, -1, -1, -1, -1,  3, -1, -1, -1, -1, -1, -1, -1, -1,  1, -1,
       -1, -1, -1, -1, -1, -1, -1,  9, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1,  6, -1, -1, -1, -1, -1, -1, -1,
       -1,  8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        0, -1,  4,  4, -1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1,
       -1, -1, -1,  1, -1

In [9]:
np.unique(clusters)

array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9])