In [1]:
# Imports
import random, cv2, os, sys, shutil
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import numpy as np
import keras
from IPython.display import display
from PIL import Image
from keras.preprocessing import image
from keras.applications.vgg16 import preprocess_input
from glob import glob

Using TensorFlow backend.


In [2]:


class image_clustering:

    def __init__(self, folder_path="data", n_clusters=10, max_examples=None, use_imagenets=False, use_pca=False):
        paths = os.listdir(folder_path)
        if max_examples == None:
            self.max_examples = len(paths)
        else:
            if max_examples > len(paths):
                self.max_examples = len(paths)
            else:
                self.max_examples = max_examples
        self.n_clusters = n_clusters
        self.folder_path = folder_path
        random.shuffle(paths)
        self.image_paths = paths[:self.max_examples]
        self.use_imagenets = use_imagenets
        self.use_pca = use_pca
        del paths 
        try:
            shutil.rmtree("output")
        except FileExistsError:
            pass
        print("\n output folders created.")
        os.makedirs("output")
        for i in range(self.n_clusters):
            os.makedirs("output/cluster" + str(i))
        print("\n Object of class \"image_clustering\" has been initialized.")

    def load_images(self):
        self.images = []
        for file in self.image_paths:
            self.images.append(cv2.cvtColor(cv2.resize(cv2.imread(self.folder_path + "/" + file), (224,224)), cv2.COLOR_BGR2RGB))
        self.images = np.float32(self.images).reshape(len(self.images), -1)
        self.images /= 255
        print("\n " + str(self.max_examples) + " images from the " + self.folder_path + "/" + "folder have been loaded in a random order.")
        
    def get_new_imagevectors(self):
        if self.use_imagenets == False:
            self.images_new = self.images
        else:            
            if use_imagenets.lower() == "vgg16":
                model1 = keras.applications.vgg16.VGG16(include_top=False, weights="imagenet", input_shape=(224,224,3))
            elif use_imagenets.lower() == "vgg19":
                model1 = keras.applications.vgg19.VGG19(include_top=False, weights="imagenet", input_shape=(224,224,3))
            elif use_imagenets.lower() == "resnet50":
                model1 = keras.applications.resnet50.ResNet50(include_top=False, weights="imagenet", input_shape=(224,224,3))
            elif use_imagenets.lower() == "xception":
                model1 = keras.applications.xception.Xception(include_top=False, weights='imagenet',input_shape=(224,224,3))
            elif use_imagenets.lower() == "inceptionv3":
                keras.applications.inception_v3.InceptionV3(include_top=False, weights='imagenet', input_shape=(224,224,3))
            elif use_imagenets.lower() == "inceptionresnetv2":
                model1 = keras.applications.inception_resnet_v2.InceptionResNetV2(include_top=False, weights='imagenet', input_shape=(224,224,3))
            elif use_imagenets.lower() == "densenet":
                model1 = keras.applications.densenet.DenseNet201(include_top=False, weights='imagenet', input_shape=(224,224,3))
            elif use_imagenets.lower() == "mobilenetv2":
                model1 = keras.applications.mobilenetv2.MobileNetV2(input_shape=(224,224,3), alpha=1.0, depth_multiplier=1, include_top=False, weights='imagenet', pooling=None)
            else:
                print("\n\n Please use one of the following keras applications only [ \"vgg16\", \"vgg19\", \"resnet50\", \"xception\", \"inceptionv3\", \"inceptionresnetv2\", \"densenet\", \"mobilenetv2\" ] or False")
                sys.exit()
            print("Done making model...")
            feature_list = []
            for i, file in enumerate(self.image_paths):
                print("Processing image {}".format(i))
                img = image.load_img(self.folder_path + "/" + file, target_size=(224, 224))
                img_data = image.img_to_array(img)
                img_data = np.expand_dims(img_data, axis=0)
                img_data = preprocess_input(img_data)

                feature = model1.predict(img_data)
                feature_np = np.array(feature)
                feature_list.append(feature_np.flatten())
            print("Done feature extracting")
            images_temp = np.array(feature_list)
            if self.use_pca == False: 
                self.images_new = images_temp
            else: 
                model2 = PCA(n_components=None, random_state=728)
                pca = model2.fit_transform(images_temp)
                self.images_new = pca

    def clustering(self):
        model = KMeans(n_clusters=self.n_clusters, n_jobs=-1, random_state=728)
        model.fit(self.images_new)
        predictions = model.predict(self.images_new)
        #print(predictions)
        for i in range(self.max_examples):
            shutil.copy2(self.folder_path+"/"+self.image_paths[i], "output/cluster"+str(predictions[i]))
        print("\n Clustering complete! \n\n Clusters and the respective images are stored in the \"output\" folder.")



In [3]:
if __name__ == "__main__":
    !rm -rf output
    !mkdir -p output

    print("\n\n \t\t START\n\n")

    number_of_clusters = 10 # cluster names will be 0 to number_of_clusters-1

    data_path = "flickr/kenting/preprocess/" # path of the folder that contains the images to be considered for the clustering (The folder must contain only image files)

    max_examples = None # number of examples to use, if "None" all of the images will be taken into consideration for the clustering
    # If the value is greater than the number of images present  in the "data_path" folder, it will use all the images and change the value of this variable to the number of images available in the "data_path" folder. 

    use_imagenets = "ResNet50"
    # choose from: "Xception", "VGG16", "VGG19", "ResNet50", "InceptionV3", "InceptionResNetV2", "DenseNet", "MobileNetV2" and "False" -> Default is: False

    if use_imagenets == False:
        use_pca = False
    else:
        use_pca = True # Make it True if you want to use PCA for dimentionaity reduction -> Default is: False

    temp = image_clustering(data_path, number_of_clusters, max_examples, use_imagenets, use_pca)
    temp.load_images()
    temp.get_new_imagevectors()
    temp.clustering()

    print("\n\n\t\t END\n\n")



 		 START



 output folders created.

 Object of class "image_clustering" has been initialized.

 242 images from the flickr/kenting/preprocess//folder have been loaded in a random order.




Done making model...
Processing image 0
Processing image 1
Processing image 2
Processing image 3
Processing image 4
Processing image 5
Processing image 6
Processing image 7
Processing image 8
Processing image 9
Processing image 10
Processing image 11
Processing image 12
Processing image 13
Processing image 14
Processing image 15
Processing image 16
Processing image 17
Processing image 18
Processing image 19
Processing image 20
Processing image 21
Processing image 22
Processing image 23
Processing image 24
Processing image 25
Processing image 26
Processing image 27
Processing image 28
Processing image 29
Processing image 30
Processing image 31
Processing image 32
Processing image 33
Processing image 34
Processing image 35
Processing image 36
Processing image 37
Processing image 38
Processing image 39
Processing image 40
Processing image 41
Processing image 42
Processing image 43
Processing image 44
Processing image 45
Processing image 46
Processing image 47
Processing image 48
Processin

In [4]:
# for i in range(0, number_of_clusters):
#     print("Cluster {}".format(i))
#     file = glob("output/cluster{}/*".format(i)) 
#     for f in file:
#         image = Image.open(f)
#         display(image)