In [None]:
# for loading/processing the images  
from keras.preprocessing.image import load_img 
from keras.preprocessing.image import img_to_array 
from keras.applications.vgg16 import preprocess_input 

# models 
from keras.applications.vgg16 import VGG16 
from keras.models import Model

# clustering and dimension reduction
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# for everything else
import os
import numpy as np
import matplotlib.pyplot as plt
from random import randint
import pandas as pd
import pickle

In [None]:
path = "./data"

# this list holds all the image filename
flowers = []

# creates a ScandirIterator aliased as files
with os.scandir(path) as files:
  # loops through each file in the directory
    for file in files:
        if file.name.endswith('.png'):
          # adds only the image files to the flowers list
            flowers.append(file.name)

In [None]:
print(len(flowers))

In [None]:
# load the image as a 224x224 array
img = load_img(os.path.join("./data",flowers[0]), target_size=(224,224))
# convert from 'PIL.Image.Image' to numpy array
img = np.array(img)

print(img.shape)

In [None]:
# load model
model = VGG16()
# remove the output layer
model = Model(inputs=model.inputs, outputs=model.layers[-2].output)

In [None]:
def extract_features(file, model):
    # load the image as a 224x224 array
    img = load_img(file, target_size=(224,224))
    # convert from 'PIL.Image.Image' to numpy array
    img = np.array(img) 
    # reshape the data for the model reshape(num_of_samples, dim 1, dim 2, channels)
    reshaped_img = img.reshape(1,224,224,3) 
    # prepare image for model
    imgx = preprocess_input(reshaped_img)
    # get the feature vector
    features = model.predict(imgx, use_multiprocessing=True)
    return features

In [None]:
data = {}
for flower in flowers:
    # try to extract the features and update the dictionary
    feat = extract_features(os.path.join("./data",flower),model)
    data[flower] = feat

In [None]:
filenames = np.array(list(data.keys()))

# get a list of just the features
feat = np.array(list(data.values()))
feat = feat.reshape(-1,4096)

feat.shape

In [None]:
pca = PCA(n_components=800, random_state=22)
pca.fit(feat)
x = pca.transform(feat)

In [None]:
kmeans = KMeans(n_clusters=4,n_jobs=-1, random_state=22)
kmeans.fit(x)

In [None]:
len(kmeans.labels_)

In [None]:
groups = {}
for file, cluster in zip(filenames,kmeans.labels_):
    if cluster not in groups.keys():
        groups[cluster] = []
        groups[cluster].append(file)
    else:
        groups[cluster].append(file)

In [None]:
# function that lets you view a cluster (based on identifier)        
def view_cluster(cluster):
    plt.figure(figsize = (50,50));
    # gets the list of filenames for a cluster
    np.random.shuffle(groups[cluster])
    files = groups[cluster]
    # only allow up to 30 images to be shown at a time
    if len(files) > 5:
        print(f"Clipping cluster size from {len(files)} to 5")
        files = files[:5]
    # plot each image in the cluster
    for index, file in enumerate(files):
        plt.subplot(5,5,index+1);
        img = load_img(os.path.join("./data",file))
        img = np.array(img)
        plt.imshow(img)
        plt.axis('off')

In [None]:
print("CLASS I:")
view_cluster(0)

In [None]:
print("CLASS II:")
view_cluster(1)

In [None]:
print("CLASS III:")
view_cluster(2)

In [None]:
print("CLASS IV:")
view_cluster(3)