In [69]:
# for loading/processing the images  
from keras_preprocessing.image import load_img
from keras_preprocessing.image import img_to_array 
from keras.applications.vgg16 import preprocess_input 

# models 
from keras.applications.vgg16 import VGG16 
from keras.models import Model

# clustering and dimension reduction
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# for everything else
import os
import numpy as np
import matplotlib.pyplot as plt
from random import randint
import pandas as pd
import pickle
import torch

# from IPython import embed

In [21]:
path = r"/Users/oliviafan/Downloads/Data/resized-D1"
# change the working directory to the path where the images are located
os.chdir(path)

# this list holds all the image filename
nail_beds = []

# creates a ScandirIterator aliased as files
with os.scandir(path) as files:
  # loops through each file in the directory
    for file in files:
        if file.name.endswith('.jpg'):
          # adds only the image files to the flowers list
            nail_beds.append(file.name)

In [22]:
# load the image as a 224x224 array
img = load_img(nail_beds[0], target_size=(224,224))
# convert from 'JPG.Image.Image' to numpy array
img = np.array(img)
print(img.shape)
(224, 224, 3)

(224, 224, 3)


(224, 224, 3)

In [23]:
reshaped_img = img.reshape(1,224,224,3)
print(reshaped_img.shape)
(1, 224, 224, 3)

(1, 224, 224, 3)


(1, 224, 224, 3)

In [24]:
x = preprocess_input(reshaped_img)

In [25]:
# load model
model = VGG16()
# remove the output layer
model = Model(inputs=model.inputs, outputs=model.layers[-2].output)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels.h5


In [29]:
features = model.predict(reshaped_img)
print(features.shape)
(1,4096)

(1, 4096)


(1, 4096)

In [31]:
features = model.predict(reshaped_img)
print(features.shape)
(1,4096)

(1, 4096)


(1, 4096)

In [32]:

# load the model first and pass as an argument
model = VGG16()
model = Model(inputs = model.inputs, outputs = model.layers[-2].output)

def extract_features(file, model):
    # load the image as a 224x224 array
    img = load_img(file, target_size=(224,224))
    # convert from 'JPG.Image.Image' to numpy array
    img = np.array(img) 
    # reshape the data for the model reshape(num_of_samples, dim 1, dim 2, channels)
    reshaped_img = img.reshape(1,224,224,3) 
    # prepare image for model
    imgx = preprocess_input(reshaped_img)
    # get the feature vector
    features = model.predict(imgx, use_multiprocessing=True)
    return features

In [44]:

data = {}
p = r"/Users/oliviafan/Downloads/Data/resized-D1"

# lop through each image in the dataset
for nail in nail_beds:
    # try to extract the features and update the dictionary
    try:
        feat = extract_features(nail,model)
        data[nail] = feat
    # if something fails, save the extracted features as a pickle file (optional)
    except:
        with open(p,'wb') as file:
            pickle.dump(data,file)
          

# get a list of the filenames
filenames = np.array(list(data.keys())) 

# get a list of just the features
feat = np.array(list(data.values()))





In [45]:
feat.shape

(93, 1, 4096)

In [46]:
# reshape so that there are 93 samples of 4096 vectors
feat = feat.reshape(-1,4096)

# # get the unique labels (from the flower_labels.csv)
# df = pd.read_csv('flower_labels.csv')
# label = df['label'].tolist()
# unique_labels = list(set(label))

In [48]:
feat.shape

(93, 4096)

In [59]:
pca = PCA(n_components=90, random_state=22)
pca.fit(feat)
x = pca.transform(feat)

In [62]:
print(f"Components before PCA: {feat.shape[1]}")
print(f"Components after PCA: {pca.n_components}")

Components before PCA: 4096
Components after PCA: 90


In [64]:
num = 5

kmeans = KMeans(n_clusters = num, n_jobs=-1, random_state=22)
kmeans.fit(x)



KMeans(n_clusters=5, n_jobs=-1, random_state=22)

In [65]:
kmeans.labels_

array([2, 1, 1, 4, 2, 4, 2, 2, 1, 2, 2, 2, 4, 1, 3, 3, 2, 0, 4, 4, 2, 2,
       2, 2, 4, 1, 1, 1, 4, 4, 4, 4, 2, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 2,
       1, 1, 4, 1, 2, 1, 4, 1, 1, 2, 2, 1, 1, 1, 2, 1, 2, 1, 1, 1, 2, 1,
       1, 4, 2, 1, 1, 4, 1, 1, 4, 2, 1, 1, 1, 0, 0, 1, 2, 2, 0, 1, 0, 4,
       4, 2, 2, 2, 1], dtype=int32)

In [72]:
# holds the cluster id and the images { id: [images] }
groups = {}
for file, cluster in zip(filenames,kmeans.labels_):
    if cluster not in groups.keys():
        groups[cluster] = []
        groups[cluster].append(file)
    else:
        groups[cluster].append(file)

pth = "/Users/oliviafan/Downloads"
torch.save(groups, '{}/result.pkl'.format(pth))