# The data
This implementation will use the various images of ALS which we collected. The dataset contains more than 500 images of 25 different representational letters. 

## Loading the data
we want python to point to the location where the images are located. This way instead of loading a whole file path, we can simply just use the name of the file.

In [2]:
# load_img allows us to load an image from a file as a PIL object
from keras.preprocessing.image import load_img 
# img_to_array allows us to convert the PIL object into a NumPy array
from keras.preprocessing.image import img_to_array 
# preproccess_input is meant to prepare your image into the format the model requires. 
# You should load images with the Keras load_img function so that you guarantee the images you load are compatible with the preprocess_input function.
from keras.applications.vgg16 import preprocess_input 

# for everything else
import os
import numpy as np
import matplotlib.pyplot as plt
from random import randint
import pandas as pd
import pickle

path = r"C:\Users\matan\My PC (DESKTOP-RLTMVS3)\Desktop\פרויקט גמר\dataset\dark photos"
# change the working directory to the path where the images are located
os.chdir(path)

# this list holds all the image filename
flowers = []

# creates a ScandirIterator aliased as files
with os.scandir(path) as files:
  # loops through each file in the directory
    for file in files:
        if file.name.endswith('.jpg'):
          # adds only the image files to the flowers list
            flowers.append(file.name)

In [7]:
flowers

['1.jpg',
 '1217.jpg',
 '1220.jpg',
 '1221.jpg',
 '1237.jpg',
 '1238.jpg',
 '1239.jpg',
 '1247.jpg',
 '1248.jpg',
 '1250.jpg',
 '1251.jpg',
 '1257.jpg',
 '1261.jpg',
 '1277.jpg',
 '1279.jpg',
 '1290.jpg',
 '1297.jpg',
 '1318.jpg',
 '1330.jpg',
 '1337.jpg',
 '1338.jpg',
 '1341.jpg',
 '1349.jpg',
 '1350.jpg',
 '1377.jpg',
 '1378.jpg',
 '1379.jpg',
 '1397.jpg',
 '1399.jpg',
 '1400.jpg',
 '1409.jpg',
 '1418.jpg',
 '1440.jpg',
 '1441.jpg',
 '1448.jpg',
 '1449.jpg',
 '214.jpg',
 '222.jpg',
 '24.jpg',
 '259.jpg',
 '268.jpg',
 '287.jpg',
 '289.jpg',
 '297.jpg',
 '300.jpg',
 '301.jpg',
 '308.jpg',
 '309.jpg',
 '310.jpg',
 '317.jpg',
 '320.jpg',
 '327.jpg',
 '350.jpg',
 '366.jpg',
 '370.jpg',
 '377.jpg',
 '379.jpg',
 '380.jpg',
 '391.jpg',
 '394.jpg',
 '403.jpg',
 '404.jpg',
 '419.jpg',
 '422.jpg',
 '433.jpg',
 '437.jpg',
 '44.jpg',
 '441.jpg',
 '460.jpg',
 '462.jpg',
 '467.jpg',
 '473.jpg',
 'A1.jpg',
 'A13.jpg',
 'A133.jpg',
 'A134.jpg',
 'A135.jpg',
 'A14.jpg',
 'A15.jpg',
 'A2.jpg',
 'A25.jp

# The model
Using a pre-trained neural network to extract a feature vector from images and cluster the images based on how similar the feature vectors are.

The pre-trained model that will be used in this tutorial is the VGG16 convolutional neural network (CNN), which is considered to be state of the art for image recognition tasks. We are going to be using this model as a feature extractor only, meaning that we will remove the final (prediction) layer so that we can obtain a feature vector.

In [1]:
from keras.applications.vgg19 import VGG19
from keras.models import Model

model = VGG19()
model = Model(inputs = model.inputs, outputs = model.layers[-2].output)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg19/vgg19_weights_tf_dim_ordering_tf_kernels.h5


## Data Preprocessing
This is where we put the load_img() and preprocess_input() methods to use. When loading the images we are going to set the target size to (224, 224) because the VGG model expects the images it receives to be 224x224 NumPy arrays.

In [14]:
def extract_features(file, model):
    # load the image as a 224x224 array
    img = load_img(file, target_size=(224,224))
    # convert from 'PIL.Image.Image' to numpy array
    img = np.array(img) 
    # reshape the data for the model reshape(num_of_samples, dim 1, dim 2, channels)
    reshaped_img = img.reshape(1,224,224,3) 
    # prepare image for model
    imgx = preprocess_input(reshaped_img)
    # get the feature vector
    features = model.predict(imgx, use_multiprocessing=True)
    return features

In [15]:
data = {}

# lop through each image in the dataset
for flower in flowers:
    # try to extract the features and update the dictionary
    try:
        feat = extract_features(flower,model)
        data[flower] = feat
    # if something fails, save the extracted features as a pickle file (optional)
    except:
        print("error extracting feature from the image")
          
# get a list of the filenames
filenames = np.array(list(data.keys()))

In [16]:
# get a list of just the features
feat = np.array(list(data.values()))

# reshape so that there are 210 samples of 4096 vectors
feat = feat.reshape(-1,4096)

# get the unique labels 
unique_labels = list(map(chr, range(97, 123)))

## Dimensionality Reduction
Simply put, if you are working with data and have a lot of variables to consider (in our case 4096), PCA allows you to reduce the number of variables while preserving as much information from the original set as possible.

The number of dimensions to reduce down to is up to you and I'm sure there's a method for finding the best number of components to use, but for this case, I just chose 100 as an arbitrary number.

In [17]:
# PCA for reducing the dimensions of our feature vector
from sklearn.decomposition import PCA

pca = PCA(n_components=100, random_state=22)
pca.fit(feat)
x = pca.transform(feat)

## KMeans clustering
This algorithm will allow us to group our feature vectors into k clusters. Each cluster should contain images that are visually similar. In this case, we know there are 10 different species of flowers so we can have k = 26.

In [18]:
# clustering and dimension reduction
from sklearn.cluster import KMeans

# cluster feature vectors
kmeans = KMeans(n_clusters=len(unique_labels), random_state=22)
kmeans.fit(x)

KMeans(n_clusters=26, random_state=22)

In [19]:
# holds the cluster id and the images { id: [images] }
groups = {}
for file, cluster in zip(filenames,kmeans.labels_):
    if cluster not in groups.keys():
        groups[cluster] = []
        groups[cluster].append(file)
    else:
        groups[cluster].append(file)

In [50]:
# function that lets you view a cluster (based on identifier)        
def view_cluster(cluster):
    plt.figure(figsize = (25,25));
    # gets the list of filenames for a cluster
    files = groups[cluster]
    # only allow up to 30 images to be shown at a time
    if len(files) > 30:
        print(f"Clipping cluster size from {len(files)} to 30")
        files = files[:29]
    # plot each image in the cluster
    for index, file in enumerate(files):
        plt.subplot(10,10,index+1);
        img = load_img(file)
        img = np.array(img)
        plt.imshow(img)
        plt.axis('off')

In [20]:
import shutil

parent_dir = r"C:\Users\matan\My PC (DESKTOP-RLTMVS3)\Desktop\פרויקט גמר\dataset\dark photos"

for i in range(len(groups)):
    path = os.path.join(parent_dir, str(i))
    os.makedirs(path)
    for s in groups[i]:
        if os.path.isfile(s): 
            shutil.move(s, path)
    print(i, groups[i])

0 ['N13.jpg', 'N14.jpg', 'N15.jpg', 'N16.jpg', 'N25.jpg', 'N26.jpg', 'N27.jpg', 'N28.jpg']
1 ['259.jpg', '268.jpg', '287.jpg', '289.jpg', '297.jpg', '300.jpg', '308.jpg', '309.jpg', '310.jpg', '317.jpg', '320.jpg', '327.jpg', '350.jpg', '366.jpg', '370.jpg', '377.jpg', '379.jpg', '380.jpg', '419.jpg', '437.jpg', '441.jpg', '460.jpg', '462.jpg']
2 ['J13.jpg', 'J14.jpg', 'J15.jpg', 'J25.jpg', 'J26.jpg', 'J27.jpg', 'J37.jpg', 'J38.jpg', 'J39.jpg', 'Y13.jpg', 'Y14.jpg']
3 ['1257.jpg', '1261.jpg', '1290.jpg', '1297.jpg', '1318.jpg', '1330.jpg', '1337.jpg', '1338.jpg', '1341.jpg', '1377.jpg', '1378.jpg', '1379.jpg', '1397.jpg', '1399.jpg', '1400.jpg', '1409.jpg', '1418.jpg', '1448.jpg', '1449.jpg']
4 ['T13.jpg', 'T14.jpg', 'T15.jpg', 'T16.jpg', 'T17.jpg', 'T25.jpg', 'T26.jpg', 'T27.jpg', 'T28.jpg', 'T29.jpg', 'T37.jpg', 'T38.jpg', 'T39.jpg', 'T40.jpg', 'T41.jpg']
5 ['del1310.jpg', 'del1313.jpg', 'del1528.jpg', 'del1603.jpg']
6 ['C1.jpg', 'C13.jpg', 'C14.jpg', 'C15.jpg', 'C2.jpg', 'C25.jpg', 