In [12]:
import glob
import os
import shutil
import multiprocessing
import cv2
import numpy as np
from collections import Counter
from tensorflow.keras.applications import VGG16, ResNet50, ResNet50V2
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD

## Global

### Variables

In [13]:
# dataset_dir = "data/traffic-small"
dataset_dir = "data/traffic"
dataset = glob.glob(dataset_dir + "/*.jpg")

feature_saved_dir = "extracted_features"

pretrain_input_size = (64, 64)

n_clusters = 14

### Utils

In [3]:
def save_result_as_file(prediction, file_name="prediction.dat"):
    r""" Save the predicted result as a new file """
    file_content = "\n".join(list(map(str, prediction)))
    with open(file_name, "w") as fd:
        fd.write(file_content) 

## Analytics
Analyze the data distribution first, so we can decide how to resize the image

In [31]:
dim_to_key = lambda dim: "({},{})".format(dim[0], dim[1])

In [32]:
cnt = Counter()

images = []
for path in dataset:
    img = cv2.imread(path)
    key = dim_to_key(img.shape)
    cnt[key] += 1

In [33]:
cnt.most_common()

[('(61,35)', 477),
 ('(61,33)', 349),
 ('(44,24)', 304),
 ('(57,27)', 300),
 ('(65,33)', 295),
 ('(67,32)', 286),
 ('(62,29)', 268),
 ('(57,29)', 263),
 ('(56,33)', 262),
 ('(72,40)', 258),
 ('(62,39)', 257),
 ('(55,32)', 252),
 ('(54,23)', 251),
 ('(49,22)', 234),
 ('(60,36)', 227),
 ('(52,25)', 224),
 ('(52,27)', 220),
 ('(55,26)', 214),
 ('(57,28)', 213),
 ('(46,27)', 211),
 ('(81,42)', 210),
 ('(63,32)', 207),
 ('(53,32)', 205),
 ('(56,24)', 205),
 ('(71,38)', 204),
 ('(70,32)', 200),
 ('(46,24)', 198),
 ('(62,34)', 196),
 ('(59,34)', 195),
 ('(77,146)', 194),
 ('(63,34)', 192),
 ('(72,38)', 190),
 ('(97,38)', 186),
 ('(64,35)', 185),
 ('(59,32)', 185),
 ('(59,26)', 185),
 ('(53,24)', 184),
 ('(63,28)', 183),
 ('(54,24)', 181),
 ('(54,26)', 178),
 ('(47,21)', 175),
 ('(57,26)', 175),
 ('(50,24)', 174),
 ('(51,27)', 172),
 ('(99,44)', 171),
 ('(54,33)', 169),
 ('(57,31)', 169),
 ('(59,31)', 168),
 ('(64,33)', 167),
 ('(57,30)', 162),
 ('(48,21)', 162),
 ('(56,30)', 161),
 ('(44,26)'

## Resize

In [4]:
images = []
for path in dataset:
    img = cv2.imread(path)
    img = cv2.resize(img, pretrain_input_size)
    img = img / 255.0  # Normalization
    images.append(img)

In [5]:
train_input = np.array(images)

## Feature Extraction 

In [15]:
def init_feature_saved_dir():
    if not os.path.exists(feature_saved_dir) or \
        len(os.listdir(feature_saved_dir)) == 0:
        try:
            shutil.rmtree(feature_saved_dir)
        except:
            pass

        os.mkdir(feature_saved_dir)

In [16]:
supported_models = ["vgg", "reset50", "reset50v2"]

def save_batch_features(x, model_name="vgg", pooling="avg"):
    assert(model_name in supported_models)
    
    init_feature_saved_dir()
    cpus = multiprocessing.cpu_count()
    
    if model_name == "vgg":
        model = VGG16(weights='imagenet',
                      include_top=False,
                      input_shape=pretrain_input_size + (3,),
                      pooling=pooling)
        
    elif model_name == "reset50":
        model = ResNet50(weights='imagenet',
                         include_top=False,
                         input_shape=pretrain_input_size + (3,),
                         pooling=pooling)

    elif model_name == "reset50v2":
        model = ResNet50V2(weights='imagenet',
                           include_top=False,
                           input_shape=pretrain_input_size + (3,),
                           pooling=pooling)
    
    y = model.predict(x,
                      workers=max(1, cpus - 1),
                      use_multiprocessing=True)

    np.save("%s/features-%s" % (feature_saved_dir, model_name), y)
    
    return y

In [17]:
features_vgg = save_batch_features(train_input)
features_resnet50 = save_batch_features(train_input, "reset50")
features_resnet50v2 = save_batch_features(train_input, "reset50v2")

Downloading data from https://github.com/keras-team/keras-applications/releases/download/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5


Exception: URL fetch failure on https://github.com/keras-team/keras-applications/releases/download/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5: None -- [Errno 111] Connection refused

## Preprocess

### Load data (optional)

In [None]:
x = features_resnet50v2
# x = np.load(os.path.join(feature_saved_dir, "features-vgg.npy"))

### Flatten

In [None]:
x_flatten = np.array([np.ndarray.flatten(img) for img in x])

In [None]:
x_flatten.shape

### PCA

In [None]:
def get_pca_data(x_train):
    def percvar(v):
        r"""Transform eigen/singular values into percents.
        Return: vector of percents, prefix vector of percents
        """
        # sort values
        s = np.sort(np.abs(v))
        # reverse sorting order
        s = s[::-1]
        # normalize
        s = s/np.sum(s)
        return s, np.cumsum(s)

    def perck(s, p):
        return next(i + 1 for i, v in enumerate(s) if v >= p)

    X_std = StandardScaler().fit_transform(x_train)
    means = np.mean(X_std, axis=0)
    X_sm = X_std - means
    
    U,s,V = np.linalg.svd(X_sm)
    _, pv = percvar(s**2/(X_sm.shape[0]-1))

    percentage_explained = 95
    n_components = perck(pv, percentage_explained * 0.01)
    
    print("Original: %d. After PCA: %d" % (x_train.shape[-1], n_components))

    svd = TruncatedSVD(n_components=n_components)
    svd.fit(x_train)

    return svd.transform(x_train), svd

In [None]:
x_flatten, _ = get_pca_data(x_flatten)

## Clustering

In [9]:
model = KMeans(n_clusters=n_clusters)
# model2 = DBSCAN(n_clusters=n_clusters)

In [13]:
model.fit(x_flatten)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=14, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [16]:
labels = list(map(lambda x: x + 1, model.labels_.tolist())) # starts from 1

In [21]:
all(label <= 14 and label >= 1 for label in labels)

True

In [20]:
save_result_as_file(labels, "kmeans_res50_pca.dat")

In [18]:
labels

[6,
 9,
 5,
 12,
 1,
 13,
 13,
 8,
 8,
 3,
 13,
 5,
 14,
 4,
 2,
 2,
 4,
 2,
 14,
 9,
 4,
 14,
 5,
 3,
 11,
 12,
 10,
 3,
 3,
 7,
 4,
 2,
 4,
 10,
 12,
 11,
 5,
 3,
 10,
 4,
 2,
 14,
 2,
 5,
 9,
 6,
 6,
 12,
 13,
 5,
 1,
 8,
 4,
 9,
 2,
 4,
 13,
 4,
 13,
 5,
 6,
 2,
 1,
 8,
 9,
 3,
 13,
 9,
 9,
 8,
 2,
 12,
 7,
 2,
 1,
 10,
 10,
 4,
 5,
 5,
 1,
 14,
 13,
 2,
 7,
 4,
 4,
 7,
 5,
 3,
 5,
 2,
 7,
 12,
 9,
 13,
 11,
 9,
 5,
 3,
 6,
 4,
 6,
 2,
 14,
 7,
 11,
 2,
 1,
 14,
 4,
 11,
 14,
 12,
 7,
 5,
 1,
 13,
 6,
 13,
 13,
 5,
 2,
 7,
 2,
 4,
 13,
 3,
 7,
 7,
 9,
 9,
 4,
 5,
 9,
 14,
 9,
 9,
 4,
 4,
 2,
 11,
 9,
 12,
 6,
 7,
 3,
 9,
 14,
 1,
 10,
 11,
 5,
 10,
 14,
 7,
 6,
 9,
 13,
 13,
 10,
 6,
 13,
 6,
 14,
 12,
 13,
 7,
 1,
 11,
 9,
 11,
 5,
 7,
 13,
 6,
 7,
 14,
 5,
 7,
 2,
 5,
 12,
 12,
 5,
 11,
 7,
 3,
 11,
 9,
 13,
 14,
 5,
 5,
 5,
 8,
 9,
 5,
 2,
 10,
 8,
 3,
 7,
 5,
 1,
 2,
 8,
 2,
 2,
 3,
 9,
 6,
 7,
 14,
 6,
 4,
 3,
 2,
 8,
 6,
 7,
 6,
 13,
 6,
 2,
 9,
 2,
 13,
 7,
 6,
 3,
 6,
 4,
 