In [1]:
from sklearn.metrics.cluster import supervised
from scipy.optimize import linear_sum_assignment

def accuracy(labels_true, labels_pred):
    labels_true, labels_pred = supervised.check_clusterings(labels_true, labels_pred)
    # labels_true : int array with ground truth labels, shape = [n_samples]
    # labels_pred : int array with estimated labels, shape = [n_samples]
    value = supervised.contingency_matrix(labels_true, labels_pred)
    # value : array of shape [n, n] whose (i, j)-th entry is the number of samples in true class i and in predicted class j
    [r, c] = linear_sum_assignment(-value)
    return value[r, c].sum() / len(labels_true)

In [2]:
from keras.layers import Input, Dense, Conv2D, MaxPooling2D, AveragePooling2D, ZeroPadding2D, Flatten, Activation, add
from keras.optimizers import SGD
from keras.layers.normalization import BatchNormalization
from keras.models import Model
from keras import initializers
from keras.engine import Layer, InputSpec
from keras import backend as K

from resnet_keras import resnet152_model

# ResNet weights after pretraining on ImageNet
weights_path = 'resnet_data/resnet152_weights_tf.h5'

# ResNet with 152 layers, pretrained on ImageNet
model = resnet152_model(weights_path)
# SGD optimizer - not important since we aren't training this model
sgd = SGD(lr=1e-2, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(optimizer=sgd, loss='categorical_crossentropy', metrics=['accuracy'])

# Extract features from one of the final ResBlocks in the ResNet
layer_name = 'avg_pool'

intermediate_layer_model = Model(inputs=model.input,
                                 outputs=model.get_layer(layer_name).output)


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
import scipy.io

def get_labels_from_file(data_name):
    mat = scipy.io.loadmat('data/' + data_name + '-' + 'train')
    return np.squeeze(mat['labels_train'])

def get_resnet_features_from_file(data_name, train=True):
    
    mat = scipy.io.loadmat('data/' + data_name + '-' + ('train.mat' if train else 'test.mat'))
    
    images_name = 'images_' + ('train' if train else 'test')
    
    x = np.reshape(np.transpose(np.repeat(mat[images_name][:,:, np.newaxis],3, axis=2), (1, 0, 2)), (-1, 28, 28, 3))
    # Normalization of input to ResNet
#     x = x * 256
#     x -= np.mean(x)
    
    large_images = np.array([scipy.misc.imresize(scipy.misc.imrotate(image, -90), size=(224,224), interp='bilinear') for image in x])
    intermediate_output = intermediate_layer_model.predict(large_images)
    # Average over spatial components of feature activations
    return np.mean(intermediate_output, (1,2))
#     return intermediate_output

In [4]:
import scipy.io
import matplotlib.pyplot as plt
import numpy as np
import sklearn.cluster

# Run Birch clustering on MNIST HOG features
# MNIST is much simpler than CIFAR-10, so HOG is sufficient

mat = scipy.io.loadmat('data/digits-test.mat')
feat = mat['fea_hog_test']

thresholds = []

# for t in np.random.uniform(0.69,.85,[100]):
clusters = sklearn.cluster.Birch(n_clusters=5, threshold=0.7).fit_predict(np.transpose(feat))
# print(t, ' ', accuracy(get_labels_from_file('digits'), clusters))
# thresholds.append((t, accuracy(get_labels_from_file('digits'), clusters)))
np.savetxt('digits.csv', clusters + 1, fmt='%i', delimiter=',', newline='\n')

In [5]:
import sklearn.manifold
f = sklearn.manifold.TSNE(n_components=3, init='pca').fit_transform(np.transpose(feat))
print(f)

[[  8.948041   14.744697   -7.652679 ]
 [-22.075785   -0.9811244  -5.2749605]
 [ -1.0242753   7.7488575  14.418346 ]
 ...
 [  8.497024   10.989828    2.6422594]
 [-16.34447     2.581515   -9.9346075]
 [ 17.65837     0.9501796  -2.8437228]]


In [6]:
clusters = sklearn.cluster.Birch(n_clusters=5, threshold=0.7).fit_predict(f)
print(accuracy(get_labels_from_file('digits'), clusters))
np.savetxt('digits.csv', clusters + 1, fmt='%i', delimiter=',', newline='\n')

0.2092


In [7]:
import scipy

# For CIFAR-10, run Birch clustering on extracted features from ResNet trained on ImageNet.

filename = 'objects'

resnet_features = get_resnet_features_from_file(filename, False)

import sklearn.decomposition 

`imrotate` is deprecated in SciPy 1.0.0, and will be removed in 1.2.0.
Use ``skimage.transform.rotate`` instead.
`imresize` is deprecated in SciPy 1.0.0, and will be removed in 1.2.0.
Use ``skimage.transform.resize`` instead.


In [8]:
# reduced_features = sklearn.decomposition.PCA(10).fit_transform(resnet_features)

reduced_features = sklearn.manifold.TSNE(n_components=3, init='pca').fit_transform(resnet_features)

In [9]:
clusters = sklearn.cluster.Birch(n_clusters=5, threshold=0.65).fit_predict(reduced_features)

# clusters = sklearn.cluster.MiniBatchKMeans(n_clusters=5, batch_size=33).fit_predict(resnet_features)

# clusters = sklearn.cluster.KMeans(n_clusters=5).fit_predict(reduced_features)

# If testing, print accuracy
print(accuracy(get_labels_from_file(filename), clusters))
np.savetxt(filename + '.csv', clusters + 1, fmt='%i', delimiter=',', newline='\n')

0.2228


In [None]:
clusters = sklearn.cluster.Birch(n_clusters=5, threshold=0.7).fit_predict(reduced_features)

In [10]:
# TODO: after getting 0.8226 average accuracy: do a hyperparameter grid search

In [130]:

print(accuracy(get_labels_from_file(filename), clusters))

0.615
