In [84]:
# cifar10
FEATURES_PATH = '../outputs/2023-04-11/10-50-20/features.csv'
NUM_CLASSES = 10

# # matek
# FEATURES_PATH = '../outputs/2023-04-09/12-50-29/features.csv'
# NUM_CLASSES = 15


STANDARD_SCALING = False

In [85]:
import re
import csv
import numpy as np
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from scipy.optimize import linear_sum_assignment

In [86]:

def _hungarian_match(flat_preds, flat_targets, preds_k, targets_k):
    # Based on implementation from IIC
    num_samples = flat_targets.shape[0]

    assert (preds_k == targets_k)  # one to one
    num_k = preds_k
    num_correct = np.zeros((num_k, num_k))

    for c1 in range(num_k):
        for c2 in range(num_k):
            # elementwise, so each sample contributes once
            votes = int(((flat_preds == c1) * (flat_targets == c2)).sum())
            num_correct[c1, c2] = votes

    # num_correct is small
    match = linear_sum_assignment(num_samples - num_correct)
    match = np.array(list(zip(*match)))

    # return as list of tuples, out_c to gt_c
    res = []
    for out_c, gt_c in match:
        res.append((out_c, gt_c))

    return res

def accuracy(predictions, targets):
    match = _hungarian_match(predictions, targets, preds_k=NUM_CLASSES, targets_k=NUM_CLASSES)
    reordered_preds = np.zeros(targets.size, dtype=predictions.dtype)
    for pred_i, target_i in match:
        reordered_preds[predictions == int(pred_i)] = int(target_i)

    # Gather performance metrics
    acc = int((reordered_preds == targets).sum()) / float(targets.size)
    return acc

In [87]:
features = {}
with open(FEATURES_PATH, 'r') as csv_file:
    csv_reader = csv.DictReader(csv_file)
    for row in csv_reader:
        features[row['name']] = np.array([float(el) for el in row['feature'][1:-1].split(', ')])

print(f'features: {len(features)}\nfeature dim: {next(iter(features.values())).shape}')

features: 50000
feature dim: (512,)


In [88]:
def get_label(name):
    return re.search(r'([^_]+)_[0-9]+[^0-9]+$', name).group(1)

In [89]:
labels = np.unique([get_label(k) for k in features.keys()]).tolist()
label_to_index = {l: i for i, l in enumerate(labels)}
print(labels)
print(label_to_index)

['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
{'airplane': 0, 'automobile': 1, 'bird': 2, 'cat': 3, 'deer': 4, 'dog': 5, 'frog': 6, 'horse': 7, 'ship': 8, 'truck': 9}


In [90]:
X = []
y = []

for k in features.keys():
    X.append(features[k])
    y.append(label_to_index[get_label(k)])

X, y = np.array(X), np.array(y)

if STANDARD_SCALING:
    X = StandardScaler().fit_transform(X)

print(X.shape)
print(y.shape)

(50000, 512)
(50000,)


In [91]:
kmeans = KMeans(n_clusters=NUM_CLASSES, random_state=0, n_init='auto').fit(X)
y_pred = kmeans.labels_

In [92]:
print(f'accuracy: {accuracy(y_pred, y)}')
print(f'normalized mutual information: {metrics.normalized_mutual_info_score(y, y_pred)}')
print(f'adjusted rand score: {metrics.adjusted_rand_score(y, y_pred)}')

accuracy: 0.51886
normalized mutual information: 0.46857903223959696
adjusted rand score: 0.32299032023469637
