In [13]:
import numpy as np
import pandas as pd

from tasksim import *

from graspologic.embed import AdjacencySpectralEmbed as ASE
from graspologic.cluster import GaussianCluster as GMM

from sklearn.model_selection import train_test_split
from sklearn.metrics import adjusted_rand_score as ARI

import torchvision

from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from joblib import Parallel, delayed
import pickle
import time
from tqdm import tqdm
import os

def evaluate_clusters(f, truth, preds, calculate_random=False, n_mc=500, acorn=None):
    eval_pred = f(truth, preds)
    
    if not calculate_random:
        return eval_pred
    
    eval_random = np.zeros(n_mc)
    for i in range(n_mc):
        shuffled_preds = np.random.choice(preds, size=len(preds), replace=False)
        eval_random[i] = f(truth, shuffled_preds)
        
    return eval_pred, np.mean(eval_random)

In [2]:
#- Data processing 1

class Dataset:
    def __init__(self, file='cifar_100_Bit_m-r101x1_embd.p', train=True, classes=[]):
        if train:
            if file == '../../../data/cifar_100_Bit_m-r101x1_embd.p':
                self.data = pickle.load(open(file, 'rb'))[0][0]
                self.targets = np.concatenate(pickle.load(open(file, 'rb'))[0][1])
        else:
            if file == '../../../data/cifar_100_Bit_m-r101x1_embd.p':
                self.data = pickle.load(open(file, 'rb'))[1][0]
                self.targets = np.concatenate(pickle.load(open(file, 'rb'))[1][1])
        
        self.classes = classes

In [3]:
#- Data processing 2

cif100 = torchvision.datasets.CIFAR100(root='./data', train=True, download=True)

file='../../../data/cifar_100_Bit_m-r101x1_embd.p'
trainset = Dataset(file, train=True, classes=cif100.classes)
testset = Dataset(file, train=False, classes=cif100.classes)

Downloading https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz to ./data/cifar-100-python.tar.gz


HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

Extracting ./data/cifar-100-python.tar.gz to ./data


In [4]:
#- Data processing 3

data_dimension=2048

if data_dimension < trainset.data.shape[1]:
    pca = PCA(n_components=data_dimension)
    pca.fit(trainset.data)
    trainset.data = pca.transform(trainset.data)
    testset.data = pca.transform(testset.data)

In [5]:
#- Data processing 4

coarse_to_fine_map = {
'aquatic_mammals': ['beaver', 'dolphin', 'otter', 'seal', 'whale'],
'fish': ['aquarium_fish', 'flatfish', 'ray', 'shark', 'trout'],
'flowers': ['orchid', 'poppy', 'rose', 'sunflower', 'tulip'],
'food_containers': ['bottle', 'bowl', 'can', 'cup', 'plate'],
'fruit_and_vegetables': ['apple', 'mushroom', 'orange', 'pear', 'sweet_pepper'],
'household_electrical_devices': ['clock', 'keyboard', 'lamp', 'telephone', 'television'],
'household_furniture': ['bed', 'chair', 'couch', 'table', 'wardrobe'],
'insects': ['bee', 'beetle', 'butterfly', 'caterpillar', 'cockroach'],
'large_carnivores': ['bear', 'leopard', 'lion', 'tiger', 'wolf'],
'large_man-made_outdoor_things': ['bridge', 'castle', 'house', 'road', 'skyscraper'],
'large_natural_outdoor_scenes': ['cloud', 'forest', 'mountain', 'plain', 'sea'],
'large_omnivores_and_herbivores': ['camel', 'cattle', 'chimpanzee', 'elephant', 'kangaroo'],
'medium-sized_mammals': ['fox', 'porcupine', 'possum', 'raccoon', 'skunk'],
'non-insect_invertebrates': ['crab', 'lobster', 'snail', 'spider', 'worm'],
'people': ['baby', 'boy', 'girl', 'man', 'woman'],
'reptiles': ['crocodile', 'dinosaur', 'lizard', 'snake', 'turtle'],
'small mammals': ['hamster', 'mouse', 'rabbit', 'shrew', 'squirrel'],
'trees': ['maple_tree', 'oak_tree', 'palm_tree', 'pine_tree', 'willow_tree'],
'vehicles_1': ['bicycle', 'bus', 'motorcycle', 'pickup_truck', 'train'],
'vehicles_2': ['lawn_mower', 'rocket', 'streetcar', 'tank', 'tractor']
}

coarse_number_to_coarse_name = {i: name for i, name in enumerate(coarse_to_fine_map)}

def fine_to_coarse(coarse_to_fine):
    fine_to_coarse_map = {}
    for key in coarse_to_fine:
        fines = coarse_to_fine[key]
        for f in fines:
            fine_to_coarse_map[f] = key
            
    return fine_to_coarse_map

fine_to_coarse_map = fine_to_coarse(coarse_to_fine_map)

fine_number_to_fine_name = {i: name for i, name in enumerate(trainset.classes)}
fine_name_to_fine_number = {name: i for i, name in fine_number_to_fine_name.items()}

for i in range(100):
    fine_to_coarse_map[fine_number_to_fine_name[i]]
    
coarse_name_to_coarse_number = {name: i for i, name in enumerate(coarse_to_fine_map)}

coarse_targets = np.array([coarse_name_to_coarse_number[fine_to_coarse_map[fine_number_to_fine_name[y]]] for y in trainset.targets])
idx_by_coarse = np.array([np.where(coarse_targets == y)[0] for y in range(20)])
idx_by_fine = np.array([np.where(trainset.targets == y)[0] for y in range(100)])


test_coarse_targets = np.array([coarse_name_to_coarse_number[fine_to_coarse_map[fine_number_to_fine_name[y]]] for y in testset.targets])
test_idx_by_coarse = np.array([np.where(test_coarse_targets == y)[0] for y in range(20)])


coarse_names = np.array(list(coarse_name_to_coarse_number.keys()))

fine_number_to_coarse_number = {fn: coarse_name_to_coarse_number[
                                        fine_to_coarse_map[
                                            fine_number_to_fine_name[fn]
                                        ]
                                    ] for fn in range(100)}


fine_by_coarse = [np.where(np.array(list(fine_number_to_coarse_number.values())) == i)[0] for i in range(20)]
all_fine = np.concatenate(fine_by_coarse)

In [6]:
np.random.seed(2)

n_props = 0.1

generate_dist_matrix_kwargs = {'metric':'tasksim', 
                               'metric_kwargs':{'n_neg_classes': 20, 
                                                'task_similarity_kwargs': {'transformer_kwargsx': 
                                                                               {'max_depth':4},
                                                                          'transformer_kwargsz':
                                                                              {'max_depth':4}}}, 
                               'function_tuples':None, 
                               'n_cores':30, 
                               'acorn':None
                              }


process_dist_matrix_kwargs = {'make_symmetric': True,
                              'scale':True,
                             'aug_diag':True,
                             }

embedding=ASE
embedding_kwargs={'n_components':16}
cluster=GMM
cluster_kwargs = {'max_components': 30}

cluster_dists_kwargs = {'embedding':embedding, 
                        'embedding_kwargs':embedding_kwargs, 
                        'cluster':cluster, 
                        'cluster_kwargs':cluster_kwargs
                       }

In [41]:
n_mc=10
master_seed = 42
np.random.seed(master_seed)
seeds = np.random.randint(10000, size=n_mc)

generate_tasksim=False
generate_condmean=False

if generate_tasksim:
    tasksim_clusters = []

if generate_condmean:
    condmean_clusters = []

    
#- Generate clusters
for iteration in tqdm(range(n_mc)):
    start = time.time()
    seed =  seeds[iteration]
    X_train, _, y_train, _ = train_test_split(trainset.data, trainset.targets, test_size=0.9, random_state=seed)
    
    if generate_tasksim:
        temp_tasksim = generate_hierarchy(X_train, y_train,
                                     generate_dist_matrix_kwargs, process_dist_matrix_kwargs, cluster_dists_kwargs)
        tasksim_clusters.append(temp_tasksim)
        
        pickle.dump(tasksim_clusters, open('cifar_tasksim_clusters.pkl', 'wb'))
    
    
    if generate_condmean:
        pca = PCA(n_components=data_dimension)
        pca.fit(X_train)
        X_train = pca.transform(X_train)
        unique_y = np.unique(y_train)
        
        conditional_means = np.array([np.mean(X_train[np.where(y_train == c)[0]], axis=0) for c in unique_y])
        gmm = GMM(min_components=10, max_components=30, reg_covar=1e-4)
        temp_condmean = gmm.fit_predict(conditional_means)
        condmean_clusters.append(temp_condmean)
        
        pickle.dump(condmean_clusters, open('cifar_condmean_clusters.pkl', 'wb'))

100%|██████████| 10/10 [00:01<00:00,  5.88it/s]


In [42]:
f=ARI
truth=np.array(list(fine_number_to_coarse_number.values()))

tasksim_clusters = pickle.load(open('cifar_tasksim_clusters.pkl', 'rb'))
tasksim_evals = np.zeros((2, n_mc))
for i, clust_ in enumerate(tasksim_clusters):
    tasksim_evals[:, i] = evaluate_clusters(f, truth, clust_, calculate_random=True, n_mc=1000)
    
condmean_clusters = pickle.load(open('cifar_condmean_clusters.pkl', 'rb'))
condmean_evals = np.zeros((2, n_mc))
for i, clust_ in enumerate(condmean_clusters):
    condmean_evals[:, i] = evaluate_clusters(f, truth, clust_, calculate_random=True, n_mc=1000)
    
print("tasksim")
print("means", np.mean(tasksim_evals, axis=-1))
print("std errs", np.std(tasksim_evals, axis=-1) / np.sqrt(n_mc))
print()
print("condmeans")
print("means", np.mean(condmean_evals, axis=-1))
print("std errs", np.std(condmean_evals, axis=-1) / np.sqrt(n_mc))

tasksim
means [3.28826143e-01 5.51670376e-06]
std errs [0.01501901 0.00014459]

condmeans
means [2.95185455e-01 3.41157506e-05]
std errs [0.00767691 0.00011096]
