In [1]:
import numpy as np
import pandas as pd

from tasksim import *

from graspologic.embed import AdjacencySpectralEmbed as ASE
from graspologic.cluster import GaussianCluster as GMM

from sklearn.metrics import normalized_mutual_info_score as NMI
from sklearn.metrics import adjusted_rand_score as ARI

from proglearn import LifelongClassificationForest as l2f
import torchvision

from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from joblib import Parallel, delayed
import pickle
import time
from tqdm import tqdm
import os

In [2]:
class Dataset:
    def __init__(self, file='cifar_100_Bit_m-r101x1_embd.p', train=True, classes=[]):
        if train:
            if file == 'data/cifar_100_Bit_m-r101x1_embd.p':
                self.data = pickle.load(open(file, 'rb'))[0][0]
                self.targets = np.concatenate(pickle.load(open(file, 'rb'))[0][1])
        else:
            if file == 'data/cifar_100_Bit_m-r101x1_embd.p':
                self.data = pickle.load(open(file, 'rb'))[1][0]
                self.targets = np.concatenate(pickle.load(open(file, 'rb'))[1][1])
        
        self.classes = classes

In [3]:
file = 'data/cifar_100_Bit_m-r101x1_embd.p'

cif100 = torchvision.datasets.CIFAR100(root='./data', train=True, download=True)
trainset = Dataset(file, classes=cif100.classes)
testset = Dataset(file, train=False, classes=cif100.classes)

Files already downloaded and verified


In [4]:
data_dimension=2048

if data_dimension < trainset.data.shape[1]:
    pca = PCA(n_components=data_dimension)
    pca.fit(trainset.data)
    trainset.data = pca.transform(trainset.data)
    testset.data = pca.transform(testset.data)

In [5]:
def evaluate_clusters(f, truth, preds, calculate_random=False, n_mc=500, acorn=None):
    eval_pred = f(truth, preds)
    
    if not calculate_random:
        return eval_pred
    
    eval_random = np.zeros(n_mc)
    for i in range(n_mc):
        shuffled_preds = np.random.choice(preds, size=len(preds), replace=False)
        eval_random[i] = f(truth, shuffled_preds)
        
    return eval_pred, np.mean(eval_random)

def evaluate_accuracy(data, labels, truth, preds, n_trees_coarse=25, n_trees_fine=10, train_flat=True,
                     test_data=None, test_labels=None, max_depth=5,
                     acorn=None):
    classes = np.unique(labels)
    
    forests_dict = {
            'coarse_truth': None, 
            'fine_truth': {c: None for c in np.unique(truth)},
            'coarse_preds': None,
            'fine_preds': {c: None for c in np.unique(preds)}, 
            'flat': None
    }
    
    # Coarse forest
    coarse_forest_truth = l2f(default_n_estimators=n_trees_coarse,
                        default_max_depth=max_depth)
    
    coarse_forest_truth.add_task(data, truth[labels])
    forests_dict['coarse_truth'] = coarse_forest_truth
    
    coarse_forest_preds = l2f(default_n_estimators=n_trees_coarse,
                        default_max_depth=max_depth)
    
    coarse_forest_preds.add_task(data, preds[labels])
    forests_dict['coarse_preds'] = coarse_forest_preds
    
    
    # Flat forest
    n_trees_flat = n_trees_coarse + len(np.unique(truth))*n_trees_fine
    
    if train_flat:
        flat_forest_truth = l2f(default_n_estimators=n_trees_flat,
                            default_max_depth=max_depth)
        flat_forest_truth.add_task(data, labels)
        forests_dict['flat'] = flat_forest_truth
        
    # Fine forest
    for j, parent_class in enumerate(np.unique(truth)):
        temp_fine_indices = np.where(truth[labels] == parent_class)[0]
        
        
        fine_forest_truth = l2f(default_n_estimators=n_trees_fine, 
                               default_max_depth=max_depth
                              )
        fine_forest_truth.add_task(data[temp_fine_indices], labels[temp_fine_indices])
        forests_dict['fine_truth'][j] = fine_forest_truth
        
    for j, parent_class in enumerate(np.unique(preds)):
        temp_fine_indices = np.where(preds[labels] == parent_class)[0]
        
        fine_forest_preds = l2f(default_n_estimators=n_trees_fine, 
                               default_max_depth=max_depth
                              )
        fine_forest_preds.add_task(data[temp_fine_indices], labels[temp_fine_indices])
        forests_dict['fine_preds'][j] = fine_forest_preds
        
        
    # Now, calculate accuracies
    accuracies = np.zeros(3)
    
    if test_data is None:
        raise ValueError
        
    n_test, d_test = test_data.shape
                
    hierarchical_posteriors_truth = np.zeros((n_test, len(classes)))
    hierarchical_posteriors_preds = np.zeros((n_test, len(classes)))
    
    coarse_posteriors_truth = forests_dict['coarse_truth'].predict_proba(test_data, 0)
    coarse_posteriors_preds = forests_dict['coarse_preds'].predict_proba(test_data, 0)
        
    # Hierarchical posteriors & prediction
    for j, parent_class in enumerate(np.unique(truth)):
        temp_fine_label_indices = np.where(truth == parent_class)[0]
        
        temp_fine_posteriors = forests_dict['fine_truth'][j].predict_proba(test_data, 0)
        hierarchical_posteriors_truth[:, temp_fine_label_indices] = np.multiply(coarse_posteriors_truth[:, j],
                                                                     temp_fine_posteriors.T
                                                                    ).T
        
    for j, parent_class in enumerate(np.unique(preds)):
        temp_fine_label_indices = np.where(preds == parent_class)[0]

        
        temp_fine_posteriors = forests_dict['fine_preds'][j].predict_proba(test_data, 0)
        hierarchical_posteriors_preds[:, temp_fine_label_indices] = np.multiply(coarse_posteriors_preds[:, j],
                                                                     temp_fine_posteriors.T
                                                                    ).T
        
    yhat_hc = np.argmax(hierarchical_posteriors_truth, axis=1)
    accuracies[0] = np.mean(yhat_hc == np.array(test_labels))
    
    yhat_hc = np.argmax(hierarchical_posteriors_preds, axis=1)
    accuracies[1] = np.mean(yhat_hc == np.array(test_labels))
    
    
    # Flat posteriors & prediction
    if train_flat:
        flat_posteriors = forests_dict['flat'].predict_proba(test_data, 0)
        yhat_flat = np.argmax(flat_posteriors, axis=1)
        accuracies[2] = np.mean(yhat_flat == np.array(test_labels))
    
    return accuracies[:, np.newaxis].T

In [12]:
def evaluate_accuracy(X, y, coarse_labels, n_trees_coarse=25, n_trees_fine=10,
                      X_test=None, y_test=None, 
                      max_depth=5,
                      acorn=None):
    
    coarse_classes = np.unique(coarse_labels)
    n_coarse = len(coarse_classes)
    n_fine = len(coarse_labels)
    
    
    # Coarse forest
    coarse_forest = l2f(default_n_estimators=n_trees_coarse,
                        default_max_depth=max_depth)
    
    coarse_forest.add_task(X, coarse_labels[y])
        
    # Fine forest
    if n_trees_fine is not None:
        fine_forests = {}
        for j, coarse_class in enumerate(coarse_classes):
            temp_fine_indices = np.where(coarse_labels[y] == coarse_class)[0]

            fine_forests[coarse_class] = l2f(default_n_estimators=n_trees_fine, 
                                   default_max_depth=max_depth
                                  )
            fine_forests[coarse_class].add_task(X[temp_fine_indices], y[temp_fine_indices])
            
    n_test, d_test = X_test.shape
                
    posteriors = np.zeros((n_test, n_fine))
    
    coarse_posteriors = coarse_forest.predict_proba(X_test, 0)
        
    # Hierarchical posteriors & prediction
    
    if n_trees_fine is not None:
        for j, coarse_class in enumerate(coarse_classes):
            temp_fine_label_indices = np.where(coarse_labels == coarse_class)[0]
            
            temp_fine_posteriors = fine_forests[coarse_class].predict_proba(X_test, 0)
            
            posteriors[:, temp_fine_label_indices] = np.multiply(coarse_posteriors[:, j],
                                                                         temp_fine_posteriors.T
                                                                        ).T
    else:
        posteriors = coarse_posteriors

    predictions = np.argmax(posteriors, axis=1)
    
    return np.mean(predictions == np.array(y_test))

In [13]:
def generate_results(X, y, X_test, y_test,
                    acc_kwargs,
                    eval_kwargs=None):
    
    acc_kwargs['test_data'] = X_test
    acc_kwargs['test_labels'] = y_test
    
    del X_test
    del y_test
    
    accs = evaluate_accuracy(X, y, **acc_kwargs)
    
    if eval_kwargs is not None:
        evals = evaluate_clusters(**eval_kwargs)
        return np.array(evals)[:, np.newaxis].T, accs
    
    return accs

In [14]:
def stratified_sample(idx_by_label, p=0.1, replace=False):
    return np.concatenate([np.random.choice(ibl, size=int(max([1, p * len(ibl)])), replace=replace) for ibl in idx_by_label])

In [15]:
coarse_to_fine_map = {
'aquatic_mammals': ['beaver', 'dolphin', 'otter', 'seal', 'whale'],
'fish': ['aquarium_fish', 'flatfish', 'ray', 'shark', 'trout'],
'flowers': ['orchid', 'poppy', 'rose', 'sunflower', 'tulip'],
'food_containers': ['bottle', 'bowl', 'can', 'cup', 'plate'],
'fruit_and_vegetables': ['apple', 'mushroom', 'orange', 'pear', 'sweet_pepper'],
'household_electrical_devices': ['clock', 'keyboard', 'lamp', 'telephone', 'television'],
'household_furniture': ['bed', 'chair', 'couch', 'table', 'wardrobe'],
'insects': ['bee', 'beetle', 'butterfly', 'caterpillar', 'cockroach'],
'large_carnivores': ['bear', 'leopard', 'lion', 'tiger', 'wolf'],
'large_man-made_outdoor_things': ['bridge', 'castle', 'house', 'road', 'skyscraper'],
'large_natural_outdoor_scenes': ['cloud', 'forest', 'mountain', 'plain', 'sea'],
'large_omnivores_and_herbivores': ['camel', 'cattle', 'chimpanzee', 'elephant', 'kangaroo'],
'medium-sized_mammals': ['fox', 'porcupine', 'possum', 'raccoon', 'skunk'],
'non-insect_invertebrates': ['crab', 'lobster', 'snail', 'spider', 'worm'],
'people': ['baby', 'boy', 'girl', 'man', 'woman'],
'reptiles': ['crocodile', 'dinosaur', 'lizard', 'snake', 'turtle'],
'small mammals': ['hamster', 'mouse', 'rabbit', 'shrew', 'squirrel'],
'trees': ['maple_tree', 'oak_tree', 'palm_tree', 'pine_tree', 'willow_tree'],
'vehicles_1': ['bicycle', 'bus', 'motorcycle', 'pickup_truck', 'train'],
'vehicles_2': ['lawn_mower', 'rocket', 'streetcar', 'tank', 'tractor']
}

coarse_number_to_coarse_name = {i: name for i, name in enumerate(coarse_to_fine_map)}

def fine_to_coarse(coarse_to_fine):
    fine_to_coarse_map = {}
    for key in coarse_to_fine:
        fines = coarse_to_fine[key]
        for f in fines:
            fine_to_coarse_map[f] = key
            
    return fine_to_coarse_map

fine_to_coarse_map = fine_to_coarse(coarse_to_fine_map)

fine_number_to_fine_name = {i: name for i, name in enumerate(trainset.classes)}
fine_name_to_fine_number = {name: i for i, name in fine_number_to_fine_name.items()}

for i in range(100):
    fine_to_coarse_map[fine_number_to_fine_name[i]]
    
coarse_name_to_coarse_number = {name: i for i, name in enumerate(coarse_to_fine_map)}

coarse_targets = np.array([coarse_name_to_coarse_number[fine_to_coarse_map[fine_number_to_fine_name[y]]] for y in trainset.targets])
idx_by_coarse = np.array([np.where(coarse_targets == y)[0] for y in range(20)])
idx_by_fine = np.array([np.where(trainset.targets == y)[0] for y in range(100)])


test_coarse_targets = np.array([coarse_name_to_coarse_number[fine_to_coarse_map[fine_number_to_fine_name[y]]] for y in testset.targets])
test_idx_by_coarse = np.array([np.where(test_coarse_targets == y)[0] for y in range(20)])


coarse_names = np.array(list(coarse_name_to_coarse_number.keys()))

fine_number_to_coarse_number = {fn: coarse_name_to_coarse_number[
                                        fine_to_coarse_map[
                                            fine_number_to_fine_name[fn]
                                        ]
                                    ] for fn in range(100)}


fine_by_coarse = [np.where(np.array(list(fine_number_to_coarse_number.values())) == i)[0] for i in range(20)]
all_fine = np.concatenate(fine_by_coarse)

In [18]:
n_mc=10
n_cores=10

clusters = [all_fine for i in range(n_mc)]
np.random.seed(2)

accuracy_tuples = []
n_props = [0.1]

n_trees_coarse=20*50 + 500
n_trees_fine=None
max_depth=10

for i, prop in enumerate(n_props):
    for j in range(n_mc):
        inds = stratified_sample(idx_by_fine, p=prop, replace=False)
        accuracy_tuples.append((inds,
                                  clusters[i*len(n_props) + j], 
                                  n_trees_coarse, 
                                  n_trees_fine, 
                                  max_depth
                              ))
        

condensed_func_accuracy = lambda x: evaluate_accuracy(trainset.data[x[0]], trainset.targets[x[0]],
                                                     *x[1:-1],
                                                      testset.data, testset.targets,
                                                     x[-1])



accuracies = np.array(Parallel(n_jobs=n_cores)(delayed(condensed_func_accuracy)(tuple_) for tuple_ in accuracy_tuples))

In [19]:
np.mean(accuracies), np.std(accuracies, ddof=1)

(0.01003, 0.0038404426828166564)