In [61]:
import numpy as np
import pandas as pd

from tasksim import *

from graspologic.embed import AdjacencySpectralEmbed as ASE
from graspologic.cluster import GaussianCluster as GMM
from graspologic.cluster import AutoGMMCluster as GMM


from sklearn.metrics import normalized_mutual_info_score as NMI

import torchvision

from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from joblib import Parallel, delayed
import pickle
import time
from tqdm import tqdm
import os

In [2]:
class Dataset:
    def __init__(self, file='cifar_100_Bit_m-r101x1_embd.p', train=True, classes=[]):
        if train:
            if file == 'data/cifar_100_Bit_m-r101x1_embd.p':
                self.data = pickle.load(open(file, 'rb'))[0][0]
                self.targets = np.concatenate(pickle.load(open(file, 'rb'))[0][1])
        else:
            if file == 'data/cifar_100_Bit_m-r101x1_embd.p':
                self.data = pickle.load(open(file, 'rb'))[1][0]
                self.targets = np.concatenate(pickle.load(open(file, 'rb'))[1][1])
        
        self.classes = classes

In [3]:
data = pickle.load(open('train_frontal_Bit_m-r101x1_with_labels.p', 'rb'))

In [76]:
X_all = np.array([vec for vec in data['vector']])

category_indices = np.array([6,8,10,11,12,13,14,15,16,17,18])
conditions = np.array(list(data.iloc[0, category_indices].keys()))

competition_conditions = ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Pleural Effusion']
competition_conditions_indices = [np.where(conditions == c)[0][0] for c in conditions]


labels = np.arange(len(category_indices))

n, d = X_all.shape
y_all = np.zeros(n)

fly_list = []
for i in range(n):
    temp_conditions = data.iloc[i, category_indices]
    positive_conditions = np.where(temp_conditions.values.astype(int) > 0)[0]
    
    if len(positive_conditions) > 1:
        temp_competition_condition_indices = []
        for pc in positive_conditions:
            if pc in competition_conditions_indices:
                temp_competition_condition_indices.append(pc)
        if len(temp_competition_condition_indices) == 1:
            y_all[i] = temp_competition_condition_indices[0]
            fly_list.append(i)
    elif len(positive_conditions) == 1:
        y_all[i] = positive_conditions[0]
        fly_list.append(i)
        
fly_list = np.array(fly_list)
X = X_all[fly_list]
y = y_all[fly_list]

In [77]:
idx_by_label = [np.where(y == c)[0] for c in labels]
print("total:", len(y))

for i, ibl in enumerate(idx_by_label):
    print(conditions[i], len(ibl))

total: 89700
No Finding 16974
Cardiomegaly 5507
Lung Lesion 2442
Edema 13744
Consolidation 3294
Pneumonia 1583
Atelectasis 7502
Pneumothorax 7598
Pleural Effusion 27149
Pleural Other 922
Fracture 2985


In [78]:
def stratified_sample(idx_by_label, p=0.1, replace=False):
    return np.concatenate([np.random.choice(ibl, size=int(max([1, p * len(ibl)])), replace=replace) for ibl in idx_by_label])

In [None]:
np.random.seed(2)

n_props = [0.1, 0.95]

generate_dist_matrix_kwargs = {'metric':'tasksim', 
                               'metric_kwargs':{'n_neg_classes': 5, 
                                                'task_similarity_kwargs': {'transformer_kwargsx': 
                                                                               {'max_depth':2},
                                                                          'transformer_kwargsz':
                                                                              {'max_depth':2}}}, 
                               'function_tuples':None, 
                               'n_cores':1, 
                               'acorn':None
                              }

# generate_dist_matrix_kwargs = {'metric':'mmd', 
#                                'metric_kwargs':{'gamma': 'median'},
#                                'function_tuples':None,
#                                'n_cores':30, 
#                                'acorn':None
#                               }

process_dist_matrix_kwargs = {'make_symmetric': True,
                              'scale':True,
                             'aug_diag':True,
#                               'negate': True
                             }

embedding=ASE
embedding_kwargs={'n_components':4}
cluster=GMM
cluster_kwargs = {'max_components': 11, 
#                   'selection_criteria': 'aic', 
#                   'max_iter':10
                 }

cluster_dists_kwargs = {'embedding':embedding, 
                        'embedding_kwargs':embedding_kwargs, 
                        'cluster':cluster, 
                        'cluster_kwargs':cluster_kwargs
                       }

In [116]:
np.random.seed(2)
cluster_tuples = []
n_mc = 10
n_cores=30

for i, prop in enumerate(n_props):
    for j in range(n_mc):
        inds = stratified_sample(idx_by_label, p=prop, replace=False)
        cluster_tuples.append((inds,
                                  generate_dist_matrix_kwargs, 
                                  process_dist_matrix_kwargs, 
                                  cluster_dists_kwargs, 
                                  max([1, int(n_cores / n_mc)])
                              ))

use_stored_clusters = False

if use_stored_clusters:
    clusters = pickle.load(open('chexpert_all_data_clusters.pkl', 'rb'))
else:
    condensed_func_clusters = lambda x: generate_hierarchy(X[x[0]], y[x[0]],
                                                          *x[1:])
    start_time = time.time()
    clusters = Parallel(n_jobs=n_cores)(delayed(condensed_func_clusters)(tuple_) for tuple_ in cluster_tuples)
    clocked = time.time() - start_time
    print(clocked)
    pickle.dump(clusters, open('chexpert_all_data_clusters.pkl', 'wb'))

TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.

The exit codes of the workers are {SIGKILL(-9)}

In [125]:
from sklearn.metrics import adjusted_rand_score as ARI
f=ARI
evals = np.zeros((len(n_props), 2, n_mc))

for i, clust_ in enumerate(clusters):
    n_mc_idx = i % n_mc
    n_prop_idx = np.math.floor(i / n_mc)
    evals[n_prop_idx, :, n_mc_idx] = evaluate_clusters(f, truth, clust_, calculate_random=True, n_mc=1000)

In [130]:
np.mean(evals, axis=-1), np.std(evals, ddof=1, axis=-1) / np.sqrt(n_mc)

(array([[0.09261581, 0.00111152],
        [0.14893617, 0.00058511]]),
 array([[4.20394116e-02, 1.50569182e-03],
        [9.25185854e-18, 1.34968564e-03]]))