In [167]:
import numpy as np
import matplotlib.pyplot as plt

from hierarchical import HierarchicalForest
from sklearn.ensemble import RandomForestClassifier as RF

from tqdm import tqdm

from joblib import Parallel, delayed

In [278]:
def generate_hierarchical_gaussian_data(n_clusts=2, n_dists_per_clust=2, d=2, clust_cov=1, n_train=25, n_test=100, dist_cov=0.5, p_class_1=None, acorn=None):
    if acorn is not None:
        np.random.seed(acorn)
    
    all_means = [
        np.array([1,1]),
        np.array([-1,-1]),
        np.array([1, -1]),
        np.array([-1, 1])
    ]
    
    clust_means = all_means[:n_clusts]
    clust_cov = clust_cov * np.eye(d)
    
    dist_means = np.vstack([np.random.multivariate_normal(clust_means[i], clust_cov, size=n_dists_per_clust) for i in range(n_clusts)])
        
    dist_cov = dist_cov * np.eye(d)
    
    n_classes = n_clusts * n_dists_per_clust
        
    if p_class_1 is None:
        p_class_1 = 1 / n_classes
        p_rest = 1 / n_classes
    else:
        p_rest = (1 - p_class_1) / (n_classes - 1)
                
    p_vector = np.array([p_class_1] + (n_classes - 1) * [p_rest])
    
    n_train_vector = np.random.multinomial(n_train * n_classes, p_vector)
    n_test_vector = np.random.multinomial(n_test * n_classes, p_vector)
        

    
    train_data = np.vstack([
        np.random.multivariate_normal(dist_means[i], dist_cov, size=n_train_vector[i]) for i in range(n_classes)
    ])
    y_train = np.concatenate([i * np.ones(n_train_vector[i]) for i in range(n_dists_per_clust * n_clusts)])
    
    test_data = np.vstack([
        np.random.multivariate_normal(dist_means[i], dist_cov, size=n_test_vector[i]) for i in range(n_classes)
    ])
    y_test = np.concatenate([i * np.ones(n_test_vector[i]) for i in range(n_dists_per_clust * n_clusts)])
            
    return train_data, y_train, test_data, y_test

In [295]:
def label_noise_exp(label_noise=0, n_train=25, n_test=100, n_clusts=4, n_dists_per_clust=8, p_class_1=None):
    
    accuracies = np.zeros(2)
    x_train, y_train, x_test, y_test = generate_hierarchical_gaussian_data(n_clusts=n_clusts, n_dists_per_clust=n_dists_per_clust, d=2, clust_cov=1,
                                                                          n_train=n_train, n_test=100, dist_cov=0.5, p_class_1=p_class_1)
    
    idx_to_change = np.random.binomial(1, p=label_noise, size=n_train * n_clusts * n_dists_per_clust).astype(bool)
    idx_to_change = np.array([i for i, bool_ in enumerate(idx_to_change) if bool_]).astype(int)
    
        
    perm_ = np.random.choice(len(idx_to_change), len(idx_to_change), replace=False)

    
    
        
    other_classes = [np.concatenate((np.arange(0, i), np.arange(i + 1, n_clusts * n_dists_per_clust))) for i in range(n_clusts * n_dists_per_clust)]
#     new_labels = np.array([np.random.choice(other_classes[int(y_train[i])], size=1)[0] for i, bool_ in enumerate(idx_to_change) if bool_])
    
    y_train[idx_to_change] = y_train[idx_to_change[perm_]]
#     y_train[idx_to_change] = new_labels

        
    fine_to_coarse = np.concatenate([i * np.ones(n_dists_per_clust) for i in range(n_clusts)])
    hf = HierarchicalForest(fine_to_coarse=fine_to_coarse, n_estimators_coarse=25, n_estimators_fine=50, max_depth=None)
    hf.fit(x_train, y_train)
    accuracies[0] = np.mean(hf.predict(x_test) == y_test)
    
    rf = RF(n_estimators=225)
    rf.fit(x_train, y_train)
    accuracies[1] = np.mean(rf.predict(x_test)  == y_test)
    
    return accuracies

In [297]:
# np.random.seed(1)

n_clusts=4
n_dists_per_clust = 4

n_train=10
n_test=100

p_class_1_list = np.arange(2, n_clusts * n_dists_per_clust-5, 1) / (2 * n_clusts * n_dists_per_clust)
# p_class_1_list = [None]
label_noise_list = [0, 0.1, 0.2, 0.3, 0.5, 0.8]
print(p_class_1_list)

accuracies = []

n_cores=30
n_mc=30
for i, p in enumerate(tqdm(label_noise_list)):
    accuracies.append([])
    for j, p_class in enumerate(p_class_1_list):
        f = lambda x: label_noise_exp(p, n_train, n_test, n_clusts, n_dists_per_clust, p_class_1=p_class)
        accuracies[-1].append(Parallel(n_jobs=n_cores)(delayed(f)(_) for _ in range(n_mc)))
    
accuracies = np.array(accuracies)

[0.0625  0.09375 0.125   0.15625 0.1875  0.21875 0.25    0.28125 0.3125 ]


  0%|                                                                                             | 0/6 [00:00<?, ?it/s]2021-11-12 05:48:43.932310: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-11-12 05:48:43.932377: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2021-11-12 05:48:43.933759: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-11-12 05:48:43.933798: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2021-11-12 05:48:43.979456: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not lo

  0%|                                                                                             | 0/6 [00:16<?, ?it/s]


IndexError: index 15 is out of bounds for axis 0 with size 15

In [None]:
import seaborn as sns

accuracies = np.array(accuracies)

mean_le = np.mean(accuracies[:,:, :, 0] / accuracies[:,:, :, 1], axis=2)
std_le = np.std(accuracies[:,:, :, 0] / accuracies[:,:, :, 1], axis=2) / np.sqrt(n_mc)

colors = sns.color_palette("Set1", n_colors=len(p_class_1_list))


fig, ax = plt.subplots(1,1)

for i, p in enumerate(p_class_1_list):
    ax.errorbar(label_noise_list[:len(accuracies)], mean_le[:,i], yerr=std_le[:,i], color=colors[i], label='%1.2f'%(p))

ax.set_title('Label noise in Hierarchical Gaussian settings')
ax.set_ylabel('learning effiency (acc)')

ax.set_xticks(label_noise_list)
ax.set_yticks([1, 1.02, 1.04, 1.06, 1.08])
ax.legend()
ax.set_xlabel('label noise')

In [196]:
label_noise_list[:len(accuracies)]
mean_le[:,i]
std_le[:,i]
colors[i]

IndexError: index 1 is out of bounds for axis 1 with size 1

In [214]:
mean_le = np.mean(accuracies[:,:, :, 0] / accuracies[:,:, :, 1], axis=2)
mean_le.shape

(3, 10)

In [213]:
accuracies.shape

(3, 10, 10, 2)