In [1]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

from uncertainty_forest.uncertainty_forest import UncertaintyForest
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from tqdm.notebook import tqdm
from joblib import Parallel, delayed
from scipy.stats import entropy, norm
from scipy.integrate import quad

In [2]:
from sklearn.model_selection import train_test_split
from sklearn import tree

# Taken from Richard's "Reprod Figure 2"

In [3]:
from sklearn.ensemble.forest import _generate_unsampled_indices
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
import math

def cef_estimate(X, y, n_estimators = 200, max_samples = .32, bootstrap = True, depth = 30, min_samples_leaf = 1, max_features = 1.):
    model = BaggingClassifier(DecisionTreeClassifier(max_depth = depth, min_samples_leaf = min_samples_leaf, max_features = math.ceil(int(math.sqrt(X.shape[1])))), 
                              n_estimators = n_estimators, 
                              max_samples= max_samples, 
                              bootstrap = bootstrap)
    model.fit(X, y)
    class_counts = np.zeros((X.shape[0], model.n_classes_))
    for idx, tree in enumerate(model): # RONAK EDIT
        # get out of bag indicies
        
        # RONAK EDIT STARTS HERE ################ In newer sklearn, generate unsampled takes a positional argument.
        #unsampled_indices = _generate_unsampled_indices(tree.random_state, len(X))
        sampled_indices = model.estimators_samples_[idx]
        unsampled_indices = np.delete(np.arange(0, len(X)), sampled_indices)
        
        # RONAK EDIT ENDS HERE ##################
        
        total_unsampled = len(unsampled_indices)
        np.random.shuffle(unsampled_indices)
        prob_indices, eval_indices = unsampled_indices[:total_unsampled//2], unsampled_indices[total_unsampled//2:]
        # get all node counts
        node_counts = tree.tree_.n_node_samples
        # get probs for eval samples
        posterior_class_counts = np.zeros((len(node_counts), model.n_classes_))
        for prob_index in prob_indices:
            posterior_class_counts[tree.apply(X[prob_index].reshape(1, -1)).item(), y[prob_index]] += 1
        row_sums = posterior_class_counts.sum(axis=1)
        row_sums[row_sums == 0] = 1
        class_probs = (posterior_class_counts/row_sums[:, None])
        
        where_0 = np.argwhere(class_probs == 0)
        for elem in where_0:
            class_probs[elem[0], elem[1]] = 1/(2*row_sums[elem[0], None])
        where_1 = np.argwhere(class_probs == 1)
        for elem in where_1:
            class_probs[elem[0], elem[1]] = 1 - 1/(2*row_sums[elem[0], None])
        
        class_probs.tolist()
        partition_counts = np.asarray([node_counts[x] for x in tree.apply(X[eval_indices])])
        # get probability for out of bag samples
        eval_class_probs = [class_probs[x] for x in tree.apply(X[eval_indices])]
        eval_class_probs = np.array(eval_class_probs)
        # find total elements for out of bag samples
        elems = np.multiply(eval_class_probs, partition_counts[:, np.newaxis])
        # store counts for each x (repeat fhis for each tree)
        class_counts[eval_indices] += elems
    # calculate p(y|X = x) for all x's
    probs = class_counts/class_counts.sum(axis = 1, keepdims = True)
    entropies = -np.sum(np.log(probs)*probs, axis = 1)
    # convert nan to 0
    entropies = np.nan_to_num(entropies)
    return np.mean(entropies)

np.warnings.filterwarnings('ignore')



In [4]:
def CART_estimate(X, y, n_trees = 300, bootstrap = True, depth = 30):
    model = RandomForestClassifier(bootstrap = bootstrap, n_estimators =n_trees, max_depth = depth, max_features = math.ceil(int(math.sqrt(X.shape[1]))))
    model.fit(X, y)
    class_counts = np.zeros((X.shape[0], model.n_classes_))
    for tree_in_forest in model:
        # get number of training elements in each partition
        node_counts = tree_in_forest.tree_.n_node_samples
        # get counts for all x (x.length array)
        partition_counts = np.asarray([node_counts[x] for x in tree_in_forest.apply(X)])
        # get class probability for all x (x.length, n_classes)
        class_probs = tree_in_forest.predict_proba(X)
        # get elements by performing row wise multiplication
        elems = np.multiply(class_probs, partition_counts[:, np.newaxis])
        # update counts for that tree
        class_counts += elems
    probs = class_counts/class_counts.sum(axis=1, keepdims=True)
    entropies = -np.sum(np.log(probs)*probs, axis = 1)
    # convert nan to 0
    entropies = np.nan_to_num(entropies)
    return np.mean(entropies)

## Simulate Data and Conditional Entropy Prediction

In [5]:
def generate_data(n, d, mu = 1):
    n_1 = np.random.binomial(n, .5) # number of class 1
    mean = np.zeros(d)
    mean[0] = mu
    X_1 = np.random.multivariate_normal(mean, np.eye(d), n_1)
    
    X = np.concatenate((X_1, np.random.multivariate_normal(-mean, np.eye(d), n - n_1)))
    y = np.concatenate((np.repeat(1, n_1), np.repeat(0, n - n_1)))
  
    return X, y

# def split_train_eval(X, y, frac_eval):
    
#     if frac_eval == 0:
#         return X, y, [], []
    
#     n = len(y)
#     n_eval = int(np.floor(frac_eval*n))
#     eval_indices = np.random.choice(np.arange(n), size = n_eval, replace = False)
#     X_eval = X[eval_indices, :]
#     y_eval = y[eval_indices]
#     X = np.delete(X, eval_indices, axis = 0)
#     y = np.delete(y, eval_indices, axis = 0)
    
#     return X, y, X_eval, y_eval

In [6]:
# def conditional_entropy_distribution(n, d, frac_eval, algos, num_trials, mu, parallel = False):
#     # For each trial, generate data and compute conditional entropy for each algorithm.
#     def worker(t):
#         # X, y, X_eval = generate_data(n, d, frac_eval, mu = mu)
#         X, y = generate_data(n, d, 0, mu = mu)
#         ret = np.zeros(len(algos))
#         for i in range(len(algos)):
#             obj = algos[i]['instance']
#             if algos[i]['label'] == 'UF':
#                 n_estimators = obj['n_estimators']
#                 ret[i] = cef_estimate(X, y, n_estimators = n_estimators, min_samples_leaf = int(np.log(len(X))))
#             else:
#                 X, y, X_eval, y_eval = split_train_eval(X, y, frac_eval)
#                 obj.fit(X, y)
#                 p = obj.predict_proba(X_eval)
#                 ret[i] = np.mean(entropy(p.T, base = np.exp(1)))
#         return ret
    
#     if parallel:
#         predicted_cond_entropy = np.array(Parallel(n_jobs=-2)(delayed(worker)(t) for t in range(num_trials)))
#     else:
#         predicted_cond_entropy = np.zeros((num_trials, len(algos)))
#         for t in tqdm(range(num_trials)):
#             predicted_cond_entropy[t, :] = worker(t)
            
#     return predicted_cond_entropy

## Compute True Conditional Entropy

In [7]:
def true_cond_entropy(mu, base = np.exp(1)):
    def func(x):
        p = 0.5 * norm.pdf(x, mu, 1) + 0.5 * norm.pdf(x, -mu, 1)
        return -p * np.log(p) / np.log(base)
    
    H_X = quad(func, -20, 20)
    H_XY = 0.5*(1.0 + np.log(2 * np.pi)) / np.log(base)
    H_Y = np.log(2.0) / np.log(base)
    # I_XY = H_X - H_XY = H_Y - H_YX
    return H_Y - H_X[0] + H_XY

## Conditional Entropy versus Sample Size

In [8]:
# def conditional_entropy_by_n(sample_sizes, d, frac_eval, algos, num_trials, mu, parallel = False):
#     # Repeat for all 'n', and save output in the 'algos' array.
#     cond_entropy_range = np.zeros((len(sample_sizes), num_trials, len(algos)))
#     for i in range(len(sample_sizes)):
#         cond_entropy_range[i, :, :] = conditional_entropy_distribution(sample_sizes[i], 
#                                                                        d, 
#                                                                        frac_eval, 
#                                                                        algos, 
#                                                                        num_trials, 
#                                                                        mu = mu,
#                                                                        parallel = parallel)
        
#     for j in range(len(algos)):
#         algos[j]['cond_entropy_by_n_d_%d' % d] = cond_entropy_range[:, :, j]
        
#     with open('algos_fig2.pkl', 'wb') as f:
#         pickle.dump(algos, f)
#     with open('sample_sizes_d_%d.pkl' % d, 'wb') as f:
#         pickle.dump(sample_sizes, f)
        
#     return algos

In [9]:
def plot_cond_entropy_by_n(ax, num_plotted_trials, d, mu):
        
    sample_sizes = pickle.load(open('sample_sizes_d_%d.pkl' % d, 'rb'))
    uf = pickle.load(open('uf_by_n_d_%d.pkl' % d, 'rb'))
    cart = pickle.load(open('cart_by_n_d_%d.pkl' % d, 'rb'))
    irf = pickle.load(open('irf_by_n_d_%d.pkl' % d, 'rb'))
    uf2 = pickle.load(open('uf2_by_n_d_%d.pkl' % d, 'rb'))
    uf2 = pickle.load(open('hon_by_n_d_%d.pkl' % d, 'rb'))
    results = [cart, irf, uf, uf2, hon]
    
    for j, algo in enumerate(algos):
        # Plot the mean over trials as a solid line.
        ax.plot(sample_sizes,
                np.mean(results[j], axis = 1).flatten(), 
                label = algo['label'], 
                linewidth = 4, 
                color = algo['color'])
        # Use transparent lines to show other trials.
        for t in range(num_plotted_trials):
            ax.plot(sample_sizes, 
                    results[j][:, t].flatten(),  
                    linewidth = 2, 
                    color = algo['color'],
                    alpha = 0.15)
    
    truth = true_cond_entropy(mu)
    ax.axhline(y = truth, linestyle = '-', color = "black", label = "Truth")
            
    ax.set_xlabel("Sample Size")
    ax.set_ylabel("Estimated Conditional Entropy")
    ax.set_title("Effect Size = %.1f, d = %d" % (mu, d))
    ax.set_ylim(ymin = -0.05, ymax = 1.05)

## Conditional Entropy Estimates versus Effect Size

In [10]:
# def conditional_entropy_by_mu(mus, n, d, frac_eval, algos, num_trials, parallel = False):
#     # Repeat for all 'mu', and save output in the 'algos' array.
#     cond_entropy_range = np.zeros((len(mus), num_trials, len(algos)))
#     for i in range(len(mus)):
#         cond_entropy_range[i, :, :] = conditional_entropy_distribution(n, 
#                                                                        d, 
#                                                                        frac_eval, 
#                                                                        algos, 
#                                                                        num_trials, 
#                                                                        mu = mus[i],
#                                                                        parallel = parallel)      
#     for j in range(len(algos)):
#         algos[j]['cond_entropy_by_mu_d_%d' % d] = cond_entropy_range[:, :, j]
        
#     with open('algos_fig2.pkl', 'wb') as f:
#         pickle.dump(algos, f)
#     with open('mus_fig2.pkl', 'wb') as f:
#         pickle.dump(mus, f)
        
#     return algos

In [11]:
def plot_cond_entropy_by_mu(ax, d, n):
    
    mus = pickle.load(open('mus_d_%d.pkl' % d, 'rb'))
    uf = pickle.load(open('uf_by_mu_d_%d.pkl' % d, 'rb'))
    cart = pickle.load(open('cart_by_mu_d_%d.pkl' % d, 'rb'))
    irf = pickle.load(open('irf_by_mu_d_%d.pkl' % d, 'rb'))
    uf2 = pickle.load(open('uf2_by_mu_d_%d.pkl' % d, 'rb'))
    hon = pickle.load(open('hon_by_mu_d_%d.pkl' % d, 'rb'))
    results = [cart, irf, uf, uf2, hon]
        
    for j, algo in enumerate(algos):
        # Plot the mean over trials as a solid line.
        ax.plot(mus, 
                np.mean(results[j], axis = 1).flatten(), 
                label = algo['label'], 
                linewidth = 4, 
                color = algo['color'])
    
    truth = [true_cond_entropy(mu) for mu in mus]
    ax.plot(mus, truth, label = 'Truth', linewidth = 4, color = 'black')

    ax.set_ylim(ymin = -.05)
    ax.set_title("n = %d, d = %d" % (n, d))
    ax.set_xlabel("Effect Size")
    ax.set_ylabel("Estimated Conditional Entropy")

In [12]:
def plot_fig2(num_plotted_trials, d1, d2, n1, n2, effect_size):
    sns.set(font_scale = 3)
    sns.set_style("ticks")
    plt.rcParams["font.family"] = "sans-serif"
    plt.rcParams['figure.figsize'] = [30, 20]
    fig, axes = plt.subplots(2, 2)
    
    plot_cond_entropy_by_n(axes[0, 0], num_plotted_trials, d1, effect_size)
    plot_cond_entropy_by_n(axes[0, 1], num_plotted_trials, d2, effect_size)
                                                  
    plot_cond_entropy_by_mu(axes[1, 0], d1, n1)
    plot_cond_entropy_by_mu(axes[1, 1], d2, n2)
    
    axes[0,0].legend(loc = "upper left")
    
    plt.tight_layout()
    plt.savefig("fig2.pdf")
    plt.show()

In [13]:
# def get_cond_entropy_vs_n(mean, d, num_trials, sample_sizes, algos):
    
# #     uf = np.zeros((len(sample_sizes), num_trials))
# #     cart = np.zeros((len(sample_sizes), num_trials))
# #     irf = np.zeros((len(sample_sizes), num_trials))
# #     uf2 = np.zeros((len(sample_sizes), num_trials))
#     hon = np.zeros((len(sample_sizes), num_trials))
    
#     def worker(t):
#         # X, y = get_multivariate_sample(elem, d, mean)
#         X, y = generate_data(elem, d, mu = mean)
        
# #         uf_out = cef_estimate(np.array(X), y, 300, .32, depth = 30)
# #         cart_out = CART_estimate(X, y)
        
# #         irf_obj = CalibratedClassifierCV(base_estimator=RandomForestClassifier(n_estimators = 300), 
# #                                      method='isotonic', 
# #                                      cv = 5)
# #         X, y, X_eval, y_eval = split_train_eval(X, y, frac_eval)
# #         irf_obj.fit(X, y)
# #         p = irf_obj.predict_proba(X_eval)
# #         irf_out = np.mean(entropy(p.T, base = np.exp(1)))
# #         uf2_out = UncertaintyForest(n_estimators = 300, frac_struct = 0.33).fit(X, y).estimate_cond_entropy()
#         hon_out = UncertaintyForest(n_estimators = 300, frac_struct = 0.33, finite_correction = False).fit(X, y).estimate_cond_entropy()
        
# #         return (uf_out, cart_out, irf_out)
#         return hon_out
    
#     for i, elem in enumerate(sample_sizes):
#         output = np.array(Parallel(n_jobs=-2)(delayed(worker)(t) for t in range(num_trials)))
# #         uf[i, :] = output[:, 0]
# #         cart[i, :] = output[:, 1]
# #         irf[i, :] = output[:, 2]
# #         uf2[i, :] = output
#         hon[i, :] = output
        
# #     pickle.dump(sample_sizes, open('sample_sizes_d_%d.pkl' % d, 'wb'))
# #     pickle.dump(uf, open('uf_by_n_d_%d.pkl' % d, 'wb'))
# #     pickle.dump(cart, open('cart_by_n_d_%d.pkl' % d, 'wb'))
# #     pickle.dump(irf, open('irf_by_n_d_%d.pkl' % d, 'wb'))
# #     pickle.dump(uf2, open('uf2_by_n_d_%d.pkl' % d, 'wb'))
#     pickle.dump(hon, open('hon_by_n_d_%d.pkl' % d, 'wb'))

#     return hon

In [14]:
def get_cond_entropy_vs_n(mean, d, num_trials, sample_sizes, algos):
    
    # labels = ["CART", "IRF", "UF1", "UF2", "UF3", "UF4", "UF5"]
    
    def worker(t):
        X, y = generate_data(elem, d, mu = mean)
        
        ret = []
        for algo in algos:
            ret.append(estimate_cef(X, y, algo['label']))

        return tuple(ret)
    
    output = [np.zeros((len(sample_sizes), num_trials))]*len(algos)
    for i, elem in enumerate(sample_sizes):
        results = np.array(Parallel(n_jobs=-2)(delayed(worker)(t) for t in range(num_trials)))
        for j in range(len(algos)):
            output[j][i, :] = results[:, j]
        
    pickle.dump(sample_sizes, open('sample_sizes_d_%d.pkl' % d, 'wb'))
    for j, algo in enumerate(algos):
        pickle.dump(output[j], open('%s_by_n_d_%d.pkl' % (algo['label'], d), 'wb'))
        
    return output

In [15]:
def estimate_cef(X, y, label):
    if label == "CART":
        return CART_estimate(X, y)
    elif label == "IRF":
        frac_eval = 0.3
        irf = CalibratedClassifierCV(base_estimator=RandomForestClassifier(n_estimators = 300), 
                                     method='isotonic', 
                                     cv = 5)
        # X_train, y_train, X_eval, y_eval = split_train_eval(X, y, frac_eval)
        X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=frac_eval)
        irf.fit(X_train, y_train)
        p = irf.predict_proba(X_eval)
        return np.mean(entropy(p.T, base = np.exp(1)))
    elif label == "UF1":
        return cef_estimate(np.array(X), y, 300, .32, depth = 30)
    elif label == "UF2":
        uf = UncertaintyForest(n_estimators = 300, frac_struct = 0.33, frac_est = 0.33)
        return uf.fit(X,y).estimate_cond_entropy()
    elif label == "UF3":
        uf = UncertaintyForest(n_estimators = 300, frac_struct = 0.33, frac_est = 0.33, finite_correction = False)
        return uf.fit(X,y).estimate_cond_entropy()
    elif label == "UF4":
        uf = UncertaintyForest(n_estimators = 300, frac_struct = 0.50, frac_est = 0.25)
        return uf.fit(X,y).estimate_cond_entropy()
    elif label == "UF5":
        uf = UncertaintyForest(n_estimators = 300, frac_struct = 0.25, frac_est = 0.50)
        return uf.fit(X,y).estimate_cond_entropy()
    else:
        raise ValueError("Unrecognized Label!")

In [16]:
# def get_cond_entropy_vs_mu(n, d, num_trials, mus):
    
# #     uf = np.zeros((len(mus), num_trials))
# #     cart = np.zeros((len(mus), num_trials))
# #     irf = np.zeros((len(mus), num_trials))
# #     uf2 = np.zeros((len(mus), num_trials))
#     hon = np.zeros((len(mus), num_trials))
    
#     def worker(t):
#         # X, y = get_multivariate_sample(elem, d, mean)
#         X, y = generate_data(n, d, mu = elem)
        
# #         uf_out = cef_estimate(np.array(X), y, 300, .32, depth = 30)
# #         cart_out = CART_estimate(X, y)
        
# #         irf_obj = CalibratedClassifierCV(base_estimator=RandomForestClassifier(n_estimators = 300), 
# #                                      method='isotonic', 
# #                                      cv = 5)
# #         X, y, X_eval, y_eval = split_train_eval(X, y, frac_eval)
# #         irf_obj.fit(X, y)
# #         p = irf_obj.predict_proba(X_eval)
# #         irf_out = np.mean(entropy(p.T, base = np.exp(1)))
# #         uf2_out = UncertaintyForest(n_estimators = 300, frac_struct = 0.33).fit(X, y).estimate_cond_entropy()
#         hon_out = UncertaintyForest(n_estimators = 300, frac_struct = 0.33).fit(X, y).estimate_cond_entropy()
        
#         # return (uf_out, cart_out, irf_out)
#         return hon_out
    
#     for i, elem in enumerate(mus):
#         output = np.array(Parallel(n_jobs=-2)(delayed(worker)(t) for t in range(num_trials)))
# #         uf[i, :] = output[:, 0]
# #         cart[i, :] = output[:, 1]
# #         irf[i, :] = output[:, 2]
# #         uf2[i, :] = output
#         hon[i, :] = output
        
#     pickle.dump(mus, open('mus_d_%d.pkl' % d, 'wb'))
# #     pickle.dump(uf, open('uf_by_mu_d_%d.pkl' % d, 'wb'))
# #     pickle.dump(cart, open('cart_by_mu_d_%d.pkl' % d, 'wb'))
# #     pickle.dump(irf, open('irf_by_mu_d_%d.pkl' % d, 'wb'))
# #     pickle.dump(uf2, open('uf2_by_mu_d_%d.pkl' % d, 'wb'))
#     pickle.dump(hon, open('hon_by_mu_d_%d.pkl' % d, 'wb'))

#     # return uf, cart, irf
#     return hon

In [17]:
def get_cond_entropy_vs_mu(n, d, num_trials, mus, algos):
    
    # labels = ["CART", "IRF", "UF1", "UF2", "UF3", "UF4", "UF5"]
    
    def worker(t):
        X, y = generate_data(n, d, mu = elem)
        
        ret = []
        for algo in algos:
            ret.append(estimate_cef(X, y, algo['label']))

        return tuple(ret)
    
    output = [np.zeros((len(mus), num_trials))]*len(algos)
    for i, elem in enumerate(mus):
        results = np.array(Parallel(n_jobs=-2)(delayed(worker)(t) for t in range(num_trials)))
        for j in range(len(algos)):
            output[j][i, :] = results[:, j]
        
    pickle.dump(mus, open('mus.pkl', 'wb'))
    for j, algo in enumerate(algos):
        pickle.dump(output[j], open('%s_by_n_d_%d.pkl' % (algo['label'], d), 'wb'))
        
    return output

## Run Experiments and Plot

In [18]:
# Data.
mus = [i * 0.5 for i in range(1, 11)]
frac_eval = 0.3
effect_size = 1
d1 = 1
d2 = 40
n1 = 5000
n2 = 10000
num_trials = 20
num_plotted_trials = 10
sample_sizes_d1 = range(400, 2501, 300)
sample_sizes_d2 = range(400, 10001, 960)

# Fake params.
# mus = [i * 0.5 for i in range(1, 4)]
# # frac_eval = 0.3
# # n_estimators = 200
# effect_size = 1
# d1 = 1
# d2 = 3
# n1 = 100
# n2 = 110
# num_trials = 3
# num_plotted_trials = 3
# sample_sizes_d1 = range(100, 120, 10)
# sample_sizes_d2 = range(100, 130, 10)

# Algorithms.
algos = [
    {
        'label': 'CART',
        'title': 'CART Forest',
        'color': "#1b9e77",
    },
    {
        'label': 'IRF',
        'title': 'Isotonic Reg. Forest',
        'color': "#fdae61",
    },
    {
        'label': 'UF1',
        'title': 'Uncertainty Forest 1',
        'color': "#F41711",
    },
    {
        'label': 'UF2',
        'title': 'Uncertainty Forest 2',
        'color': "purple",
    },
    {
        'label': 'UF3',
        'title': 'Uncertainty Forest 3',
        'color': "blue",
    },
    {
        'label': 'UF4',
        'title': 'Uncertainty Forest 4',
        'color': "m",
    },
    {
        'label': 'UF5',
        'title': 'Uncertainty Forest 5',
        'color': "g",
    },
]

### Estimated H(Y | X) versus n, d = 1

In [19]:
get_cond_entropy_vs_n(effect_size, d1, num_trials, sample_sizes_d1, algos)

[array([[0.3491779 , 0.38506953, 0.30000501, 0.38381699, 0.38230503,
         0.36147004, 0.36758465, 0.34971459, 0.37221792, 0.37309355,
         0.37327357, 0.35533922, 0.35871351, 0.367598  , 0.39763752,
         0.36637928, 0.35822566, 0.3462868 , 0.32731299, 0.36820729],
        [0.41201099, 0.36793538, 0.36812162, 0.35342113, 0.30434775,
         0.32623956, 0.33060861, 0.35958452, 0.38555986, 0.3664273 ,
         0.38940899, 0.37780015, 0.39705662, 0.35866431, 0.37269361,
         0.36723586, 0.3739822 , 0.37143836, 0.35981061, 0.35008241],
        [0.33453992, 0.36882288, 0.34313401, 0.33316869, 0.35548837,
         0.34672736, 0.3464147 , 0.34314415, 0.34817361, 0.34843293,
         0.33238282, 0.35864325, 0.34840192, 0.36142798, 0.34155358,
         0.35722846, 0.37703521, 0.39043607, 0.34924401, 0.34905135],
        [0.39214282, 0.35492558, 0.32521157, 0.37371477, 0.33869102,
         0.37888871, 0.34449873, 0.33022598, 0.35226251, 0.3594689 ,
         0.3688859 , 0.36314613

### Estimated H(Y | X) versus mu, d = 1

In [20]:
# Estimate conditional entropy vs mu.
# algos = conditional_entropy_by_mu(mus, n1, d1, frac_eval, algos, num_trials_mu, parallel = parallel)
# print(algos[0]['cond_entropy_by_mu_d_%d' % d1])
get_cond_entropy_vs_mu(n1, d1, num_trials, mus, algos)

[array([[0.54147832, 0.55868474, 0.55630742, 0.55064156, 0.54503008,
         0.55762157, 0.55304849, 0.54734196, 0.54623512, 0.54869378,
         0.55468892, 0.54751063, 0.55277168, 0.55112305, 0.55024496,
         0.552435  , 0.54555116, 0.54145526, 0.55694413, 0.55299388],
        [0.36336058, 0.33753803, 0.33360835, 0.34257121, 0.34285132,
         0.34404045, 0.35283756, 0.33696941, 0.34808397, 0.35833637,
         0.34632148, 0.35618004, 0.35945681, 0.34738858, 0.34851086,
         0.35374762, 0.33563571, 0.34137232, 0.341735  , 0.36050724],
        [0.15873125, 0.17567927, 0.15802742, 0.17116975, 0.15707784,
         0.17207461, 0.16776755, 0.1659406 , 0.17150868, 0.16896625,
         0.17711803, 0.17192915, 0.14823465, 0.1627457 , 0.16855257,
         0.17278386, 0.16117671, 0.16167538, 0.15796256, 0.16086786],
        [0.0674698 , 0.05962632, 0.07896242, 0.06308129, 0.06688143,
         0.06629395, 0.07178658, 0.06843846, 0.06077751, 0.06328867,
         0.06720056, 0.06048721

### Estimated H(Y | X) versus n, d = 40

In [21]:
# Estimate conditional entropy vs n.
# algos =conditional_entropy_by_n(sample_sizes_d2, d2, frac_eval, algos, num_trials_n, effect_size, parallel = parallel)
# print(algos[0]['cond_entropy_by_n_d_%d' % d2])

get_cond_entropy_vs_n(effect_size, d2, num_trials, sample_sizes_d2, algos)

[array([[0.53177803, 0.53503069, 0.53122401, 0.54485593, 0.53312405,
         0.52097932, 0.54202658, 0.54749744, 0.50635424, 0.56080055,
         0.55902808, 0.51014334, 0.53616981, 0.54765936, 0.54158789,
         0.51774187, 0.51935436, 0.52764309, 0.53779934, 0.53981637],
        [0.49732579, 0.48641505, 0.4854396 , 0.49429404, 0.49700857,
         0.50150678, 0.49917073, 0.49490087, 0.46993082, 0.48505424,
         0.50019698, 0.50147035, 0.49370764, 0.4964123 , 0.48847044,
         0.48508512, 0.50337075, 0.50452199, 0.49534575, 0.48248422],
        [0.46893194, 0.484089  , 0.4747261 , 0.46709117, 0.47635153,
         0.4510255 , 0.47039364, 0.47176456, 0.48685442, 0.46659167,
         0.47200395, 0.46161198, 0.46050637, 0.46736888, 0.4766554 ,
         0.47001596, 0.4856485 , 0.48082006, 0.48522724, 0.47323196],
        [0.4616977 , 0.44869172, 0.43383897, 0.45753062, 0.4724017 ,
         0.44860356, 0.45511176, 0.45983079, 0.46204019, 0.45577613,
         0.46743023, 0.46722562

### Estimated H(Y | X) versus mu, d = 40

In [22]:
# Estimate conditional entropy vs mu.
# algos = conditional_entropy_by_mu(mus, n2, d2, frac_eval, algos, num_trials_mu, parallel = parallel)
# print(algos[0]['cond_entropy_by_mu_d_%d' % d2])
get_cond_entropy_vs_mu(n2, d2, num_trials, mus, algos)

[array([[0.5861043 , 0.5830266 , 0.59118666, 0.5899025 , 0.59078848,
         0.59143381, 0.59019654, 0.58738898, 0.58367325, 0.58730753,
         0.5922321 , 0.59023201, 0.59319068, 0.58802836, 0.58423987,
         0.58628369, 0.58643552, 0.58789271, 0.58275628, 0.58183596],
        [0.43552204, 0.42543289, 0.43490346, 0.44100142, 0.43416848,
         0.42401088, 0.4295988 , 0.4281135 , 0.42102601, 0.4269695 ,
         0.43678295, 0.42205862, 0.43394467, 0.42373304, 0.43300683,
         0.42928602, 0.43769233, 0.42907815, 0.43995858, 0.43103792],
        [0.27346269, 0.27736988, 0.27377752, 0.26768251, 0.27165246,
         0.27015234, 0.27330863, 0.27373957, 0.26956672, 0.27824987,
         0.27382277, 0.27289173, 0.27182822, 0.27173165, 0.27336843,
         0.28724562, 0.27571039, 0.2767927 , 0.26800237, 0.27072854],
        [0.16347248, 0.1719471 , 0.17826224, 0.17030457, 0.16289176,
         0.17006878, 0.17938893, 0.16718416, 0.17766886, 0.16650545,
         0.17717703, 0.17092271

In [23]:
# # Correct log base.
# old_base = 2.0
# new_base = np.exp(1)
# correction = np.log(old_base) / np.log(new_base)

# for d in [d1, d2]:
#     for label in ['uf2']:
#         by_n = pickle.load(open('%s_by_n_d_%d.pkl' % (label, d), 'rb')) * correction
#         pickle.dump(by_n, open('%s_by_n_d_%d.pkl' % (label, d), 'wb'))
#         by_mu = pickle.load(open('%s_by_mu_d_%d.pkl' % (label, d), 'rb')) * correction
#         pickle.dump(by_mu, open('%s_by_mu_d_%d.pkl' % (label, d), 'wb'))

In [24]:
# plot_fig2(num_plotted_trials, d1, d2, n1, n2, effect_size)