In [20]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble.forest import _generate_unsampled_indices
from sklearn.ensemble.forest import _generate_sample_indices

from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

from tqdm import tqdm_notebook as tqdm

from joblib import Parallel, delayed

In [21]:
def generate_2d_rotation(theta=0, acorn=None):
    if acorn is not None:
        np.random.seed(acorn)
    
    R = np.array([
        [np.cos(theta), np.sin(theta)],
        [-np.sin(theta), np.cos(theta)]
    ])
    
    return R

def generate_parity(n, d=2, angle_params=None, acorn=None):
    if acorn is not None:
        np.random.seed(acorn)
        
    X = np.random.uniform(-2, 2, size=(10*n, d))
    Y = (np.sum(X > 0, axis=1) % 2 == 0).astype(int)
    
    if d == 2:
        if angle_params is None:
            angle_params = np.random.uniform(0, 2*np.pi)
        R = generate_2d_rotation(angle_params)
        X = X @ R
        inds = (abs(X[:, 0]) < 1) + (abs(X[:, 1]) < 1)
        Y = Y[(abs(X[:, 0]) < 1) * (abs(X[:, 1]) < 1)][:n]
        X = X[(abs(X[:, 0]) < 1) * (abs(X[:, 1]) < 1)][:n]
    return X, Y.astype(int)

def generate_gaussian_parity(n, mean=np.array([-1, -1]), cov_scale=1, angle_params=None, k=1, acorn=None):
    if acorn is not None:
        np.random.seed(acorn)
        
    d = len(mean)
    
    if mean[0] == -1 and mean[1] == -1:
        mean = mean + 1 / 2**k
    
    mnt = np.random.multinomial(n, 1/(4**k) * np.ones(4**k))
    cumsum = np.cumsum(mnt)
    cumsum = np.concatenate(([0], cumsum))
    
    Y = np.zeros(n)
    X = np.zeros((n, d))
    

    for i in range(2**k):
        for j in range(2**k):
            if cov_scale == 0:
                temp = np.random.uniform(-1, -1 + 1/2**(k-1), size=(mnt[i*(2**k) + j], d))
            else:
                temp = np.random.multivariate_normal(mean, cov_scale * np.eye(d), 
                                                     size=mnt[i*(2**k) + j])
            temp[:, 0] += i*(1/2**(k-1))
            temp[:, 1] += j*(1/2**(k-1))

            X[cumsum[i*(2**k) + j]:cumsum[i*(2**k) + j + 1]] = temp
            
            if i % 2 == j % 2:
                Y[cumsum[i*(2**k) + j]:cumsum[i*(2**k) + j + 1]] = 0
            else:
                Y[cumsum[i*(2**k) + j]:cumsum[i*(2**k) + j + 1]] = 1
                
    if d == 2:
        if angle_params is None:
            angle_params = np.random.uniform(0, 2*np.pi)
        
        R = generate_2d_rotation(angle_params)
        X = X @ R
        
#         Y = Y[(abs(X[:, 0]) < 1) * (abs(X[:, 1]) < 1)][:n]
#         X = X[(abs(X[:, 0]) < 1) * (abs(X[:, 1]) < 1)][:n]
    else:
        raise ValueError('d=%i not implemented!'%(d))
       
    return X, Y.astype(int)

def get_colors(colors, inds):
    c = [colors[i] for i in inds]
    return c

In [49]:
def estimate_posteriors(tree, X, y):
    n = X.shape[0]
    size = len(np.unique(y))

#     def worker(tree):
        # Count the occurences of each class in each leaf node,
        # by first extracting the leaves.
        # node_counts = tree.tree_.n_node_samples
    unique_leaf_nodes = get_leaves(tree)

    class_counts_per_leaf = np.zeros(
        (len(unique_leaf_nodes), size)
    )

    # Drop each estimation example down the tree, and record its 'y' value.
    for i in range(len(y)):
        temp_node = tree.apply(X[i].reshape((1, -1))).item()
        class_counts_per_leaf[
            np.where(unique_leaf_nodes == temp_node)[0][0], y[i]
        ] += 1

    # Count the number of data points in each leaf in.
    n_per_leaf = class_counts_per_leaf.sum(axis=1)
    n_per_leaf[n_per_leaf == 0] = 1  # Avoid divide by zero.

    # Posterior probability distributions in each leaf.
    # Each row is length num_classes.
    posterior_per_leaf = np.divide(
        class_counts_per_leaf,
        np.repeat(n_per_leaf.reshape((-1, 1)), size, axis=1),
    )
    posterior_per_leaf = finite_sample_correction(
        posterior_per_leaf, n_per_leaf
    )
    posterior_per_leaf = posterior_per_leaf.tolist()


#         return (posterior_per_leaf, tree, unique_leaf_nodes)

#     if parallel:
#         uncertainty_per_tree = Parallel(n_jobs=-2)(
#             delayed(worker)(idx_tree) for idx_tree in enumerate(forest)
#         )
#     else:
#         uncertainty_per_tree = []
#         for idx_tree in enumerate(forest):
#             uncertainty_per_tree.append(worker(idx_tree))

#     posterior_info = []
#     for elem in uncertainty_per_tree:
#         posterior_info.append(elem[0:3])

    return posterior_per_leaf, tree, unique_leaf_nodes


def predict(posterior_info, X):
    
    posterior_per_leaf = posterior_info[0]
    tree = posterior_info[1]
    unique_leaf_nodes = posterior_info[2]

        # Posterior probability for each element of the evaluation set.
    eval_posteriors = np.array(
        [
        posterior_per_leaf[np.where(unique_leaf_nodes == node)[0][0]]
        for node in tree.apply(X)
        ]
    )

    return np.argmax(eval_posteriors, axis=1)


def get_leaves(estimator):
    """
    Internal function to get leaf node ids of estimator.

    Input
    estimator: a fit DecisionTreeClassifier

    Return
    leaf_ids: numpy array; an array of leaf node ids

    Usage
    _estimate_posteriors(..)
    """

    # adapted from https://scikit-learn.org/stable/auto_examples/tree/plot_unveil_tree_structure.html
    n_nodes = estimator.tree_.node_count
    children_left = estimator.tree_.children_left
    children_right = estimator.tree_.children_right
    feature = estimator.tree_.feature
    threshold = estimator.tree_.threshold

    leaf_ids = []
    stack = [(0, -1)] 
    while len(stack) > 0:
        node_id, parent_depth = stack.pop()

        # If we have a test node
        if (children_left[node_id] != children_right[node_id]):
            stack.append((children_left[node_id], parent_depth + 1))
            stack.append((children_right[node_id], parent_depth + 1))
        else:
            leaf_ids.append(node_id)

    return np.array(leaf_ids)
    
    
def finite_sample_correction(class_probs, row_sums):
    """
    An internal function for finite sample correction of posterior estimation.

    Input
    class_probs: numpy array; array of posteriors to correct
    row_sums: numpy array; array of partition counts

    Output
    class_probs: numpy array; finite sample corrected posteriors

    Usage
    _estimate_posteriors(..)

    """

    where_0 = np.argwhere(class_probs == 0)
    for elem in where_0:
        class_probs[elem[0], elem[1]] = 1 / (2 * row_sums[elem[0], None])
    where_1 = np.argwhere(class_probs == 1)
    for elem in where_1:
        class_probs[elem[0], elem[1]] = 1 - 1 / (2 * row_sums[elem[0], None])

    return class_probs

In [113]:
X, y = generate_gaussian_parity(200)

tree = DecisionTreeClassifier(max_depth=2).fit(X, y)
eta = estimate_posteriors(tree, X, y)
predict(eta, X) == y

array([False, False,  True, False,  True, False,  True, False,  True,
       False, False,  True,  True,  True, False,  True,  True, False,
       False, False, False, False, False,  True,  True, False, False,
       False, False, False, False, False, False, False, False,  True,
       False,  True, False,  True, False, False, False, False, False,
       False,  True, False,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False,  True, False,  True, False,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True, False,  True, False,  True,  True,  True,  True,
        True,  True, False,  True,  True,  True,  True,  True,  True,
        True,  True,

In [339]:
def task_similarity_estimation_experiment(nx, funcx, paramsx, classifierx, classifier_paramsx,
                                          nz, funcz, paramsz, classifierz, classifier_paramsz,
                                          acorn=None):
    
    if acorn is None:
        np.random.seed(acorn)
                
    Tx = 1
    Tz = 1

    shuffle = np.random.choice(nx, nx, replace=False)

    # 60 / 20 / 20
    transform_idx = shuffle[:int(0.6*nx)]

    not_transform_idx = shuffle[int(0.6*nx):]
    vote_idx = shuffle[int(0.6*nx): int(0.8*nx)]
    valid_idx = shuffle[:nx - int(0.8*nx)]

    # Source task
    X, y = funcx(nx, *paramsx)

    # Target task
    Z, w = funcz(nz, *paramsz)

    task1_tree = classifierx(**classifier_paramsx)
    task1_tree.fit(X, y)
    task1_posteriors = estimate_posteriors(task1_tree, X, y)
    yhat1 = predict(task1_posteriors, X) 


    task2_tree = classifierz(**classifier_paramsz)
    task2_tree.fit(Z, w)
    task2_posteriors = estimate_posteriors(task2_tree, X, y)
    yhat2 = predict(task2_posteriors, X)
    
    return np.mean(yhat1 == yhat2)

In [340]:
dists = {'xor': {
                'funcx':generate_gaussian_parity, 
                'paramsx': (np.array([-1, -1]), 0.1, 0, 1),
                'classifierx': DecisionTreeClassifier,
                'classifier_paramsx': {},
                },
         'n-xor':{'funcx':generate_gaussian_parity, 
                'paramsx': (np.array([-1, -1]), 0.1, np.pi/2, 1),
                'classifierx': DecisionTreeClassifier,
                'classifier_paramsx': {},
                },
             
         'r-xor':{'funcx':generate_gaussian_parity, 
                'paramsx': (np.array([-1, -1]), 0.1, np.pi/4, 1),
                'classifierx': DecisionTreeClassifier,
                'classifier_paramsx': {},
                },
         'f-xor':{'funcx':generate_gaussian_parity, 
                'paramsx': (np.array([-1, -1]), 0.01, 0, 2),
                'classifierx': DecisionTreeClassifier,
                'classifier_paramsx': {},
                },
        }

In [346]:
np.random.seed(1)

ns = np.array([10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000])
means = np.zeros((len(dists), len(dists), len(ns)))
std_errors = np.zeros((len(dists), len(dists), len(ns)))

n_mc=100
for i, n in enumerate(ns):
    for j, key1 in enumerate(dists):
        temp = np.zeros(mc_its)

        for k, key2 in enumerate(dists):
            condensed_func = lambda x : task_similarity_estimation_experiment(x, 
                                                                              dists[key1]['funcx'],
                                                                              dists[key1]['paramsx'],
                                                                              dists[key1]['classifierx'],
                                                                              dists[key1]['classifier_paramsx'],
                                                                              n,
                                                                              dists[key2]['funcx'],
                                                                              dists[key2]['paramsx'],
                                                                              dists[key2]['classifierx'],
                                                                              dists[key2]['classifier_paramsx']
                                                                            )
                                                                              
            temp_errors = np.array(Parallel(n_jobs=-2)(delayed(condensed_func)(int(x)) for x in n*np.ones(n_mc)))
            
            means[j,k,i] = np.mean(temp_errors)
            std_errors[j,k,i] = np.std(temp_erorrs) / np.sqrt(n_mc)

KeyboardInterrupt: 

In [320]:
nx=1000
funcx=generate_gaussian_parity
paramsx= (np.array([-1, -1]), 0.05, 0, 1)
classifierx=DecisionTreeClassifier
classifier_paramsx={}

nz=1000
funcz=generate_gaussian_parity 
paramsz=(np.array([-1, -1]), 0.05, 0, 2)
classifierz=DecisionTreeClassifier
classifier_paramsz={}
                       
task_similarity_estimation_experiment(nx, funcx, paramsx, classifierx, classifier_paramsx, 
                                      nz, funcz, paramsz, classifierz, classifier_paramsz)

0.95

In [335]:
task_similarity_estimation_experiment(nz, funcz, paramsz, classifierz, classifier_paramsz, 
                                      nx, funcx, paramsx, classifierx, classifier_paramsx)

0.585