In [13]:
from algs_lib import *

In [14]:
def hybrid_noise_test(train_x, train_y, mechanism, subsample_rate, num_classes,
    eta, regularize=None, rebalance = False, num_trees=None, tree_depth = None, max_mi = 0.5, num_dims = None,
    record_ys = False, fname = None, num_trials = 1024):

    avg_dist = 0.
    curr_est = None
    curr_trial = 0
#     print(num_trials)
    if num_classes is None:
        num_classes = len(set(train_y))

    assert subsample_rate >= num_classes

    est_y = {}
    est_x = {}
    prev_ests = None
    # 10*c*v
    seed = 743895091 # randomly generated seed for reproducibility
    s1 = None # only relevant for PCA

    while curr_trial < num_trials:
        shuffled_inds = shuffle(list(range(len(train_x))))
        # shuffled_x, shuffled_y = shuffle(train_x, train_y)
        x, y = train_x[shuffled_inds], train_y[shuffled_inds]
        x1, y1 = x[:subsample_rate], y[:subsample_rate]
        x2, y2 = x[subsample_rate:], y[subsample_rate:]
        inds_1, inds_2 = shuffled_inds[:subsample_rate], shuffled_inds[subsample_rate:]

        for (shuffled_x, shuffled_y, shuffled_inds) in [(x1, y1, inds_1), (x2, y2, inds_2)]:
            est_x[curr_trial] = shuffled_inds

            if mechanism == run_kmeans:
                output = mechanism(shuffled_x, shuffled_y, num_classes, seed, rebalance=rebalance)[1]
            if mechanism.__name__ == 'fit_forest' or mechanism.__name__ == 'fit_gbdt':
                assert num_trees is not None
                assert tree_depth is not None
                output = mechanism(shuffled_x, shuffled_y, num_trees, tree_depth, seed, regularize)[1]
            for ind in range(len(output)):
                if ind not in est_y:
                    est_y[ind] = []
                est_y[ind].append(output[ind])

            curr_trial += 1
    fin_var = {ind: np.var(est_y[ind]) for ind in est_y}

    noise = {}
    sqrt_total_var = sum([fin_var[x]**0.5 for x in fin_var])
    for ind in fin_var:
        noise[ind] = 1./(2*max_mi) * fin_var[ind]**0.5 * sqrt_total_var
    return est_x, noise


In [15]:
train_x, train_y, test_x, test_y, num_classes, train_len = gen_iris(normalize=True, norm_kind='power')

In [16]:
subsample_rate = int(0.5*train_len)

In [17]:
all_noise = []
for _ in range(100):
    _, noise = hybrid_noise_test(train_x, train_y, run_kmeans, subsample_rate, eta=1e-6,
            num_classes = num_classes, max_mi=0.5, rebalance=False, num_trials = 2)
    all_noise.append(np.linalg.norm(list(noise.values())))

In [18]:
min(all_noise), max(all_noise)

(0.0030000016862906445, 0.8953363659314739)

In [19]:
all_noise = []
for trial in range(100):
    print(trial)
    _, noise = hybrid_noise_test(train_x, train_y, run_kmeans, subsample_rate, eta=1e-6,
            num_classes = num_classes, max_mi=0.5, rebalance=False, num_trials = 128)
    all_noise.append(np.linalg.norm(list(noise.values())))

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [30]:
min(all_noise), max(all_noise)

(0.3916633607536901, 0.9770152339608571)

In [21]:
0.13/0.48

0.27083333333333337

In [22]:
num_trees = 1 
tree_depth = 3
reg = (None, 0.0, 1.0)

In [23]:
all_noise = []
for _ in range(100):
    _, noise = hybrid_noise_test(train_x, train_y, fit_forest, subsample_rate, eta=None,
            num_classes = num_classes, max_mi=0.5, num_trials = 2, regularize=reg,
            num_trees = num_trees, tree_depth=tree_depth)
    all_noise.append(np.linalg.norm(list(noise.values())))

In [24]:
noise

{0: 0.049617584070285776,
 1: 0.17458989254317545,
 2: 0.0,
 3: 0.004117725584330527,
 4: 0.0,
 5: 0.029945074500887625,
 6: 0.0}

In [25]:
min(all_noise), max(all_noise)

(0.0819168649232067, 2.912128541338001)

In [26]:
all_noise = []
for trial in range(100):
    print(trial)
    _, noise = hybrid_noise_test(train_x, train_y, fit_forest, subsample_rate, eta=None,
            num_classes = num_classes, max_mi=0.5, num_trials = 128, regularize=reg,
            num_trees = num_trees, tree_depth=tree_depth)
    all_noise.append(np.linalg.norm(list(noise.values())))

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [27]:
min(all_noise), max(all_noise)

(0.3916633607536901, 0.9770152339608571)

In [28]:
_, scaled_noise = hybrid_noise_test(train_x, train_y, fit_forest, subsample_rate, eta=None,
            num_classes = num_classes, max_mi=0.5, num_trials = 128, regularize=reg,
            num_trees = num_trees, tree_depth=tree_depth)