In [55]:
from algs_lib import *

In [56]:
def clip_to_threshold(vec, c):
    curr_norm = np.linalg.norm(vec)
    if curr_norm <= c:
        return vec
    clip_ratio = c / curr_norm
    return [vec[i]*clip_ratio for i in range(len(vec))]

In [57]:
def add_noise(scale):
    return np.random.laplace(scale)
# global sensitivity is C/n i think?
# so scale should be (C/n) / \epsilon per elem?

In [58]:
def calc_posterior(mi, prior=0.5, prec = 100000):
    test_vals = [x / prec for x in range(1, prec)]
    max_t = None
    for t in test_vals:
        if t*np.log(t/prior)+(1-t)*np.log((1-t)/(1-prior)) <= mi:
            if  max_t is None or t > max_t:
                max_t = t
    return max_t

def dp_epsilon_to_posterior_success(epsilon):
    return 1 - 1./(1+np.exp(epsilon))

def dp_ps_to_epsilon(ps):
    return np.log(ps / (1-ps))

In [150]:
def hybrid_noise_auto(train_x, train_y, subsample_rate, num_classes,
    eta, regularize=None, num_trees=None, tree_depth = None, max_mi = 0.5, num_dims = None):

    sec_v = max_mi / 2
    sec_beta = max_mi - sec_v
    r = calc_r(train_x)
    gamma = 0.01
    avg_dist = 0.
    curr_est = None
    converged = False
    curr_trial = 0

    if num_classes is None:
        num_classes = len(set(train_y))

    assert subsample_rate >= num_classes

    est_y = {}
    prev_ests = None
    # 10*c*v
    seed = np.random.randint(1, 100000)

    s1 = None # only relevant for PCA
    
    while not converged:
        shuffled_x, shuffled_y = shuffle(train_x, train_y)
        
        shuffled_x, shuffled_y = get_samples_safe(shuffled_x, shuffled_y, num_classes, subsample_rate)
        
        output = np.average(shuffled_x, axis=0)

        for ind in range(len(output)):
            if ind not in est_y:
                est_y[ind] = []
            est_y[ind].append(output[ind])

        if curr_trial % 10 == 0:        
            if prev_ests is None:
                prev_ests = {}
                for ind in est_y:
                    prev_ests[ind] = np.var(est_y[ind])
            else:
                converged = True
                for ind in est_y:
                    if abs(np.var(est_y[ind]) - prev_ests[ind]) > eta:
                        converged = False
                if not converged:
                    for ind in est_y:
                        prev_ests[ind] = np.var(est_y[ind])
        curr_trial += 1
    fin_var = {ind: np.var(est_y[ind]) for ind in est_y}

    noise = {}
    sqrt_total_var = sum([fin_var[x]**0.5 for x in fin_var])
    for ind in fin_var:
        noise[ind] = 1./(2*max_mi) * fin_var[ind]**0.5 * sqrt_total_var
    return noise

In [137]:
train_x, train_y, test_x, test_y, num_classes, train_len = gen_iris()

norms = [np.linalg.norm(x) for x in train_x]
# print(max(norms))

In [138]:
true_mean = np.average(train_x, axis=0)

mi_range = [1.0, 0.25, 0.0625, 0.015625]
posterior_success_rates = [calc_posterior(mi) for mi in mi_range]
epsilon_vals = [dp_ps_to_epsilon(ps) for ps in posterior_success_rates]

print(epsilon_vals)
print([x for x in posterior_success_rates])

[11.51291546492478, 1.6426117097961406, 0.7304317044395013, 0.3563228120191924]
[0.99999, 0.83789, 0.6749, 0.58815]


In [144]:
# DP MEAN
dp_dists = {}
num_trials = 1000

for eps in epsilon_vals:
    avg_dist_dp = {}
    for i in range(1, 220):
        clip_budget = i / 20. # 0.05 
        clipped_train_x = [clip_to_threshold(train_x[i], clip_budget) for i in range(len(train_x))]
        released_mean = np.average(clipped_train_x, axis=0)
        clip_dist = np.linalg.norm(released_mean - true_mean)
        dist = 0.
        for _ in range(num_trials):
            released_mean = np.average(clipped_train_x, axis=0)
            for ind in range(len(released_mean)):
                sensitivity = clip_budget / train_len 
                released_mean[ind] += add_noise(sensitivity / eps)
            dist += np.linalg.norm(released_mean - true_mean)
        dist /= num_trials
        avg_dist_dp[i] = (clip_dist, dist)
    dp_key = min(avg_dist_dp.items(), key=lambda x: x[1][1])[0]
    dp_dists[eps] = avg_dist_dp[dp_key]


In [145]:
dp_dists

{11.51291546492478: (0.005589453936607491, 2.414644707168465),
 1.6426117097961406: (0.08284865477957871, 2.404570812928289),
 0.7304317044395013: (0.0, 2.4319473328763332),
 0.3563228120191924: (0.3379551420939025, 2.4432985634448583)}

In [151]:
# PAC MEAN
subsample_rate = int(0.5*train_len)

noise = hybrid_noise_auto(train_x, train_y, subsample_rate, num_classes, 1e-6)

pac_dists = {}
num_trials = 1000

for mi in mi_range:
    scaled_noise = {k: noise[k] * (0.5 / mi) for k in noise}
    iso_noise = max(scaled_noise.values())
    iso_scaled = {k: iso_noise for k in noise}
    avg_dist_pac = 0
    avg_iso_dist_pac = 0
    subsampled_dist = 0
    for _ in range(num_trials):
        shuffled_x1, shuffled_y1 = shuffle(train_x, train_y)
        shuffled_x1, shuffled_y1 = get_samples_safe(shuffled_x1, shuffled_y1, num_classes, subsample_rate)
        released_mean = np.average(shuffled_x1, axis=0)
        subsampled_dist += np.linalg.norm(released_mean - true_mean)
        for ind in range(len(released_mean)):
            c = np.random.normal(0, scale=scaled_noise[ind])
            released_mean[ind] += c
        avg_dist_pac += np.linalg.norm(released_mean - true_mean)
    for _ in range(num_trials):
        shuffled_x1, shuffled_y1 = shuffle(train_x, train_y)
        shuffled_x1, shuffled_y1 = get_samples_safe(shuffled_x1, shuffled_y1, num_classes, subsample_rate)
        released_mean = np.average(shuffled_x1, axis=0)
        for ind in range(len(released_mean)):
            c = np.random.normal(0, scale=iso_scaled[ind])
            released_mean[ind] += c
        avg_iso_dist_pac += np.linalg.norm(released_mean - true_mean)
    avg_iso_dist_pac /= num_trials
    avg_dist_pac /= num_trials
    subsampled_dist /= num_trials
    pac_dists[mi] = (subsampled_dist, avg_dist_pac, avg_iso_dist_pac)

In [152]:
noise

{0: 0.028578130875505172,
 1: 0.016058658559470425,
 2: 0.06149385186092042,
 3: 0.027026669637757513}

In [28]:
3.69/2.17

1.7004608294930876

In [13]:
train_x, train_y, test_x, test_y, num_classes, train_len = gen_bean(normalize=True)

In [14]:
true_mean = np.average(train_x, axis=0)

norms = [np.linalg.norm(x) for x in train_x]
print(max(norms))

3.033332638023896


In [15]:
true_mean

array([0.13949812, 0.22676071, 0.24652181, 0.23625196, 0.39830925,
       0.76938283, 0.1365661 , 0.22534371, 0.62571808, 0.89973101,
       0.76502333, 0.45814466, 0.49316895, 0.36997741, 0.41194519,
       0.9095871 ])

In [16]:
# DP MEAN
dp_dists = {}
num_trials = 1000

for eps in epsilon_vals:
    avg_dist_dp = {}
    for i in range(1, 62):
        clip_budget = i / 20.
        clipped_train_x = [clip_to_threshold(train_x[i], clip_budget) for i in range(len(train_x))]
        released_mean = np.average(clipped_train_x, axis=0)
        clip_dist = np.linalg.norm(released_mean - true_mean)
        dist = 0.
        for _ in range(num_trials):
            released_mean = np.average(clipped_train_x, axis=0)
            for ind in range(len(released_mean)):
                sensitivity = clip_budget / train_len 
                released_mean[ind] += add_noise(sensitivity / eps)
            dist += np.linalg.norm(released_mean - true_mean)
        dist /= num_trials
        avg_dist_dp[i] = (clip_dist, dist)
    dp_key = min(avg_dist_dp.items(), key=lambda x: x[1][1])[0]
    dp_dists[eps] = avg_dist_dp[dp_key]


In [17]:
dp_dists

{11.51291546492478: (0.00011472786248996413, 5.397195637416383),
 1.6426117097961406: (0.022664005320329528, 5.396575923514626),
 0.7304317044395013: (0.40049136963183046, 5.364796508769275),
 0.3563228120191924: (0.44875470877922397, 5.358119756917312)}

In [18]:
# PAC MEAN
subsample_rate = int(0.5*train_len)

noise = hybrid_noise_auto(train_x, train_y, subsample_rate, num_classes, 1e-6)

pac_dists = {}
num_trials = 1000

for mi in mi_range:
    scaled_noise = {k: noise[k] * (0.5 / mi) for k in noise}
    iso_noise = max(scaled_noise.values())
    iso_scaled = {k: iso_noise for k in noise}
    avg_dist_pac = 0
    avg_iso_dist_pac = 0
    subsampled_dist = 0
    for _ in range(num_trials):
        shuffled_x1, shuffled_y1 = shuffle(train_x, train_y)
        shuffled_x1, shuffled_y1 = get_samples_safe(shuffled_x1, shuffled_y1, num_classes, subsample_rate)
        released_mean = np.average(shuffled_x1, axis=0)
        subsampled_dist += np.linalg.norm(released_mean - true_mean)
        for ind in range(len(released_mean)):
            c = np.random.normal(0, scale=scaled_noise[ind])
            released_mean[ind] += c
        avg_dist_pac += np.linalg.norm(released_mean - true_mean)
    for _ in range(num_trials):
        shuffled_x1, shuffled_y1 = shuffle(train_x, train_y)
        shuffled_x1, shuffled_y1 = get_samples_safe(shuffled_x1, shuffled_y1, num_classes, subsample_rate)
        released_mean = np.average(shuffled_x1, axis=0)
        for ind in range(len(released_mean)):
            c = np.random.normal(0, scale=iso_scaled[ind])
            released_mean[ind] += c
        avg_iso_dist_pac += np.linalg.norm(released_mean - true_mean)
    avg_iso_dist_pac /= num_trials
    avg_dist_pac /= num_trials
    subsampled_dist /= num_trials
    pac_dists[mi] = (subsampled_dist, avg_dist_pac, avg_iso_dist_pac)

In [19]:
pac_dists

{1.0: (0.005402634291839429, 0.005402545268518531, 0.0053194949322241175),
 0.25: (0.005319329253503482, 0.0053230113150411155, 0.005361969629488425),
 0.0625: (0.005355829056043277, 0.005413356185585519, 0.005501557336572924),
 0.015625: (0.005408692491632391, 0.006300209931213997, 0.006706026642238748)}

In [20]:
# RICE

In [21]:
train_x, train_y, test_x, test_y, num_classes, train_len = gen_rice(normalize=True)

In [22]:
true_mean = np.average(train_x, axis=0)

norms = [np.linalg.norm(x) for x in train_x]
print(max(norms))

2.3296929410457268


In [23]:
true_mean

array([0.44882084, 0.50127565, 0.46280648, 0.55699467, 0.64165913,
       0.45820917, 0.454011  ])

In [24]:
# DP MEAN
dp_dists = {}
num_trials = 1000

for eps in epsilon_vals:
    avg_dist_dp = {}
    for i in range(1, 48):
        clip_budget = i / 20.
        clipped_train_x = [clip_to_threshold(train_x[i], clip_budget) for i in range(len(train_x))]
        released_mean = np.average(clipped_train_x, axis=0)
        clip_dist = np.linalg.norm(released_mean - true_mean)
        dist = 0.
        for _ in range(num_trials):
            released_mean = np.average(clipped_train_x, axis=0)
            for ind in range(len(released_mean)):
                sensitivity = clip_budget / train_len 
                released_mean[ind] += add_noise(sensitivity / eps)
            dist += np.linalg.norm(released_mean - true_mean)
        dist /= num_trials
        avg_dist_dp[i] = (clip_dist, dist)
    dp_key = min(avg_dist_dp.items(), key=lambda x: x[1][1])[0]
    dp_dists[eps] = avg_dist_dp[dp_key]


In [25]:
dp_dists

{11.51291546492478: (0.3776770756195068, 3.3823852954087963),
 1.6426117097961406: (0.2194404456981254, 3.3524872064037132),
 0.7304317044395013: (0.29280740534432015, 3.3836859246706905),
 0.3563228120191924: (0.11230215784580518, 3.3981163471357414)}

In [26]:
# PAC MEAN
subsample_rate = int(0.5*train_len)

noise = hybrid_noise_auto(train_x, train_y, subsample_rate, num_classes, 1e-6)

pac_dists = {}
num_trials = 1000

for mi in mi_range:
    scaled_noise = {k: noise[k] * (0.5 / mi) for k in noise}
    iso_noise = max(scaled_noise.values())
    iso_scaled = {k: iso_noise for k in noise}
    avg_dist_pac = 0
    avg_iso_dist_pac = 0
    subsampled_dist = 0
    for _ in range(num_trials):
        shuffled_x1, shuffled_y1 = shuffle(train_x, train_y)
        shuffled_x1, shuffled_y1 = get_samples_safe(shuffled_x1, shuffled_y1, num_classes, subsample_rate)
        released_mean = np.average(shuffled_x1, axis=0)
        subsampled_dist += np.linalg.norm(released_mean - true_mean)
        for ind in range(len(released_mean)):
            c = np.random.normal(0, scale=scaled_noise[ind])
            released_mean[ind] += c
        avg_dist_pac += np.linalg.norm(released_mean - true_mean)
    for _ in range(num_trials):
        shuffled_x1, shuffled_y1 = shuffle(train_x, train_y)
        shuffled_x1, shuffled_y1 = get_samples_safe(shuffled_x1, shuffled_y1, num_classes, subsample_rate)
        released_mean = np.average(shuffled_x1, axis=0)
        for ind in range(len(released_mean)):
            c = np.random.normal(0, scale=iso_scaled[ind])
            released_mean[ind] += c
        avg_iso_dist_pac += np.linalg.norm(released_mean - true_mean)
    avg_iso_dist_pac /= num_trials
    avg_dist_pac /= num_trials
    subsampled_dist /= num_trials
    pac_dists[mi] = (subsampled_dist, avg_dist_pac, avg_iso_dist_pac)

In [27]:
pac_dists

{1.0: (0.007543825045711737, 0.007543184221990362, 0.007572223452566617),
 0.25: (0.00763998912500602, 0.007645582684431556, 0.007631803548210275),
 0.0625: (0.00746459681578852, 0.007598925465874449, 0.007985333967967552),
 0.015625: (0.007352115992608959, 0.009083210939745826, 0.010733538914307353)}