In [1]:
from algs_lib import *


In [2]:
def clip_to_threshold(vec, c):
    curr_norm = np.linalg.norm(vec)
#     print(curr_norm, c)
    if curr_norm <= c:
        return vec
#     print('reached')
    clip_ratio = c / curr_norm
    return [vec[i]*clip_ratio for i in range(len(vec))]

In [3]:
def add_noise(scale):
    return np.random.laplace(0, scale)
# global sensitivity is C/n i think?
# so scale should be (C/n) / \epsilon per elem?

In [4]:
def calc_posterior(mi, prior=0.5, prec = 100000):
    test_vals = [x / prec for x in range(1, prec)]
    max_t = None
    for t in test_vals:
        if t*np.log(t/prior)+(1-t)*np.log((1-t)/(1-prior)) <= mi:
            if  max_t is None or t > max_t:
                max_t = t
    return max_t

def dp_epsilon_to_posterior_success(epsilon):
    return 1 - 1./(1+np.exp(epsilon))

def dp_ps_to_epsilon(ps):
    return np.log(ps / (1-ps))

In [5]:
calc_posterior(1.)

0.99999

In [6]:
def hybrid_noise_mean(train_x, train_y, subsample_rate, num_classes,
    eta, regularize=None, num_trees=None, tree_depth = None, max_mi = 0.5, num_dims = None):

    sec_v = max_mi / 2
    sec_beta = max_mi - sec_v
    r = calc_r(train_x)
    gamma = 0.01
    avg_dist = 0.
    curr_est = None
    converged = False
    curr_trial = 0

    if num_classes is None:
        num_classes = len(set(train_y))

    assert subsample_rate >= num_classes

    est_y = {}
    prev_ests = None
    # 10*c*v
    seed = np.random.randint(1, 100000)

    s1 = None # only relevant for PCA
    while not converged:
        shuffled_x, shuffled_y = shuffle(train_x, train_y)
        
        shuffled_x, shuffled_y = get_samples_safe(shuffled_x, shuffled_y, num_classes, subsample_rate)
        
        output = np.average(shuffled_x, axis=0)

        for ind in range(len(output)):
            if ind not in est_y:
                est_y[ind] = []
            est_y[ind].append(output[ind])

        if curr_trial % 10 == 0:
            if curr_trial % 100 == 0:
                print(f'curr trial is {curr_trial}')
            if prev_ests is None:
                prev_ests = {}
                for ind in est_y:
                    prev_ests[ind] = np.var(est_y[ind])
            else:
                converged = True
                for ind in est_y:
                    if abs(np.var(est_y[ind]) - prev_ests[ind]) > eta:
                        converged = False
                if not converged:
                    for ind in est_y:
                        prev_ests[ind] = np.var(est_y[ind])
        curr_trial += 1
    fin_var = {ind: np.var(est_y[ind]) for ind in est_y}

    noise = {}
    sqrt_total_var = sum([fin_var[x]**0.5 for x in fin_var])
    for ind in fin_var:
        noise[ind] = 1./(2*max_mi) * fin_var[ind]**0.5 * sqrt_total_var
    return noise


def hybrid_noise_auto_ind(train_x, train_y, subsample_rate, num_classes,
    eta, regularize=None, num_trees=None, tree_depth = None, max_mi = 0.5, num_dims = None):
    curr_est = None
    converged = False
    curr_trial = 0

    if num_classes is None:
        num_classes = len(set(train_y))

    assert subsample_rate >= num_classes

    est_y = {}
    prev_ests = None

    s1 = None # only relevant for PCA
    
    max_noises = {}
    
    for ind in range(len(train_x)):
        print(f"ind = {ind}")
        removed_train_x = np.delete(train_x, ind, 0)
        removed_train_y = np.delete(train_y, ind, 0)
        while not converged:
            shuffled_x, shuffled_y = shuffle(removed_train_x, removed_train_y)

            shuffled_x, shuffled_y = get_samples_safe(shuffled_x, shuffled_y, num_classes, subsample_rate)
            
            added_x = copy.deepcopy(shuffled_x)
            added_y = copy.deepcopy(shuffled_y)
            added_x[0] = train_x[ind]
            added_y[0] = train_y[ind]

            output_orig = np.average(shuffled_x, axis=0)
            output_new = np.average(added_x, axis=0)
            output = (output_orig - output_new)**2

            for ind in range(len(output)):
                if ind not in est_y:
                    est_y[ind] = []
                est_y[ind].append(output[ind])

            if curr_trial % 10 == 0:        
                if prev_ests is None:
                    prev_ests = {}
                    for ind in est_y:
                        prev_ests[ind] = np.var(est_y[ind])
                else:
                    converged = True
                    for ind in est_y:
                        if abs(np.var(est_y[ind]) - prev_ests[ind]) > eta:
                            converged = False
                    if not converged:
                        for ind in est_y:
                            prev_ests[ind] = np.var(est_y[ind])
            curr_trial += 1
        fin_var = {ind: np.var(est_y[ind]) for ind in est_y}

        noise = {}
        sqrt_total_var = sum([fin_var[x]**0.5 for x in fin_var])
        for ind in fin_var:
            noise[ind] = 1./(2*max_mi) * fin_var[ind]**0.5 * sqrt_total_var
        for ind in noise:
            if ind not in max_noises or max_noises[ind] < noise[ind]:
                max_noises[ind] = noise[ind]
    return max_noises

In [140]:
train_x = train_x[:5]
print(train_x)
print(np.delete(train_x, 0, 0))

[[0.41666667 0.33333333 0.69491525 0.95833333]
 [0.47222222 0.08333333 0.50847458 0.375     ]
 [0.33333333 0.91666667 0.06779661 0.04166667]
 [0.83333333 0.375      0.89830508 0.70833333]
 [0.19444444 0.58333333 0.08474576 0.04166667]]
[[0.47222222 0.08333333 0.50847458 0.375     ]
 [0.33333333 0.91666667 0.06779661 0.04166667]
 [0.83333333 0.375      0.89830508 0.70833333]
 [0.19444444 0.58333333 0.08474576 0.04166667]]


In [142]:
train_x, train_y, test_x, test_y, num_classes, train_len = gen_iris(normalize=True)

norms = [np.linalg.norm(x) for x in train_x]
orig_noise = hybrid_noise_mean(train_x, train_y, subsample_rate, num_classes, 1e-6)

print(max(norms))
print(orig_noise)

1.6853970684247657
{0: 0.002289270346102352, 1: 0.0018726066562047249, 2: 0.0030400308880141748, 3: 0.0032905573978312327}


In [11]:
true_mean = np.average(train_x, axis=0)

mi_range = [0.25, 1/16., 0.015625]
posterior_success_rates = [calc_posterior(mi) for mi in mi_range]
epsilon_vals = [dp_ps_to_epsilon(ps) for ps in posterior_success_rates]

print(epsilon_vals)
print([x for x in posterior_success_rates])

[1.6426117097961406, 0.7304317044395013, 0.3563228120191924]
[0.83789, 0.6749, 0.58815]


In [144]:
# DP MEAN
dp_dists = {}
num_trials = 1000
for eps in epsilon_vals:
    avg_dist_dp = {}
    for i in range(1, 17):
        clip_budget = i / 10
        clipped_train_x = [clip_to_threshold(train_x[i], clip_budget) for i in range(len(train_x))]
        released_mean = np.average(clipped_train_x, axis=0)
        clip_dist = np.linalg.norm(released_mean - true_mean)
        dist = 0.
        sensitivity = clip_budget / train_len
        for _ in range(num_trials):
            released_mean = np.average(clipped_train_x, axis=0)
            for ind in range(len(released_mean)):
                sensitivity = clip_budget / train_len
                released_mean[ind] += add_noise(sensitivity/eps)
            dist += np.linalg.norm(released_mean - true_mean)
        dist /= num_trials
        avg_dist_dp[i] = (clip_dist, dist)
    dp_key = min(avg_dist_dp.items(), key=lambda x: x[1][1])[0]
    print(dp_key)
    dp_dists[eps] = avg_dist_dp[dp_key]


15
14
12


In [145]:
dp_dists

{1.6426117097961406: (0.003760138063449314, 0.023091294506178354),
 0.7304317044395013: (0.01164606776365974, 0.04880799508990495),
 0.3563228120191924: (0.05011494917674307, 0.0956881524433286)}

In [146]:
from scipy import stats

In [168]:
# PAC MEAN
train_x, train_y, test_x, test_y, num_classes, train_len = gen_iris(normalize=True)
true_mean = np.average(train_x, axis=0)


norms = [np.linalg.norm(x) for x in train_x]
# print(max(norms))

subsample_rate = int(0.5*train_len)

pac_dists = {}
num_trials = 1000
for mi in mi_range:
    scaled_noise = {k: noise[k] * (0.5 / mi) for k in noise}
    iso_noise = max(scaled_noise.values())
    iso_scaled = {k: iso_noise for k in noise}
    avg_dist_pac = 0
    avg_iso_dist_pac = 0
    subsampled_dist = 0
    noise = hybrid_noise_auto(train_x, train_y, subsample_rate, num_classes, 1e-6)
    for _ in range(num_trials):
        shuffled_x1, shuffled_y1 = shuffle(train_x, train_y)
        shuffled_x1, shuffled_y1 = get_samples_safe(shuffled_x1, shuffled_y1, num_classes, subsample_rate)
        released_mean = np.average(shuffled_x1, axis=0)
        subsampled_dist += np.linalg.norm(released_mean - true_mean)
        for ind in range(len(released_mean)):
            c = np.random.normal(0, scale=scaled_noise[ind])
            released_mean[ind] += c
        avg_dist_pac += np.linalg.norm(released_mean - true_mean)
    subsampled_dist /= num_trials
    print(f'subsampled_dist = {subsampled_dist}')
    for _ in range(num_trials):
        shuffled_x1, shuffled_y1 = shuffle(train_x, train_y)
        shuffled_x1, shuffled_y1 = get_samples_safe(shuffled_x1, shuffled_y1, num_classes, subsample_rate)
        released_mean = np.average(shuffled_x1, axis=0)
        for ind in range(len(released_mean)):
            c = np.random.normal(0, scale=iso_scaled[ind])
            released_mean[ind] += c
        avg_iso_dist_pac += np.linalg.norm(released_mean - true_mean)
    
    avg_iso_dist_pac /= num_trials
    avg_dist_pac /= num_trials
    print(avg_dist_pac)
    print('-----')
#     subsampled_dist /= num_trials
    pac_dists[mi] = (subsampled_dist, avg_dist_pac, avg_iso_dist_pac)
print(pac_dists)

subsampled_dist = 0.04369795223162045
0.044892926450963024
-----
subsampled_dist = 0.043053740909081324
0.05633762085921197
-----
subsampled_dist = 0.043495379785967965
0.15442332129575034
-----
{0.25: (0.04369795223162045, 0.044892926450963024, 0.04663600714844551), 0.0625: (0.043053740909081324, 0.05633762085921197, 0.06318522972963278), 0.015625: (0.043495379785967965, 0.15442332129575034, 0.19137743177062086)}


In [169]:
pac_dists

{0.25: (0.04369795223162045, 0.044892926450963024, 0.04663600714844551),
 0.0625: (0.043053740909081324, 0.05633762085921197, 0.06318522972963278),
 0.015625: (0.043495379785967965, 0.15442332129575034, 0.19137743177062086)}

In [7]:
train_x, train_y, test_x, test_y, num_classes, train_len = gen_bean(normalize=True)

In [8]:
true_mean = np.average(train_x, axis=0)

norms = [np.linalg.norm(x) for x in train_x]
print(max(norms))

3.0829053788327507


In [9]:
true_mean

array([0.14015323, 0.22728516, 0.24730455, 0.23645427, 0.39973723,
       0.76977418, 0.13719586, 0.22584614, 0.62437111, 0.90016561,
       0.7647052 , 0.45709353, 0.49354762, 0.36943584, 0.41102562,
       0.91006917])

In [12]:
# DP MEAN
dp_dists = {}
num_trials = 1000

for eps in epsilon_vals:
    avg_dist_dp = {}
    for i in range(1, 31):
        clip_budget = i/10
        clipped_train_x = [clip_to_threshold(train_x[i], clip_budget) for i in range(len(train_x))]
        released_mean = np.average(clipped_train_x, axis=0)
        clip_dist = np.linalg.norm(released_mean - true_mean)
        dist = 0.
        for _ in range(num_trials):
            released_mean = np.average(clipped_train_x, axis=0)
            for ind in range(len(released_mean)):
                sensitivity = clip_budget / train_len 
                released_mean[ind] += add_noise(sensitivity / eps)
            dist += np.linalg.norm(released_mean - true_mean)
        dist /= num_trials
        avg_dist_dp[i] = (clip_dist, dist)
    dp_key = min(avg_dist_dp.items(), key=lambda x: x[1][1])[0]
    dp_dists[eps] = avg_dist_dp[dp_key]


KeyboardInterrupt: 

In [153]:
dp_dists

{1.6426117097961406: (6.585062837225667e-05, 0.0010065313905155784),
 0.7304317044395013: (0.0002614561788240281, 0.0021905352978012884),
 0.3563228120191924: (0.000942316538515316, 0.004439134273849168)}

In [171]:
[dp_dists[x] for x in dp_dists]

[(0.0019812757614805806, 0.002553987238147606),
 (0.0019812757614805806, 0.00399900130072505),
 (0.0019812757614805806, 0.007624172088701473)]

In [14]:
# PAC MEAN

train_x, train_y, test_x, test_y, num_classes, train_len = gen_bean(normalize=True)
true_mean = np.average(train_x, axis=0)


subsample_rate = int(0.5*train_len)

noise = hybrid_noise_mean(train_x, train_y, subsample_rate, num_classes, 1e-6)
print(sum(noise.values()))
pac_dists = {}
num_trials = 1000

for mi in mi_range:
    scaled_noise = {k: noise[k] * (0.5 / mi) for k in noise}
    iso_noise = max(scaled_noise.values())
    iso_scaled = {k: iso_noise for k in noise}
    avg_dist_pac = 0
    avg_iso_dist_pac = 0
    subsampled_dist = 0
    for _ in range(num_trials):
        shuffled_x1, shuffled_y1 = shuffle(train_x, train_y)
        shuffled_x1, shuffled_y1 = get_samples_safe(shuffled_x1, shuffled_y1, num_classes, subsample_rate)
        released_mean = np.average(shuffled_x1, axis=0)
        subsampled_dist += np.linalg.norm(released_mean - true_mean)
        for ind in range(len(released_mean)):
            c = np.random.normal(0, scale=scaled_noise[ind])
            released_mean[ind] += c
        avg_dist_pac += np.linalg.norm(released_mean - true_mean)
    for _ in range(num_trials):
        shuffled_x1, shuffled_y1 = shuffle(train_x, train_y)
        shuffled_x1, shuffled_y1 = get_samples_safe(shuffled_x1, shuffled_y1, num_classes, subsample_rate)
        released_mean = np.average(shuffled_x1, axis=0)
        for ind in range(len(released_mean)):
            c = np.random.normal(0, scale=iso_scaled[ind])
            released_mean[ind] += c
        avg_iso_dist_pac += np.linalg.norm(released_mean - true_mean)
    avg_iso_dist_pac /= num_trials
    avg_dist_pac /= num_trials
    subsampled_dist /= num_trials
    pac_dists[mi] = (subsampled_dist, avg_dist_pac, avg_iso_dist_pac)

curr trial is 0
0.0005653131897607153


In [174]:
pac_dists

{0.25: (0.005492895735124468, 0.005504368899810956, 0.005600894228118885),
 0.0625: (0.00539451708927943, 0.005548361536118546, 0.005655970716286265),
 0.015625: (0.005417806532302793, 0.007328477886119223, 0.00892565593821365)}

In [157]:
# # PAC MEAN
# subsample_rate = int(0.5*train_len)

# noise = hybrid_noise_auto_ind(train_x, train_y, subsample_rate, num_classes, 1e-6)

# pac_dists = {}
# num_trials = 1000

# for mi in mi_range:
#     scaled_noise = {k: noise[k] * (0.5 / mi) for k in noise}
#     iso_noise = max(scaled_noise.values())
#     iso_scaled = {k: iso_noise for k in noise}
#     avg_dist_pac = 0
#     avg_iso_dist_pac = 0
#     subsampled_dist = 0
#     for _ in range(num_trials):
#         shuffled_x1, shuffled_y1 = shuffle(train_x, train_y)
#         shuffled_x1, shuffled_y1 = get_samples_safe(shuffled_x1, shuffled_y1, num_classes, subsample_rate)
#         released_mean = np.average(shuffled_x1, axis=0)
#         subsampled_dist += np.linalg.norm(released_mean - true_mean)
#         for ind in range(len(released_mean)):
#             c = np.random.normal(0, scale=scaled_noise[ind])
#             released_mean[ind] += c
#         avg_dist_pac += np.linalg.norm(released_mean - true_mean)
#     for _ in range(num_trials):
#         shuffled_x1, shuffled_y1 = shuffle(train_x, train_y)
#         shuffled_x1, shuffled_y1 = get_samples_safe(shuffled_x1, shuffled_y1, num_classes, subsample_rate)
#         released_mean = np.average(shuffled_x1, axis=0)
#         for ind in range(len(released_mean)):
#             c = np.random.normal(0, scale=iso_scaled[ind])
#             released_mean[ind] += c
#         avg_iso_dist_pac += np.linalg.norm(released_mean - true_mean)
#     avg_iso_dist_pac /= num_trials
#     avg_dist_pac /= num_trials
#     subsampled_dist /= num_trials
#     pac_dists[mi] = (subsampled_dist, avg_dist_pac, avg_iso_dist_pac)

In [175]:
pac_dists

{0.25: (0.005492895735124468, 0.005504368899810956, 0.005600894228118885),
 0.0625: (0.00539451708927943, 0.005548361536118546, 0.005655970716286265),
 0.015625: (0.005417806532302793, 0.007328477886119223, 0.00892565593821365)}

In [176]:
# RICE

In [177]:
train_x, train_y, test_x, test_y, num_classes, train_len = gen_rice(normalize=True)

In [178]:
true_mean = np.average(train_x, axis=0)

norms = [np.linalg.norm(x) for x in train_x]
print(max(norms))

2.399557199470374


In [179]:
true_mean

array([0.45223512, 0.50394861, 0.46521187, 0.5597932 , 0.64152809,
       0.46161908, 0.45263566])

In [180]:
# DP MEAN
dp_dists = {}
num_trials = 1000

for eps in epsilon_vals:
    avg_dist_dp = {}
    for i in range(1, 3):
        clip_budget = i
        clipped_train_x = [clip_to_threshold(train_x[i], clip_budget) for i in range(len(train_x))]
        released_mean = np.average(clipped_train_x, axis=0)
        clip_dist = np.linalg.norm(released_mean - true_mean)
        dist = 0.
        for _ in range(num_trials):
            released_mean = np.average(clipped_train_x, axis=0)
            for ind in range(len(released_mean)):
                sensitivity = clip_budget / train_len 
                released_mean[ind] += add_noise(sensitivity / eps)
            dist += np.linalg.norm(released_mean - true_mean)
        dist /= num_trials
        avg_dist_dp[i] = (clip_dist, dist)
    dp_key = min(avg_dist_dp.items(), key=lambda x: x[1][1])[0]
    dp_dists[eps] = avg_dist_dp[dp_key]


In [181]:
dp_dists

{1.6426117097961406: (0.0016312321401344787, 0.002292359980503194),
 0.7304317044395013: (0.0016312321401344787, 0.0038929025067448103),
 0.3563228120191924: (0.0016312321401344787, 0.007434481194003592)}

In [182]:
# PAC MEAN

train_x, train_y, test_x, test_y, num_classes, train_len = gen_rice(normalize=True)
true_mean = np.average(train_x, axis=0)
subsample_rate = int(0.5*train_len)


noise = hybrid_noise_auto(train_x, train_y, subsample_rate, num_classes, 1e-6)

pac_dists = {}
num_trials = 1000

for mi in mi_range:
    scaled_noise = {k: noise[k] * (0.5 / mi) for k in noise}
    iso_noise = max(scaled_noise.values())
    iso_scaled = {k: iso_noise for k in noise}
    avg_dist_pac = 0
    avg_iso_dist_pac = 0
    subsampled_dist = 0
    for _ in range(num_trials):
        shuffled_x1, shuffled_y1 = shuffle(train_x, train_y)
        shuffled_x1, shuffled_y1 = get_samples_safe(shuffled_x1, shuffled_y1, num_classes, subsample_rate)
        released_mean = np.average(shuffled_x1, axis=0)
        subsampled_dist += np.linalg.norm(released_mean - true_mean)
        for ind in range(len(released_mean)):
            c = np.random.normal(0, scale=scaled_noise[ind])
            released_mean[ind] += c
        avg_dist_pac += np.linalg.norm(released_mean - true_mean)
    for _ in range(num_trials):
        shuffled_x1, shuffled_y1 = shuffle(train_x, train_y)
        shuffled_x1, shuffled_y1 = get_samples_safe(shuffled_x1, shuffled_y1, num_classes, subsample_rate)
        released_mean = np.average(shuffled_x1, axis=0)
        for ind in range(len(released_mean)):
            c = np.random.normal(0, scale=iso_scaled[ind])
            released_mean[ind] += c
        avg_iso_dist_pac += np.linalg.norm(released_mean - true_mean)
    avg_iso_dist_pac /= num_trials
    avg_dist_pac /= num_trials
    subsampled_dist /= num_trials
    pac_dists[mi] = (subsampled_dist, avg_dist_pac, avg_iso_dist_pac)

In [183]:
pac_dists

{0.25: (0.007619588243238379, 0.007626673663862336, 0.007452167877120045),
 0.0625: (0.007601176162287781, 0.007683088037882145, 0.00797367638558806),
 0.015625: (0.0075220431904354824, 0.008809859813561569, 0.010569859683508558)}

In [184]:
# CIFAR-10

In [212]:
def gen_cifar10(normalize=False):
    fnames = ['cifar-10-batches-py/data_batch_{}'.format(i) for i in range(1, 6)]
    all_x = []
    all_y = []
    for f in fnames:
        data = unpickle(f)
        all_x.extend(data[b'data'])
        all_y.extend(data[b'labels'])
    test_data = 'cifar-10-batches-py/test_batch'
    test_data = unpickle(f)
    all_x.extend(data[b'data'])
    all_y.extend(data[b'labels'])
    if normalize:
        min_max_scaler = preprocessing.MinMaxScaler()
        scaled = min_max_scaler.fit_transform(all_x)
        all_x = np.array(pd.DataFrame(scaled))
#     print(all_x.shape)
    train_x = np.array(all_x[:50000, :])
    test_x = np.array(all_x[50000:, :])
#     print(train_x.shape)
#     print(test_x.shape)
    
    train_y = np.array(all_y[:50000])
    test_y = np.array(all_y[50000:])

    num_classes = 10
    train_len = train_x.shape[0]
    return train_x, train_y, test_x, test_y, num_classes, train_len

In [213]:
# PAC MEAN

train_x, train_y, test_x, test_y, num_classes, train_len = gen_cifar10(normalize=True)
true_mean = np.average(train_x, axis=0)
subsample_rate = int(0.5*train_len)


noise = hybrid_noise_auto(train_x, train_y, subsample_rate, num_classes, 1e-3)

curr trial is 0


In [215]:
noise

{0: 0.004188356421119193,
 1: 0.0043902587621660666,
 2: 0.004218196109612588,
 3: 0.00391120058172516,
 4: 0.0037325421246164835,
 5: 0.0036269196160360773,
 6: 0.0036615215194997905,
 7: 0.0036336067956982505,
 8: 0.003895481563404101,
 9: 0.004264602520775066,
 10: 0.0048089583283891756,
 11: 0.0049650615989707955,
 12: 0.0047066052069609,
 13: 0.0043873121728876694,
 14: 0.00401422022060526,
 15: 0.0035170051855170563,
 16: 0.002991171607874488,
 17: 0.003159592404017243,
 18: 0.003657950493853478,
 19: 0.0037027455291419025,
 20: 0.003694316073226612,
 21: 0.003955270198976654,
 22: 0.004203521653369557,
 23: 0.003711044939899409,
 24: 0.003442565571382252,
 25: 0.003524050975364102,
 26: 0.0033617636616469545,
 27: 0.0035357050628646984,
 28: 0.003310676656335339,
 29: 0.0032482624245607316,
 30: 0.003437511218460746,
 31: 0.003254317548694189,
 32: 0.00414210927920843,
 33: 0.004262415461265851,
 34: 0.004328415411320247,
 35: 0.004014372894272318,
 36: 0.003624240403452663,
 37

In [218]:
norms = [np.linalg.norm(x) for x in train_x]
print(max(norms))

54.91567867836909


In [222]:
# PAC MEAN

train_x, train_y, test_x, test_y, num_classes, train_len = gen_cifar10(normalize=True)
true_mean = np.average(train_x, axis=0)
subsample_rate = int(0.5*train_len)

clipped_train_x = np.array([clip_to_threshold(train_x[i], 3) for i in range(len(train_x))])
norms = [np.linalg.norm(x) for x in clipped_train_x]
print(max(norms))
clipped_noise = hybrid_noise_auto(clipped_train_x, train_y, subsample_rate, num_classes, 1e-3)

3.000000000000067
curr trial is 0


In [223]:
sum(clipped_noise.values())

0.0801084004786155

In [None]:
pac_dists = {}
num_trials = 1000

for mi in mi_range:
    scaled_noise = {k: noise[k] * (0.5 / mi) for k in noise}
    iso_noise = max(scaled_noise.values())
    iso_scaled = {k: iso_noise for k in noise}
    avg_dist_pac = 0
    avg_iso_dist_pac = 0
    subsampled_dist = 0
    for _ in range(num_trials):
        shuffled_x1, shuffled_y1 = shuffle(train_x, train_y)
        shuffled_x1, shuffled_y1 = get_samples_safe(shuffled_x1, shuffled_y1, num_classes, subsample_rate)
        released_mean = np.average(shuffled_x1, axis=0)
        subsampled_dist += np.linalg.norm(released_mean - true_mean)
        for ind in range(len(released_mean)):
            c = np.random.normal(0, scale=scaled_noise[ind])
            released_mean[ind] += c
        avg_dist_pac += np.linalg.norm(released_mean - true_mean)
    for _ in range(num_trials):
        shuffled_x1, shuffled_y1 = shuffle(train_x, train_y)
        shuffled_x1, shuffled_y1 = get_samples_safe(shuffled_x1, shuffled_y1, num_classes, subsample_rate)
        released_mean = np.average(shuffled_x1, axis=0)
        for ind in range(len(released_mean)):
            c = np.random.normal(0, scale=iso_scaled[ind])
            released_mean[ind] += c
        avg_iso_dist_pac += np.linalg.norm(released_mean - true_mean)
    avg_iso_dist_pac /= num_trials
    avg_dist_pac /= num_trials
    subsampled_dist /= num_trials
    pac_dists[mi] = (subsampled_dist, avg_dist_pac, avg_iso_dist_pac)

In [224]:
29/3700

0.007837837837837838