<a href="https://colab.research.google.com/github/miaojingang/private_ratio/blob/main/private_ratio.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook has the simulation used in paper

> Jingang Miao, Yiming Paul Li. (2021). Privacy Preserving Inference on the Ratio of Two Gaussians Using (Weighted) Sums. [arXiv:2110.15449](https://arxiv.org/abs/2110.15449).

In [7]:
#@title functions {form-width:"20%"}
import logging
import numpy as np
import pandas as pd
import functools
import warnings
warnings.filterwarnings('ignore')


def from_moments(m_s, m_y, v_s, v_y, v_ys, log_scale=False):
    """Gets mean and variance from moments of the means."""
    if log_scale:
        return np.log(m_s / m_y), (
            1. / m_s ** 2 * v_s
            -2 * v_ys / m_y / m_s
            + v_y / m_y ** 2
        )
    else:
        return m_s / m_y, (
            1. / m_y ** 2 * v_s
            - 2 * m_s / m_y **3 * v_ys
            + m_s ** 2 / m_y** 4 * v_y
        )

def moments_from_sums(s, s2, y, y2, ys, w, w2):
    """Gets moments of the means from sums."""
    effective_n = w ** 2 / w2
    return (
        s / w, y / w,  # m_s, m_y
        (s2 / w - (s / w) ** 2) / effective_n,  # v_s
        (y2 / w - (y / w) ** 2) / effective_n,  # v_y
        (ys / w - y * s / w ** 2) / effective_n  # v_ys
    )

def from_sums(s, s2, y, y2, ys, w, w2, log_scale=False):
    """Gets mean and variance from sums."""
    moments = moments_from_sums(s, s2, y, y2, ys, w, w2)
    return from_moments(*moments, log_scale=log_scale)

def get_ci_maybe_bound_by_0(m, v, bound_by_0=False):
    half = 1.96 * np.sqrt(v)
    if bound_by_0:
        return np.maximum(m - half, 0), m + half
    return m - half, m + half

def ci_length(ci):
    return ci[1] - ci[0]

def is_covered(ci, truth):
    return ci[0] < truth < ci[1]

def interval_score(ci, truth, alpha=0.05):
    # section 6.2 of https://sites.stat.washington.edu/raftery/Research/PDF/Gneiting2007jasa.pdf
    l, u = ci
    x = truth
    return (
        u - l  # length of ci
        + 2.0 / alpha * ((l - x) * (x < l) + (x - u) * (x > u))  # penalty for missing truth
    )

def get_noise_scale(Delta, epsilon, delta, mechanism):
    if mechanism == "gaussian":
        return Delta * np.sqrt(2 * np.log(1.25 / delta)) / epsilon
    elif mechanism == "laplace":
        return Delta / epsilon

def one_run_dp_sums(
    seed=0, n=1_000, u_w=1, B=200,
    epsilon=2, delta=1e-6, true=1.1, log_scale=False, mechanism="gaussian"):
    # seed=0; n=1_000; u_w=1; B=200; epsilon=2; delta=1e-6; true=1.1; log_scale=False
    np.random.seed(seed)
    s = np.random.beta(2, 2, size=n)
    y = np.random.binomial(n=1, p=s/true, size=n)
    l_w = 1 / u_w
    w = np.random.exponential(1., size=n).clip(l_w, u_w)

    if log_scale:
        true = np.log(true)
        maybe_log = np.log
        get_ci = functools.partial(get_ci_maybe_bound_by_0, bound_by_0=False)
    else:
        maybe_log = lambda x: x
        # ratio is non-negative
        get_ci = functools.partial(get_ci_maybe_bound_by_0, bound_by_0=True)

    if mechanism == "gaussian":
        gen_noise = np.random.normal
    elif mechanism == "laplace":
        gen_noise = np.random.laplace
    else:
        raise ValueError("Only gaussian and laplace mechanisms are supported")
 
    # public version
    sum_s = s @ w
    sum_s2 = (s ** 2) @ w
    sum_y = sum_y2 = y @ w
    sum_ys = (y * s) @ w
    sum_w = w.sum()
    sum_w2 = (w ** 2).sum()
    m_p, v_p = from_sums(
        sum_s, sum_s2, sum_y, sum_y2,
        sum_ys, sum_w, sum_w2, log_scale=log_scale
    )
    ci_p = get_ci(m_p, v_p)
    length_p = ci_length(ci_p)
    covered_p = is_covered(ci_p, true)
    interval_score_p = interval_score(ci_p, true)

    # DP: add normal noise
    if u_w == 1.0:
        get_scale = functools.partial(
            get_noise_scale, epsilon=epsilon / 5,
            delta = delta / 5, mechanism=mechanism)
    else:
        get_scale = functools.partial(
            get_noise_scale, epsilon=epsilon / 6,
            delta = delta / 6, mechanism=mechanism)
    
    sum_s += gen_noise(0, get_scale(u_w))
    sum_s2 += gen_noise(0, get_scale(u_w))
    sum_y += gen_noise(0, get_scale(u_w))
    sum_y2 = sum_y
    sum_ys += gen_noise(0, get_scale(u_w))
    sum_w += gen_noise(0, get_scale(u_w))
    if u_w == 1.0:
        sum_w2 = sum_w
    else:
        sum_w2 += gen_noise(0, get_scale(u_w ** 2))
    # no correction
    m, v_nc = from_sums(
        sum_s, sum_s2, sum_y, sum_y2,
        sum_ys, sum_w, sum_w2, log_scale=log_scale
    )
    bias = m - true
    effective_n = w.sum() ** 2 / (w ** 2).sum()
    ci_nc = get_ci(m, v_nc)
    length_nc = ci_length(ci_nc)
    covered_nc = is_covered(ci_nc, true)
    interval_score_nc = interval_score(ci_nc, true)

    # monte carlo correction
    v_extra = ((maybe_log(
        (sum_s + gen_noise(0, get_scale(u_w), size=B)) /
        (sum_y + gen_noise(0, get_scale(u_w), size=B))
        ) - m
    ) ** 2).mean()
    
    ci_m = get_ci(m, v_nc + v_extra)
    length_m = ci_length(ci_m)
    covered_m = is_covered(ci_m, true)
    interval_score_m = interval_score(ci_m, true)

    # analytical correction
    m_s, m_y, v_s, v_y, v_sy = moments_from_sums(sum_s, sum_s2, sum_y, sum_y2,
        sum_ys, sum_w, sum_w2)
    _, v_a = from_moments(
        m_s * sum_w, m_y * sum_w,
        v_s * sum_w ** 2 + get_scale(u_w) ** 2,
        v_y * sum_w ** 2 + get_scale(u_w) ** 2,
        v_sy * sum_w ** 2, log_scale=log_scale
    )
    ci_a = get_ci(m, v_a)
    length_a = ci_length(ci_a)
    covered_a = is_covered(ci_a, true)
    interval_score_a = interval_score(ci_a, true)

    return (
        n, epsilon, delta,
        u_w, effective_n,
        length_p, covered_p, interval_score_p,
        length_nc, covered_nc, interval_score_nc,
        length_m, covered_m, interval_score_m,
        length_a, covered_a, interval_score_a,
        bias
    )

def sim(
    show_styled_result=True,
    show_latex_result=True,
    **kwargs):
    res = (
        pd.DataFrame.from_records(
            (one_run_dp_sums(seed, n=n, u_w=u_w, epsilon=epsilon, **kwargs)
            for seed in range(1_000)
            for u_w in [1, 3]
            for n in [5_000,10_000]
            for epsilon in [0.2, 0.5, 1.]
            ),
            columns = [
                "n", "epsilon", "delta", "u_w", "effective_n",
                "length_p", "coverage_p", "interval_score_p",
                "length_nc", "coverage_nc", "interval_score_nc",
                "length_m", "coverage_m", "interval_score_m",
                "length_a", "coverage_a", "interval_score_a",
                "bias"]
            )
        .groupby(["n", "u_w", "epsilon", "delta", ], as_index=False).mean()
    )
    if show_styled_result:
        display(
            res.style.hide_index()
            .format({
                "n": "{:,}",
                "epsilon": "{:.1f}", "effective_n": "{:,.0f}",
                "length_p": "{:.3f}", "coverage_p": "{:.3f}", "interval_score_p": "{:.3f}",
                "length_nc": "{:.3f}", "coverage_nc": "{:.3f}", "interval_score_nc": "{:.3f}",
                "length_m": "{:.3f}", "coverage_m": "{:.3f}", "interval_score_m": "{:.3f}",
                "length_a": "{:.3f}", "coverage_a": "{:.3f}", "interval_score_a": "{:.3f}",
                "bias": "{:.3f}"
            })
            .bar(subset=["coverage_p", "coverage_nc", "coverage_m", "coverage_a"],
                 color="lightgreen", vmin=0)
            .set_table_attributes("class='dataframe'")
        )
    if show_latex_result:
        cols = [
            "u_w", "effective_n",
            "n", "epsilon",
            "length_p", "coverage_p", "interval_score_p",
            "length_nc", "coverage_nc", "interval_score_nc",
            "length_m", "coverage_m", "interval_score_m",
            "length_a", "coverage_a", "interval_score_a",
        ]
        print(res[cols].round(3).to_latex())

## Gaussian mechanism

In [8]:
#@title ratio scale {form-width:"20%"}
sim(delta=1e-6, B=200, mechanism="gaussian")

n,u_w,epsilon,delta,effective_n,length_p,coverage_p,interval_score_p,length_nc,coverage_nc,interval_score_nc,length_m,coverage_m,interval_score_m,length_a,coverage_a,interval_score_a,bias
5000,1,0.2,1e-06,5000,0.061,0.947,0.074,0.061,0.267,1.893,0.37,0.951,0.443,0.367,0.952,0.435,0.004
5000,1,0.5,1e-06,5000,0.061,0.947,0.074,0.061,0.575,0.459,0.156,0.949,0.188,0.156,0.949,0.186,0.001
5000,1,1.0,1e-06,5000,0.061,0.947,0.074,0.061,0.784,0.155,0.094,0.947,0.111,0.094,0.947,0.111,0.0
5000,3,0.2,1e-06,3080,0.078,0.948,0.093,0.09,0.085,9.158,4.119,0.947,4.34,1.482,0.941,1.744,0.06
5000,3,0.5,1e-06,3080,0.078,0.948,0.093,0.077,0.243,2.903,0.547,0.949,0.646,0.534,0.947,0.632,0.008
5000,3,1.0,1e-06,3080,0.078,0.948,0.093,0.078,0.439,1.044,0.273,0.951,0.331,0.272,0.95,0.326,0.002
10000,1,0.2,1e-06,10000,0.043,0.955,0.049,0.043,0.317,0.917,0.184,0.924,0.236,0.184,0.928,0.235,-0.001
10000,1,0.5,1e-06,10000,0.043,0.955,0.049,0.043,0.665,0.196,0.083,0.934,0.104,0.084,0.935,0.104,-0.001
10000,1,1.0,1e-06,10000,0.043,0.955,0.049,0.043,0.874,0.075,0.056,0.951,0.066,0.056,0.947,0.066,-0.001
10000,3,0.2,1e-06,6160,0.055,0.95,0.067,0.055,0.102,4.788,0.697,0.942,0.849,0.667,0.932,0.832,0.011


\begin{tabular}{lrrrrrrrrrrrrrrrr}
\toprule
{} &  u\_w &  effective\_n &      n &  epsilon &  length\_p &  coverage\_p &  interval\_score\_p &  length\_nc &  coverage\_nc &  interval\_score\_nc &  length\_m &  coverage\_m &  interval\_score\_m &  length\_a &  coverage\_a &  interval\_score\_a \\
\midrule
0  &    1 &     5000.000 &   5000 &      0.2 &     0.061 &       0.947 &             0.074 &      0.061 &        0.267 &              1.893 &     0.370 &       0.951 &             0.443 &     0.367 &       0.952 &             0.435 \\
1  &    1 &     5000.000 &   5000 &      0.5 &     0.061 &       0.947 &             0.074 &      0.061 &        0.575 &              0.459 &     0.156 &       0.949 &             0.188 &     0.156 &       0.949 &             0.186 \\
2  &    1 &     5000.000 &   5000 &      1.0 &     0.061 &       0.947 &             0.074 &      0.061 &        0.784 &              0.155 &     0.094 &       0.947 &             0.111 &     0.094 &       0.947 &           

In [9]:
#@title log_scale results {form-width: "20%"}
sim(log_scale=True, delta=1e-6, B=200, mechanism="gaussian")

n,u_w,epsilon,delta,effective_n,length_p,coverage_p,interval_score_p,length_nc,coverage_nc,interval_score_nc,length_m,coverage_m,interval_score_m,length_a,coverage_a,interval_score_a,bias
5000,1,0.2,1e-06,5000,0.055,0.948,0.067,0.055,0.267,1.719,0.332,0.952,0.398,0.332,0.957,0.392,0.0
5000,1,0.5,1e-06,5000,0.055,0.948,0.067,0.055,0.572,0.418,0.142,0.948,0.171,0.142,0.952,0.168,0.0
5000,1,1.0,1e-06,5000,0.055,0.948,0.067,0.055,0.787,0.141,0.085,0.946,0.101,0.086,0.949,0.1,0.0
5000,3,0.2,1e-06,3080,0.071,0.947,0.085,0.079,0.087,8.097,1.329,0.921,1.36,1.255,0.975,1.331,0.005
5000,3,0.5,1e-06,3080,0.071,0.947,0.085,0.07,0.242,2.633,0.484,0.956,0.569,0.481,0.963,0.561,0.0
5000,3,1.0,1e-06,3080,0.071,0.947,0.085,0.07,0.437,0.951,0.247,0.952,0.297,0.247,0.955,0.293,-0.0
10000,1,0.2,1e-06,10000,0.039,0.954,0.045,0.039,0.319,0.833,0.167,0.926,0.215,0.167,0.928,0.213,-0.002
10000,1,0.5,1e-06,10000,0.039,0.954,0.045,0.039,0.669,0.178,0.076,0.934,0.095,0.076,0.931,0.094,-0.001
10000,1,1.0,1e-06,10000,0.039,0.954,0.045,0.039,0.875,0.068,0.051,0.952,0.06,0.051,0.95,0.06,-0.001
10000,3,0.2,1e-06,6160,0.05,0.951,0.06,0.05,0.1,4.321,0.607,0.938,0.734,0.598,0.937,0.735,-0.004


\begin{tabular}{lrrrrrrrrrrrrrrrr}
\toprule
{} &  u\_w &  effective\_n &      n &  epsilon &  length\_p &  coverage\_p &  interval\_score\_p &  length\_nc &  coverage\_nc &  interval\_score\_nc &  length\_m &  coverage\_m &  interval\_score\_m &  length\_a &  coverage\_a &  interval\_score\_a \\
\midrule
0  &    1 &     5000.000 &   5000 &      0.2 &     0.055 &       0.948 &             0.067 &      0.055 &        0.267 &              1.719 &     0.332 &       0.952 &             0.398 &     0.332 &       0.957 &             0.392 \\
1  &    1 &     5000.000 &   5000 &      0.5 &     0.055 &       0.948 &             0.067 &      0.055 &        0.572 &              0.418 &     0.142 &       0.948 &             0.171 &     0.142 &       0.952 &             0.168 \\
2  &    1 &     5000.000 &   5000 &      1.0 &     0.055 &       0.948 &             0.067 &      0.055 &        0.787 &              0.141 &     0.085 &       0.946 &             0.101 &     0.086 &       0.949 &           

## Laplace mechanism

In [10]:
#@title ratio scale {form-width:"20%"}
sim(delta=1e-6, B=200, mechanism="laplace")

n,u_w,epsilon,delta,effective_n,length_p,coverage_p,interval_score_p,length_nc,coverage_nc,interval_score_nc,length_m,coverage_m,interval_score_m,length_a,coverage_a,interval_score_a,bias
5000,1,0.2,1e-06,5000,0.061,0.947,0.074,0.061,0.745,0.2,0.109,0.962,0.129,0.089,0.899,0.138,0.0
5000,1,0.5,1e-06,5000,0.061,0.947,0.074,0.061,0.913,0.086,0.071,0.954,0.082,0.066,0.935,0.083,0.0
5000,1,1.0,1e-06,5000,0.061,0.947,0.074,0.061,0.936,0.075,0.064,0.949,0.075,0.062,0.944,0.075,0.0
5000,3,0.2,1e-06,3080,0.078,0.948,0.093,0.077,0.404,1.393,0.344,0.958,0.41,0.245,0.868,0.482,0.002
5000,3,0.5,1e-06,3080,0.078,0.948,0.093,0.078,0.723,0.307,0.152,0.956,0.184,0.121,0.899,0.197,0.0
5000,3,1.0,1e-06,3080,0.078,0.948,0.093,0.078,0.885,0.135,0.101,0.952,0.121,0.09,0.925,0.124,-0.0
10000,1,0.2,1e-06,10000,0.043,0.955,0.049,0.043,0.831,0.096,0.063,0.947,0.074,0.054,0.898,0.079,-0.001
10000,1,0.5,1e-06,10000,0.043,0.955,0.049,0.043,0.936,0.054,0.047,0.955,0.054,0.045,0.946,0.054,-0.001
10000,1,1.0,1e-06,10000,0.043,0.955,0.049,0.043,0.955,0.05,0.044,0.964,0.051,0.044,0.959,0.05,-0.001
10000,3,0.2,1e-06,6160,0.055,0.95,0.067,0.055,0.513,0.597,0.173,0.946,0.216,0.128,0.85,0.256,0.0


\begin{tabular}{lrrrrrrrrrrrrrrrr}
\toprule
{} &  u\_w &  effective\_n &      n &  epsilon &  length\_p &  coverage\_p &  interval\_score\_p &  length\_nc &  coverage\_nc &  interval\_score\_nc &  length\_m &  coverage\_m &  interval\_score\_m &  length\_a &  coverage\_a &  interval\_score\_a \\
\midrule
0  &    1 &     5000.000 &   5000 &      0.2 &     0.061 &       0.947 &             0.074 &      0.061 &        0.745 &              0.200 &     0.109 &       0.962 &             0.129 &     0.089 &       0.899 &             0.138 \\
1  &    1 &     5000.000 &   5000 &      0.5 &     0.061 &       0.947 &             0.074 &      0.061 &        0.913 &              0.086 &     0.071 &       0.954 &             0.082 &     0.066 &       0.935 &             0.083 \\
2  &    1 &     5000.000 &   5000 &      1.0 &     0.061 &       0.947 &             0.074 &      0.061 &        0.936 &              0.075 &     0.064 &       0.949 &             0.075 &     0.062 &       0.944 &           

In [11]:
#@title log_scale results {form-width: "20%"}
sim(log_scale=True, delta=1e-6, B=200, mechanism="laplace")

n,u_w,epsilon,delta,effective_n,length_p,coverage_p,interval_score_p,length_nc,coverage_nc,interval_score_nc,length_m,coverage_m,interval_score_m,length_a,coverage_a,interval_score_a,bias
5000,1,0.2,1e-06,5000,0.055,0.948,0.067,0.055,0.748,0.182,0.099,0.96,0.117,0.08,0.896,0.125,-0.0
5000,1,0.5,1e-06,5000,0.055,0.948,0.067,0.055,0.913,0.078,0.064,0.954,0.075,0.06,0.938,0.075,-0.0
5000,1,1.0,1e-06,5000,0.055,0.948,0.067,0.055,0.936,0.068,0.058,0.95,0.068,0.057,0.944,0.068,-0.0
5000,3,0.2,1e-06,3080,0.071,0.947,0.085,0.07,0.406,1.261,0.308,0.959,0.373,0.222,0.866,0.436,-0.001
5000,3,0.5,1e-06,3080,0.071,0.947,0.085,0.071,0.721,0.279,0.138,0.958,0.167,0.11,0.9,0.179,-0.001
5000,3,1.0,1e-06,3080,0.071,0.947,0.085,0.071,0.883,0.123,0.092,0.952,0.11,0.082,0.924,0.112,-0.0
10000,1,0.2,1e-06,10000,0.039,0.954,0.045,0.039,0.83,0.088,0.057,0.948,0.067,0.049,0.898,0.072,-0.001
10000,1,0.5,1e-06,10000,0.039,0.954,0.045,0.039,0.936,0.049,0.043,0.955,0.049,0.041,0.945,0.049,-0.001
10000,1,1.0,1e-06,10000,0.039,0.954,0.045,0.039,0.957,0.046,0.04,0.961,0.046,0.04,0.959,0.046,-0.001
10000,3,0.2,1e-06,6160,0.05,0.951,0.06,0.05,0.514,0.543,0.157,0.946,0.195,0.116,0.853,0.232,-0.001


\begin{tabular}{lrrrrrrrrrrrrrrrr}
\toprule
{} &  u\_w &  effective\_n &      n &  epsilon &  length\_p &  coverage\_p &  interval\_score\_p &  length\_nc &  coverage\_nc &  interval\_score\_nc &  length\_m &  coverage\_m &  interval\_score\_m &  length\_a &  coverage\_a &  interval\_score\_a \\
\midrule
0  &    1 &     5000.000 &   5000 &      0.2 &     0.055 &       0.948 &             0.067 &      0.055 &        0.748 &              0.182 &     0.099 &       0.960 &             0.117 &     0.080 &       0.896 &             0.125 \\
1  &    1 &     5000.000 &   5000 &      0.5 &     0.055 &       0.948 &             0.067 &      0.055 &        0.913 &              0.078 &     0.064 &       0.954 &             0.075 &     0.060 &       0.938 &             0.075 \\
2  &    1 &     5000.000 &   5000 &      1.0 &     0.055 &       0.948 &             0.067 &      0.055 &        0.936 &              0.068 &     0.058 &       0.950 &             0.068 &     0.057 &       0.944 &           

In [12]:
print(f"""
For the same epsilon=0.2 and sensitivity=1, the noise scale
for gaussian: {get_noise_scale(Delta=1, epsilon=0.2, delta=1e-6, mechanism="gaussian"):.3f}
for laplace: {get_noise_scale(Delta=1, epsilon=0.2, delta=1e-6, mechanism="laplace"):.3f}
""")


For the same epsilon=0.2 and sensitivity=1, the noise scale
for gaussian: 26.494
for laplace: 5.000

