<a href="https://colab.research.google.com/github/miaojingang/private_ratio/blob/main/simulation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook has the simulation used in paper

> Jingang Miao, Yiming Paul Li. (2021). Privacy Preserving Inference on the Ratio of Two Gaussians Using (Weighted) Sums. [arXiv:2110.15449](https://arxiv.org/abs/2110.15449).

In [17]:
#@title functions {form-width:"20%"}
import logging
import numpy as np
import pandas as pd
import functools
import warnings
warnings.filterwarnings('ignore')


def from_moments(m_s, m_y, v_s, v_y, v_ys, log_scale=False):
    """Gets mean and variance from moments of the means."""
    if log_scale:
        return np.log(m_s / m_y), (
            1. / m_s ** 2 * v_ys
            -2 * v_ys / m_y / m_s
            + v_y / m_y ** 2
        )
    else:
        return m_s / m_y, (
            1. / m_y ** 2 * v_s
            - 2 * m_s / m_y **3 * v_ys
            + m_s ** 2 / m_y** 4 * v_y
        )

def moments_from_sums(s, s2, y, y2, ys, w, w2):
    """Gets moments of the means from sums."""
    effective_n = w ** 2 / w2
    return (
        s / w, y / w,  # m_s, m_y
        (s2 / w - (s / w) ** 2) / effective_n,  # v_s
        (y2 / w - (y / w) ** 2) / effective_n,  # v_y
        (ys / w - y * s / w ** 2) / effective_n  # v_ys
    )

def from_sums(s, s2, y, y2, ys, w, w2, log_scale=False):
    """Gets mean and variance from sums."""
    moments = moments_from_sums(s, s2, y, y2, ys, w, w2)
    return from_moments(*moments, log_scale=log_scale)

def get_ci_maybe_bound_by_0(m, v, bound_by_0=False):
    half = 1.96 * np.sqrt(v)
    if bound_by_0:
        return np.maximum(m - half, 0), m + half
    return m - half, m + half

def ci_length(ci):
    return ci[1] - ci[0]

def is_covered(ci, truth):
    return ci[0] < truth < ci[1]

def interval_score(ci, truth, alpha=0.05):
    # section 6.2 of https://sites.stat.washington.edu/raftery/Research/PDF/Gneiting2007jasa.pdf
    l, u = ci
    x = truth
    return (
        u - l  # length of ci
        + 2.0 / alpha * ((l - x) * (x < l) + (x - u) * (x > u))  # penalty for missing truth
    )

def get_noise_scale(Delta, epsilon, delta, mechanism):
    if mechanism == "gaussian":
        return Delta * np.sqrt(2 * np.log(1.25 / delta)) / epsilon
    elif mechanism == "laplace":
        return Delta / epsilon

def one_run_dp_sums(
    seed=0, n=1_000, u_w=1, B=200,
    epsilon=2, delta=1e-6, true=1.1, log_scale=False, mechanism="gaussian"):
    # seed=0; n=1_000; u_w=1; B=200; epsilon=2; delta=1e-6; true=1.1; log_scale=False
    np.random.seed(seed)
    s = np.random.beta(2, 2, size=n)
    y = np.random.binomial(n=1, p=s/true, size=n)
    l_w = 1 / u_w
    w = np.random.exponential(1., size=n).clip(l_w, u_w)

    if log_scale:
        true = np.log(true)
        maybe_log = np.log
        get_ci = functools.partial(get_ci_maybe_bound_by_0, bound_by_0=False)
    else:
        maybe_log = lambda x: x
        # ratio is non-negative
        get_ci = functools.partial(get_ci_maybe_bound_by_0, bound_by_0=True)

    if mechanism == "gaussian":
        gen_noise = np.random.normal
    elif mechanism == "laplace":
        gen_noise = np.random.laplace
    else:
        raise ValueError("Only gaussian and laplace mechanisms are supported")
 
    # public version
    sum_s = s @ w
    sum_s2 = (s ** 2) @ w
    sum_y = sum_y2 = y @ w
    sum_ys = (y * s) @ w
    sum_w = w.sum()
    sum_w2 = (w ** 2).sum()
    m_p, v_p = from_sums(
        sum_s, sum_s2, sum_y, sum_y2,
        sum_ys, sum_w, sum_w2, log_scale=log_scale
    )
    ci_p = get_ci(m_p, v_p)
    length_p = ci_length(ci_p)
    covered_p = is_covered(ci_p, true)
    interval_score_p = interval_score(ci_p, true)

    # DP: add normal noise
    if u_w == 1.0:
        get_scale = functools.partial(
            get_noise_scale, epsilon=epsilon / 5,
            delta = delta / 5, mechanism=mechanism)
    else:
        get_scale = functools.partial(
            get_noise_scale, epsilon=epsilon / 6,
            delta = delta / 6, mechanism=mechanism)
    
    sum_s += gen_noise(0, get_scale(u_w))
    sum_s2 += gen_noise(0, get_scale(u_w))
    sum_y += gen_noise(0, get_scale(u_w))
    sum_y2 = sum_y
    sum_ys += gen_noise(0, get_scale(u_w))
    sum_w += gen_noise(0, get_scale(u_w))
    if u_w == 1.0:
        sum_w2 = sum_w
    else:
        sum_w2 += gen_noise(0, get_scale(u_w ** 2))
    # no correction
    m, v_nc = from_sums(
        sum_s, sum_s2, sum_y, sum_y2,
        sum_ys, sum_w, sum_w2, log_scale=log_scale
    )
    bias = m - true
    effective_n = w.sum() ** 2 / (w ** 2).sum()
    ci_nc = get_ci(m, v_nc)
    length_nc = ci_length(ci_nc)
    covered_nc = is_covered(ci_nc, true)
    interval_score_nc = interval_score(ci_nc, true)

    # monte carlo correction
    v_extra = ((maybe_log(
        (sum_s + gen_noise(0, get_scale(u_w), size=B)) /
        (sum_y + gen_noise(0, get_scale(u_w), size=B))
        ) - m
    ) ** 2).mean()
    
    ci_m = get_ci(m, v_nc + v_extra)
    length_m = ci_length(ci_m)
    covered_m = is_covered(ci_m, true)
    interval_score_m = interval_score(ci_m, true)

    # analytical correction
    m_s, m_y, v_s, v_y, v_sy = moments_from_sums(sum_s, sum_s2, sum_y, sum_y2,
        sum_ys, sum_w, sum_w2)
    _, v_a = from_moments(
        m_s * sum_w, m_y * sum_w,
        v_s * sum_w ** 2 + get_scale(u_w) ** 2,
        v_y * sum_w ** 2 + get_scale(u_w) ** 2,
        v_sy * sum_w ** 2, log_scale=log_scale
    )
    ci_a = get_ci(m, v_a)
    length_a = ci_length(ci_a)
    covered_a = is_covered(ci_a, true)
    interval_score_a = interval_score(ci_a, true)

    return (
        n, epsilon, delta,
        u_w, effective_n,
        length_p, covered_p, interval_score_p,
        length_nc, covered_nc, interval_score_nc,
        length_m, covered_m, interval_score_m,
        length_a, covered_a, interval_score_a,
        bias
    )

def sim(
    show_styled_result=True,
    show_latex_result=True,
    **kwargs):
    res = (
        pd.DataFrame.from_records(
            (one_run_dp_sums(seed, n=n, u_w=u_w, epsilon=epsilon, **kwargs)
            for seed in range(1_000)
            for u_w in [1, 3]
            for n in [20_000, 50_000]
            for epsilon in [0.2, 0.5, 1.]
            ),
            columns = [
                "n", "epsilon", "delta", "u_w", "effective_n",
                "length_p", "coverage_p", "interval_score_p",
                "length_nc", "coverage_nc", "interval_score_nc",
                "length_m", "coverage_m", "interval_score_m",
                "length_a", "coverage_a", "interval_score_a",
                "bias"]
            )
        .groupby(["n", "u_w", "epsilon", "delta", ], as_index=False).mean()
    )
    if show_styled_result:
        display(
            res.style.hide_index()
            .format({
                "n": "{:,}",
                "epsilon": "{:.1f}", "effective_n": "{:,.0f}",
                "length_p": "{:.3f}", "coverage_p": "{:.3f}", "interval_score_p": "{:.3f}",
                "length_nc": "{:.3f}", "coverage_nc": "{:.3f}", "interval_score_nc": "{:.3f}",
                "length_m": "{:.3f}", "coverage_m": "{:.3f}", "interval_score_m": "{:.3f}",
                "length_a": "{:.3f}", "coverage_a": "{:.3f}", "interval_score_a": "{:.3f}",
                "bias": "{:.3f}"
            })
            .bar(subset=["coverage_p", "coverage_nc", "coverage_m", "coverage_a"],
                 color="lightgreen", vmin=0)
            .set_table_attributes("class='dataframe'")
        )
    if show_latex_result:
        cols = [
            "u_w", "effective_n",
            "n", "epsilon",
            "length_p", "coverage_p", "interval_score_p",
            "length_nc", "coverage_nc", "interval_score_nc",
            "length_m", "coverage_m", "interval_score_m",
            "length_a", "coverage_a", "interval_score_a",
        ]
        print(res[cols].round(3).to_latex())

## Gaussian mechanism

In [18]:
#@title ratio scale {form-width:"20%"}
sim(delta=1e-6, B=200, mechanism="gaussian")

n,u_w,epsilon,delta,effective_n,length_p,coverage_p,interval_score_p,length_nc,coverage_nc,interval_score_nc,length_m,coverage_m,interval_score_m,length_a,coverage_a,interval_score_a,bias
20000,1,0.2,1e-06,20000,0.03,0.948,0.036,0.031,0.51,0.329,0.095,0.948,0.11,0.095,0.952,0.109,0.001
20000,1,0.5,1e-06,20000,0.03,0.948,0.036,0.03,0.791,0.074,0.047,0.952,0.054,0.047,0.953,0.054,-0.0
20000,1,1.0,1e-06,20000,0.03,0.948,0.036,0.03,0.902,0.042,0.035,0.95,0.04,0.035,0.951,0.04,-0.0
20000,3,0.2,1e-06,12323,0.039,0.958,0.045,0.039,0.199,1.95,0.332,0.944,0.386,0.33,0.945,0.385,0.006
20000,3,0.5,1e-06,12323,0.039,0.958,0.045,0.039,0.45,0.514,0.136,0.942,0.159,0.136,0.943,0.158,0.002
20000,3,1.0,1e-06,12323,0.039,0.958,0.045,0.039,0.692,0.156,0.076,0.948,0.086,0.076,0.951,0.085,0.0
50000,1,0.2,1e-06,50000,0.019,0.948,0.024,0.019,0.661,0.098,0.041,0.947,0.048,0.041,0.95,0.048,0.0
50000,1,0.5,1e-06,50000,0.019,0.948,0.024,0.019,0.881,0.032,0.024,0.947,0.028,0.024,0.951,0.028,0.0
50000,1,1.0,1e-06,50000,0.019,0.948,0.024,0.019,0.931,0.025,0.021,0.944,0.025,0.021,0.944,0.025,0.0
50000,3,0.2,1e-06,30800,0.025,0.957,0.029,0.025,0.266,0.686,0.132,0.954,0.157,0.133,0.955,0.157,0.002


\begin{tabular}{lrrrrrrrrrrrrrrrr}
\toprule
{} &  u\_w &  effective\_n &      n &  epsilon &  length\_p &  coverage\_p &  interval\_score\_p &  length\_nc &  coverage\_nc &  interval\_score\_nc &  length\_m &  coverage\_m &  interval\_score\_m &  length\_a &  coverage\_a &  interval\_score\_a \\
\midrule
0  &    1 &    20000.000 &  20000 &      0.2 &     0.030 &       0.948 &             0.036 &      0.031 &        0.510 &              0.329 &     0.095 &       0.948 &             0.110 &     0.095 &       0.952 &             0.109 \\
1  &    1 &    20000.000 &  20000 &      0.5 &     0.030 &       0.948 &             0.036 &      0.030 &        0.791 &              0.074 &     0.047 &       0.952 &             0.054 &     0.047 &       0.953 &             0.054 \\
2  &    1 &    20000.000 &  20000 &      1.0 &     0.030 &       0.948 &             0.036 &      0.030 &        0.902 &              0.042 &     0.035 &       0.950 &             0.040 &     0.035 &       0.951 &           

In [19]:
#@title log_scale results {form-width: "20%"}
sim(log_scale=True, delta=1e-6, B=200, mechanism="gaussian")

n,u_w,epsilon,delta,effective_n,length_p,coverage_p,interval_score_p,length_nc,coverage_nc,interval_score_nc,length_m,coverage_m,interval_score_m,length_a,coverage_a,interval_score_a,bias
20000,1,0.2,1e-06,20000,0.027,0.945,0.033,0.027,0.506,0.301,0.086,0.946,0.1,0.066,0.863,0.116,0.0
20000,1,0.5,1e-06,20000,0.027,0.945,0.033,0.027,0.79,0.068,0.043,0.951,0.049,0.037,0.912,0.051,-0.0
20000,1,1.0,1e-06,20000,0.027,0.945,0.033,0.027,0.898,0.039,0.032,0.949,0.037,0.03,0.929,0.037,-0.0
20000,3,0.2,1e-06,12323,0.035,0.955,0.041,0.035,0.199,1.765,0.299,0.944,0.354,0.222,0.848,0.419,0.003
20000,3,0.5,1e-06,12323,0.035,0.955,0.041,0.035,0.445,0.469,0.123,0.943,0.145,0.094,0.862,0.166,0.001
20000,3,1.0,1e-06,12323,0.035,0.955,0.041,0.035,0.687,0.143,0.069,0.947,0.078,0.056,0.889,0.084,0.0
50000,1,0.2,1e-06,50000,0.017,0.946,0.021,0.017,0.658,0.09,0.037,0.946,0.044,0.03,0.889,0.048,0.0
50000,1,0.5,1e-06,50000,0.017,0.946,0.021,0.017,0.877,0.029,0.022,0.947,0.026,0.02,0.919,0.026,0.0
50000,1,1.0,1e-06,50000,0.017,0.946,0.021,0.017,0.93,0.023,0.019,0.944,0.023,0.018,0.938,0.023,0.0
50000,3,0.2,1e-06,30800,0.022,0.954,0.027,0.022,0.265,0.625,0.12,0.949,0.143,0.09,0.869,0.16,0.001


\begin{tabular}{lrrrrrrrrrrrrrrrr}
\toprule
{} &  u\_w &  effective\_n &      n &  epsilon &  length\_p &  coverage\_p &  interval\_score\_p &  length\_nc &  coverage\_nc &  interval\_score\_nc &  length\_m &  coverage\_m &  interval\_score\_m &  length\_a &  coverage\_a &  interval\_score\_a \\
\midrule
0  &    1 &    20000.000 &  20000 &      0.2 &     0.027 &       0.945 &             0.033 &      0.027 &        0.506 &              0.301 &     0.086 &       0.946 &             0.100 &     0.066 &       0.863 &             0.116 \\
1  &    1 &    20000.000 &  20000 &      0.5 &     0.027 &       0.945 &             0.033 &      0.027 &        0.790 &              0.068 &     0.043 &       0.951 &             0.049 &     0.037 &       0.912 &             0.051 \\
2  &    1 &    20000.000 &  20000 &      1.0 &     0.027 &       0.945 &             0.033 &      0.027 &        0.898 &              0.039 &     0.032 &       0.949 &             0.037 &     0.030 &       0.929 &           

## Laplace mechanism

In [21]:
#@title ratio scale {form-width:"20%"}
sim(delta=1e-6, B=200, mechanism="laplace")

n,u_w,epsilon,delta,effective_n,length_p,coverage_p,interval_score_p,length_nc,coverage_nc,interval_score_nc,length_m,coverage_m,interval_score_m,length_a,coverage_a,interval_score_a,bias
20000,1,0.2,1e-06,20000,0.03,0.948,0.036,0.03,0.887,0.051,0.038,0.95,0.046,0.034,0.917,0.048,-0.0
20000,1,0.5,1e-06,20000,0.03,0.948,0.036,0.03,0.932,0.039,0.032,0.944,0.038,0.031,0.94,0.038,-0.0
20000,1,1.0,1e-06,20000,0.03,0.948,0.036,0.03,0.94,0.037,0.031,0.942,0.037,0.031,0.942,0.037,-0.0
20000,3,0.2,1e-06,12323,0.039,0.958,0.045,0.039,0.638,0.238,0.09,0.939,0.118,0.07,0.88,0.132,-0.0
20000,3,0.5,1e-06,12323,0.039,0.958,0.045,0.039,0.865,0.072,0.051,0.95,0.062,0.045,0.912,0.064,-0.0
20000,3,1.0,1e-06,12323,0.039,0.958,0.045,0.039,0.932,0.051,0.042,0.949,0.051,0.041,0.942,0.051,-0.0
50000,1,0.2,1e-06,50000,0.019,0.948,0.024,0.019,0.928,0.026,0.021,0.952,0.026,0.02,0.943,0.026,0.0
50000,1,0.5,1e-06,50000,0.019,0.948,0.024,0.019,0.946,0.024,0.02,0.953,0.024,0.019,0.952,0.024,0.0
50000,1,1.0,1e-06,50000,0.019,0.948,0.024,0.019,0.952,0.024,0.019,0.953,0.024,0.019,0.952,0.024,0.0
50000,3,0.2,1e-06,30800,0.025,0.957,0.029,0.025,0.77,0.074,0.041,0.946,0.05,0.034,0.896,0.053,0.0


\begin{tabular}{lrrrrrrrrrrrrrrrr}
\toprule
{} &  u\_w &  effective\_n &      n &  epsilon &  length\_p &  coverage\_p &  interval\_score\_p &  length\_nc &  coverage\_nc &  interval\_score\_nc &  length\_m &  coverage\_m &  interval\_score\_m &  length\_a &  coverage\_a &  interval\_score\_a \\
\midrule
0  &    1 &    20000.000 &  20000 &      0.2 &     0.030 &       0.948 &             0.036 &      0.030 &        0.887 &              0.051 &     0.038 &       0.950 &             0.046 &     0.034 &       0.917 &             0.048 \\
1  &    1 &    20000.000 &  20000 &      0.5 &     0.030 &       0.948 &             0.036 &      0.030 &        0.932 &              0.039 &     0.032 &       0.944 &             0.038 &     0.031 &       0.940 &             0.038 \\
2  &    1 &    20000.000 &  20000 &      1.0 &     0.030 &       0.948 &             0.036 &      0.030 &        0.940 &              0.037 &     0.031 &       0.942 &             0.037 &     0.031 &       0.942 &           

In [20]:
#@title log_scale results {form-width: "20%"}
sim(log_scale=True, delta=1e-6, B=200, mechanism="laplace")

n,u_w,epsilon,delta,effective_n,length_p,coverage_p,interval_score_p,length_nc,coverage_nc,interval_score_nc,length_m,coverage_m,interval_score_m,length_a,coverage_a,interval_score_a,bias
20000,1,0.2,1e-06,20000,0.027,0.945,0.033,0.027,0.882,0.047,0.034,0.946,0.042,0.029,0.905,0.045,-0.0
20000,1,0.5,1e-06,20000,0.027,0.945,0.033,0.027,0.93,0.035,0.029,0.944,0.035,0.028,0.934,0.035,-0.0
20000,1,1.0,1e-06,20000,0.027,0.945,0.033,0.027,0.935,0.033,0.028,0.942,0.033,0.028,0.936,0.033,-0.0
20000,3,0.2,1e-06,12323,0.035,0.955,0.041,0.035,0.634,0.219,0.082,0.939,0.108,0.052,0.814,0.143,-0.001
20000,3,0.5,1e-06,12323,0.035,0.955,0.041,0.035,0.859,0.066,0.046,0.948,0.057,0.038,0.891,0.061,-0.0
20000,3,1.0,1e-06,12323,0.035,0.955,0.041,0.035,0.93,0.047,0.038,0.947,0.046,0.036,0.935,0.047,-0.0
50000,1,0.2,1e-06,50000,0.017,0.946,0.021,0.017,0.927,0.024,0.019,0.951,0.023,0.018,0.933,0.024,0.0
50000,1,0.5,1e-06,50000,0.017,0.946,0.021,0.017,0.944,0.022,0.018,0.949,0.022,0.017,0.946,0.022,0.0
50000,1,1.0,1e-06,50000,0.017,0.946,0.021,0.017,0.95,0.021,0.017,0.951,0.021,0.017,0.95,0.021,0.0
50000,3,0.2,1e-06,30800,0.022,0.954,0.027,0.022,0.764,0.068,0.037,0.944,0.045,0.027,0.849,0.054,0.0


\begin{tabular}{lrrrrrrrrrrrrrrrr}
\toprule
{} &  u\_w &  effective\_n &      n &  epsilon &  length\_p &  coverage\_p &  interval\_score\_p &  length\_nc &  coverage\_nc &  interval\_score\_nc &  length\_m &  coverage\_m &  interval\_score\_m &  length\_a &  coverage\_a &  interval\_score\_a \\
\midrule
0  &    1 &    20000.000 &  20000 &      0.2 &     0.027 &       0.945 &             0.033 &      0.027 &        0.882 &              0.047 &     0.034 &       0.946 &             0.042 &     0.029 &       0.905 &             0.045 \\
1  &    1 &    20000.000 &  20000 &      0.5 &     0.027 &       0.945 &             0.033 &      0.027 &        0.930 &              0.035 &     0.029 &       0.944 &             0.035 &     0.028 &       0.934 &             0.035 \\
2  &    1 &    20000.000 &  20000 &      1.0 &     0.027 &       0.945 &             0.033 &      0.027 &        0.935 &              0.033 &     0.028 &       0.942 &             0.033 &     0.028 &       0.936 &           

In [23]:
print(f"""
For the same epsilon=0.2 and sensitivity=1, the noise scale
for gaussian: {get_noise_scale(Delta=1, epsilon=0.2, delta=1e-6, mechanism="gaussian"):.3f}
for laplace: {get_noise_scale(Delta=1, epsilon=0.2, delta=1e-6, mechanism="laplace"):.3f}
""")


For the same epsilon-0.2 and sensitivity=1, the noise scale
for gaussian: 26.494
for laplace: 5.000

