<a href="https://colab.research.google.com/github/miaojingang/private_ratio/blob/main/simulation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook has the simulation used in paper

> Jingang Miao, Yiming Paul Li. (2021). Privacy Preserving Inference on the Ratio of Two Gaussians Using Sums.

In [None]:
#@title functions {form-width:"20%"}
import logging
import numpy as np
import pandas as pd
import functools
import warnings
warnings.filterwarnings('ignore')


def from_moments(m_s, m_y, v_s, v_y, v_ys, log_scale=False):
    """Gets mean and variance from moments of the means."""
    if log_scale:
        # Equation after (1)
        return np.log(m_s / m_y), (
            1. / m_s ** 2 * v_s
            -2 * v_ys / m_y / m_s
            + v_y / m_y ** 2
        )
    else:
        # Equation (1)
        return m_s / m_y, (
            1. / m_y ** 2 * v_s
            - 2 * m_s / m_y **3 * v_ys
            + m_s ** 2 / m_y** 4 * v_y
        )

def moments_from_sums(s, s2, y, y2, ys, w, w2):
    """Gets moments of the means from sums."""
    effective_n = w ** 2 / w2
    # Equations (2) to (4)
    return (
        s / w, y / w,  # m_s, m_y
        (s2 / w - (s / w) ** 2) / effective_n,  # v_s
        (y2 / w - (y / w) ** 2) / effective_n,  # v_y
        (ys / w - y * s / w ** 2) / effective_n  # v_ys
    )

def from_sums(s, s2, y, y2, ys, w, w2, log_scale=False):
    """Gets point and variance estimates from sums."""
    moments = moments_from_sums(s, s2, y, y2, ys, w, w2)
    return from_moments(*moments, log_scale=log_scale)

def get_ci_maybe_bound_by_0(m, v, bound_by_0=False):
    half = 1.96 * np.sqrt(v)
    if bound_by_0:
        return np.maximum(m - half, 0), m + half
    return m - half, m + half

def ci_length(ci):
    return ci[1] - ci[0]

def is_covered(ci, truth):
    return ci[0] < truth < ci[1]

def interval_score(ci, truth, alpha=0.05):
    # section 6.2 of https://sites.stat.washington.edu/raftery/Research/PDF/Gneiting2007jasa.pdf
    l, u = ci
    x = truth
    return (
        u - l  # length of ci
        + 2.0 / alpha * ((l - x) * (x < l) + (x - u) * (x > u))  # penalty for missing truth
    )

def get_noise_scale(Delta, epsilon, delta, mechanism):
    if mechanism == "gaussian":
        return Delta * np.sqrt(2 * np.log(1.25 / delta)) / epsilon
    elif mechanism == "laplace":
        # does not depend on delta
        return Delta / epsilon

def get_variance_from_scale(scale, mechanism):
    if mechanism == "gaussian":
        return scale * scale
    elif mechanism == "laplace":
        return 2.0 * scale * scale

def one_run_dp_sums(
    seed=0, n=1_000, u_w=1, B=200,
    epsilon=2, delta=1e-6, true=1.1, log_scale=False, mechanism="gaussian"):
    # seed=0; n=1_000; u_w=1; B=200; epsilon=2; delta=1e-6; true=1.1; log_scale=False
    np.random.seed(seed + 12345)
    s = np.random.beta(2, 2, size=n)
    y = np.random.binomial(n=1, p=s/true, size=n)
    l_w = 1 / u_w
    w = np.random.exponential(1., size=n).clip(l_w, u_w)

    if log_scale:
        true = np.log(true)
        maybe_log = np.log
        get_ci = functools.partial(get_ci_maybe_bound_by_0, bound_by_0=False)
    else:
        maybe_log = lambda x: x
        # ratio is non-negative
        get_ci = functools.partial(get_ci_maybe_bound_by_0, bound_by_0=True)

    if mechanism == "gaussian":
        gen_noise = np.random.normal
    elif mechanism == "laplace":
        gen_noise = np.random.laplace
    else:
        raise ValueError("Only gaussian and laplace mechanisms are supported")
 
    # public version
    sum_s = s @ w
    sum_s2 = (s ** 2) @ w
    sum_y = sum_y2 = y @ w
    sum_ys = (y * s) @ w
    sum_w = w.sum()
    sum_w2 = (w ** 2).sum()
    m_p, v_p = from_sums(
        sum_s, sum_s2, sum_y, sum_y2,
        sum_ys, sum_w, sum_w2, log_scale=log_scale
    )
    ci_p = get_ci(m_p, v_p)
    length_p = ci_length(ci_p)
    covered_p = is_covered(ci_p, true)
    interval_score_p = interval_score(ci_p, true)

    # DP: add  noise
    if u_w == 1.0:
        get_scale = functools.partial(
            get_noise_scale, epsilon=epsilon / 5,
            delta = delta / 5, mechanism=mechanism)
    else:
        get_scale = functools.partial(
            get_noise_scale, epsilon=epsilon / 6,
            delta = delta / 6, mechanism=mechanism)
    get_variance = functools.partial(
        get_variance_from_scale, mechanism=mechanism)

    sum_s += gen_noise(0, get_scale(u_w))
    sum_s2 += gen_noise(0, get_scale(u_w))
    sum_y += gen_noise(0, get_scale(u_w))
    sum_y2 = sum_y
    sum_ys += gen_noise(0, get_scale(u_w))
    sum_w += gen_noise(0, get_scale(u_w))
    if u_w == 1.0:
        sum_w2 = sum_w
    else:
        sum_w2 += gen_noise(0, get_scale(u_w ** 2))
    # no correction
    m, v_nc = from_sums(
        sum_s, sum_s2, sum_y, sum_y2,
        sum_ys, sum_w, sum_w2, log_scale=log_scale
    )
    bias = m - true
    effective_n = w.sum() ** 2 / (w ** 2).sum()
    ci_nc = get_ci(m, v_nc)
    length_nc = ci_length(ci_nc)
    covered_nc = is_covered(ci_nc, true)
    interval_score_nc = interval_score(ci_nc, true)

    # monte carlo correction
    v_extra = ((maybe_log(
        (sum_s + gen_noise(0, get_scale(u_w), size=B)) /
        (sum_y + gen_noise(0, get_scale(u_w), size=B))
        ) - m
    ) ** 2).mean()
    
    ci_m = get_ci(m, v_nc + v_extra)
    length_m = ci_length(ci_m)
    covered_m = is_covered(ci_m, true)
    interval_score_m = interval_score(ci_m, true)

    # analytical correction
    m_s, m_y, v_s, v_y, v_ys = moments_from_sums(sum_s, sum_s2, sum_y, sum_y2,
        sum_ys, sum_w, sum_w2)
   # switch from the moments of means to the moments of sums,
   # which doesn't change the ratio estimate and its standard error
    _, v_a = from_moments(
        m_s * sum_w,
        m_y * sum_w,
        v_s * sum_w ** 2 + get_variance(get_scale(u_w)),
        v_y * sum_w ** 2 + get_variance(get_scale(u_w)),
        v_ys * sum_w ** 2, log_scale=log_scale
    )
    ci_a = get_ci(m, v_a)
    length_a = ci_length(ci_a)
    covered_a = is_covered(ci_a, true)
    interval_score_a = interval_score(ci_a, true)

    return (
        n, epsilon, delta,
        u_w, effective_n,
        length_p, covered_p, interval_score_p,
        length_nc, covered_nc, interval_score_nc,
        length_m, covered_m, interval_score_m,
        length_a, covered_a, interval_score_a,
        bias
    )

def sim(
    show_styled_result=True,
    show_latex_result=True,
    **kwargs):
    res = (
        pd.DataFrame.from_records(
            (one_run_dp_sums(seed, n=n, u_w=u_w, epsilon=epsilon, **kwargs)
            for seed in range(1_000)
            for u_w in [1, 3]
            for n in [5_000,10_000]
            for epsilon in [0.2, 0.5, 1., 4.0]
            ),
            columns = [
                "n", "epsilon", "delta", "u_w", "effective_n",
                "length_p", "coverage_p", "interval_score_p",
                "length_nc", "coverage_nc", "interval_score_nc",
                "length_m", "coverage_m", "interval_score_m",
                "length_a", "coverage_a", "interval_score_a",
                "bias"]
            )
        .groupby(["n", "u_w", "epsilon", "delta", ], as_index=False).mean()
    )
    if show_styled_result:
        display(
            res.style.hide_index()
            .format({
                "n": "{:,}",
                "epsilon": "{:.1f}", "effective_n": "{:,.0f}",
                "length_p": "{:.3f}", "coverage_p": "{:.3f}", "interval_score_p": "{:.3f}",
                "length_nc": "{:.3f}", "coverage_nc": "{:.3f}", "interval_score_nc": "{:.3f}",
                "length_m": "{:.3f}", "coverage_m": "{:.3f}", "interval_score_m": "{:.3f}",
                "length_a": "{:.3f}", "coverage_a": "{:.3f}", "interval_score_a": "{:.3f}",
                "bias": "{:.3f}"
            })
            .bar(subset=["coverage_p", "coverage_nc", "coverage_m", "coverage_a"],
                 color="lightgreen", vmin=0)
            .set_table_attributes("class='dataframe'")
        )
    if show_latex_result:
        cols = [
            "u_w", "effective_n",
            "n", "epsilon",
            "length_p", "coverage_p", "interval_score_p",
            "length_nc", "coverage_nc", "interval_score_nc",
            "length_m", "coverage_m", "interval_score_m",
            "length_a", "coverage_a", "interval_score_a",
        ]
        print(res[cols].round(3).to_latex())

## Gaussian mechanism

In [None]:
#@title ratio scale {form-width:"20%"}
sim(delta=1e-6, B=200, mechanism="gaussian")

n,u_w,epsilon,delta,effective_n,length_p,coverage_p,interval_score_p,length_nc,coverage_nc,interval_score_nc,length_m,coverage_m,interval_score_m,length_a,coverage_a,interval_score_a,bias
5000,1,0.2,1e-06,5000,0.061,0.951,0.073,0.06,0.231,2.074,0.37,0.945,0.452,0.367,0.943,0.451,0.003
5000,1,0.5,1e-06,5000,0.061,0.951,0.073,0.061,0.538,0.489,0.156,0.952,0.191,0.156,0.946,0.192,0.0
5000,1,1.0,1e-06,5000,0.061,0.951,0.073,0.061,0.782,0.158,0.094,0.948,0.115,0.094,0.95,0.116,0.0
5000,1,4.0,1e-06,5000,0.061,0.951,0.073,0.061,0.935,0.077,0.064,0.943,0.076,0.064,0.942,0.076,0.0
5000,3,0.2,1e-06,3082,0.078,0.949,0.093,0.09,0.076,10.296,8.783,0.949,9.04,1.532,0.939,1.852,0.071
5000,3,0.5,1e-06,3082,0.078,0.949,0.093,0.076,0.205,3.19,0.549,0.946,0.659,0.535,0.94,0.652,0.009
5000,3,1.0,1e-06,3082,0.078,0.949,0.093,0.077,0.398,1.127,0.274,0.941,0.332,0.272,0.94,0.333,0.002
5000,3,4.0,1e-06,3082,0.078,0.949,0.093,0.078,0.867,0.139,0.101,0.951,0.121,0.101,0.949,0.121,0.001
10000,1,0.2,1e-06,10000,0.043,0.949,0.05,0.043,0.354,0.826,0.185,0.956,0.217,0.185,0.954,0.215,-0.001
10000,1,0.5,1e-06,10000,0.043,0.949,0.05,0.043,0.699,0.178,0.084,0.952,0.096,0.084,0.946,0.095,-0.001


\begin{tabular}{lrrrrrrrrrrrrrrrr}
\toprule
{} &  u\_w &  effective\_n &      n &  epsilon &  length\_p &  coverage\_p &  interval\_score\_p &  length\_nc &  coverage\_nc &  interval\_score\_nc &  length\_m &  coverage\_m &  interval\_score\_m &  length\_a &  coverage\_a &  interval\_score\_a \\
\midrule
0  &    1 &     5000.000 &   5000 &      0.2 &     0.061 &       0.951 &             0.073 &      0.060 &        0.231 &              2.074 &     0.370 &       0.945 &             0.452 &     0.367 &       0.943 &             0.451 \\
1  &    1 &     5000.000 &   5000 &      0.5 &     0.061 &       0.951 &             0.073 &      0.061 &        0.538 &              0.489 &     0.156 &       0.952 &             0.191 &     0.156 &       0.946 &             0.192 \\
2  &    1 &     5000.000 &   5000 &      1.0 &     0.061 &       0.951 &             0.073 &      0.061 &        0.782 &              0.158 &     0.094 &       0.948 &             0.115 &     0.094 &       0.950 &           

In [None]:
#@title log_scale results {form-width: "20%"}
sim(log_scale=True, delta=1e-6, B=200, mechanism="gaussian")

n,u_w,epsilon,delta,effective_n,length_p,coverage_p,interval_score_p,length_nc,coverage_nc,interval_score_nc,length_m,coverage_m,interval_score_m,length_a,coverage_a,interval_score_a,bias
5000,1,0.2,1e-06,5000,0.055,0.953,0.066,0.055,0.232,1.882,0.333,0.944,0.405,0.332,0.941,0.406,-0.001
5000,1,0.5,1e-06,5000,0.055,0.953,0.066,0.055,0.535,0.445,0.142,0.95,0.173,0.142,0.95,0.174,-0.0
5000,1,1.0,1e-06,5000,0.055,0.953,0.066,0.055,0.783,0.143,0.086,0.95,0.105,0.086,0.952,0.105,-0.0
5000,1,4.0,1e-06,5000,0.055,0.953,0.066,0.055,0.937,0.07,0.058,0.944,0.069,0.058,0.944,0.069,0.0
5000,3,0.2,1e-06,3082,0.071,0.948,0.085,0.077,0.075,8.864,1.34,0.915,1.367,1.268,0.975,1.349,0.002
5000,3,0.5,1e-06,3082,0.071,0.948,0.085,0.069,0.206,2.884,0.486,0.943,0.579,0.482,0.942,0.582,-0.0
5000,3,1.0,1e-06,3082,0.071,0.948,0.085,0.07,0.395,1.023,0.247,0.943,0.3,0.247,0.943,0.301,0.0
5000,3,4.0,1e-06,3082,0.071,0.948,0.085,0.071,0.864,0.126,0.092,0.952,0.11,0.092,0.951,0.111,0.001
10000,1,0.2,1e-06,10000,0.039,0.948,0.046,0.039,0.355,0.752,0.168,0.951,0.196,0.168,0.956,0.194,-0.002
10000,1,0.5,1e-06,10000,0.039,0.948,0.046,0.039,0.701,0.161,0.076,0.947,0.087,0.076,0.949,0.086,-0.001


\begin{tabular}{lrrrrrrrrrrrrrrrr}
\toprule
{} &  u\_w &  effective\_n &      n &  epsilon &  length\_p &  coverage\_p &  interval\_score\_p &  length\_nc &  coverage\_nc &  interval\_score\_nc &  length\_m &  coverage\_m &  interval\_score\_m &  length\_a &  coverage\_a &  interval\_score\_a \\
\midrule
0  &    1 &     5000.000 &   5000 &      0.2 &     0.055 &       0.953 &             0.066 &      0.055 &        0.232 &              1.882 &     0.333 &       0.944 &             0.405 &     0.332 &       0.941 &             0.406 \\
1  &    1 &     5000.000 &   5000 &      0.5 &     0.055 &       0.953 &             0.066 &      0.055 &        0.535 &              0.445 &     0.142 &       0.950 &             0.173 &     0.142 &       0.950 &             0.174 \\
2  &    1 &     5000.000 &   5000 &      1.0 &     0.055 &       0.953 &             0.066 &      0.055 &        0.783 &              0.143 &     0.086 &       0.950 &             0.105 &     0.086 &       0.952 &           

## Laplace mechanism

In [None]:
#@title ratio scale {form-width:"20%"}
sim(delta=1e-6, B=200, mechanism="laplace")

n,u_w,epsilon,delta,effective_n,length_p,coverage_p,interval_score_p,length_nc,coverage_nc,interval_score_nc,length_m,coverage_m,interval_score_m,length_a,coverage_a,interval_score_a,bias
5000,1,0.2,1e-06,5000,0.061,0.951,0.073,0.061,0.73,0.238,0.109,0.937,0.149,0.109,0.94,0.147,0.0
5000,1,0.5,1e-06,5000,0.061,0.951,0.073,0.061,0.896,0.094,0.071,0.948,0.09,0.071,0.946,0.089,0.0
5000,1,1.0,1e-06,5000,0.061,0.951,0.073,0.061,0.936,0.078,0.064,0.947,0.077,0.064,0.947,0.077,0.0
5000,1,4.0,1e-06,5000,0.061,0.951,0.073,0.061,0.949,0.073,0.061,0.95,0.073,0.061,0.95,0.073,0.0
5000,3,0.2,1e-06,3082,0.078,0.949,0.093,0.077,0.416,1.505,0.344,0.936,0.473,0.339,0.938,0.468,0.004
5000,3,0.5,1e-06,3082,0.078,0.949,0.093,0.078,0.699,0.362,0.152,0.942,0.203,0.152,0.941,0.202,0.002
5000,3,1.0,1e-06,3082,0.078,0.949,0.093,0.078,0.853,0.15,0.102,0.938,0.127,0.102,0.939,0.127,0.001
5000,3,4.0,1e-06,3082,0.078,0.949,0.093,0.078,0.944,0.096,0.08,0.948,0.096,0.08,0.949,0.096,0.001
10000,1,0.2,1e-06,10000,0.043,0.949,0.05,0.043,0.829,0.098,0.063,0.951,0.077,0.063,0.947,0.078,0.0
10000,1,0.5,1e-06,10000,0.043,0.949,0.05,0.043,0.934,0.056,0.047,0.955,0.055,0.047,0.955,0.055,0.0


\begin{tabular}{lrrrrrrrrrrrrrrrr}
\toprule
{} &  u\_w &  effective\_n &      n &  epsilon &  length\_p &  coverage\_p &  interval\_score\_p &  length\_nc &  coverage\_nc &  interval\_score\_nc &  length\_m &  coverage\_m &  interval\_score\_m &  length\_a &  coverage\_a &  interval\_score\_a \\
\midrule
0  &    1 &     5000.000 &   5000 &      0.2 &     0.061 &       0.951 &             0.073 &      0.061 &        0.730 &              0.238 &     0.109 &       0.937 &             0.149 &     0.109 &       0.940 &             0.147 \\
1  &    1 &     5000.000 &   5000 &      0.5 &     0.061 &       0.951 &             0.073 &      0.061 &        0.896 &              0.094 &     0.071 &       0.948 &             0.090 &     0.071 &       0.946 &             0.089 \\
2  &    1 &     5000.000 &   5000 &      1.0 &     0.061 &       0.951 &             0.073 &      0.061 &        0.936 &              0.078 &     0.064 &       0.947 &             0.077 &     0.064 &       0.947 &           

In [None]:
#@title log_scale results {form-width: "20%"}
sim(log_scale=True, delta=1e-6, B=200, mechanism="laplace")

n,u_w,epsilon,delta,effective_n,length_p,coverage_p,interval_score_p,length_nc,coverage_nc,interval_score_nc,length_m,coverage_m,interval_score_m,length_a,coverage_a,interval_score_a,bias
5000,1,0.2,1e-06,5000,0.055,0.953,0.066,0.055,0.73,0.217,0.099,0.938,0.135,0.099,0.943,0.134,0.0
5000,1,0.5,1e-06,5000,0.055,0.953,0.066,0.055,0.902,0.086,0.064,0.946,0.081,0.065,0.946,0.081,0.0
5000,1,1.0,1e-06,5000,0.055,0.953,0.066,0.055,0.936,0.071,0.058,0.95,0.07,0.058,0.949,0.07,0.0
5000,1,4.0,1e-06,5000,0.055,0.953,0.066,0.055,0.948,0.067,0.056,0.951,0.067,0.056,0.95,0.067,0.0
5000,3,0.2,1e-06,3082,0.071,0.948,0.085,0.07,0.412,1.365,0.308,0.933,0.432,0.307,0.937,0.426,0.001
5000,3,0.5,1e-06,3082,0.071,0.948,0.085,0.071,0.696,0.329,0.138,0.941,0.184,0.138,0.944,0.183,0.001
5000,3,1.0,1e-06,3082,0.071,0.948,0.085,0.071,0.854,0.136,0.092,0.938,0.116,0.092,0.938,0.116,0.001
5000,3,4.0,1e-06,3082,0.071,0.948,0.085,0.071,0.94,0.087,0.072,0.947,0.087,0.072,0.947,0.087,0.001
10000,1,0.2,1e-06,10000,0.039,0.948,0.046,0.039,0.83,0.089,0.057,0.947,0.07,0.057,0.946,0.071,0.0
10000,1,0.5,1e-06,10000,0.039,0.948,0.046,0.039,0.933,0.051,0.043,0.953,0.05,0.043,0.953,0.05,-0.0


\begin{tabular}{lrrrrrrrrrrrrrrrr}
\toprule
{} &  u\_w &  effective\_n &      n &  epsilon &  length\_p &  coverage\_p &  interval\_score\_p &  length\_nc &  coverage\_nc &  interval\_score\_nc &  length\_m &  coverage\_m &  interval\_score\_m &  length\_a &  coverage\_a &  interval\_score\_a \\
\midrule
0  &    1 &     5000.000 &   5000 &      0.2 &     0.055 &       0.953 &             0.066 &      0.055 &        0.730 &              0.217 &     0.099 &       0.938 &             0.135 &     0.099 &       0.943 &             0.134 \\
1  &    1 &     5000.000 &   5000 &      0.5 &     0.055 &       0.953 &             0.066 &      0.055 &        0.902 &              0.086 &     0.064 &       0.946 &             0.081 &     0.065 &       0.946 &             0.081 \\
2  &    1 &     5000.000 &   5000 &      1.0 &     0.055 &       0.953 &             0.066 &      0.055 &        0.936 &              0.071 &     0.058 &       0.950 &             0.070 &     0.058 &       0.949 &           

In [None]:
print(f"""
For the same epsilon=0.2 and sensitivity=1, the variance of noise
for gaussian: {get_noise_scale(Delta=1, epsilon=0.2, delta=1e-6, mechanism="gaussian") ** 2:.3f}
for laplace: {get_noise_scale(Delta=1, epsilon=0.2, delta=1e-6, mechanism="laplace") ** 2 * 2:.3f}
""")


For the same epsilon=0.2 and sensitivity=1, the variance of noise
for gaussian: 701.933
for laplace: 50.000

