In [3]:
import numpy as np
from itertools import product

In [None]:
k = 5
p = 0.1
def maj(b):
    return 1 if np.sum(b) > k/2 else 0

def parity(b):
    return np.sum(b) % 2

def bool_func_from_signature(n, signature):
    """
    A signature of a boolean function is a length 2^n array that assigns
    a 0 or 1 to each input bitstring. The array is shape (2, )^n and the 
    value at each position is 0 or 1.
    """
    pass

### Find a counterexample

We want an example of data and noise such that MLD (for the noisy data) evaluated on noiseless data is _worse_ than MLD (for noiseless data) evaluated on noiseless data.

Suppose $f^*(z^{n-1}) = \argmax_{x'} p_{X|Z^{n-1}}(x|z^{n-1})$ is our MLD for noisy data and $g*(x^{n-1}) = \argmax_{x'} p_{X|X^{n-1}}(x|x^{n-1})$ is our MLD for noiseless data. We build $f^*$ analytically, then compare it to $g^*$. 

Our example will be $k=3$ majority function (in which case $g^*$ is just majority).

In [5]:
import itertools


In [20]:
def s(f, x):
    """Compute the sensitivity of f at x"""
    n = len(x)
    out = 0
    for i in range(n):
        x_ = x.copy()
        x_[i] = 1 - x_[i]
        if f(x) != f(x_):
            out += 1
    return out

def average_sensitivity(f, X_arr):
    n = len(X_arr[0])
    out = 0
    for x in X_arr:
        out += s(f, x)
    return out / len(X_arr)

In [71]:
n = 5
k = n
p = 0.2

X_arr = np.array(list(itertools.product([0, 1], repeat=n)))


p_x = 1 / (2 ** k) # uniform distribution over x

H = np.array(list(itertools.product([0, 1], repeat=n+1)))
print("boolean signature | imbal? | Pr(fN* = f) | Sensitivity(f) | Sensitivity(fN*) | S(f) - S(fN*)")
print("--------------------------------------------------------------------------")
for i, signature in enumerate(H):
    # if list(signature) != [0, 1, 1, 0]:
    #     continue
    hash = dict(zip(range(n+1), signature))
    func = lambda b: hash[sum(b)]
    
    # noisy_lookup[row,col] is the JOINT probability Pr(f(z)=row| x=col)
    noisy_lookup = np.zeros((2, 2**n))
    true_lookup = np.zeros((2, 2**n))
    # simulate a noisy dataset essentially
    for i, x in enumerate(product([0,1], repeat=k)):
        func_value = func(x)
        # true lookup is an array with 2 rows; there is a p_x at [row, column] if 
        # f[column] = row]. so, true_lookup[i, j] = pr(f(x) = i| x=j)
        true_lookup[func(x), i] = 1
        # iterate over all of the z values that contribute to 
        for e in product([0, 1], repeat=k):
            z = np.array(x) ^ np.array(e)
            p_x_given_z = p ** sum(e) * (1-p)**(k - sum(e))
            # increment noisy_lookup at the binary index of z
            # noisy_lookup[i, j] = pr(f(z) = i,  x=j) 
            noisy_lookup[func_value, int(''.join(map(str, z)), 2)] += p_x_given_z 
       
    # the function is balanced if the sums of the two rows of true_lookup are equal
    imbal = abs(true_lookup[0,:].sum() - true_lookup[1,:].sum())  / 2 ** n
    
    # if not balanced:
    #     continue
    # round up to get argmax 
    noisy_mle = np.round(noisy_lookup)  
    out = np.multiply(noisy_mle, true_lookup) / 2 ** n # "inner product" of the functions
    diff = out.sum()


    fnstar_dct = {}
    for i, x in enumerate(X_arr):
        fnstar_dct[tuple(x)] = np.argmax(noisy_lookup[:, i])
    def fnstar(x):
        return fnstar_dct[tuple(x)]
    
    sensitivity_f = average_sensitivity(func, X_arr)
    sensitivity_fnstar = average_sensitivity(fnstar, X_arr)
    sensitivity_diff = sensitivity_f - sensitivity_fnstar
    print(f"{signature}     | {imbal:0.4f} |   {diff:1.4f}    |    {sensitivity_f:1.4f}      |    {sensitivity_fnstar:1.4f}     |  {sensitivity_diff:1.4f} ")
    if sensitivity_fnstar == 0:
        topr = sum(noisy_mle[0,:])
        botr = sum(noisy_mle[1,:])
        assert (np.allclose(topr, 0) or np.allclose(topr, 1 << n))
        assert (np.allclose(botr, 0) or np.allclose(botr, 1 << n))

    


boolean signature | imbal? | Pr(fN* = f) | Sensitivity(f) | Sensitivity(fN*) | S(f) - S(fN*)
--------------------------------------------------------------------------
[0 0 0 0 0 0]     | 1.0000 |   1.0000    |    0.0000      |    0.0000     |  0.0000 
[0 0 0 0 0 1]     | 0.9375 |   0.9688    |    0.3125      |    0.0000     |  0.3125 
[0 0 0 0 1 0]     | 0.6875 |   0.8438    |    1.5625      |    0.0000     |  1.5625 
[0 0 0 0 1 1]     | 0.6250 |   0.8438    |    1.2500      |    0.3125     |  0.9375 
[0 0 0 1 0 0]     | 0.3750 |   0.6875    |    3.1250      |    0.0000     |  3.1250 
[0 0 0 1 0 1]     | 0.3125 |   0.6875    |    3.4375      |    0.3125     |  3.1250 
[0 0 0 1 1 0]     | 0.0625 |   0.9688    |    2.1875      |    1.8750     |  0.3125 
[0 0 0 1 1 1]     | 0.0000 |   1.0000    |    1.8750      |    1.8750     |  0.0000 
[0 0 1 0 0 0]     | 0.3750 |   0.6875    |    3.1250      |    0.0000     |  3.1250 
[0 0 1 0 0 1]     | 0.3125 |   0.6562    |    3.4375      |    0.00

In [24]:
np.argmax(noisy_lookup[:,0])

np.int64(1)

In [5]:
print(noisy_mle)

[[0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 1. 1. 1. 1. 1. 1. 1.]]


In [None]:
# We will build the conditional p_{X|Z^{n-1}}(x|z^{n-1}) one at a time for each x and z^{n-1}
k = 5
p = 0.44

noisy_lookup = np.zeros((2, 2**k))
true_lookup = np.zeros((2, 2**k))
func = parity
# simulate a noisy dataset essentially
for i, x in enumerate(product([0,1], repeat=k)):
    p_x = 1 / (2 ** k) 
    func_value = func(x)
    true_lookup[func(x), i] = p_x
    for e in product([0, 1], repeat=k):
        p_e = p ** sum(e) * (1-p)**(k - sum(e))
        z = np.array(x) ^ np.array(e)
        p_z = p_x * p_e
        # increment noisy_lookup at the binary index of z
        noisy_lookup[func_value, int(''.join(map(str, z)), 2)] += p_e


out = np.multiply(np.round(noisy_lookup), true_lookup)
print(out)
print(out.sum())

[[0.01562539 0.         0.         0.01562539 0.         0.01562539
  0.01562539 0.         0.         0.01562539 0.01562539 0.
  0.01562539 0.         0.         0.01562539 0.         0.01562539
  0.01562539 0.         0.01562539 0.         0.         0.01562539
  0.01562539 0.         0.         0.01562539 0.         0.01562539
  0.01562539 0.        ]
 [0.         0.01562539 0.01562539 0.         0.01562539 0.
  0.         0.01562539 0.01562539 0.         0.         0.01562539
  0.         0.01562539 0.01562539 0.         0.01562539 0.
  0.         0.01562539 0.         0.01562539 0.01562539 0.
  0.         0.01562539 0.01562539 0.         0.01562539 0.
  0.         0.01562539]]
0.5000124416
[[0.03125 0.      0.      0.03125 0.      0.03125 0.03125 0.      0.
  0.03125 0.03125 0.      0.03125 0.      0.      0.03125 0.      0.03125
  0.03125 0.      0.03125 0.      0.      0.03125 0.03125 0.      0.
  0.03125 0.      0.03125 0.03125 0.     ]
 [0.      0.03125 0.03125 0.      0.03125

[[0.03125 0.      0.      0.03125 0.      0.03125 0.03125 0.      0.
  0.03125 0.03125 0.      0.03125 0.      0.      0.03125 0.      0.03125
  0.03125 0.      0.03125 0.      0.      0.03125 0.03125 0.      0.
  0.03125 0.      0.03125 0.03125 0.     ]
 [0.      0.03125 0.03125 0.      0.03125 0.      0.      0.03125 0.03125
  0.      0.      0.03125 0.      0.03125 0.03125 0.      0.03125 0.
  0.      0.03125 0.      0.03125 0.03125 0.      0.      0.03125 0.03125
  0.      0.03125 0.      0.      0.03125]]
1.0
