In [2]:
import numpy as np
import os
from mldec import bit_tools
from mldec.pipelines.dataloader import save_numpy_as_dict

In [3]:
# DATASET PARAMETERS
experiment_name = 'repetition_code_v1'

In [4]:
# Make the repetition code dataset where the first half and last half of bits have different error rates

# tl;dr: make sure the number of bits is less than twice the number printed below.

# goodness check: for the v1 noise model, as long as the number of bitflips
# is less than this limit, then the weight-ordering of bitstrings matches 
# the likelihood ordering. In other words, if you generate all strings with num_bitflips
# k > lim, at least one weight-k bitstring will have lower prob. than some length-(k+1) bitstring
# ASSUMING k < n/2. The violation occurs when all k bitflips happen on the last n/2 bits, and
# k+1 bitflips happen in the first n/2 bits. if k >= n/2, then we are fine.

def bitflips_upper_limit_v1(p1, p2):
    return np.log2( (1-p1)/p1) / (np.log2(p1/p2) *(1-p2)/(1-p1) ) 
p1 = 0.1
p2 = 0.07
print(bitflips_upper_limit_v1(p1, p2))

5.961580428619441


In [None]:
def repetition_pcm(n):
    out = []
    for i in range(n-1):
        out.append([0]*i + [1, 1] + [0]*(n-i-2))
    return np.array(out, dtype=np.uint8)


def sample_bitstring_v1(n, p1, p2, n_data):
    """Sample bitstrings from the biased bitflip model v1.
    
    Args:
        n: number of bits
        p1, p2: bitflip probabilities for the first and second half of the bits
        n_data: number of samples to generate
    
    Returns:
        (n_data, n) array of bitstrings
    """
    assert n % 2 == 0
    bitstrings = np.random.rand(n_data, n) < np.concatenate([p1*np.ones(n//2), p2*np.ones(n//2)])
    return bitstrings

def bitstring_prob_v1(s, n, p1, p2, permute=None):
    """Create a noise model where the first n//2 bits have prob. p1 of flipping, the last n//2 have prob. p2.
    
    ARgs:
        s: (n_data, n) array of bitstrings.
    Warning: if the difference in bias is too much, the weight-ordering of bitstrings 
     is no longer the same as likelihood ordering. make sure to check the bitflips_upper_limit_v1
    """
    if permute is not None:
        s = s[:,permute]
    p_first = np.prod(p1*s[:,:n//2] +(1-p1)*(1-s[:,:n//2]), axis=1)
    p_second = np.prod(p2*s[:,n//2:] + (1-p2)*(1-s[:,n//2:]), axis=1)
    return np.multiply(p_first, p_second)

def calculator(p1, p2):
    num = p1 ** 2 * (1-p2)**2
    denom = p2 ** 2 * (1-p1)**2
    return num / denom

calculator(0.1, 0.07)

In [14]:
# simulation parameters
n = 8
p1 = 0.1
p2 = 0.07
p1txt = str(p1).replace('.', '')
p2txt = str(p2).replace('.', '')
n_train_vals = np.logspace(10, 17, 9, base=2).astype(int)
n_train_vals = [10000]

In [None]:

for n_train in n_train_vals:
    dirname = f'n{n}_N{n_train}_p1{p1txt}_p2{p2txt}'
    path = os.path.join(experiment_name, dirname)
    # Check if the data directory exists, if not create it
    if not os.path.exists(path):
        os.makedirs(path)

    Y_train = sample_bitstring_v1(n, p1, p2, n_train).astype(int)
    X_train = (Y_train @ repetition_pcm(n).T % 2).astype(int)
    np.save(f"{path}/X_train.npy", X_train)
    np.save(f"{path}/Y_train.npy", Y_train)

    Y_test = bit_tools.binarr(n)
    X_test = (Y_test @ repetition_pcm(n).T % 2).astype(int)
    weights_test = bitstring_prob_v1(Y_test, n, p1, p2)
    np.save(f"{path}/X_test.npy", X_test)
    np.save(f"{path}/Y_test.npy", Y_test)
    np.save(f"{path}/weights.npy", weights_test)

    # train_path = f"{path}/train.pkl"
    # val_path = f"{path}/test.pkl"
    # save_numpy_as_dict(X_train, train_path)
    # save_numpy_as_dict(Y_train, val_path)


### Grokking dataset

This dataset will contain only good examples, and about 90% of the necessary good examples. Since we plan to train until it learns optimal decoding, we won't weight the data and just provide a list of bitstrings

In [15]:
# The dataset has all bitstrings of <weight 4, and only ~half of the "good" bitstrings that have exactly weight 4
#  (where good bitstrings are themselves the bitstrings that are more likely than their logical negation)
wt_4_bitstrings = []
Y_train = []
for b in bit_tools.binarr(8):
    """For n = 8"""
    tot_count = 8
    if sum(b) < 4:
        Y_train.append(b)
    elif sum(b) > 4:
        break
    elif sum(b) == 4:
        if sum(b[:4]) == 3:
            wt_4_bitstrings.append(b)
        elif sum(b[:4]) == 4:
            Y_train.append(b)
# shuffle the weight 4 bitstrings
np.random.shuffle(wt_4_bitstrings)
# select just 8 of them to add to Y_train
Y_train += wt_4_bitstrings[:8]
Y_train = np.array(Y_train)
X_train = (Y_train @ repetition_pcm(8).T % 2).astype(int)
N = len(Y_train)
dirname = f'grok_n{n}_N{N}_p1{p1txt}_p2{p2txt}'
path = os.path.join(experiment_name, dirname)
if not os.path.exists(path):
    os.makedirs(path)
np.save(f"{path}/X_train.npy", X_train)
np.save(f"{path}/Y_train.npy", Y_train)

Y_test = bit_tools.binarr(n)
X_test = (Y_test @ repetition_pcm(n).T % 2).astype(int)
weights_test = bitstring_prob_v1(Y_test, n, p1, p2)
np.save(f"{path}/X_test.npy", X_test)
np.save(f"{path}/Y_test.npy", Y_test)
np.save(f"{path}/weights.npy", weights_test)