### Make datasets to analyze

Each dataset should have a training set `train.pkl`, a validation set `val.pkl`. If there is noise involved, there should be an additional `train_noiselss.pkl`, `dev_noiseless.pkl`.

This notebook serves as a manifest for reproducing precisely the datasets in this directory. Data access notes:
 - The data directory name is how you will access this data using `dataloader.py` 

In [1]:
# autoreload magic
%load_ext autoreload
%autoreload 2

In [2]:
import pickle
import os 
from mindreadingautobots.sequence_generators import make_datasets, data_io

In [5]:
# Generate data with bitflip values
n_val = 10000 # number of validation examples
seed = 1234
n_train = 5000
n_bits = 10 # number of TOTAL bits
# variables
p_bitflips = [0, 0.05, 0.1]

generators = {
    # "parity_4lookback": make_datasets.parity_4lookback,
    # "not_majority_4lookback": make_datasets.not_majority_4lookback,
    "sparse_parity_k4": make_datasets.sparity_k4,
}

for p_bitflip in p_bitflips:
    p100 = int(p_bitflip*100)
    suffix = f"_nbits{n_bits}_n{n_train}_bf{p100}_seed{seed}"

    for gen_name, generator in generators.items():
        dirname = gen_name + suffix
        print(f"Generating {dirname} with p_bitflip={p_bitflip}")
        if gen_name == "sparse_parity_k4":
            X, Z, idx = generator(n_train + n_val, n_bits, p_bitflip, seed)
            print("idx for sparse parity: save these:", idx)
        else:
            X, Z = generator(n_train + n_val, n_bits, p_bitflip, seed)

        if p_bitflip == 0:
            Z = X
        Z_train = Z[:n_train]
        Z_val = Z[n_train:]

        # Check if the data directory exists, if not create it
        if not os.path.exists(dirname):
            os.makedirs(dirname)

        train_path = f"{dirname}/train.pkl"
        val_path = f"{dirname}/val.pkl"
        data_io.save_numpy_as_dict(Z_train, train_path)
        data_io.save_numpy_as_dict(Z_val, val_path)

        if p_bitflip != 0:
            X_train = X[:n_train]
            X_val = X[n_train:]
            noiseless_train_path = f"{dirname}/noiseless_train.pkl"
            noiseless_val_path = f"{dirname}/noiseless_val.pkl"
            data_io.save_numpy_as_dict(X_train, noiseless_train_path)
            data_io.save_numpy_as_dict(X_val, noiseless_val_path)


Generating sparse_parity_k4_nbits10_n5000_bf0_seed1234 with p_bitflip=0
idx for sparse parity: save these: [8 2 1 0]
Generating sparse_parity_k4_nbits10_n5000_bf5_seed1234 with p_bitflip=0.05
idx for sparse parity: save these: [8 2 1 0]
Generating sparse_parity_k4_nbits10_n5000_bf10_seed1234 with p_bitflip=0.1
idx for sparse parity: save these: [8 2 1 0]


In [31]:
# Generate data with bitflip values
n_val = 10000 # number of validation examples
seed = 1234
n_train = 1000
n_bits = 40 # number of TOTAL bits
# variables
nondeterms = [0.05, 0.1]

generators = {
    "parity_4lookback": make_datasets.parity_4lookback_nondeterministic,
    "not_majority_4lookback": make_datasets.not_majority_4lookback_nondeterministic,
}

for nondeterm in nondeterms:
    p100 = int(nondeterm*100)
    suffix = f"_n{n_train}_nondeterm{p100}_seed{seed}"

    for gen_name, generator in generators.items():
        dirname = gen_name + suffix
        print(f"Generating {dirname} with nondeterm={nondeterm}")
        X, _ = generator(n_train + n_val, n_bits, nondeterm, seed)
        X_train = X[:n_train]
        X_val = X[n_train:]

        # Check if the data directory exists, if not create it
        if not os.path.exists(dirname):
            os.makedirs(dirname)

        train_path = f"{dirname}/train.pkl"
        val_path = f"{dirname}/val.pkl"
        data_io.save_numpy_as_dict(X_train, train_path)
        data_io.save_numpy_as_dict(X_val, val_path)

Generating parity_4lookback_n100_nondeterm5_seed1234 with nondeterm=0.05
Generating not_majority_4lookback_n100_nondeterm5_seed1234 with nondeterm=0.05
Generating parity_4lookback_n100_nondeterm10_seed1234 with nondeterm=0.1
Generating not_majority_4lookback_n100_nondeterm10_seed1234 with nondeterm=0.1


In [12]:
# load "./sparity40_1k/train.pkl" 
# dir_names = ['./sparity40_1k', 'sparity40_5k', 'sparity40_25h', 'sparse_n_parity4a', 'sparse_parity4a']
dir_names = ['sparse_parity4a']

for dir_name in dir_names:
    train_path = f'./{dir_name}/train.pkl'
    test_path = f'./{dir_name}/val.pkl'

    print(dir_name)
    with open(train_path, 'rb') as f:
        train = pickle.load(f)
    xvals = train.get('line')
    n_data = len(xvals)
    n_bits = len(xvals[0])
    print("train:", n_data, n_bits)

    # with open(test_path, 'rb') as f:
    #     test = pickle.load(f)
    # xvals = test.get('line')
    # n_data = len(xvals)
    # n_bits = len(xvals[0])
    # print("val:", n_data, n_bits)
    # print()

sparse_parity4a
train: 30000 40


In [21]:
	# test_path = os.path.join(data_path, config.dataset, 'test.tsv')
len('0010011001001111011011111011110111100010')

40

In [None]:
voc= Voc()
train_path = './sparity40_1k/train.pkl'
z = voc.create_vocab_dict(None, path= train_path, debug = None)
voc.w2id 
