### Make datasets to analyze

Each dataset should have a training set `train.pkl`, a validation set `val.pkl`. If there is noise involved, there should be an additional `train_noiselss.pkl`, `dev_noiseless.pkl`.

This notebook serves as a manifest for reproducing precisely the datasets in this directory. Data access notes:
 - The data directory name is how you will access this data using `dataloader.py` 

In [1]:
# autoreload magic
%load_ext autoreload
%autoreload 2

In [4]:
import pickle
import os 
from mindreadingautobots.sequence_generators import make_datasets, data_io
import numpy as np

### Hamilton path datasets

In [5]:
out = make_datasets.k_choose_m_hamilton_forecast_dataset(k=6, m=4, n_data=300, n_bits=30, p_bitflip=0, seed=1237)
X, Z, idx = out
for x in X:
    # find two entries with the first 6 bits the same
    if np.all(x[0:6] == X[0,0:6]):
        # print(X[0])
        print(x)
    # print(x)
# tester = out[0]
# dct = {}
# for i in range(len(tester)):

# print(out[0])
print(out[2])

[0 1 0 0 0 0 1 1 0 1 0 1 0 1 1 1 1 1 1 0 0 0 0 0 0 1 0 1 0 0]
[0 1 0 0 0 0 1 1 0 1 0 1 0 1 1 1 1 1 1 0 0 0 0 0 0 1 0 1 0 0]
[0 1 0 0 0 0 1 1 0 1 0 1 0 1 1 1 1 1 1 0 0 0 0 0 0 1 0 1 0 0]
[0 1 0 0 0 0 1 1 0 1 0 1 0 1 1 1 1 1 1 0 0 0 0 0 0 1 0 1 0 0]
[0 1 0 0 0 0 1 1 0 1 0 1 0 1 1 1 1 1 1 0 0 0 0 0 0 1 0 1 0 0]
[0 1 0 0 0 0 1 1 0 1 0 1 0 1 1 1 1 1 1 0 0 0 0 0 0 1 0 1 0 0]
[0 1 2 5]


In [63]:
# Generate data with bitflip values
n_val = 10000 # number of validation examples
seed = 1234
n_train = 2000
n_bits = 16 # number of TOTAL bits
# variables
# p_bitflips = [0, 0.05, 0.1]
p_bitflips = [0.2]

def hamilton_6_choose_4(n_data, n_bits, p_bitflip, seed):
    return make_datasets.k_choose_m_hamilton_forecast_dataset(k=6, m=4, n_data=n_data, n_bits=n_bits, p_bitflip=p_bitflip, seed=seed)

generators = {
    # "parity_4lookback": make_datasets.parity_4lookback,
    # "not_majority_4lookback": make_datasets.not_majority_4lookback,
    # "sparse_parity_k4": make_datasets.sparity_k4,
    "hamilton_6_choose_4": hamilton_6_choose_4
}

for p_bitflip in p_bitflips:
    p100 = int(p_bitflip*100)
    suffix = f"_nbits{n_bits}_n{n_train}_bf{p100}_seed{seed}"

    for gen_name, generator in generators.items():
        dirname = gen_name + suffix
        print(f"Generating {dirname} with p_bitflip={p_bitflip}")
        # If your dataset has a hidden subset, update this list:
        if gen_name in ["sparse_parity_k4", "hamilton_6_choose_4"]: 
            X, Z, idx = generator(n_train + n_val, n_bits, p_bitflip, seed)
            print("idx for sparse parity: save these:", idx)
        else:
            X, Z = generator(n_train + n_val, n_bits, p_bitflip, seed)

        if p_bitflip == 0:
            Z = X
        Z_train = Z[:n_train]
        Z_val = Z[n_train:]

        # Check if the data directory exists, if not create it
        if not os.path.exists(dirname):
            os.makedirs(dirname)

        train_path = f"{dirname}/train.pkl"
        val_path = f"{dirname}/val.pkl"
        data_io.save_numpy_as_dict(Z_train, train_path)
        data_io.save_numpy_as_dict(Z_val, val_path)

        if p_bitflip != 0:
            X_train = X[:n_train]
            X_val = X[n_train:]
            noiseless_train_path = f"{dirname}/noiseless_train.pkl"
            noiseless_val_path = f"{dirname}/noiseless_val.pkl"
            data_io.save_numpy_as_dict(X_train, noiseless_train_path)
            data_io.save_numpy_as_dict(X_val, noiseless_val_path)


Generating hamilton_6_choose_4_nbits16_n2000_bf20_seed1234 with p_bitflip=0.2
idx for sparse parity: save these: [1 2 3 5]


### k-lookback datasets

In [17]:
# Generate data with bitflip values
n_val = 10000 # number of validation examples
seed = 1234 
n_train = 5000 # number of training examples
n_bits = 10 # number of TOTAL bits (including final bit)

# Create a different dataset for every 'nondeterministic' value in this list
# Note that _sometimes_ this means bitflip rate, but not always
nondeterms = [0.0, 0.1, 0.2]

# We will create datasets for every entry in this dictionary of data generators 
# Data generating functions must all have signature (n_data, n_bits, p_bitflip, seed)
generators = {
    # "parity_4lookback": make_datasets.parity_4lookback_nondeterministic,
    # "not_majority_4lookback": make_datasets.not_majority_4lookback_nondeterministic,
    "sparse_parity_k4": make_datasets.sparity_k4,
}

for nondeterm in nondeterms:
    p100 = int(nondeterm*100)
    suffix = f"_nbits{n_bits}_n{n_train}_bf{p100}_seed{seed}"

    for gen_name, generator in generators.items():
        dirname = gen_name + suffix
        print(f"Generating {dirname} with nondeterm={nondeterm}")
        X, Z, idx = generator(n_train + n_val, n_bits, nondeterm, seed)
        X_train = X[:n_train]
        X_val = X[n_train:]

        if nondeterm == 0:
            Z = X
        Z_train = Z[:n_train]
        Z_val = Z[n_train:]

        # Check if the data directory exists, if not create it
        if not os.path.exists(dirname):
            os.makedirs(dirname)

        train_path = f"{dirname}/train.pkl"
        val_path = f"{dirname}/val.pkl"
        data_io.save_numpy_as_dict(Z_train, train_path)
        data_io.save_numpy_as_dict(Z_val, val_path)
    
        # whether or not there are bitflips, we will save a 'noiseless'
        # version of the data just for consistency
        X_train = X[:n_train]
        X_val = X[n_train:]
        noiseless_train_path = f"{dirname}/noiseless_train.pkl"
        noiseless_val_path = f"{dirname}/noiseless_val.pkl"
        data_io.save_numpy_as_dict(X_train, noiseless_train_path)
        data_io.save_numpy_as_dict(X_val, noiseless_val_path)


Generating sparse_parity_k4_nbits10_n5000_bf0_seed1234 with nondeterm=0.0
Generating sparse_parity_k4_nbits10_n5000_bf10_seed1234 with nondeterm=0.1
Generating sparse_parity_k4_nbits10_n5000_bf20_seed1234 with nondeterm=0.2
