In [9]:
import numpy as np
import torch
import os
import pickle

In [10]:
ALPHABET = [chr(i) for i in range(ord('a'), ord('z') + 1)]
SEP_BAR, SEP_Q = '|', '?'
batch_size = 64
block_size = 2048
device_type = 'cuda' if torch.cuda.is_available() else 'cpu'
device = 'cuda'

In [29]:
dataset = 'openwebtext'
# poor man's data loader
data_dir = os.path.join('data', dataset)
with open(os.path.join(data_dir, 'meta.pkl'), 'rb') as f:
    meta = pickle.load(f)
stoi, itos = meta['stoi'], meta['itos']
vocab_size = meta['vocab_size'] 
print(f"Using vocab size of {vocab_size} (a-z + separators)")

# ---------------- helper: random mono‑alphabetic key --------
alpha_ids = np.array([stoi[c] for c in ALPHABET], dtype=np.uint8)
def random_key():
    perm = np.random.permutation(26)
    enc  = {alpha_ids[i]: alpha_ids[perm[i]] for i in range(26)}   # plain→cipher
    dec  = {v: k for k, v in enc.items()}                          # cipher→plain
    return enc, dec

def get_batch(split):
    mmap = np.memmap(os.path.join(data_dir, f'{split}.bin'),
                     dtype=np.uint8, mode='r')

    k_pairs   = 1024                              # desired number of pairs
    known_k   = k_pairs - 1                       # last one is the query
    prompt_sz = 2 * k_pairs                       # 2048 tokens
    assert prompt_sz == block_size, "block_size must be 2*k_pairs"

    X = torch.full((batch_size, block_size), stoi['|'],  dtype=torch.long)
    Y = torch.full((batch_size, block_size), -1,          dtype=torch.long)

    for b in range(batch_size):
        # ----- 1. grab k plaintext letters from corpus -------------------
        start = np.random.randint(0, len(mmap) - k_pairs - 1)
        plain = mmap[start:start + k_pairs].copy()          # np.uint8, shape (k_pairs,)

        # ----- 2. fresh random key for this sample -----------------------
        enc, _ = random_key()

        # ----- 3. build prompt ------------------------------------------
        buf, tgt = [], []
        for i, p in enumerate(plain):
            c = enc[p]
            if i < known_k:                                 # give answer
                buf.extend([c, p]);      tgt.extend([-1, p])
            else:                                           # query pair
                buf.extend([c, stoi['?']])
                tgt.extend([-1, p])

        X[b] = torch.from_numpy(np.asarray(buf,  np.uint8))
        Y[b] = torch.from_numpy(np.asarray(tgt, np.int64))

    if device_type == 'cuda':
        X, Y = X.pin_memory().to(device, non_blocking=True), \
               Y.pin_memory().to(device, non_blocking=True)
    else:
        X, Y = X.to(device), Y.to(device)
    return X, Y

Using vocab size of 28 (a-z + separators)


In [30]:
x, y = get_batch('train')

In [34]:
for j in range(x.shape[0]):
    for i in range(1, x.shape[-1], 2):
        print(itos[x[j, i].item()], end='')
    print("\n")

afterjoiningthemandhasattractedinterestfrommanyclubsincludingtottenhamhotspursandnewcastleunitedlatestnewsfromswitzerlandisthattottenhamhavegazumpednewcastleforthesignatureofthehardworkingcenterbackandmustbereadytodashoutamouthwateringmfortheyearoldreportssuggesttheswisssidehasslappedapricearoundmillionontheformermedeamascplayerthereportsfurtherrevealthatdespiteyoungboysunwillingnesstotransfertheplayerstheycantresistthecashiftottenhamarereadytopaythemtottenhamonlyrecentlybroketheirtransferrecordtosigndavinsonsanchezfromajaxbuttheyarelikelytobreakthebankforkasimssignatureastheyareconsideringdefensiveoptionswithtobyalderweireldandjanvertongthennotgettinganyyoungerkasimtheyoungerbrotherofkumasiasantekotokodefenderahmedadamshasbeenveryimpressivesincejoiningyoungboysonloanfromrealmallorcainhewaslaterhandedapermanentyeardealatyoungboysafterhittingtopformandprovingapricelessassetforadolfhutterssidebysheikhtophicsienudesheikhontwitterwelcomethissiteisdedicatedtothelegendaryworksofsuraktheficti