In [4]:
# reload magic
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
# Import deterministic.py using local file path
import sys
sys.path.append('../sequence_generators')
import deterministic

sys.path.append('../ucan')
import ucan

#### UCAN dataset

The UCAN dataset has inputs $(Y \oplus \Gamma, \Delta)$, targets $Y$, and 'hidden variable' $\Gamma$, all of which are length-n bitstrings. Note that I'm changing my notation from whats in the writeup to match input/target sequence labels better.

This has to be done in a few steps:
1. Generate a dataset for $Y$, array $(n_{data}, n)$
2. Generate a matched dataset $(\Gamma, \Delta)$, array $(N, n, 2)$ for whatever version of UCAN
3. Compose and discard, i.e. $X = Y \oplus \Gamma$, data = $[Z, \Delta]$ array $(N, 2n)$

In [6]:
# Get a deterministic set of sequences and batch them

import numpy as np

import ucan

# Get a deterministic set of sequences
n = 8
n_data = 100

gen = deterministic.SequenceGen(lookback=4, seed=228, number_of_generating_methods=1)
Y, _ = gen.deterministically_generate_sequences(length=n, num_seq=n_data, save=False)
Y = np.array(Y, dtype=np.int32)

# Generate our UCAN. For the first experiment, gamma=delta (so p_diff = 0)
p_diff = 0
p0_delta = 0.5 # if this is too hard, change to 1 (i.e. ignore delta)
out = ucan.bitwise_ucan_v1(n, n_data, p0_delta, p_diff, seed=228)
gammas = out[:,:,0]
deltas = out[:,:,1]

# Generate the noise and concatenate the data
Z = Y ^ gammas
X = np.concatenate((Z, deltas), axis=1) # (n_data, 2n)


In [7]:
from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import Transformer
import math
from torch.nn import functional as F

# DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DEVICE = torch.device('cpu')


In [8]:
# Encoding the entire text into integers
data = torch.tensor(X, dtype=torch.int) # (n_data, 2n)
targets = torch.tensor(Y, dtype=torch.int) # (n_data, n)

n_train = int(len(data) * 0.9)
train_data = data[:n_train]
val_data = data[n_train:]
train_targets = targets[:n_train]
val_targets = targets[n_train:]

In [9]:
torch.manual_seed(1337) # to get what andrej karpathy got

block_size = 8 # this is the context window size. 
batch_size = 4

def get_batch(split):
    """Generate a small batch of data with inputs x, targets y.
    
    Note that there's no interesting block structure going on here,
    since we're learning a map from x \in \{0,1\}^{2n} -> y \in \{0,1\}^n.

    Outputs are shaped (batch_size, 2n) and (batch_size, n) respectively.
    This corresponds to `batch_first` in the torch transformer
    """
    data = train_data if split == 'train' else val_data
    targets = train_targets if split == 'train' else val_targets

    ix = torch.randint(0, len(data), (batch_size,)) # indices for batch sample
    x = torch.stack([data[i] for i in ix]) # `block_size` many data points
    y = torch.stack([targets[i] for i in ix]) # target for each input (the Y string)
    # CUDA has a problem with short int i guess. see: https://stackoverflow.com/questions/69742930/runtimeerror-nll-loss-forward-reduce-cuda-kernel-2d-index-not-implemented-for
    y = y.type(torch.LongTensor)
    x, y = x.to(DEVICE), y.to(DEVICE)
    return x, y


### Architecture notes:

 - Do we want/need a causal mask in our decoder??? This is not an autoregressive task
 - Reminder to modify the mask for the encoder to reflect the position-wise dependence of $\Delta$, $Z$
 - I do not have a tokenizer nor plans for one???


 #### Data notes:

  - I am avoiding using <EOS> and <BOS> partly because I can get away with bool-type data right now. I don't know how smart that actually is, since these values get cast regardless...

In [10]:
# helper Module to convert tensor of input indices into corresponding tensor of token embeddings
class TokenEmbedding(nn.Module):
    """from https://pytorch.org/tutorials/beginner/translation_transformer.html
    
        Args:
            vocab_size: (int) number of tokens in alphabet
            emb_size: (int) model dimension
    """
    def __init__(self, vocab_size, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens):
        """
        Input:
            tokens: (batch_size, m) tensor of bits or token indices (m=n or 2n)
        Returns:
            Tensor: (batch_size, n, emb_size), final dimension indexes the embedding vector
        """
        # Okay, so we have to cast our bits to float64 to embed...
        # FIXME: use lower precision?
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)


In [11]:
class PositionalEncoding(nn.Module):
    """from https://pytorch.org/tutorials/beginner/translation_transformer.html
    
    Args:
        emb_size: dimension of the embedding, i.e. d_model. MUST BE EVEN
        dropout: dropout rate
    """
    def __init__(self,
                 emb_size: int,
                 dropout: float,
                 maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        # this just rearranges the equation from Vaswani et al. (2017)
        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding):
        """
        Input:
            token_embedding: (batch_size, n, emb_size)
        Returns:
            Tensor: (batch_size, n, emb_size), with positional encoding
        """
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])


Notes from nn.Transformer https://pytorch.org/docs/stable/generated/torch.nn.Transformer.html

 - If a boolean tensor is provided for any of the [src/tgt/memory]_mask arguments, positions with a True value are not allowed to participate in the attention, which is the opposite of the definition for attn_mask in torch.nn.functional.scaled_dot_product_attention().
 - src, tgt, memory mask are masks applied to the x input seq, the y target seq, and the last layer of encoder seq resp.

In [12]:

# Seq2Seq Network
class Seq2SeqTransformer(nn.Module):
    """from https://pytorch.org/tutorials/beginner/translation_transformer.html"""
    def __init__(self, num_encoder_layers, num_decoder_layers, emb_size, nhead, src_vocab_size, tgt_vocab_size,
                 dim_feedforward=512, dropout=0.1):
        super(Seq2SeqTransformer, self).__init__()
        self.transformer = Transformer(d_model=emb_size,
                                       nhead=nhead,
                                       num_encoder_layers=num_encoder_layers,
                                       num_decoder_layers=num_decoder_layers,
                                       dim_feedforward=dim_feedforward,
                                       dropout=dropout,
                                       batch_first=True) # (batch, seq_len, d_model)
        
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(emb_size, dropout=dropout)
        # Final layer for output decoder
        self.generator = nn.Linear(emb_size, tgt_vocab_size)

    def forward(self, src, trg, src_mask, tgt_mask, src_padding_mask=None, tgt_padding_mask=None, memory_key_padding_mask=None):
        """
        Let S be the source seq length, T the target seq length, N the batch size, E the embedding dimension.

        Args:
            src: input token embeddings. Shape: (N,S,E) (since Transformer.batch_first=True)
            trg: target token embeddings. Shape: (N,T,E) 
            src_mask: Encoder self-attention mask. Shape is (S,S) or (N⋅num_heads,S,S)
            tgt_mask: Decoder self-attention mask. Shape is (T,T) or (N⋅num_heads,T,T)
            src_padding_mask: This removes padding for ragged seqences, specified per example
            tgt_padding_mask: See above 
            memory_key_padding_mask: See above
        
        Returns:
            Tensor: (N, T, num_tokens) logits for the target sequence
        """
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None,
                                src_padding_mask, tgt_padding_mask, memory_key_padding_mask)
        logits = self.generator(outs)

        # Compute loss
        # Forward is only called during training/validation, so this is fine
        B, T, C = logits.shape
        logits = logits.view(B*T, C)
        # I'm sketchy about this. also C = vocab_size now
        targets = trg.view(B*T)
        loss = F.cross_entropy(logits, targets)

        return logits, loss

    def encode(self, src, src_mask):
        src_pos_emb = self.positional_encoding(self.src_tok_emb(src))
        return self.transformer.encoder(src_pos_emb, src_mask)

    def decode(self, tgt, memory, tgt_mask):
        tgt_pos_emb = self.positional_encoding(self.tgt_tok_emb(tgt))
        return self.transformer.decoder(tgt_pos_emb, memory, tgt_mask)

Things I'm confused about:
 - at training time, we have targets, so we can embed (positional and vector) the targets to feed into the decoder. At evaluation time, we start with a source vector, encode into memory, then autoregressively build the target I guess?

In [18]:
num_encoder_layers = 2
num_decoder_layers = 2
emb_size = 4
nhead = 4
src_vocab_size = 2
tgt_vocab_size = 2
dim_feedforward = 64
dropout = 0.1

# hyperparameters
eval_iters = 200
LEARNING_RATE = 0.0001
# BATCH_SIZE = 32

# Train loop
eval_interval = 20
max_iters = 20

model = Seq2SeqTransformer(
    num_encoder_layers, 
    num_decoder_layers, 
    emb_size, 
    nhead, 
    src_vocab_size, 
    tgt_vocab_size, 
    dim_feedforward, 
    dropout
).to(DEVICE)


In [19]:
@torch.no_grad()
def estimate_loss():
    # Average the loss over many batches. Hardcoded cross_entropy loss
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y, None, None)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [20]:

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)


for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb, None, None)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()


step 0: train loss 0.8997, val loss 0.8882


In [24]:

# function to generate output sequence using greedy algorithm
def greedy_decode(model, src, src_mask, max_len):
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)

    memory = model.encode(src, src_mask)
    # SHIT. Since we're doing this autoregressively, we need a start sequence as 
    # the seed for generating our output...
    ys = torch.ones(1, 1).type(torch.long).to(DEVICE)
    for i in range(max_len):
        memory = memory.to(DEVICE)
        # tgt_mask = (generate_square_subsequent_mask(ys.size(0)).type(torch.bool)).to(DEVICE)
        out = model.decode(ys, memory, None)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.item()
        ys = torch.cat([ys, torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
    return ys


# actual function to translate input sentence into target language
# inp
def translate(model: torch.nn.Module, src):
    """
    Args:
        src: tensor. single input bitstring of length 2n. Shape (2n,) 
    """
    model.eval()
    num_tokens = src.shape[0]
    n = num_tokens // 2
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = greedy_decode(model, src, src_mask, max_len=n).flatten()

    return tgt_tokens

In [25]:
translate(model, val_data[0])

RuntimeError: shape '[16, 4, 1]' is invalid for input of size 1024

In [None]:
# TODO: Try to use DataLoader, Dataset, etc. for batching

def train_step(model, optimizer):
    losses = 0

    for _ in range(n_steps):
        # These aren't technically epochs i guess.
        xb, yb = get_batch('train')

        logits = model(xb, yb, None, None) # No masks for now
        optimizer.zero_grad(set_to_none=True) #?

        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), yb.reshape(-1))
        loss.backward()
        
        optimizer.step()
        losses += loss.item() / BATCH_SIZE # check this

    return losses


def evaluate(model):
    model.eval()
    losses = 0

    val_iter = Multi30k(split='valid', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    val_dataloader = DataLoader(val_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)

    for src, tgt in val_dataloader:
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        losses += loss.item()

    return losses / len(list(val_dataloader))

In [61]:
emb_size = 10
x, y = get_batch('train')
embedding = TokenEmbedding(2, emb_size)
xx = embedding.forward(x)
print(xx.shape, "embedding")
positional = PositionalEncoding(emb_size, 0.1)
xxx = positional.forward(xx)
print(xxx.shape, "positional")

xxxx = model.forward(x, y, None, None)
print(xxxx.shape, "model forward")


torch.Size([4, 16, 10]) embedding
torch.Size([4, 16, 10]) positional
torch.Size([4, 8, 2]) model forward


### TODO

Re-adapt this for prediction/evaluation, and probably incorporate as a function into the transformer model

In [None]:
from timeit import default_timer as timer
NUM_EPOCHS = 18

for epoch in range(1, NUM_EPOCHS+1):
    start_time = timer()
    train_loss = train_epoch(transformer, optimizer)
    end_time = timer()
    val_loss = evaluate(transformer)
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))

# function to generate output sequence using greedy algorithm
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
    for i in range(max_len-1):
        memory = memory.to(DEVICE)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                    .type(torch.bool)).to(DEVICE)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.item()

        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break
    return ys


# actual function to translate input sentence into target language
# inp
def translate(model: torch.nn.Module, src):
    """
    Args:
        single
    """
    model.eval()
    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = greedy_decode(
        model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()
    return " ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")

print(translate(transformer, "Eine Gruppe von Menschen steht vor einem Iglu ."))


In [None]:

def create_mask(src, tgt):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len),device=DEVICE).type(torch.bool)

    src_padding_mask = (src == PAD_IDX).transpose(0, 1)
    tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

---

#### Scratchwork

In [None]:

from torch.nn.utils.rnn import pad_sequence
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

SRC_LANGUAGE = 'fuck'
TGT_LANGUAGE = 'this'

# Place-holders
token_transform = {}
vocab_transform = {}
token_transform[SRC_LANGUAGE] = get_tokenizer('spacy', language='de_core_news_sm')
token_transform[TGT_LANGUAGE] = get_tokenizer('spacy', language='en_core_web_sm')

for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    # Training data Iterator
    train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    # Create torchtext's Vocab object
    vocab_transform[ln] = build_vocab_from_iterator(yield_tokens(train_iter, ln),
                                                    min_freq=1,
                                                    specials=special_symbols,
                                                    special_first=True)
    

val_iter = Multi30k(split='valid', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
val_dataloader = DataLoader(val_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)


# helper function to club together sequential operations
def sequential_transforms(*transforms):
    def func(txt_input):
        for transform in transforms:
            txt_input = transform(txt_input)
        return txt_input
    return func

# function to add BOS/EOS and create tensor for input sequence indices
def tensor_transform(token_ids: List[int]):
    return torch.cat((torch.tensor([BOS_IDX]),
                      torch.tensor(token_ids),
                      torch.tensor([EOS_IDX])))

# ``src`` and ``tgt`` language text transforms to convert raw strings into tensors indices
text_transform = {}
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    text_transform[ln] = sequential_transforms(token_transform[ln], #Tokenization
                                               vocab_transform[ln], #Numericalization
                                               tensor_transform) # Add BOS/EOS and create tensor

# function to collate data samples into batch tensors
def collate_fn(batch):
    src_batch, tgt_batch = [], []
    for src_sample, tgt_sample in batch:
        src_batch.append(text_transform[SRC_LANGUAGE](src_sample.rstrip("\n")))
        tgt_batch.append(text_transform[TGT_LANGUAGE](tgt_sample.rstrip("\n")))

    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX)
    return src_batch, tgt_batch

In [107]:
from torch.utils.data import DataLoader
from torchtext.datasets import multi30k, Multi30k

# Fixme
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)


def train_epoch(model, optimizer):
    model.train()
    losses = 0
    train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    train_dataloader = DataLoader(train_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)

    for src, tgt in train_dataloader:
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        optimizer.zero_grad()

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        loss.backward()

        optimizer.step()
        losses += loss.item()

    return losses / len(list(train_dataloader))

def evaluate(model):
    model.eval()
    losses = 0

    val_iter = Multi30k(split='valid', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    val_dataloader = DataLoader(val_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)

    for src, tgt in val_dataloader:
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        losses += loss.item()

    return losses / len(list(val_dataloader))

ModuleNotFoundError: No module named 'torchtext'