In [2]:
# reload magic
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
# Import deterministic.py using local file path
import sys
sys.path.append('../sequence_generators')
import deterministic

sys.path.append('../ucan')
import ucan
import ucan_transformer

#### UCAN dataset

The UCAN dataset has inputs $(Y \oplus \Gamma, \Delta)$, targets $Y$, and 'hidden variable' $\Gamma$, all of which are length-n bitstrings. Note that I'm changing my notation from whats in the writeup to match input/target sequence labels better.

This has to be done in a few steps:
1. Generate a dataset for $Y$, array $(n_{data}, n)$
2. Generate a matched dataset $(\Gamma, \Delta)$, array $(N, n, 2)$ for whatever version of UCAN
3. Compose and discard, i.e. $X = Y \oplus \Gamma$, data = $[Z, \Delta]$ array $(N, 2n)$

In [4]:
# Get a deterministic set of sequences and batch them

import numpy as np

import ucan

# Get a deterministic set of sequences of length n
# with the sos and eos tokens, this becomes length 2+n
n = 8
n_data = 100

gen = deterministic.SequenceGen(lookback=4, seed=228, number_of_generating_methods=1)
Y, _ = gen.deterministically_generate_sequences(length=n, num_seq=n_data, save=False)
Y = np.array(Y, dtype=np.int32)

# Generate our UCAN. For the first experiment, gamma=delta (so p_diff = 0)
p_diff = 0
p0_delta = 0.5 # if this is too hard, change to 1 (i.e. ignore delta)
out = ucan.bitwise_ucan_v1(n, n_data, p0_delta, p_diff, seed=228)
gammas = out[:,:,0]
deltas = out[:,:,1]

# Generate the noise and concatenate the data
Z = Y ^ gammas
X = np.concatenate((Z, deltas), axis=1) # (n_data, 2n)


In [5]:
import torch
from torch.nn import functional as F

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# DEVICE = torch.device('cpu')


In [6]:
# Encoding the entire text into integers
data = torch.tensor(X, dtype=torch.int) # (n_data, 2n)
targets = torch.tensor(Y, dtype=torch.int) # (n_data, n)

# SOS is needed to get reasonable marginals for first token / seed generator
# I'm not sure if EOS is necessary for anything
SOS_TOKEN = 2
EOS_TOKEN = 3
data = torch.cat((torch.ones(data.size(0), 1, dtype=torch.int) * SOS_TOKEN, data, torch.ones(data.size(0), 1, dtype=torch.int) * EOS_TOKEN), dim=1)
targets = torch.cat((torch.ones(targets.size(0), 1, dtype=torch.int) * SOS_TOKEN, targets, torch.ones(targets.size(0), 1, dtype=torch.int) * EOS_TOKEN), dim=1)

n_train = int(len(data) * 0.9)
train_data = data[:n_train]
val_data = data[n_train:]
train_targets = targets[:n_train]
val_targets = targets[n_train:]

In [7]:
torch.manual_seed(1337) # to get what andrej karpathy got

block_size = 8 # this is the context window size. 
batch_size = 4

def get_batch(split):
    """Generate a small batch of data with inputs x, targets y.
    
    Note that there's no interesting block structure going on here,
    since we're learning a map from x \in \{0,1\}^{2n} -> y \in \{0,1\}^n.

    Outputs are shaped (batch_size, 2n) and (batch_size, n) respectively.
    This corresponds to `batch_first` in the torch transformer
    """
    data = train_data if split == 'train' else val_data
    targets = train_targets if split == 'train' else val_targets

    ix = torch.randint(0, len(data), (batch_size,)) # indices for batch sample
    x = torch.stack([data[i] for i in ix]) # `block_size` many data points
    y = torch.stack([targets[i] for i in ix]) # target for each input (the Y string)
    # CUDA has a problem with short int. see: https://stackoverflow.com/questions/69742930/runtimeerror-nll-loss-forward-reduce-cuda-kernel-2d-index-not-implemented-for
    y = y.type(torch.LongTensor)
    x, y = x.to(DEVICE), y.to(DEVICE)
    return x, y


### Architecture notes:

 - Do we want/need a causal mask in our decoder??? This is not an autoregressive task
 - Reminder to modify the mask for the encoder to reflect the position-wise dependence of $\Delta$, $Z$
 - I do not have a tokenizer nor plans for one???


 #### Data notes:

  - I am avoiding using <EOS> and <BOS> partly because I can get away with bool-type data right now. I don't know how smart that actually is, since these values get cast regardless...

  Notes from nn.Transformer https://pytorch.org/docs/stable/generated/torch.nn.Transformer.html

 - If a boolean tensor is provided for any of the [src/tgt/memory]_mask arguments, positions with a True value are not allowed to participate in the attention, which is the opposite of the definition for attn_mask in torch.nn.functional.scaled_dot_product_attention().
 - src, tgt, memory mask are masks applied to the x input seq, the y target seq, and the last layer of encoder seq resp.

Things I'm confused about:
 - at training time, we have targets, so we can embed (positional and vector) the targets to feed into the decoder. At evaluation time, we start with a source vector, encode into memory, then autoregressively build the target I guess?

In [8]:
num_encoder_layers = 2
num_decoder_layers = 2
emb_size = 12
nhead = 4
src_vocab_size = 4
tgt_vocab_size = 4
dim_feedforward = 64
dropout = 0.1

# hyperparameters
eval_iters = 200
LEARNING_RATE = 0.0001
# BATCH_SIZE = 32

# Train loop
eval_interval = 50
max_iters = 1000

model = ucan_transformer.Seq2SeqTransformer(
    num_encoder_layers, 
    num_decoder_layers, 
    emb_size, 
    nhead, 
    src_vocab_size, 
    tgt_vocab_size, 
    dim_feedforward, 
    dropout
).to(DEVICE)
# `forward` signature: (src, trg, src_mask, tgt_mask, **kwargs)


In [9]:
@torch.no_grad()
def estimate_loss():
    # Average the loss over many batches. Hardcoded cross_entropy loss
    # Needs to be in same namespace as model and get_batch
    out = {}
    model.eval()
    tgt_mask = ucan_transformer.generate_square_subsequent_mask(n + 2, device=DEVICE)
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y, None, tgt_mask)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [10]:

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)


for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb, None, ucan_transformer.generate_square_subsequent_mask(n + 2, device=DEVICE))
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()


  attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)


step 0: train loss 1.7526, val loss 1.7280
step 50: train loss 1.1336, val loss 1.1391
step 100: train loss 0.8750, val loss 0.8705
step 150: train loss 0.7000, val loss 0.6967
step 200: train loss 0.5872, val loss 0.5837
step 250: train loss 0.5056, val loss 0.5015
step 300: train loss 0.4382, val loss 0.4353
step 350: train loss 0.3866, val loss 0.3832
step 400: train loss 0.3427, val loss 0.3398
step 450: train loss 0.3027, val loss 0.3010
step 500: train loss 0.2734, val loss 0.2723
step 550: train loss 0.2454, val loss 0.2439
step 600: train loss 0.2232, val loss 0.2229
step 650: train loss 0.2052, val loss 0.2050
step 700: train loss 0.1889, val loss 0.1891
step 750: train loss 0.1764, val loss 0.1766
step 800: train loss 0.1647, val loss 0.1646
step 850: train loss 0.1544, val loss 0.1547
step 900: train loss 0.1441, val loss 0.1443
step 950: train loss 0.1357, val loss 0.1361


In [None]:

# # function to generate output sequence using greedy algorithm
# def greedy_decode(model, src, src_mask, max_len):
#     """First attempt at non-autoregressive decoding."""
    
#     ys = torch.cat((src[:9], torch.tensor([EOS_TOKEN]).to(DEVICE)), dim=0).unsqueeze(0)
#     src = src.to(DEVICE)
#     src_mask = None  
    
#     # For a single example evaluation, we need to add a dummy batch dimension (1, *) with unsqueeze(0)
#     memory = model.encode(src.unsqueeze(0), src_mask)    
#     memory = memory.to(DEVICE)

#     out = model.decode(tgt=ys, memory=memory, tgt_mask=None) # (1, tgt_seq_len, emb_dim)
#     prob = model.generator(out) # (1, tgt_seq_len, num_tokens)
#     _, pred = torch.max(prob, dim=2)

#     return pred.squeeze(0)


In [20]:
xv, yv = get_batch('val')
src = xv[0]
truth = yv[0]

src = src.to(DEVICE)
src_mask = None
max_len = 10

# For a single example evaluation, we need to add a dummy batch dimension (1, *) with unsqueeze(0)
memory = model.encode(src.unsqueeze(0), src_mask)
# The [1, 1] shape starts us off with a dummy batch dimension 
ys = torch.ones(1, 1).fill_(1).type(torch.long).to(DEVICE)

# FIXME: encode acts identically on every token in every position.
print(memory)

for i in range(max_len - 1): # -1 since we start with SOS
    memory = memory.to(DEVICE)
    tgt_mask = (ucan_transformer.generate_square_subsequent_mask(ys.size(1), device=DEVICE).type(torch.bool)).to(DEVICE)
    out = model.decode(tgt=ys, memory=memory, tgt_mask=tgt_mask) # (1, tgt_seq_len, emb_dim)

    # FIXME: decode acts identically on every token in every position.
    prob = model.generator(out[:, -1])
    print(out, "out")
    print(ys, "ys")
    print(prob, "prob")
    _, next_word = torch.max(prob, dim=1)
    next_word = next_word.item()
    ys = torch.cat([ys, torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=1)
    print()


tensor([[[-0.1236,  1.2823,  0.6321,  1.4439,  0.9602,  0.9766, -0.4973,
          -0.9076, -0.5309, -0.2988, -1.6624, -1.3608],
         [-0.0534,  0.7805, -0.3174,  0.3710,  1.1181,  1.4887,  0.4814,
          -0.9515, -0.0999, -2.0325, -1.4425,  0.6218],
         [-0.0534,  0.7805, -0.3174,  0.3710,  1.1181,  1.4887,  0.4814,
          -0.9515, -0.0999, -2.0325, -1.4425,  0.6218],
         [-0.0534,  0.7805, -0.3174,  0.3710,  1.1181,  1.4887,  0.4814,
          -0.9515, -0.0999, -2.0325, -1.4425,  0.6218],
         [-0.0534,  0.7805, -0.3174,  0.3710,  1.1181,  1.4887,  0.4814,
          -0.9515, -0.0999, -2.0325, -1.4425,  0.6218],
         [-0.6038,  0.5767,  0.8809, -0.9446, -0.2169,  1.5485, -0.2948,
          -1.3890,  1.9180, -0.8868, -0.8581,  0.2718],
         [-0.6038,  0.5767,  0.8809, -0.9446, -0.2169,  1.5485, -0.2948,
          -1.3890,  1.9180, -0.8868, -0.8581,  0.2718],
         [-0.0534,  0.7805, -0.3174,  0.3710,  1.1181,  1.4887,  0.4814,
          -0.9515, -0.09

In [11]:
# function to generate output sequence using greedy algorithm
def greedy_decode(model, src, src_mask, max_len):
    """Standard autoregressive decoder output scheme.

    Current issue: my train batching didn't have time-sliced data, so I think the 
    model has no idea what to do with a length-1 <SOS> sequence as input.
    """
    src = src.to(DEVICE)
    # src_mask = src_mask.to(DEVICE)
    src_mask = None  # FIXME
    
    # For a single example evaluation, we need to add a dummy batch dimension (1, *) with unsqueeze(0)
    memory = model.encode(src.unsqueeze(0), src_mask)
    # The [1, 1] shape starts us off with a dummy batch dimension 
    ys = torch.ones(1, 1).fill_(SOS_TOKEN).type(torch.long).to(DEVICE)

    # FIXME: should I enforce the length? Or should I enforce the length+1, 
    # and then checksum for an EOS? Or should I allow variable length :(
    for i in range(max_len - 1): # -1 since we start with SOS
        memory = memory.to(DEVICE)
        tgt_mask = (ucan_transformer.generate_square_subsequent_mask(ys.size(1), device=DEVICE).type(torch.bool)).to(DEVICE)
        out = model.decode(tgt=ys, memory=memory, tgt_mask=tgt_mask) # (1, tgt_seq_len, emb_dim)
        prob = model.generator(out[:, -1])
        print(out, "out")
        print(ys, "ys")
        print(prob, "prob")
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.item()
        ys = torch.cat([ys, torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=1)
        print()
    return ys

# actual function to translate input sentence into target language
# inp
def translate(model: torch.nn.Module, src):
    """
    Args:
        src: tensor. single input bitstring of length 2n + 2. Shape (2n + 2,) 
    """
    model.eval()
    seq_len = src.shape[0] - 2
    out_len = seq_len // 2 + 2 # 2:1 UCAN conversion, plus EOS/SOS
    # src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    src_mask = None
    tgt_tokens = greedy_decode(model, src, src_mask, max_len=out_len).flatten()

    return tgt_tokens


xv, yv = get_batch('val')

for xvi, yvi in zip(xv, yv):
    print("xv_i", xvi)
    pred = translate(model, xvi)
    print("yv_i", yvi)
    print("prediction", pred)
    print()

xv_i tensor([2, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 3], device='cuda:0',
       dtype=torch.int32)
tensor([[[ 0.7594, -0.8275, -1.9494,  0.1014,  0.9055, -0.2788, -1.9472,
           0.5500,  1.7124,  0.2659,  0.7651, -0.0168]]], device='cuda:0',
       grad_fn=<NativeLayerNormBackward0>) out
tensor([[2]], device='cuda:0') ys
tensor([[-1.0226, -0.8345,  1.9569, -0.7579]], device='cuda:0',
       grad_fn=<AddmmBackward0>) prob

tensor([[[ 0.7594, -0.8275, -1.9494,  0.1014,  0.9055, -0.2788, -1.9472,
           0.5500,  1.7124,  0.2659,  0.7651, -0.0168],
         [ 0.7594, -0.8275, -1.9494,  0.1014,  0.9055, -0.2788, -1.9472,
           0.5500,  1.7124,  0.2659,  0.7651, -0.0168]]], device='cuda:0',
       grad_fn=<NativeLayerNormBackward0>) out
tensor([[2, 2]], device='cuda:0') ys
tensor([[-1.0226, -0.8345,  1.9569, -0.7579]], device='cuda:0',
       grad_fn=<AddmmBackward0>) prob

tensor([[[ 0.7594, -0.8275, -1.9494,  0.1014,  0.9055, -0.2788, -1.9472,
           0.5500,  

In [46]:
ys = torch.ones(1, 1).fill_(SOS_TOKEN).type(torch.long).to(DEVICE)
print(ys.shape)

torch.Size([1, 1])


In [None]:
# TODO: Try to use DataLoader, Dataset, etc. for batching

def train_step(model, optimizer):
    losses = 0

    for _ in range(n_steps):
        # These aren't technically epochs i guess.
        xb, yb = get_batch('train')

        logits = model(xb, yb, None, None) # No masks for now
        optimizer.zero_grad(set_to_none=True) #?

        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), yb.reshape(-1))
        loss.backward()
        
        optimizer.step()
        losses += loss.item() / BATCH_SIZE # check this

    return losses


def evaluate(model):
    model.eval()
    losses = 0

    val_iter = Multi30k(split='valid', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    val_dataloader = DataLoader(val_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)

    for src, tgt in val_dataloader:
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        losses += loss.item()

    return losses / len(list(val_dataloader))

In [61]:
emb_size = 10
x, y = get_batch('train')
embedding = TokenEmbedding(2, emb_size)
xx = embedding.forward(x)
print(xx.shape, "embedding")
positional = PositionalEncoding(emb_size, 0.1)
xxx = positional.forward(xx)
print(xxx.shape, "positional")

xxxx = model.forward(x, y, None, None)
print(xxxx.shape, "model forward")


torch.Size([4, 16, 10]) embedding
torch.Size([4, 16, 10]) positional
torch.Size([4, 8, 2]) model forward


### TODO

Re-adapt this for prediction/evaluation, and probably incorporate as a function into the transformer model

In [None]:

def create_mask(src, tgt):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len),device=DEVICE).type(torch.bool)

    src_padding_mask = (src == PAD_IDX).transpose(0, 1)
    tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

---

#### Scratchwork

In [None]:
from timeit import default_timer as timer
NUM_EPOCHS = 18

for epoch in range(1, NUM_EPOCHS+1):
    start_time = timer()
    train_loss = train_epoch(transformer, optimizer)
    end_time = timer()
    val_loss = evaluate(transformer)
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))

# function to generate output sequence using greedy algorithm
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
    for i in range(max_len-1):
        memory = memory.to(DEVICE)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                    .type(torch.bool)).to(DEVICE)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.item()

        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break
    return ys


# actual function to translate input sentence into target language
# inp
def translate(model: torch.nn.Module, src):
    """
    Args:
        single
    """
    model.eval()
    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = greedy_decode(
        model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()
    return " ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")

print(translate(transformer, "Eine Gruppe von Menschen steht vor einem Iglu ."))


In [None]:

from torch.nn.utils.rnn import pad_sequence
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

SRC_LANGUAGE = 'fuck'
TGT_LANGUAGE = 'this'

# Place-holders
token_transform = {}
vocab_transform = {}
token_transform[SRC_LANGUAGE] = get_tokenizer('spacy', language='de_core_news_sm')
token_transform[TGT_LANGUAGE] = get_tokenizer('spacy', language='en_core_web_sm')

for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    # Training data Iterator
    train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    # Create torchtext's Vocab object
    vocab_transform[ln] = build_vocab_from_iterator(yield_tokens(train_iter, ln),
                                                    min_freq=1,
                                                    specials=special_symbols,
                                                    special_first=True)
    

val_iter = Multi30k(split='valid', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
val_dataloader = DataLoader(val_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)


# helper function to club together sequential operations
def sequential_transforms(*transforms):
    def func(txt_input):
        for transform in transforms:
            txt_input = transform(txt_input)
        return txt_input
    return func

# function to add BOS/EOS and create tensor for input sequence indices
def tensor_transform(token_ids: List[int]):
    return torch.cat((torch.tensor([BOS_IDX]),
                      torch.tensor(token_ids),
                      torch.tensor([EOS_IDX])))

# ``src`` and ``tgt`` language text transforms to convert raw strings into tensors indices
text_transform = {}
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    text_transform[ln] = sequential_transforms(token_transform[ln], #Tokenization
                                               vocab_transform[ln], #Numericalization
                                               tensor_transform) # Add BOS/EOS and create tensor

# function to collate data samples into batch tensors
def collate_fn(batch):
    src_batch, tgt_batch = [], []
    for src_sample, tgt_sample in batch:
        src_batch.append(text_transform[SRC_LANGUAGE](src_sample.rstrip("\n")))
        tgt_batch.append(text_transform[TGT_LANGUAGE](tgt_sample.rstrip("\n")))

    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX)
    return src_batch, tgt_batch

In [107]:
from torch.utils.data import DataLoader
from torchtext.datasets import multi30k, Multi30k

# Fixme
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)


def train_epoch(model, optimizer):
    model.train()
    losses = 0
    train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    train_dataloader = DataLoader(train_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)

    for src, tgt in train_dataloader:
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        optimizer.zero_grad()

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        loss.backward()

        optimizer.step()
        losses += loss.item()

    return losses / len(list(train_dataloader))

def evaluate(model):
    model.eval()
    losses = 0

    val_iter = Multi30k(split='valid', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    val_dataloader = DataLoader(val_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)

    for src, tgt in val_dataloader:
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        losses += loss.item()

    return losses / len(list(val_dataloader))

ModuleNotFoundError: No module named 'torchtext'