## Developing Notebook

In [38]:
import torch
import torch.nn as nn
import math

In [47]:
class InputEmbeddings(nn.Module):
    def __init__(self, d_modal, vocab_size):
        super(InputEmbeddings, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, d_modal)
        self.d_modal = d_modal
        
    def forward(self, x):
        return self.embeddings(x) * math.sqrt(self.d_modal)

In [48]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, seq_len):
        super(PositionalEncoding, self).__init__()
        self.d_d_modelmodal = d_model
        self.seq_len = seq_len

        positions = torch.arange(0, seq_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))  # Shape: [d_model/2]

        pe = torch.zeros(seq_len, d_model)
        pe[:, 0::2] = torch.sin(positions.float() * div_term)
        pe[:, 1::2] = torch.cos(positions.float() * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

In [70]:
import torch.nn.functional as F

class MultiHeadAttentionBlock(nn.Module):
    def __init__(self, d_model, heads):
        super(MultiHeadAttentionBlock, self).__init__() 
        self.d_model = d_model
        self.heads = heads
        
        self.d_k = d_model // heads # Dimension of vector seen by each head
        self.w_q = nn.Linear(d_model, d_model, bias=False) # Wq
        self.w_k = nn.Linear(d_model, d_model, bias=False) # Wk
        self.w_v = nn.Linear(d_model, d_model, bias=False) # Wv
        self.w_o = nn.Linear(d_model, d_model, bias=False) # Wo

    @staticmethod
    def scale_dot_product_attention(d_k, Q, K, V, mask=None):
        scores = torch.matmul(Q, K.transpose(-2, -1)) / torch.sqrt(torch.tensor(d_k, dtype=torch.float32))
    
        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-inf'))
    
        attention_weights = F.softmax(scores, dim=-1)
        attention_output = torch.matmul(attention_weights, V)
    
        return attention_output

    def split_head(self, x):
        batch_size, seq_length, _ = x.size()
        return x.view(batch_size, seq_length, self.heads, self.d_k).transpose(1, 2)
    
    def combine_head(self, x):
        batch_size, _, seq_length, _ = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.heads * self.d_k)

    def forward(self, Q, K, V, mask):
        Q = self.split_head(self.w_q(Q))
        K = self.split_head(self.w_k(K))
        V = self.split_head(self.w_v(V))

        attention_output = MultiHeadAttentionBlock.scale_dot_product_attention(self.d_k, Q, K, V, mask)
        attention_output = self.combine_head(attention_output)
        return self.w_o(attention_output)


In [50]:
class FeedForwardLayer(nn.Module):
    def __init__(self, d_model, d_ff):
        super(FeedForwardLayer, self).__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        return self.linear2(torch.relu(self.linear1(x)))

In [None]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__() 
        self.d_model = d_model
        self.attention = MultiHeadAttentionBlock(d_model, num_heads)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.feed_forward = FeedForwardLayer(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        attention_output = self.attention(x, x, x, mask)
        x = self.norm1(x + self.dropout(attention_output))
        feed_forward_output = self.feed_forward(x)
        return self.norm2(x + self.dropout(feed_forward_output))

In [None]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()
        self.d_model = d_model
        self.attention = MultiHeadAttentionBlock(d_model, num_heads)
        self.attention2 = MultiHeadAttentionBlock(d_model, num_heads)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.feed_forward = FeedForwardLayer(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, encoder_output, src_mask, tgt_mask):
        # First attention block
        attention_output = self.attention(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attention_output))

        # Second attention block
        attention_output = self.attention2(x, encoder_output, encoder_output, src_mask)
        x = self.norm2(x + self.dropout(attention_output))

        # Feed forward layer
        feed_forward_output = self.feed_forward(x)
        return self.norm3(x + self.dropout(feed_forward_output))
        

In [None]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, N, d_ff, max_seq_length, dropout):
        super().__init__()
        self.src_embedding = InputEmbeddings(d_model, src_vocab_size)
        self.tgt_embedding = InputEmbeddings(d_model, tgt_vocab_size)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)
        self.encoder = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(N)])
        self.decoder = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(N)])

        self.fc = nn.Linear(d_model, tgt_vocab_size)
        self.dropout_layer = nn.Dropout(dropout)

    def generate_mask(self, src, tgt):
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2)
        tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(3)
        seq_length = tgt.size(1)
        nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length), diagonal=1)).bool()
        tgt_mask = tgt_mask & nopeak_mask
        return src_mask, tgt_mask
    
    def forward(self, src, tgt):
        src_mask, tgt_mask = self.generate_mask(src, tgt)
        src = self.dropout_layer(self.positional_encoding(self.src_embedding(src)))
        tgt = self.dropout_layer(self.positional_encoding(self.tgt_embedding(tgt)))
        for layer in self.encoder:
            src = layer(src, src_mask)
        for layer in self.decoder:
            tgt = layer(tgt, src, src_mask, tgt_mask)
        return self.fc(tgt)


In [22]:
from datasets import load_dataset

ds = load_dataset("wmt/wmt14", "cs-en")

In [23]:
train = ds['train']

In [18]:
train = train.map(lambda x: {'en': x['translation']['en'], 'cs': x['translation']['cs']})

Map: 100%|██████████| 953621/953621 [00:16<00:00, 57028.13 examples/s]


In [27]:
from datasets import load_dataset
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, processors
from transformers import PreTrainedTokenizerFast
from torch.utils.data import DataLoader

dataset = load_dataset("wmt/wmt14", "cs-en")

def get_training_corpus():
    for example in dataset["train"]['translation']:
        yield example["cs"]
        yield example["en"]

tokenizer = Tokenizer(models.BPE(unk_token="<unk>"))
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

trainer = trainers.BpeTrainer(
    vocab_size=37000,
    special_tokens=["<s>", "</s>", "<pad>", "<unk>", "<mask>"]
)

tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)
tokenizer.post_processor = processors.TemplateProcessing(
    single="<s> $A </s>",
    pair="<s> $A </s> <s> $B:1 </s>",
    special_tokens=[
        ("<s>", tokenizer.token_to_id("<s>")),
        ("</s>", tokenizer.token_to_id("</s>")),
    ],
)

tokenizer.save("bpe_tokenizer.json")
hf_tokenizer = PreTrainedTokenizerFast(tokenizer_file="bpe_tokenizer.json")
hf_tokenizer.pad_token = "<pad>"
hf_tokenizer.mask_token = "<mask>"
hf_tokenizer.unk_token = "<unk>"

print("Tokenizer loaded for Hugging Face integration.")

def tokenize_function(examples):
    # Access the source and target lists
    source_sentences = [translation["cs"] for translation in examples["translation"]]
    target_sentences = [translation["en"] for translation in examples["translation"]]


    # Tokenize source and target
    source = hf_tokenizer(source_sentences, max_length=128, truncation=True, padding="max_length")
    target = hf_tokenizer(target_sentences, max_length=128, truncation=True, padding="max_length")

    # Add labels for the target sequence
    source["labels"] = target["input_ids"]

    return source


# Apply tokenization to the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["translation"])
def batch_by_token_count(dataset, max_tokens=25000):
    """Batch sentences by approximate token count."""
    batches = []
    batch = []
    token_count = 0

    for example in dataset:
        source_len = len(example["input_ids"])
        target_len = len(example["labels"])
        total_len = source_len + target_len

        if token_count + total_len > max_tokens:
            batches.append(batch)
            batch = []
            token_count = 0

        batch.append(example)
        token_count += total_len

    if batch:
        batches.append(batch)

    return batches

# Custom batching
batched_data = batch_by_token_count(tokenized_dataset["train"], max_tokens=25000)
print(f"Number of batches: {len(batched_data)}")

# Convert to DataLoader
train_dataloader = DataLoader(batched_data, batch_size=1, shuffle=True)

# Example: Inspect a batch
for batch in train_dataloader:
    print(batch)
    break




Tokenizer loaded for Hugging Face integration.


Map: 100%|██████████| 953621/953621 [01:24<00:00, 11335.82 examples/s]
Map: 100%|██████████| 3000/3000 [00:00<00:00, 8196.16 examples/s]
Map: 100%|██████████| 3003/3003 [00:00<00:00, 12714.01 examples/s]


Number of batches: 9832
[{'input_ids': [tensor([0]), tensor([2856]), tensor([4724]), tensor([16]), tensor([6109]), tensor([2353]), tensor([18619]), tensor([3384]), tensor([8322]), tensor([16]), tensor([2353]), tensor([17]), tensor([1981]), tensor([8084]), tensor([28088]), tensor([3900]), tensor([18]), tensor([1]), tensor([2]), tensor([2]), tensor([2]), tensor([2]), tensor([2]), tensor([2]), tensor([2]), tensor([2]), tensor([2]), tensor([2]), tensor([2]), tensor([2]), tensor([2]), tensor([2]), tensor([2]), tensor([2]), tensor([2]), tensor([2]), tensor([2]), tensor([2]), tensor([2]), tensor([2]), tensor([2]), tensor([2]), tensor([2]), tensor([2]), tensor([2]), tensor([2]), tensor([2]), tensor([2]), tensor([2]), tensor([2]), tensor([2]), tensor([2]), tensor([2]), tensor([2]), tensor([2]), tensor([2]), tensor([2]), tensor([2]), tensor([2]), tensor([2]), tensor([2]), tensor([2]), tensor([2]), tensor([2]), tensor([2]), tensor([2]), tensor([2]), tensor([2]), tensor([2]), tensor([2]), tensor([

In [None]:
from transformers import AutoTokenizer
from datasets import load_dataset
import torch
from torch.utils.data import DataLoader

# Load the WMT14 Czech-English dataset
ds = load_dataset("wmt14", "cs-en")

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")  # example; pick something appropriate

def tokenize_function(examples):
    # This assumes the "translation" column with a dict of "cs" and "en" strings
    src_texts = [translation["cs"] for translation in examples["translation"]]
    tgt_texts = [translation["en"] for translation in examples["translation"]]
    # For a typical seq2seq model, you might tokenize them separately:
    model_inputs = tokenizer(src_texts, max_length=128, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(tgt_texts, max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_ds = ds.map(tokenize_function, batched=True)

def collate_fn(batch):
    # Each item in `batch` is already a dict with tensor fields
    return {
        "input_ids": torch.tensor([b["input_ids"] for b in batch], dtype=torch.long),
        "attention_mask": torch.tensor([b["attention_mask"] for b in batch], dtype=torch.long),
        "labels": torch.tensor([b["labels"] for b in batch], dtype=torch.long),
    }

train_dataloader = DataLoader(
    tokenized_ds["train"], 
    batch_size=1, 
    collate_fn=collate_fn, 
    shuffle=True
)


Map: 100%|██████████| 953621/953621 [02:17<00:00, 6937.41 examples/s] 
Map: 100%|██████████| 3000/3000 [00:00<00:00, 13307.44 examples/s]
Map: 100%|██████████| 3003/3003 [00:00<00:00, 13747.16 examples/s]


In [72]:
def train_my_transformer(model, dataloader, num_epochs=3, device="cuda"):
    """
    A simple training loop for your custom Transformer model,
    using teacher forcing and cross-entropy loss (with ignore_index=0).
    """
    model.to(device)
    
    # Assume 0 is the padding index
    criterion = nn.CrossEntropyLoss(ignore_index=0)
    optimizer = optim.Adam(model.parameters(), lr=1e-4)
    
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0.0
        
        for step, batch in enumerate(dataloader):
            # Move data to device
            src = batch["input_ids"].to(device)   # (B, src_len)
            # We won't necessarily use 'attention_mask' for this custom model
            labels = batch["labels"].to(device)   # (B, tgt_len)
            
            # Teacher forcing: input to decoder is everything except last token,
            # targets are everything except the first token.
            tgt_inp = labels[:, :-1]  # (B, tgt_len - 1)
            tgt_out = labels[:, 1:]   # (B, tgt_len - 1)

            optimizer.zero_grad()
            logits = model(src, tgt_inp)  # (B, T-1, vocab_size)
            
            # Reshape to (batch*(T-1), vocab_size) for cross-entropy
            logits = logits.reshape(-1, logits.size(-1))
            # Same reshape for targets
            tgt_out = tgt_out.reshape(-1)
            
            # Compute loss
            loss = criterion(logits, tgt_out)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()

            # (Optional) print progress
            if (step + 1) % 100 == 0:
                avg_step_loss = total_loss / (step + 1)
                print(f"Epoch [{epoch+1}/{num_epochs}], Step [{step+1}/{len(dataloader)}], "
                      f"Avg Loss: {avg_step_loss:.4f}")
        
        # End of epoch stats
        epoch_loss = total_loss / len(dataloader)
        print(f"=== Epoch [{epoch+1}/{num_epochs}] - Avg Loss: {epoch_loss:.4f} ===")

# ---------------------------
# 4. Putting It All Together
# ---------------------------

# Example of how to instantiate your model:
src_vocab_size = tokenizer.vocab_size  # or larger, depending on subword tokens
tgt_vocab_size = tokenizer.vocab_size
d_model = 256
num_heads = 4
N = 2           # number of encoder/decoder layers
d_ff = 2048
max_seq_length = 128

model = Transformer(
    src_vocab_size=src_vocab_size,
    tgt_vocab_size=tgt_vocab_size,
    d_model=d_model,
    num_heads=num_heads,
    N=N,
    d_ff=d_ff,
    max_seq_length=max_seq_length
)

# Train for a few epochs:
train_my_transformer(model, train_dataloader, num_epochs=1, device="cpu")

Epoch [1/1], Step [100/29801], Avg Loss: nan
Epoch [1/1], Step [200/29801], Avg Loss: nan
Epoch [1/1], Step [300/29801], Avg Loss: nan
Epoch [1/1], Step [400/29801], Avg Loss: nan
Epoch [1/1], Step [500/29801], Avg Loss: nan
Epoch [1/1], Step [600/29801], Avg Loss: nan
Epoch [1/1], Step [700/29801], Avg Loss: nan
Epoch [1/1], Step [800/29801], Avg Loss: nan
Epoch [1/1], Step [900/29801], Avg Loss: nan
Epoch [1/1], Step [1000/29801], Avg Loss: nan
Epoch [1/1], Step [1100/29801], Avg Loss: nan
Epoch [1/1], Step [1200/29801], Avg Loss: nan


KeyboardInterrupt: 

In [74]:
src_vocab_size = 5000
tgt_vocab_size = 5000
d_model = 512
num_heads = 8
num_layers = 6
d_ff = 2048
max_seq_length = 100
dropout = 0.1

transformer = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length)

# Generate random sample data
src_data = torch.randint(1, src_vocab_size, (64, max_seq_length))  # (batch_size, seq_length)
tgt_data = torch.randint(1, tgt_vocab_size, (64, max_seq_length))  # (batch_size, seq_length)

In [76]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

transformer.train()

model.to("mps")

for epoch in range(10):
    optimizer.zero_grad()
    output = transformer(src_data, tgt_data[:, :-1])
    loss = criterion(output.contiguous().view(-1, tgt_vocab_size), tgt_data[:, 1:].contiguous().view(-1))
    loss.backward()
    optimizer.step()
    print(f"Epoch: {epoch+1}, Loss: {loss.item()}")

Epoch: 1, Loss: 7.5081024169921875
Epoch: 2, Loss: 7.779489994049072
Epoch: 3, Loss: 7.347989559173584
Epoch: 4, Loss: 7.262905597686768
Epoch: 5, Loss: 7.179934024810791
Epoch: 6, Loss: 7.066669940948486
Epoch: 7, Loss: 6.957176685333252
Epoch: 8, Loss: 6.8551130294799805
Epoch: 9, Loss: 6.7570319175720215
Epoch: 10, Loss: 6.660093307495117
Epoch: 11, Loss: 6.564731121063232
Epoch: 12, Loss: 6.468922138214111
Epoch: 13, Loss: 6.3711981773376465
Epoch: 14, Loss: 6.27244234085083
Epoch: 15, Loss: 6.1756768226623535
Epoch: 16, Loss: 6.0786590576171875
Epoch: 17, Loss: 5.983967304229736
Epoch: 18, Loss: 5.890192985534668
Epoch: 19, Loss: 5.799063205718994
Epoch: 20, Loss: 5.711126804351807
Epoch: 21, Loss: 5.6204328536987305
Epoch: 22, Loss: 5.522023677825928
Epoch: 23, Loss: 5.430327415466309
Epoch: 24, Loss: 5.342520713806152
Epoch: 25, Loss: 5.248709201812744
Epoch: 26, Loss: 5.1615705490112305
Epoch: 27, Loss: 5.073338985443115
Epoch: 28, Loss: 4.986794471740723
Epoch: 29, Loss: 4.901

In [9]:
import polars as pl
from pytorch.data import load_vocab

df = pl.read_parquet("./data/dataset.parquet")
df.head(100)

vocab = load_vocab("./data/vocab.ende")
df

inputs,targets
list[i64],list[i64]
"[143, 47, … 1]","[816, 226, … 1]"
"[563, 251, … 1]","[1848, 19085, … 1]"
"[376, 16, … 1]","[527, 3873, … 1]"
"[32952, 9, … 1]","[75, 11, … 1]"
"[161, 18627, … 1]","[161, 75, … 1]"
…,…
"[319, 2798, … 1]","[32121, 2668, … 1]"
"[15953, 5, … 1]","[31571, 5, … 1]"
"[1393, 3457, … 1]","[1393, 3457, … 1]"
"[29, 1705, … 1]","[688, 73, … 1]"


In [13]:
src_vocab_size = len(vocab[0])
tgt_vocab_size = len(vocab[0])
d_model = 512
num_heads = 8
num_layers = 6
d_ff = 2048
max_seq_length = 512
dropout = 0.1
label_smoothing = 0.1

from pytorch.transformer import Transformer
import torch
from torch import optim
from pytorch.train import AttentionIsAllYouNeedSchedule

model = Transformer(
        src_vocab_size,
        tgt_vocab_size,
        d_model,
        num_heads,
        num_layers,
        d_ff,
        max_seq_length,
        dropout,
    )
optimizer = optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9)

warmpup_steps = 4000
scheduler = AttentionIsAllYouNeedSchedule(
        optimizer, d_model, warmup_steps=warmpup_steps
    )

criterion = torch.nn.CrossEntropyLoss(
        ignore_index=0, label_smoothing=label_smoothing
    )

In [None]:
import os
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data import DataLoader, DistributedSampler

# Ensure required environment variables are set
os.environ["MASTER_ADDR"] = "127.0.0.1"  # Replace with the master node's IP in multi-node setups
os.environ["MASTER_PORT"] = "29500"      # Replace with a free port
os.environ["WORLD_SIZE"] = "4"           # Total number of processes
os.environ["RANK"] = "0"                 # Adjust per process

# Read environment variables
world_size = int(os.environ["WORLD_SIZE"])
rank = int(os.environ["RANK"])

dist.init_process_group(backend="nccl", init_method="env://", world_size=world_size, rank=rank)

torch.cuda.set_device(rank)
device = torch.device("cuda", rank)

# Move to device
model.to(device)

# Wrap with DistributedDataParallel
# (find_unused_parameters=True if your forward pass has unused params)
model = DDP(model, device_ids=[rank], output_device=rank)

In [None]:
checkpoint = torch.load("./pytorch/checkpoints/checkpoint_step_100000.pt")

In [None]:
checkpoint = torch.load("./pytorch/checkpoints/checkpoint_step_100000.pt")
model.load_state_dict(checkpoint["model_state_dict"])