In [19]:
import torch
import torch.nn.functional as F 
import torch.nn as nn
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import numpy as np
from torch.utils.data import Dataset, DataLoader

# Hyperparameters


In [20]:
USE_CONDITION = False          # <- auf False setzen für unkonditioniert
d_model = 128
n_head = 4
n_layer = 4
d_ff = 512
dropout = 0.2
batch_size = 1024
eval_iters = 500
device = "cuda" if torch.cuda.is_available() else "cpu"

# Create dataset

Schritte:
1. Jeder wert kriegt ein eigenes lambda
2. füge den wert n_lambda mal einem string hinzu
3. shuffle die sequenz
4. erstelle solche strings N mal

In [21]:
values = np.arange(1, 5)
rates = {1: 6.0, 2: 3.0, 3: 1.5, 4: 0.7}
np.random.seed(42)

# einzelne sequenz mit poissonverteilten werten innerhalb der sequenz
def generate_poisson_string(values):
    count = 0
    seq = []
    for val in values:
        count = np.random.poisson(rates[val])
        for _ in range(count):
            seq.append(val)
        count=0
    seq = np.array(seq)
    np.random.shuffle(seq)
    return seq

# hänge N sequenzen aneinander und fülle immer bis zur maximalen länge mit 0 auf
def generate_dataset(N):
    sequences = [generate_poisson_string(values) for _ in range(N)]
    max_len = max(len(seq) for seq in sequences)
    padded_sequences = np.array([
    np.pad(seq, (0, max_len - len(seq)), mode='constant', constant_values=0) for seq in sequences])
    return padded_sequences

data = generate_dataset(1000000)

# Make list of sequences as list

In [22]:
def dataset_to_lol(dataset):
    seqs = []
    for row in dataset:
        zero_idx = np.argmax(row == 0) if np.any(row == 0) else len(row)
        seqs.append(row[:zero_idx])

    return seqs

seqs = dataset_to_lol(data)

In [23]:
# Vokabular der Ausgabewerte
vals = sorted({v for s in seqs for v in s})

# Spezielle Tokens
PAD_ID = 0 
BOS_ID = 1
EOS_ID = 2
offset = 3  # Start-ID für echte Werte
stoi = {v: i+offset for i,v in enumerate(vals)}
itos = {i+offset: v for i,v in enumerate(vals)}

vocab_size = offset + len(vals) 
max_len = max(len(s) for s in seqs)

def encode_example(seq):
    # Tokenize: BOS + seq + EOS, dann pad
    toks = [BOS_ID]
    toks += [stoi[v] for v in seq]
    toks.append(EOS_ID)
    # Ziel ist um 1 nach rechts geschoben
    attn_len = len(toks)
    pad_needed = ( (max_len + 2) - attn_len )
    toks += [PAD_ID] * pad_needed
    x = torch.tensor(toks[:-1], dtype=torch.long)  # inputs
    y = torch.tensor(toks[1:],  dtype=torch.long)  # targets
    return x, y

class SeqDataset(Dataset):
    def __init__(self, seqs):
        self.items = [encode_example(s) for s in seqs]
    def __len__(self): return len(self.items)
    def __getitem__(self, i): return self.items[i]

dataset = SeqDataset(seqs)
loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

train_size = int(0.8 * len(dataset))  
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=batch_size, drop_last=True, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, drop_last=False, shuffle=False)

# Model

In [24]:
class CausalTransformer(nn.Module):
    def __init__(self, vocab_size, d_model, n_head, n_layer, d_ff, dropout):
        super().__init__()
        self.token_emb = nn.Embedding(vocab_size, d_model)
        self.pos_emb = nn.Parameter(torch.zeros(1, max_len + 1, d_model))  # +1 für BOS
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, 
                                                    nhead=n_head, 
                                                    dim_feedforward=d_ff, 
                                                    dropout=dropout,
                                                    batch_first=True,
                                                    activation='gelu').to(device)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=n_layer, enable_nested_tensor=False).to(device)
        self.ff = nn.Linear(d_model, vocab_size)
        self.ln = nn.LayerNorm(d_model)

    def forward(self, x, targets=None):
        B, T = x.size() #Batch, num sequences/time
        h = self.token_emb(x) + self.pos_emb[:, :T, :]
        causal_mask = torch.tril(torch.ones(T, T)).to(x.device).bool()
        key_padding_mask = (x == PAD_ID)
        h = self.transformer(h, mask=causal_mask, src_key_padding_mask=key_padding_mask)
        h = self.ln(h)
        logits = self.ff(h)
            
        if targets is not None:
            B,T,C = logits.size()
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets, ignore_index=PAD_ID)
            return logits, loss
        else:
            loss=None

        return logits, loss

In [25]:
model = CausalTransformer(vocab_size, d_model, n_head, n_layer, d_ff, dropout).to(device)

# Training

In [26]:
@torch.no_grad()
def estimate_loss(model, train_loader, val_loader, eval_iters=500):
    out = {}
    model.eval()
    for split, loader in [('train', train_loader), ('val', val_loader)]:
        losses = []
        for i, (xb, yb) in enumerate(loader):
            if eval_iters is not None and i >= eval_iters:
                break
            xb, yb = xb.to(device), yb.to(device)
            _, loss = model(xb, yb)
            losses.append(loss.item())
        out[split] = sum(losses) / len(losses) if losses else float('inf')
    model.train()
    return out
            

def train_model(model, train_loader, val_loader, epochs, eval_iters=50, lr=1e-3):
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    model.to(device)

    #Per epoch losses
    train_losses = []
    val_losses =  []
    
    for epoch in tqdm(range(epochs)):
        metrics = estimate_loss(model, train_loader, val_loader, eval_iters=eval_iters)
        val_losses.append(metrics['val'])
        train_losses.append(metrics['train'])

        if epoch == epochs: break
    
        model.train()
        running_loss = 0.0
        for x, y in tqdm(train_loader):
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()
            _, loss = model(x, y)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        avg_train_loss = running_loss / len(train_loader)
        train_losses.append(avg_train_loss)

    return train_losses, val_losses

In [27]:
trained_model = train_model(model, train_loader, val_loader, epochs=5, eval_iters=eval_iters, lr=1e-3)

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/781 [00:00<?, ?it/s]

  0%|          | 0/781 [00:00<?, ?it/s]

  0%|          | 0/781 [00:00<?, ?it/s]

  0%|          | 0/781 [00:00<?, ?it/s]

  0%|          | 0/781 [00:00<?, ?it/s]

In [29]:
train_losses, val_losses = trained_model
train_losses

[nan,
 0.025401572620434444,
 nan,
 0.0005067086936080937,
 nan,
 0.0004050570568209186,
 nan,
 0.0003816378037872675,
 nan,
 0.00043399887130055476]