# Encoder-only Transformer Architektur

In [1]:
#from google.colab import drive
#drive.mount('/content/drive')

In [2]:
!pip install biopython

Defaulting to user installation because normal site-packages is not writeable


In [41]:
import sys
import random
import numpy as np
import pandas as pd
import ast
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch import Tensor
import time
import math

sys.path.append('../scripts')
#sys.path.append('/content/drive/MyDrive/PMDS/Notebooks')
import ml_helper

In [42]:
#data_path = '/content/drive/MyDrive/PMDS/Data'
data_path = '../data'

In [43]:
SEED = 42
def set_seed(SEED=SEED):
    random.seed(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
set_seed()

In [44]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


## Prepare Test and Training Data Loader

In [45]:
amino_acids = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V', '*',
               '_']

aminoacids_to_integer = dict((a, i) for i, a in enumerate(amino_acids))
integer_to_aminoacids = dict((i, a) for i, a in enumerate(amino_acids))

codons = ['TTT', 'TTC', 'TTA', 'TTG', 'TCT', 'TCC', 'TCA', 'TCG', 'TAT', 'TAC', 'TAA', 'TAG', 'TGT', 'TGC', 'TGA',
          'TGG', 'CTT', 'CTC', 'CTA', 'CTG', 'CCT', 'CCC', 'CCA', 'CCG', 'CAT', 'CAC', 'CAA', 'CAG', 'CGT', 'CGC',
          'CGA', 'CGG', 'ATT', 'ATC', 'ATA', 'ATG', 'ACT', 'ACC', 'ACA', 'ACG', 'AAT', 'AAC', 'AAA', 'AAG', 'AGT',
          'AGC', 'AGA', 'AGG', 'GTT', 'GTC', 'GTA', 'GTG', 'GCT', 'GCC', 'GCA', 'GCG', 'GAT', 'GAC', 'GAA', 'GAG',
          'GGT', 'GGC', 'GGA', 'GGG', '___']

codons_to_integer = dict((c, i) for i, c in enumerate(codons))
integer_to_codons = dict((i, c) for i, c in enumerate(codons))

In [46]:
organism = "E.Coli"
min_length = None
max_length = 500

SPEEDS_ADDED = True

train_dataset = ml_helper.CodonDataset(organism, "train", min_length, max_length, add_speeds=SPEEDS_ADDED, cut_data=True, one_hot_aa=False, data_path=data_path, device=device)
print(f"Länge train_dataset: {len(train_dataset)}")
test_dataset = ml_helper.CodonDataset(organism, "test", min_length, max_length, add_speeds=SPEEDS_ADDED, cut_data=True, one_hot_aa=False, data_path=data_path, device=device)
print(f"Länge test_dataset: {len(test_dataset)}")

Länge train_dataset: 3561
Länge test_dataset: 864


In [47]:
print(train_dataset[3])
print(train_dataset[3][0].shape)
print(train_dataset[3][1].shape)

(tensor([[ 12.0000,   1.0000],
        [ 16.0000,   1.0000],
        [ 18.0000,   2.5500],
        [  0.0000,   0.8800],
        [ 19.0000,   1.5000],
        [  6.0000,   1.8500],
        [ 13.0000,   2.1000],
        [  3.0000,   1.9000],
        [  2.0000,   1.6500],
        [ 19.0000,   1.5000],
        [ 15.0000,   2.5200],
        [  1.0000,   5.8700],
        [ 10.0000,   1.6800],
        [ 18.0000,   2.5500],
        [  7.0000,   2.7500],
        [  3.0000,   1.9000],
        [ 19.0000,   1.5000],
        [  1.0000,   5.8700],
        [  0.0000,   0.8800],
        [ 19.0000,   1.5000],
        [  3.0000,   1.9000],
        [  7.0000,   2.7500],
        [ 19.0000,   1.5000],
        [ 15.0000,   2.5200],
        [  9.0000,   2.1000],
        [  0.0000,   0.8800],
        [  9.0000,   2.1000],
        [ 11.0000,   1.2500],
        [  3.0000,   1.9000],
        [  7.0000,   2.7500],
        [  6.0000,   1.8500],
        [ 13.0000,   2.1000],
        [ 13.0000,   2.1000],
        [

In [48]:
BATCH_SIZE = 32
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

## Define the encoder-only model

In [49]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 500):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Arguments:
            x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [50]:
class EncoderClassifier(nn.Module):
    def __init__(self, embed_dim, num_layers, num_heads, dropout=0.2, pos_enc=False):
        super(EncoderClassifier, self).__init__()

        emb_size = embed_dim
        if SPEEDS_ADDED:
            emb_size -= 1
        self.emb = nn.Embedding(len(amino_acids), emb_size, padding_idx=len(amino_acids)-1)
        self.pos_enc = pos_enc
        self.pos_encoder = PositionalEncoding(embed_dim, dropout)

        self.encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim,
            nhead=num_heads,
            batch_first=True
        )
        self.encoder = nn.TransformerEncoder(
            encoder_layer=self.encoder_layer,
            num_layers=num_layers,
        )
        self.linear = nn.Linear(embed_dim, len(codons))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = x.long()
        if SPEEDS_ADDED:
            x1 = self.emb(x[:, :, 0])
            x2 = x[:, :, 1].unsqueeze(-1)
            x = torch.cat((x1, x2), dim=-1)  # Concatenate along the feature dimension
        else:
            x = self.emb(x)

        if self.pos_enc:
            x = self.pos_encoder(x)  # Add positional encoding
        x = self.encoder(x)
        x = self.dropout(x)
        out = self.linear(x)
        return out

In [51]:
EMBED_DIM = 256
NUM_ENCODER_LAYERS = 4
NUM_HEADS = 4
DROP_OUT = 0.2

In [52]:
model = EncoderClassifier(
    embed_dim=EMBED_DIM,
    num_layers=NUM_ENCODER_LAYERS,
    num_heads=NUM_HEADS,
    dropout=DROP_OUT,
    pos_enc=False
).to(device)
print(model)

EncoderClassifier(
  (emb): Embedding(22, 255, padding_idx=21)
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (encoder_layer): TransformerEncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
    )
    (linear1): Linear(in_features=256, out_features=2048, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (linear2): Linear(in_features=2048, out_features=256, bias=True)
    (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.1, inplace=False)
    (dropout2): Dropout(p=0.1, inplace=False)
  )
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-3): 4 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
        )
   

In [53]:
# Total parameters and trainable parameters.
def print_parameters(model):
    total_params = sum(p.numel() for p in model.parameters())
    print(f"{total_params:,} total parameters.")
    total_trainable_params = sum(
        p.numel() for p in model.parameters() if p.requires_grad)
    print(f"{total_trainable_params:,} training parameters.")

In [54]:
print_parameters(model)

6,597,675 total parameters.
6,597,675 training parameters.


In [55]:
def test_forward_pass(model, data_loader):
  batch_data, batch_label = next(iter(data_loader))
  print(f"input dim: {batch_data.shape}")
  output = model(batch_data)
  print(f"output dim: {output.shape}")

In [56]:
test_forward_pass(model, train_loader)

input dim: torch.Size([32, 500, 2])
output dim: torch.Size([32, 500, 65])


## Define the training methods

In [57]:
def train_model(model, optimizer, criterion, num_epochs, print_batches=0):
    for epoch in range(num_epochs):
        model.train()

        epoch_start_time = time.time()
        batch_start_time = time.time()
        epoch_loss = 0
        for batch_idx, batch in enumerate(train_loader):
            # Clear gradients
            optimizer.zero_grad()

            # Forward pass
            input_data, labels = batch

            output = model(input_data)  # (batch_size, seq_len, num_classes)
            output = output.view(-1, len(codons)) # (batch_size * seq_len, num_classes)

            labels = labels.view(-1).long() # (batch_size, seq_len) -> (batch_size * seq_len)

            # Calculate loss
            loss = criterion(output, labels)
            epoch_loss += loss.item()

            # Backward pass
            loss.backward()

            # Update model parameters
            optimizer.step()

            if print_batches != 0 and batch_idx % print_batches == (print_batches-1):
                batch_time =  round(time.time() - batch_start_time,2)
                print(f'Batch [{batch_idx+1}/{len(train_loader)}], Loss: {loss.item():.4f}, Time since last batch print: {batch_time} s')
                batch_start_time = time.time()

        epoch_time = round(time.time() - epoch_start_time,2)
        epoch_loss = round(epoch_loss / len(train_loader),4)
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss}, Took {epoch_time} s')

## Define the evaluation methods to calculate metrics

In [58]:
from sklearn.metrics import accuracy_score

def compute_accuracy(predictions, labels):
    predictions = np.argmax(predictions, axis=1)

    # Find indices where labels are not equal to the padding value
    non_padding_indices = labels != codons_to_integer['___']

    # Filter out predictions and labels where the label is not padding
    filtered_predictions = predictions[non_padding_indices]
    filtered_labels = labels[non_padding_indices]

    acc = accuracy_score(labels, predictions)
    return acc

In [59]:
def evaluate_model(model, criterion):
    model.eval()  # Set the model to evaluation mode

    total_loss = 0.0

    with torch.no_grad():
        accuracies = []
        for batch_idx, batch in enumerate(test_loader):
             # Forward pass
            input_data, labels = batch

            output = model(input_data)  # (batch_size, seq_len, num_classes)
            output = output.view(-1, len(codons)) # (batch_size * seq_len, num_classes)

            labels = labels.view(-1).long() # (batch_size, seq_len) -> (batch_size * seq_len)

            # Calculate loss
            loss = criterion(output, labels)

            # Compute total loss
            total_loss += loss.item()

            # Compute custom metrics
            accuracy = compute_accuracy(output.cpu(), labels.cpu())
            accuracies.append(accuracy)

    # Compute average loss
    avg_loss = total_loss / len(test_loader)

    # Compute average accuracy
    avg_accuracy = np.mean(accuracies)

    print(f'Average Loss: {avg_loss:.4f}')
    print(f'Average Accuracy: {avg_accuracy:.4f}')

    return output, labels

## Training the model

In [60]:
set_seed()
EMBED_DIM = 256
NUM_ENCODER_LAYERS = 4
NUM_HEADS = 4
DROPOUT = 0.2

In [61]:
model = EncoderClassifier(
    embed_dim=EMBED_DIM,
    num_layers=NUM_ENCODER_LAYERS,
    num_heads=NUM_HEADS,
    dropout=DROPOUT,
    pos_enc=False
).to(device)
print_parameters(model)

6,597,675 total parameters.
6,597,675 training parameters.


In [62]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

In [63]:
EPOCHS = 10
print("----- Start Training -----")
train_model(model, optimizer, criterion, EPOCHS)

----- Start Training -----
Epoch [1/10], Loss: 0.7026, Took 3.67 s
Epoch [2/10], Loss: 0.5737, Took 3.56 s
Epoch [3/10], Loss: 0.5695, Took 3.57 s
Epoch [4/10], Loss: 0.565, Took 3.56 s
Epoch [5/10], Loss: 0.5621, Took 3.57 s
Epoch [6/10], Loss: 0.5607, Took 3.57 s
Epoch [7/10], Loss: 0.5603, Took 3.57 s
Epoch [8/10], Loss: 0.5607, Took 3.57 s
Epoch [9/10], Loss: 0.5595, Took 3.57 s
Epoch [10/10], Loss: 0.5595, Took 3.56 s


In [64]:
batch_data, batch_label = next(iter(train_loader))
output = model(batch_data)
print(output.shape)

torch.Size([32, 500, 65])


In [65]:
ml_helper.save_model(model, 'encoder_256em_4l_4h_02dr_10ep_speeds', organism)

Model saved as 20240522184144_encoder_256em_4l_4h_02dr_10ep_speeds.pt


In [284]:
model = EncoderClassifier(
    embed_dim=EMBED_DIM,
    num_layers=NUM_ENCODER_LAYERS,
    num_heads=NUM_HEADS,
    dropout=DROPOUT,
    pos_enc=False
).to(device)
model = ml_helper.load_model('encoder_256em_4l_4h_02dr_10ep', organism)

Model loaded: 20240518113523_encoder_256em_4l_4h_02dr_10ep.pt


In [285]:
evaluate_model(model, criterion)

Average Loss: 0.5520
Average Accuracy: 0.7443


(tensor([[ 0.9152,  0.5791,  0.0878,  ..., -0.0355, -0.4697, -0.5985],
         [-1.2141, -1.1749, -0.8081,  ..., -0.5642, -0.7749, -0.1969],
         [-0.3878, -0.4670, -1.3479,  ..., -0.3965,  0.2353,  0.5023],
         ...,
         [-2.1591, -1.8035, -1.4565,  ..., -1.9126, -1.7272, 12.4792],
         [-2.1591, -1.8035, -1.4565,  ..., -1.9126, -1.7272, 12.4792],
         [-2.1591, -1.8035, -1.4565,  ..., -1.9126, -1.7272, 12.4792]],
        device='cuda:0'),
 tensor([35, 52, 40,  ..., 64, 64, 64], device='cuda:0'))

## Hyperparameter tuning

In [296]:
def train_parameter_model(embed_dim, num_encoder_layers, num_heads, dropout, pos_enc, num_epochs):
    set_seed()
    
    model = EncoderClassifier(
        embed_dim=embed_dim,
        num_layers=num_encoder_layers,
        num_heads=num_heads,
        dropout=dropout,
        pos_enc=pos_enc
    ).to(device)
    print_parameters(model)

    criterion = torch.nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters())

    print(f"----- Start Training: {embed_dim} emb, {num_encoder_layers} layers, {num_heads} heads, {dropout} dropout, positional encoding: {pos_enc}, {num_epochs} epochs -----")
    train_model(model, optimizer, criterion, num_epochs)

    ml_helper.save_model(model, f'encoder_{embed_dim}em_{num_encoder_layers}l_{num_heads}h_{str(dropout).replace(".","")}dr_{num_epochs}ep{"_posenc" if pos_enc else ""}', organism)

### Dropout

In [297]:
EMBED_DIM = 256
NUM_ENCODER_LAYERS = 4
NUM_HEADS = 4
dropouts = [0.1, 0.2, 0.3, 0.4, 0.5]
POS_ENC = False
EPOCHS = 10

In [298]:
for DROPOUT in dropouts:
    train_parameter_model(EMBED_DIM, NUM_ENCODER_LAYERS, NUM_HEADS, DROPOUT, POS_ENC, EPOCHS)

6,597,697 total parameters.
6,597,697 training parameters.
----- Start Training: 256 emb, 4 layers, 4 heads, 0.1 dropout, positional encoding: False, 10 epochs -----
Epoch [1/10], Loss: 0.7416, Took 3.49 s
Epoch [2/10], Loss: 0.5721, Took 3.49 s
Epoch [3/10], Loss: 0.5657, Took 3.49 s
Epoch [4/10], Loss: 0.5641, Took 3.5 s
Epoch [5/10], Loss: 0.5632, Took 3.5 s
Epoch [6/10], Loss: 0.5608, Took 3.5 s
Epoch [7/10], Loss: 0.5612, Took 3.5 s
Epoch [8/10], Loss: 0.5596, Took 3.5 s
Epoch [9/10], Loss: 0.5605, Took 3.5 s
Epoch [10/10], Loss: 0.5602, Took 3.51 s
Model saved as 20240518114512_encoder_256em_4l_4h_01dr_10ep.pt
6,597,697 total parameters.
6,597,697 training parameters.
----- Start Training: 256 emb, 4 layers, 4 heads, 0.2 dropout, positional encoding: False, 10 epochs -----
Epoch [1/10], Loss: 0.7472, Took 3.51 s
Epoch [2/10], Loss: 0.5737, Took 3.5 s
Epoch [3/10], Loss: 0.5664, Took 3.5 s
Epoch [4/10], Loss: 0.5642, Took 3.51 s
Epoch [5/10], Loss: 0.5631, Took 3.51 s
Epoch [6/10]

### Positional Encoding

In [299]:
EMBED_DIM = 256
NUM_ENCODER_LAYERS = 4
NUM_HEADS = 4
DROPOUT = 0.3
pos_enc = [True, False]
EPOCHS = 10

In [300]:
for POS_ENC in pos_enc:
    train_parameter_model(EMBED_DIM, NUM_ENCODER_LAYERS, NUM_HEADS, DROPOUT, POS_ENC, EPOCHS)

6,597,697 total parameters.
6,597,697 training parameters.
----- Start Training: 256 emb, 4 layers, 4 heads, 0.3 dropout, positional encoding: True, 10 epochs -----
Epoch [1/10], Loss: 1.4638, Took 3.51 s
Epoch [2/10], Loss: 0.5765, Took 3.51 s
Epoch [3/10], Loss: 0.5681, Took 3.51 s
Epoch [4/10], Loss: 0.5652, Took 3.51 s
Epoch [5/10], Loss: 0.5642, Took 3.51 s
Epoch [6/10], Loss: 0.5619, Took 3.51 s
Epoch [7/10], Loss: 0.5618, Took 3.51 s
Epoch [8/10], Loss: 0.5612, Took 3.51 s
Epoch [9/10], Loss: 0.5614, Took 3.51 s
Epoch [10/10], Loss: 0.5615, Took 3.52 s
Model saved as 20240518115201_encoder_256em_4l_4h_03dr_10ep_posenc.pt
6,597,697 total parameters.
6,597,697 training parameters.
----- Start Training: 256 emb, 4 layers, 4 heads, 0.3 dropout, positional encoding: False, 10 epochs -----
Epoch [1/10], Loss: 0.753, Took 3.51 s
Epoch [2/10], Loss: 0.5751, Took 3.51 s
Epoch [3/10], Loss: 0.5666, Took 3.51 s
Epoch [4/10], Loss: 0.5644, Took 3.51 s
Epoch [5/10], Loss: 0.5629, Took 3.51 s

### Embedding Dimension

In [301]:
embed_dims = [32, 64, 128, 256, 512, 1028]
NUM_ENCODER_LAYERS = 4
NUM_HEADS = 4
DROPOUT = 0.3
POS_ENC = False
EPOCHS = 10

In [302]:
for EMBED_DIM in embed_dims:
    train_parameter_model(EMBED_DIM, NUM_ENCODER_LAYERS, NUM_HEADS, DROPOUT, POS_ENC, EPOCHS)

690,369 total parameters.
690,369 training parameters.
----- Start Training: 32 emb, 4 layers, 4 heads, 0.3 dropout, positional encoding: False, 10 epochs -----
Epoch [1/10], Loss: 1.4734, Took 1.98 s
Epoch [2/10], Loss: 0.7379, Took 1.97 s
Epoch [3/10], Loss: 0.6394, Took 1.97 s
Epoch [4/10], Loss: 0.6079, Took 1.97 s
Epoch [5/10], Loss: 0.593, Took 1.97 s
Epoch [6/10], Loss: 0.584, Took 1.97 s
Epoch [7/10], Loss: 0.5777, Took 1.97 s
Epoch [8/10], Loss: 0.5742, Took 1.97 s
Epoch [9/10], Loss: 0.5738, Took 1.97 s
Epoch [10/10], Loss: 0.5719, Took 1.97 s
Model saved as 20240518115409_encoder_32em_4l_4h_03dr_10ep.pt
1,411,393 total parameters.
1,411,393 training parameters.
----- Start Training: 64 emb, 4 layers, 4 heads, 0.3 dropout, positional encoding: False, 10 epochs -----
Epoch [1/10], Loss: 0.9535, Took 2.09 s
Epoch [2/10], Loss: 0.6092, Took 2.09 s
Epoch [3/10], Loss: 0.5858, Took 2.09 s
Epoch [4/10], Loss: 0.576, Took 2.09 s
Epoch [5/10], Loss: 0.5714, Took 2.09 s
Epoch [6/10], 

### Number Encoder Layers and Heads

In [303]:
EMBED_DIM = 256
num_encoder_layers = [2, 4, 8, 16]
num_heads = [2, 4, 8, 16]
DROPOUT = 0.3
POS_ENC = False
EPOCHS = 10

In [304]:
for NUM_ENCODER_LAYERS in num_encoder_layers:
    for NUM_HEADS in num_heads:
        train_parameter_model(EMBED_DIM, NUM_ENCODER_LAYERS, NUM_HEADS, DROPOUT, POS_ENC, EPOCHS)

3,967,553 total parameters.
3,967,553 training parameters.
----- Start Training: 256 emb, 2 layers, 2 heads, 0.3 dropout, positional encoding: False, 10 epochs -----
Epoch [1/10], Loss: 0.677, Took 1.77 s
Epoch [2/10], Loss: 0.5755, Took 1.77 s
Epoch [3/10], Loss: 0.568, Took 1.76 s
Epoch [4/10], Loss: 0.5656, Took 1.76 s
Epoch [5/10], Loss: 0.5647, Took 1.77 s
Epoch [6/10], Loss: 0.5625, Took 1.76 s
Epoch [7/10], Loss: 0.5639, Took 1.77 s
Epoch [8/10], Loss: 0.562, Took 1.77 s
Epoch [9/10], Loss: 0.5619, Took 1.77 s
Epoch [10/10], Loss: 0.5615, Took 1.76 s
Model saved as 20240518120122_encoder_256em_2l_2h_03dr_10ep.pt
3,967,553 total parameters.
3,967,553 training parameters.
----- Start Training: 256 emb, 2 layers, 4 heads, 0.3 dropout, positional encoding: False, 10 epochs -----
Epoch [1/10], Loss: 0.6778, Took 1.81 s
Epoch [2/10], Loss: 0.5752, Took 1.81 s
Epoch [3/10], Loss: 0.5679, Took 1.81 s
Epoch [4/10], Loss: 0.5655, Took 1.81 s
Epoch [5/10], Loss: 0.5638, Took 1.81 s
Epoch [

## Training Drosophila.Melanogaster

In [305]:
organism = "Drosophila.Melanogaster"
min_length = None
max_length = 500

train_dataset = ml_helper.CodonDataset(organism, "train", min_length, max_length, cut_data=True, one_hot_aa=False, data_path=data_path, device=device)
print(f"Länge train_dataset: {len(train_dataset)}")
test_dataset = ml_helper.CodonDataset(organism, "test", min_length, max_length, cut_data=True, one_hot_aa=False, data_path=data_path, device=device)
print(f"Länge test_dataset: {len(test_dataset)}")

Länge train_dataset: 33071
Länge test_dataset: 8100


In [306]:
EMBED_DIM = 256
NUM_ENCODER_LAYERS = 4
NUM_HEADS = 4
DROPOUT = 0.3
POS_ENC = False
EPOCHS = 10

In [307]:
train_parameter_model(EMBED_DIM, NUM_ENCODER_LAYERS, NUM_HEADS, DROPOUT, POS_ENC, EPOCHS)

6,597,697 total parameters.
6,597,697 training parameters.
----- Start Training: 256 emb, 4 layers, 4 heads, 0.3 dropout, positional encoding: False, 10 epochs -----
Epoch [1/10], Loss: 0.753, Took 3.51 s
Epoch [2/10], Loss: 0.5751, Took 3.51 s
Epoch [3/10], Loss: 0.5666, Took 3.51 s
Epoch [4/10], Loss: 0.5644, Took 3.51 s
Epoch [5/10], Loss: 0.5629, Took 3.51 s
Epoch [6/10], Loss: 0.5609, Took 3.51 s
Epoch [7/10], Loss: 0.5612, Took 3.52 s
Epoch [8/10], Loss: 0.5597, Took 3.51 s
Epoch [9/10], Loss: 0.5603, Took 3.52 s
Epoch [10/10], Loss: 0.5604, Took 3.52 s
Model saved as 20240518122826_encoder_256em_4l_4h_03dr_10ep.pt
