# Encoder-only Transformer Architektur

In [1]:
#from google.colab import drive
#drive.mount('/content/drive')

In [2]:
!pip install biopython

Defaulting to user installation because normal site-packages is not writeable


In [1]:
import sys
import random
import numpy as np
import pandas as pd
import ast
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch import Tensor
import time
import math

sys.path.append('../scripts')
#sys.path.append('/content/drive/MyDrive/PMDS/Notebooks')
import ml_helper

In [2]:
#data_path = '/content/drive/MyDrive/PMDS/Data'
data_path = '../data'

In [3]:
SEED = 42
def set_seed(SEED=SEED):
    random.seed(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
set_seed()

In [4]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


## Prepare Test and Training Data Loader

In [5]:
amino_acids = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V', '*',
               '_']

aminoacids_to_integer = dict((a, i) for i, a in enumerate(amino_acids))
integer_to_aminoacids = dict((i, a) for i, a in enumerate(amino_acids))

codons = ['TTT', 'TTC', 'TTA', 'TTG', 'TCT', 'TCC', 'TCA', 'TCG', 'TAT', 'TAC', 'TAA', 'TAG', 'TGT', 'TGC', 'TGA',
          'TGG', 'CTT', 'CTC', 'CTA', 'CTG', 'CCT', 'CCC', 'CCA', 'CCG', 'CAT', 'CAC', 'CAA', 'CAG', 'CGT', 'CGC',
          'CGA', 'CGG', 'ATT', 'ATC', 'ATA', 'ATG', 'ACT', 'ACC', 'ACA', 'ACG', 'AAT', 'AAC', 'AAA', 'AAG', 'AGT',
          'AGC', 'AGA', 'AGG', 'GTT', 'GTC', 'GTA', 'GTG', 'GCT', 'GCC', 'GCA', 'GCG', 'GAT', 'GAC', 'GAA', 'GAG',
          'GGT', 'GGC', 'GGA', 'GGG', '___']

codons_to_integer = dict((c, i) for i, c in enumerate(codons))
integer_to_codons = dict((i, c) for i, c in enumerate(codons))

In [6]:
organism = "E.Coli"
min_length = None
max_length = 500

SPEEDS_ADDED = False

train_dataset = ml_helper.CodonDataset(organism, "train", min_length, max_length, add_speeds=SPEEDS_ADDED, cut_data=True, one_hot_aa=False, data_path=data_path, device=device)
print(f"Länge train_dataset: {len(train_dataset)}")
test_dataset = ml_helper.CodonDataset(organism, "test", min_length, max_length, add_speeds=SPEEDS_ADDED, cut_data=True, one_hot_aa=False, data_path=data_path, device=device)
print(f"Länge test_dataset: {len(test_dataset)}")

Länge train_dataset: 3561
Länge test_dataset: 864


In [7]:
print(train_dataset[3])
print(train_dataset[3][0].shape)
print(train_dataset[3][1].shape)

(tensor([12, 16, 18,  0, 19,  6, 13,  3,  2, 19, 15,  1, 10, 18,  7,  3, 19,  1,
         0, 19,  3,  7, 19, 15,  9,  0,  9, 11,  3,  7,  6, 13, 13, 15, 12, 10,
         7, 14, 15,  7, 15,  7, 11, 16, 16,  4, 10,  1, 10,  9,  0,  7, 13,  6,
         5, 10, 15,  7,  7,  0,  9, 15,  9, 13,  7, 11, 14,  0, 15,  2, 10, 14,
        14, 17,  6,  1,  3, 19,  2, 16, 19, 13,  5,  3, 18,  0, 10, 13, 14,  8,
        12, 15,  9, 10,  3,  2, 19,  0, 18,  7, 10, 12, 19, 11,  7, 19,  2, 11,
        11,  5,  1,  8,  0, 12,  0,  5,  6,  0, 10,  6, 11, 19,  0, 10,  7, 13,
        19,  8,  5,  1, 11, 14, 15,  5, 10, 15,  7,  7,  5,  1,  5,  1, 19,  0,
         9,  0,  1,  0, 10, 19,  2,  6, 14,  1, 19, 10, 10, 10,  3,  6, 14, 10,
         7,  0, 10,  3, 10, 11, 10,  1,  6,  5, 12,  5, 10,  6, 10, 11, 11, 10,
         5,  5, 15, 10,  7,  9, 16, 13,  9, 13, 19, 16,  8,  3,  5,  7,  6,  0,
        10, 15, 12, 15,  3,  1, 19,  0, 19, 13,  2,  2,  7,  1,  9,  6,  5, 19,
         3, 15, 14,  1,  3, 10, 18, 12,

In [8]:
BATCH_SIZE = 32
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

## Define the encoder-only model

In [9]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 500):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Arguments:
            x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [10]:
class EncoderClassifier(nn.Module):
    def __init__(self, embed_dim, num_layers, num_heads, dropout=0.2, pos_enc=False):
        super(EncoderClassifier, self).__init__()

        emb_size = embed_dim
        if SPEEDS_ADDED:
            emb_size -= 1
        self.emb = nn.Embedding(len(amino_acids), emb_size, padding_idx=len(amino_acids)-1)
        self.pos_enc = pos_enc
        self.pos_encoder = PositionalEncoding(embed_dim, dropout)

        self.encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim,
            nhead=num_heads,
            batch_first=True
        )
        self.encoder = nn.TransformerEncoder(
            encoder_layer=self.encoder_layer,
            num_layers=num_layers,
        )
        self.linear = nn.Linear(embed_dim, len(codons))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = x.long()
        if SPEEDS_ADDED:
            x1 = self.emb(x[:, :, 0])
            x2 = x[:, :, 1].unsqueeze(-1)
            x = torch.cat((x1, x2), dim=-1)  # Concatenate along the feature dimension
        else:
            x = self.emb(x)

        if self.pos_enc:
            x = x.transpose(0, 1)
            x = self.pos_encoder(x)  # Add positional encoding
            x = x.transpose(0, 1)
        x = self.encoder(x)
        x = self.dropout(x)
        out = self.linear(x)
        return out

In [11]:
EMBED_DIM = 256
NUM_ENCODER_LAYERS = 4
NUM_HEADS = 4
DROP_OUT = 0.2

In [12]:
model = EncoderClassifier(
    embed_dim=EMBED_DIM,
    num_layers=NUM_ENCODER_LAYERS,
    num_heads=NUM_HEADS,
    dropout=DROP_OUT,
    pos_enc=True
).to(device)
print(model)

EncoderClassifier(
  (emb): Embedding(22, 256, padding_idx=21)
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (encoder_layer): TransformerEncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
    )
    (linear1): Linear(in_features=256, out_features=2048, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (linear2): Linear(in_features=2048, out_features=256, bias=True)
    (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.1, inplace=False)
    (dropout2): Dropout(p=0.1, inplace=False)
  )
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-3): 4 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
        )
   

In [13]:
# Total parameters and trainable parameters.
def print_parameters(model):
    total_params = sum(p.numel() for p in model.parameters())
    print(f"{total_params:,} total parameters.")
    total_trainable_params = sum(
        p.numel() for p in model.parameters() if p.requires_grad)
    print(f"{total_trainable_params:,} training parameters.")

In [14]:
print_parameters(model)

6,597,697 total parameters.
6,597,697 training parameters.


In [15]:
def test_forward_pass(model, data_loader):
  batch_data, batch_label = next(iter(data_loader))
  print(f"input dim: {batch_data.shape}")
  output = model(batch_data)
  print(f"output dim: {output.shape}")

In [16]:
test_forward_pass(model, train_loader)

input dim: torch.Size([32, 500])
output dim: torch.Size([32, 500, 65])


## Define the training methods

In [17]:
def train_model(model, optimizer, criterion, num_epochs, print_batches=0, print_epochs=True):
    start_time = time.time()
    last_loss = None
    for epoch in range(num_epochs):
        model.train()

        epoch_start_time = time.time()
        batch_start_time = time.time()
        epoch_loss = 0
        for batch_idx, batch in enumerate(train_loader):
            # Clear gradients
            optimizer.zero_grad()

            # Forward pass
            input_data, labels = batch

            output = model(input_data)  # (batch_size, seq_len, num_classes)
            output = output.view(-1, len(codons)) # (batch_size * seq_len, num_classes)

            labels = labels.view(-1).long() # (batch_size, seq_len) -> (batch_size * seq_len)

            # Calculate loss
            loss = criterion(output, labels)
            epoch_loss += loss.item()

            # Backward pass
            loss.backward()

            # Update model parameters
            optimizer.step()

            if print_batches != 0 and batch_idx % print_batches == (print_batches-1):
                batch_time =  round(time.time() - batch_start_time,2)
                print(f'Batch [{batch_idx+1}/{len(train_loader)}], Loss: {loss.item():.4f}, Time since last batch print: {batch_time} s')
                batch_start_time = time.time()
        
        epoch_time = round(time.time() - epoch_start_time,2)
        epoch_loss = round(epoch_loss / len(train_loader),4)
        last_loss = epoch_loss
        if print_epochs:
            print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss}, Took {epoch_time} s')
    total_time = round(time.time() - start_time,2)
    print(f'Last Loss: {last_loss}, Took {total_time} s')
    return last_loss
    

## Define the evaluation methods to calculate metrics

In [18]:
from sklearn.metrics import accuracy_score

def compute_accuracy(predictions, labels):
    predictions = np.argmax(predictions, axis=1)

    # Find indices where labels are not equal to the padding value
    non_padding_indices = labels != codons_to_integer['___']

    # Filter out predictions and labels where the label is not padding
    filtered_predictions = predictions[non_padding_indices]
    filtered_labels = labels[non_padding_indices]

    acc = accuracy_score(labels, predictions)
    return acc

In [19]:
def evaluate_model(model, criterion):
    model.eval()  # Set the model to evaluation mode

    total_loss = 0.0

    with torch.no_grad():
        accuracies = []
        for batch_idx, batch in enumerate(test_loader):
             # Forward pass
            input_data, labels = batch

            output = model(input_data)  # (batch_size, seq_len, num_classes)
            output = output.view(-1, len(codons)) # (batch_size * seq_len, num_classes)

            labels = labels.view(-1).long() # (batch_size, seq_len) -> (batch_size * seq_len)

            # Calculate loss
            loss = criterion(output, labels)

            # Compute total loss
            total_loss += loss.item()

            # Compute custom metrics
            accuracy = compute_accuracy(output.cpu(), labels.cpu())
            accuracies.append(accuracy)

    # Compute average loss
    avg_loss = total_loss / len(test_loader)

    # Compute average accuracy
    avg_accuracy = np.mean(accuracies)

    print(f'Average Loss: {avg_loss:.4f}')
    print(f'Average Accuracy: {avg_accuracy:.4f}')

    return output, labels

## Training the model

In [20]:
set_seed()
EMBED_DIM = 256
NUM_ENCODER_LAYERS = 4
NUM_HEADS = 4
DROPOUT = 0.3

In [94]:
model = EncoderClassifier(
    embed_dim=EMBED_DIM,
    num_layers=NUM_ENCODER_LAYERS,
    num_heads=NUM_HEADS,
    dropout=DROPOUT,
    pos_enc=True
).to(device)
print_parameters(model)

6,597,697 total parameters.
6,597,697 training parameters.


In [95]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

In [96]:
EPOCHS = 10
print("----- Start Training -----")
train_model(model, optimizer, criterion, EPOCHS)

----- Start Training -----
Epoch [1/10], Loss: 1.4713, Took 3.55 s
Epoch [2/10], Loss: 0.5759, Took 3.54 s
Epoch [3/10], Loss: 0.5672, Took 3.54 s
Epoch [4/10], Loss: 0.5641, Took 3.54 s
Epoch [5/10], Loss: 0.5627, Took 3.54 s
Epoch [6/10], Loss: 0.5604, Took 3.55 s
Epoch [7/10], Loss: 0.5608, Took 3.54 s
Epoch [8/10], Loss: 0.5593, Took 3.55 s
Epoch [9/10], Loss: 0.5604, Took 3.55 s
Epoch [10/10], Loss: 0.5595, Took 3.55 s


In [98]:
batch_data, batch_label = next(iter(train_loader))
output = model(batch_data)
print(output.shape)

torch.Size([32, 500, 65])


In [99]:
ml_helper.save_model(model, 'encoder_256em_4l_4h_03dr_10ep_posenc', organism)

Model saved as 20240526160959_encoder_256em_4l_4h_03dr_10ep_posenc.pt


In [100]:
model = EncoderClassifier(
    embed_dim=EMBED_DIM,
    num_layers=NUM_ENCODER_LAYERS,
    num_heads=NUM_HEADS,
    dropout=DROPOUT,
    pos_enc=False
).to(device)
model = ml_helper.load_model('encoder_256em_4l_4h_03dr_10ep_posenc', organism)

Model loaded: 20240526160959_encoder_256em_4l_4h_03dr_10ep_posenc.pt


In [101]:
evaluate_model(model, criterion)

Average Loss: 0.5523
Average Accuracy: 0.7434


(tensor([[ 0.6396, -0.1035,  0.8757,  ..., -0.0875, -0.5559,  0.6196],
         [-1.2721, -1.9555, -0.3067,  ..., -0.2736, -0.8093,  0.5727],
         [-1.1088, -1.3988, -0.8205,  ..., -0.3505,  0.1035,  0.4657],
         ...,
         [-1.7422, -1.5923, -1.8738,  ..., -2.4118, -2.2591, 12.4480],
         [-1.7013, -1.5504, -1.8589,  ..., -2.4322, -2.2756, 12.4490],
         [-1.6725, -1.5168, -1.8558,  ..., -2.4479, -2.2876, 12.4499]],
        device='cuda:0'),
 tensor([35, 52, 40,  ..., 64, 64, 64], device='cuda:0'))

## Hyperparameter tuning

In [21]:
def train_parameter_model(embed_dim, num_encoder_layers, num_heads, dropout, pos_enc, num_epochs):
    set_seed()
    
    model = EncoderClassifier(
        embed_dim=embed_dim,
        num_layers=num_encoder_layers,
        num_heads=num_heads,
        dropout=dropout,
        pos_enc=pos_enc
    ).to(device)
    print_parameters(model)

    criterion = torch.nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters())

    print(f"----- Start Training: {embed_dim} emb, {num_encoder_layers} layers, {num_heads} heads, {dropout} dropout, positional encoding: {pos_enc}, {num_epochs} epochs -----")
    last_loss = train_model(model, optimizer, criterion, num_epochs, print_epochs=False)

    saved = False
    if last_loss >= 1.5:
        print(f"Did not save following model as loss was too high:")
        print(f'encoder_{embed_dim}em_{num_encoder_layers}l_{num_heads}h_{str(dropout).replace(".","")}dr_{num_epochs}ep{"_posenc" if pos_enc else ""}')
    else:
        saved = True
        ml_helper.save_model(model, f'encoder_{embed_dim}em_{num_encoder_layers}l_{num_heads}h_{str(dropout).replace(".","")}dr_{num_epochs}ep{"_posenc" if pos_enc else ""}', organism)
    return saved

In [22]:
def hyper_parameter_training(embed_dims, num_encoder_layers, num_heads, dropouts, pos_enc, epochs=10):
    not_saved = []
    for EMBED_DIM in embed_dims:
        for NUM_ENCODER_LAYERS in num_encoder_layers:
            for NUM_HEADS in num_heads:
                for DROPOUT in dropouts:
                    for POS_ENC in pos_enc:
                        saved = train_parameter_model(EMBED_DIM, NUM_ENCODER_LAYERS, NUM_HEADS, DROPOUT, POS_ENC, epochs)
                        if not saved:
                            not_saved.append(f'encoder_{EMBED_DIM}em_{NUM_ENCODER_LAYERS}l_{NUM_HEADS}h_{str(DROPOUT).replace(".","")}dr_{epochs}ep{"_posenc" if POS_ENC else ""}')
    print("------------")
    print("Not saved as loss too high:")
    print(not_saved)

### E.Coli

#### Dropout

In [25]:
EMBED_DIM = [256]
NUM_ENCODER_LAYERS = [4]
NUM_HEADS = [4]
dropouts = [0.1, 0.2, 0.3, 0.4, 0.5]
POS_ENC = [False]
hyper_parameter_training(EMBED_DIM, NUM_ENCODER_LAYERS, NUM_HEADS, dropouts, POS_ENC)

6,597,697 total parameters.
6,597,697 training parameters.
----- Start Training: 256 emb, 4 layers, 4 heads, 0.1 dropout, positional encoding: False, 10 epochs -----
Last Loss: 0.5602, Took 35.54 s
Model saved as 20240527163942_encoder_256em_4l_4h_01dr_10ep.pt
6,597,697 total parameters.
6,597,697 training parameters.
----- Start Training: 256 emb, 4 layers, 4 heads, 0.2 dropout, positional encoding: False, 10 epochs -----
Last Loss: 0.5603, Took 35.54 s
Model saved as 20240527164018_encoder_256em_4l_4h_02dr_10ep.pt
6,597,697 total parameters.
6,597,697 training parameters.
----- Start Training: 256 emb, 4 layers, 4 heads, 0.3 dropout, positional encoding: False, 10 epochs -----
Last Loss: 0.5604, Took 35.59 s
Model saved as 20240527164053_encoder_256em_4l_4h_03dr_10ep.pt
6,597,697 total parameters.
6,597,697 training parameters.
----- Start Training: 256 emb, 4 layers, 4 heads, 0.4 dropout, positional encoding: False, 10 epochs -----
Last Loss: 0.5605, Took 35.59 s
Model saved as 2024

#### Positional Encoding

In [26]:
EMBED_DIM = [256]
NUM_ENCODER_LAYERS = [4]
NUM_HEADS = [4]
DROPOUTS = [0.3]
pos_enc = [True, False]
hyper_parameter_training(EMBED_DIM, NUM_ENCODER_LAYERS, NUM_HEADS, DROPOUTS, pos_enc)

6,597,697 total parameters.
6,597,697 training parameters.
----- Start Training: 256 emb, 4 layers, 4 heads, 0.3 dropout, positional encoding: True, 10 epochs -----
Last Loss: 0.5595, Took 35.61 s
Model saved as 20240527164446_encoder_256em_4l_4h_03dr_10ep_posenc.pt
6,597,697 total parameters.
6,597,697 training parameters.
----- Start Training: 256 emb, 4 layers, 4 heads, 0.3 dropout, positional encoding: False, 10 epochs -----
Last Loss: 0.5604, Took 35.48 s
Model saved as 20240527164522_encoder_256em_4l_4h_03dr_10ep.pt
------------
Not saved as loss too high:
[]


#### Embedding Dimension

In [None]:
embed_dims = [32, 64, 128, 256, 512, 1028]
NUM_ENCODER_LAYERS = [4]
NUM_HEADS = [4]
DROPOUTS = [0.3]
POS_ENC = [False]
hyper_parameter_training(embed_dims, NUM_ENCODER_LAYERS, NUM_HEADS, DROPOUTS, POS_ENC)

#### Number Encoder Layers and Heads

In [None]:
EMBED_DIM = [256]
num_encoder_layers = [2, 4, 8, 16]
num_heads = [2, 4, 8, 16]
DROPOUTS = [0.3]
POS_ENC = [False]
hyper_parameter_training(EMBED_DIM, num_encoder_layers, num_heads, DROPOUTS, POS_ENC)

## Drosophila.Melanogaster

In [20]:
organism = "Drosophila.Melanogaster"
min_length = None
max_length = 500

train_dataset = ml_helper.CodonDataset(organism, "train", min_length, max_length, cut_data=True, one_hot_aa=False, data_path=data_path, device=device)
print(f"Länge train_dataset: {len(train_dataset)}")
test_dataset = ml_helper.CodonDataset(organism, "test", min_length, max_length, cut_data=True, one_hot_aa=False, data_path=data_path, device=device)
print(f"Länge test_dataset: {len(test_dataset)}")

Länge train_dataset: 33071
Länge test_dataset: 8100


In [None]:
embed_dims = [256, 512]
num_encoder_layers = [4, 8]
num_heads = [4, 8]
DROPOUTS = [0.3]
POS_ENC = [False]
hyper_parameter_training(embed_dims, num_encoder_layers, num_heads, DROPOUTS, POS_ENC)

## Homo.Sapiens

In [23]:
%%time

organism = "Homo.Sapiens"
min_length = None
max_length = 500

train_dataset = ml_helper.CodonDataset(organism, "train", min_length, max_length, cut_data=True, one_hot_aa=False, filter_x=True, data_path=data_path, device=device)
print(f"Länge train_dataset: {len(train_dataset)}")
test_dataset = ml_helper.CodonDataset(organism, "test", min_length, max_length, cut_data=True, one_hot_aa=False, filter_x=True, data_path=data_path, device=device)
print(f"Länge test_dataset: {len(test_dataset)}")

Länge train_dataset: 140902
Länge test_dataset: 35210
CPU times: user 1min 41s, sys: 508 ms, total: 1min 42s
Wall time: 1min 42s


In [25]:
embed_dims = [256, 512]
num_encoder_layers = [4, 6]
num_heads = [4, 8]
DROPOUTS = [0.3]
POS_ENC = [False]
hyper_parameter_training(embed_dims, num_encoder_layers, num_heads, DROPOUTS, POS_ENC)

6,597,697 total parameters.
6,597,697 training parameters.
----- Start Training: 256 emb, 4 layers, 4 heads, 0.3 dropout, positional encoding: False, 10 epochs -----
Last Loss: 0.5604, Took 35.08 s
Model saved as 20240527185016_encoder_256em_4l_4h_03dr_10ep.pt
6,597,697 total parameters.
6,597,697 training parameters.
----- Start Training: 256 emb, 4 layers, 8 heads, 0.3 dropout, positional encoding: False, 10 epochs -----
Last Loss: 0.5604, Took 40.84 s
Model saved as 20240527185057_encoder_256em_4l_8h_03dr_10ep.pt
9,227,841 total parameters.
9,227,841 training parameters.
----- Start Training: 256 emb, 6 layers, 4 heads, 0.3 dropout, positional encoding: False, 10 epochs -----
Last Loss: 2.8327, Took 52.7 s
Did not save following model as loss was too high:
encoder_256em_6l_4h_03dr_10ep
9,227,841 total parameters.
9,227,841 training parameters.
----- Start Training: 256 emb, 6 layers, 8 heads, 0.3 dropout, positional encoding: False, 10 epochs -----
Last Loss: 2.1347, Took 61.23 s
Di