# Encoder-only Transformer Architektur

In [1]:
#from google.colab import drive
#drive.mount('/content/drive')

In [2]:
!pip install biopython

Defaulting to user installation because normal site-packages is not writeable


In [1]:
import sys
import random
import numpy as np
import pandas as pd
import ast
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch import Tensor
from torch.nn import TransformerEncoderLayer
import time
import math

sys.path.append('../scripts')
#sys.path.append('/content/drive/MyDrive/PMDS/Notebooks')
import ml_helper
import custom_transformer_encoder as custom_te

In [2]:
#data_path = '/content/drive/MyDrive/PMDS/Data'
data_path = '../data'

In [3]:
SEED = 42
def set_seed(SEED=SEED):
    random.seed(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
set_seed()

In [4]:
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
print(device)

cpu


## Prepare Valid and Training Data Loader

In [5]:
amino_acids = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V', '*',
               '_']

aminoacids_to_integer = dict((a, i) for i, a in enumerate(amino_acids))
integer_to_aminoacids = dict((i, a) for i, a in enumerate(amino_acids))

codons = ['TTT', 'TTC', 'TTA', 'TTG', 'TCT', 'TCC', 'TCA', 'TCG', 'TAT', 'TAC', 'TAA', 'TAG', 'TGT', 'TGC', 'TGA',
          'TGG', 'CTT', 'CTC', 'CTA', 'CTG', 'CCT', 'CCC', 'CCA', 'CCG', 'CAT', 'CAC', 'CAA', 'CAG', 'CGT', 'CGC',
          'CGA', 'CGG', 'ATT', 'ATC', 'ATA', 'ATG', 'ACT', 'ACC', 'ACA', 'ACG', 'AAT', 'AAC', 'AAA', 'AAG', 'AGT',
          'AGC', 'AGA', 'AGG', 'GTT', 'GTC', 'GTA', 'GTG', 'GCT', 'GCC', 'GCA', 'GCG', 'GAT', 'GAC', 'GAA', 'GAG',
          'GGT', 'GGC', 'GGA', 'GGG', '___']

codons_to_integer = dict((c, i) for i, c in enumerate(codons))
integer_to_codons = dict((i, c) for i, c in enumerate(codons))

In [6]:
organism = "E.Coli"
min_length = None
max_length = 500

SPEEDS_ADDED = False

train_dataset = ml_helper.CodonDataset(organism, "train", min_length, max_length, add_speeds=SPEEDS_ADDED, cut_data=True, one_hot_aa=False, data_path=data_path, device=device)
print(f"Länge train_dataset: {len(train_dataset)}")
valid_dataset = ml_helper.CodonDataset(organism, "valid", min_length, max_length, add_speeds=SPEEDS_ADDED, cut_data=True, one_hot_aa=False, data_path=data_path, device=device)
print(f"Länge valid_dataset: {len(valid_dataset)}")

Länge train_dataset: 3555
Länge valid_dataset: 420


In [7]:
print(train_dataset[3])
print(train_dataset[3][0].shape)
print(train_dataset[3][1].shape)

(tensor([12, 16, 18,  0, 19,  6, 13,  3,  2, 19, 15,  1, 10, 18,  7,  3, 19,  1,
         0, 19,  3,  7, 19, 15,  9,  0,  9, 11,  3,  7,  6, 13, 13, 15, 12, 10,
         7, 14, 15,  7, 15,  7, 11, 16, 16,  4, 10,  1, 10,  9,  0,  7, 13,  6,
         5, 10, 15,  7,  7,  0,  9, 15,  9, 13,  7, 11, 14,  0, 15,  2, 10, 14,
        14, 17,  6,  1,  3, 19,  2, 16, 19, 13,  5,  3, 18,  0, 10, 13, 14,  8,
        12, 15,  9, 10,  3,  2, 19,  0, 18,  7, 10, 12, 19, 11,  7, 19,  2, 11,
        11,  5,  1,  8,  0, 12,  0,  5,  6,  0, 10,  6, 11, 19,  0, 10,  7, 13,
        19,  8,  5,  1, 11, 14, 15,  5, 10, 15,  7,  7,  5,  1,  5,  1, 19,  0,
         9,  0,  1,  0, 10, 19,  2,  6, 14,  1, 19, 10, 10, 10,  3,  6, 14, 10,
         7,  0, 10,  3, 10, 11, 10,  1,  6,  5, 12,  5, 10,  6, 10, 11, 11, 10,
         5,  5, 15, 10,  7,  9, 16, 13,  9, 13, 19, 16,  8,  3,  5,  7,  6,  0,
        10, 15, 12, 15,  3,  1, 19,  0, 19, 13,  2,  2,  7,  1,  9,  6,  5, 19,
         3, 15, 14,  1,  3, 10, 18, 12,

In [8]:
BATCH_SIZE = 32
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)

## Define the encoder-only model

In [9]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 500):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Arguments:
            x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [10]:
class EncoderClassifier(nn.Module):
    def __init__(self, embed_dim, num_layers, num_heads, dropout=0.2, pos_enc=False):
        super(EncoderClassifier, self).__init__()

        emb_size = embed_dim
        if SPEEDS_ADDED:
            emb_size -= 1
        self.emb = nn.Embedding(len(amino_acids), emb_size, padding_idx=len(amino_acids)-1)
        self.pos_enc = pos_enc
        self.pos_encoder = PositionalEncoding(embed_dim, dropout)

        self.encoder_layer = custom_te.CustomTransformerEncoderLayer(
            d_model=embed_dim,
            nhead=num_heads,
            batch_first=True
        )
        self.encoder = custom_te.CustomTransformerEncoder(
            encoder_layer=self.encoder_layer,
            num_layers=num_layers,
        )

        self.linear = nn.Linear(embed_dim, len(codons))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, attn_weights_needed=False):
        x = x.long()
        if SPEEDS_ADDED:
            x1 = self.emb(x[:, :, 0])
            x2 = x[:, :, 1].unsqueeze(-1)
            x = torch.cat((x1, x2), dim=-1)  # Concatenate along the feature dimension
        else:
            x = self.emb(x)

        if self.pos_enc:
            x = x.transpose(0, 1)
            x = self.pos_encoder(x)  # Add positional encoding
            x = x.transpose(0, 1)
        
        if attn_weights_needed:
            x, attn_weights = self.encoder(x, attn_weights_needed=attn_weights_needed)
            x = self.dropout(x)
            out = self.linear(x)
            return out, attn_weights
        else:
            x = self.encoder(x, attn_weights_needed=attn_weights_needed)
            x = self.dropout(x)
            out = self.linear(x)
            return out


In [11]:
EMBED_DIM = 256
NUM_ENCODER_LAYERS = 4
NUM_HEADS = 4
DROP_OUT = 0.2

In [12]:
model = EncoderClassifier(
    embed_dim=EMBED_DIM,
    num_layers=NUM_ENCODER_LAYERS,
    num_heads=NUM_HEADS,
    dropout=DROP_OUT,
    pos_enc=True
).to(device)
print(model)

EncoderClassifier(
  (emb): Embedding(22, 256, padding_idx=21)
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (encoder_layer): CustomTransformerEncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
    )
    (linear1): Linear(in_features=256, out_features=2048, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (linear2): Linear(in_features=2048, out_features=256, bias=True)
    (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.1, inplace=False)
    (dropout2): Dropout(p=0.1, inplace=False)
  )
  (encoder): CustomTransformerEncoder(
    (layers): ModuleList(
      (0-3): 4 x CustomTransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=T

In [13]:
# Total parameters and trainable parameters.
def print_parameters(model):
    total_params = sum(p.numel() for p in model.parameters())
    print(f"{total_params:,} total parameters.")
    total_trainable_params = sum(
        p.numel() for p in model.parameters() if p.requires_grad)
    print(f"{total_trainable_params:,} training parameters.")

In [14]:
print_parameters(model)

6,597,697 total parameters.
6,597,697 training parameters.


In [17]:
def test_forward_pass(model, data_loader):
  batch_data, batch_label = next(iter(data_loader))
  print(f"input dim: {batch_data.shape}")
  output, attn_weights = model(batch_data, attn_weights_needed=True)
  print(f"output dim: {output.shape}")
  print(f"attr_weights dim: {attn_weights}")

In [18]:
test_forward_pass(model, train_loader)

input dim: torch.Size([32, 500])
output dim: torch.Size([32, 500, 65])
attr_weights dim: [tensor([[[0.0019, 0.0040, 0.0026,  ..., 0.0020, 0.0024, 0.0015],
         [0.0024, 0.0024, 0.0016,  ..., 0.0015, 0.0017, 0.0012],
         [0.0034, 0.0020, 0.0034,  ..., 0.0015, 0.0012, 0.0018],
         ...,
         [0.0022, 0.0006, 0.0026,  ..., 0.0021, 0.0013, 0.0024],
         [0.0026, 0.0029, 0.0011,  ..., 0.0017, 0.0021, 0.0024],
         [0.0023, 0.0026, 0.0022,  ..., 0.0016, 0.0016, 0.0022]],

        [[0.0012, 0.0018, 0.0019,  ..., 0.0020, 0.0024, 0.0016],
         [0.0028, 0.0039, 0.0053,  ..., 0.0023, 0.0026, 0.0015],
         [0.0033, 0.0060, 0.0041,  ..., 0.0021, 0.0018, 0.0007],
         ...,
         [0.0017, 0.0024, 0.0020,  ..., 0.0022, 0.0022, 0.0024],
         [0.0014, 0.0023, 0.0017,  ..., 0.0022, 0.0025, 0.0026],
         [0.0026, 0.0024, 0.0025,  ..., 0.0022, 0.0024, 0.0023]],

        [[0.0044, 0.0039, 0.0030,  ..., 0.0034, 0.0036, 0.0013],
         [0.0042, 0.0034, 0.0030,

## Define the evaluation methods to calculate metrics

In [18]:
def count_correct_predictions(predictions, labels):
    predictions = np.argmax(predictions, axis=1)

    # Find indices where labels are not equal to the padding value
    non_padding_indices = labels != codons_to_integer['___']

    # Filter out predictions and labels where the label is not padding
    filtered_predictions = predictions[non_padding_indices]
    filtered_labels = labels[non_padding_indices]

    codon_num = filtered_labels.shape[0]
    correct_codons = (filtered_predictions == filtered_labels).sum().item()
    return codon_num, correct_codons

In [19]:
def evaluate_model(model, criterion, print_scores=True, loss_without_pad=False):
    model.eval()  # Set the model to evaluation mode

    total_loss = 0.0

    with torch.no_grad():
        codon_num = 0
        correct_codon_num = 0
        for batch_idx, batch in enumerate(valid_loader):
             # Forward pass
            input_data, labels = batch

            output = model(input_data)  # (batch_size, seq_len, num_classes)
            output = output.view(-1, len(codons)) # (batch_size * seq_len, num_classes)

            labels = labels.view(-1).long() # (batch_size, seq_len) -> (batch_size * seq_len)

            # Calculate loss
            loss = criterion(output, labels)

            # Compute total loss
            total_loss += loss.item()

            # Count codons and correct codon predictions
            codon_num_batch, correct_codons_batch = count_correct_predictions(output.cpu(), labels.cpu())
            codon_num += codon_num_batch
            correct_codon_num += correct_codons_batch

    # Compute average loss
    avg_loss = total_loss / len(valid_loader)

    # Compute accuracy
    accuracy = round(correct_codon_num / codon_num, 4)

    if print_scores:
        print(f'Average Batch Loss: {avg_loss:.4f}')
        print(f'Accuracy: {accuracy:.4f}')

    return avg_loss, accuracy

## Define the training methods

In [33]:
def train_model(model, num_epochs, loss_ignore_pad=True, learning_rate=0.0005, validation_stop=True, validation_stop_area=7, print_batches=0, print_epochs=True):
    criterion = torch.nn.CrossEntropyLoss()
    if loss_ignore_pad:
        criterion = torch.nn.CrossEntropyLoss(ignore_index=codons_to_integer['___'])
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    start_time = time.time()
    last_loss = None
    saved_accuracies = []
    epoch_num = 0
    for epoch in range(num_epochs):
        epoch_num += 1
        set_seed(epoch)
        model.train()

        epoch_start_time = time.time()
        batch_start_time = time.time()
        epoch_loss = 0
        for batch_idx, batch in enumerate(train_loader):
            # Clear gradients
            optimizer.zero_grad()

            # Forward pass
            input_data, labels = batch

            output = model(input_data)  # (batch_size, seq_len, num_classes)
            output = output.view(-1, len(codons)) # (batch_size * seq_len, num_classes)

            labels = labels.view(-1).long() # (batch_size, seq_len) -> (batch_size * seq_len)

            # Calculate loss
            loss = criterion(output, labels)
            epoch_loss += loss.item()

            # Backward pass
            loss.backward()

            # Update model parameters
            optimizer.step()

            if print_batches != 0 and batch_idx % print_batches == (print_batches-1):
                batch_time =  round(time.time() - batch_start_time,2)
                print(f'Batch [{batch_idx+1}/{len(train_loader)}], Loss: {loss.item():.4f}, Time since last batch print: {batch_time} s')
                batch_start_time = time.time()
        
        epoch_loss = round(epoch_loss / len(train_loader),4)
        last_loss = epoch_loss
        
        avg_eval_loss, accuracy = evaluate_model(model, criterion, print_scores=False)
        
        epoch_time = round(time.time() - epoch_start_time,2)
        if print_epochs:
            print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss}, Eval Accuracy: {accuracy}, Took {epoch_time} s')
         
        if validation_stop:
            saved_accuracies.append(accuracy)  
            if len(saved_accuracies) == validation_stop_area+1:
                # compare accuracy to average of saved_accuracies
                # if accuracy is lower: stop early
                if np.average(np.array(saved_accuracies[validation_stop_area-1:validation_stop_area+1])) < np.average(np.array(saved_accuracies[0:validation_stop_area-2])):
                    print(f'Stopped early after epoch {epoch+1} as validation accuracy was lower than average of the last {validation_stop_area} accuracies.')
                    break
                saved_accuracies.pop(0)
               
    total_time = round(time.time() - start_time,2)
    print(f'Last Loss: {last_loss}, Last Eval Accuracy: {accuracy}, Took {total_time} s')
    return last_loss, accuracy, epoch_num

## Training the model

In [21]:
set_seed()
EMBED_DIM = 256
NUM_ENCODER_LAYERS = 4
NUM_HEADS = 4
DROPOUT = 0.3

model = EncoderClassifier(
    embed_dim=EMBED_DIM,
    num_layers=NUM_ENCODER_LAYERS,
    num_heads=NUM_HEADS,
    dropout=DROPOUT,
    pos_enc=False
).to(device)
print_parameters(model)

6,597,697 total parameters.
6,597,697 training parameters.


In [22]:
EPOCHS = 50
print("----- Start Training -----")
#train_model(model, EPOCHS)

----- Start Training -----


In [23]:
#ml_helper.save_model(model, f'encoder_256em_4l_4h_03dr_10ep', organism)

In [24]:
#model = ml_helper.load_model(f'encoder_256em_4l_4h_03dr_10ep', organism)

In [25]:
criterion = torch.nn.CrossEntropyLoss(ignore_index=codons_to_integer['___'])
#evaluate_model(model, criterion)

## Hyperparameter tuning

In [46]:
def train_parameter_model(embed_dim, num_encoder_layers, num_heads, dropout, pos_enc, num_epochs, print_epochs):
    set_seed()
    
    model = EncoderClassifier(
        embed_dim=embed_dim,
        num_layers=num_encoder_layers,
        num_heads=num_heads,
        dropout=dropout,
        pos_enc=pos_enc
    ).to(device)
    #print_parameters(model)

    print(f"----- Start Training: {embed_dim} emb, {num_encoder_layers} layers, {num_heads} heads, {dropout} dropout, positional encoding: {pos_enc}, {num_epochs} epochs -----")
    last_loss, accuracy, epoch_num = train_model(model, num_epochs, print_epochs=print_epochs)

    saved = False
    if last_loss >= 2:
        print(f"Did not save following model as loss was too high:")
        print(f'encoder_{embed_dim}em_{num_encoder_layers}l_{num_heads}h{"_posenc" if pos_enc else ""}_{str(dropout).replace(".","")}dr_{epoch_num}ep')
    else:
        saved = True
        ml_helper.save_model(model, f'encoder_{embed_dim}em_{num_encoder_layers}l_{num_heads}h{"_posenc" if pos_enc else ""}_{str(dropout).replace(".","")}dr_{epoch_num}ep', organism)
    return saved, accuracy

In [47]:
def hyper_parameter_training(embed_dims, num_encoder_layers, num_heads, dropouts, pos_enc, epochs=50, print_epochs=True):
    not_saved = []
    accuracies = {}
    for EMBED_DIM in embed_dims:
        for NUM_ENCODER_LAYERS in num_encoder_layers:
            for NUM_HEADS in num_heads:
                for DROPOUT in dropouts:
                    for POS_ENC in pos_enc:
                        model_name = f'encoder_{EMBED_DIM}em_{NUM_ENCODER_LAYERS}l_{NUM_HEADS}h{"_posenc" if POS_ENC else ""}_{str(DROPOUT).replace(".","")}dr_{epochs}ep'
                        saved, accuracy = train_parameter_model(EMBED_DIM, NUM_ENCODER_LAYERS, NUM_HEADS, DROPOUT, POS_ENC, epochs, print_epochs)
                        accuracies[model_name] = accuracy
                        if not saved:
                            not_saved.append(model_name)
    print("------------")
    print("Not saved as loss too high:")
    print(not_saved)
    return accuracies

### E.Coli

In [31]:
organism = "E.Coli"
min_length = None
max_length = 500

train_dataset = ml_helper.CodonDataset(organism, "train", min_length, max_length, cut_data=True, one_hot_aa=False, data_path=data_path, device=device)
print(f"Länge train_dataset: {len(train_dataset)}")
valid_dataset = ml_helper.CodonDataset(organism, "valid", min_length, max_length, cut_data=True, one_hot_aa=False, data_path=data_path, device=device)
print(f"Länge valid_dataset: {len(valid_dataset)}")

BATCH_SIZE = 32
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)

Länge train_dataset: 3555
Länge valid_dataset: 420


In [None]:
# Train single model
EMBED_DIM = [256]
NUM_ENCODER_LAYERS = [4]
NUM_HEADS = [4]
dropouts = [0.3]
POS_ENC = [False]
hyper_parameter_training(EMBED_DIM, NUM_ENCODER_LAYERS, NUM_HEADS, dropouts, POS_ENC, epochs=100)

#### Dropout

In [34]:
EMBED_DIM = [256]
NUM_ENCODER_LAYERS = [4]
NUM_HEADS = [4]
dropouts = [0.1, 0.2, 0.3, 0.4, 0.5]
POS_ENC = [False]
hyper_parameter_training(EMBED_DIM, NUM_ENCODER_LAYERS, NUM_HEADS, dropouts, POS_ENC)

----- Start Training: 256 emb, 4 layers, 4 heads, 0.1 dropout, positional encoding: False, 50 epochs -----
Epoch [1/50], Loss: 1.1639, Eval Accuracy: 0.4962, Took 3.68 s
Epoch [2/50], Loss: 1.0538, Eval Accuracy: 0.5166, Took 3.68 s
Epoch [3/50], Loss: 1.0461, Eval Accuracy: 0.5178, Took 3.69 s
Epoch [4/50], Loss: 1.0421, Eval Accuracy: 0.52, Took 3.68 s
Epoch [5/50], Loss: 1.0397, Eval Accuracy: 0.5076, Took 3.71 s
Epoch [6/50], Loss: 1.0378, Eval Accuracy: 0.5178, Took 3.71 s
Epoch [7/50], Loss: 1.0361, Eval Accuracy: 0.5162, Took 3.71 s
Epoch [8/50], Loss: 1.0355, Eval Accuracy: 0.5191, Took 3.71 s
Epoch [9/50], Loss: 1.0326, Eval Accuracy: 0.5233, Took 3.71 s
Epoch [10/50], Loss: 1.0331, Eval Accuracy: 0.5229, Took 3.71 s
Epoch [11/50], Loss: 1.0323, Eval Accuracy: 0.5174, Took 3.71 s
Epoch [12/50], Loss: 1.0308, Eval Accuracy: 0.5192, Took 3.72 s
Epoch [13/50], Loss: 1.031, Eval Accuracy: 0.5226, Took 3.72 s
Epoch [14/50], Loss: 1.0299, Eval Accuracy: 0.5203, Took 3.71 s
Epoch [15

{'encoder_256em_4l_4h_01dr_50ep': 0.5183,
 'encoder_256em_4l_4h_02dr_50ep': 0.5189,
 'encoder_256em_4l_4h_03dr_50ep': 0.5182,
 'encoder_256em_4l_4h_04dr_50ep': 0.5182,
 'encoder_256em_4l_4h_05dr_50ep': 0.5174}

#### Embedding Dimension

In [35]:
embed_dims = [16, 32, 64, 128, 256, 512]
NUM_ENCODER_LAYERS = [4]
NUM_HEADS = [4]
DROPOUTS = [0.5]
POS_ENC = [False]
hyper_parameter_training(embed_dims, NUM_ENCODER_LAYERS, NUM_HEADS, DROPOUTS, POS_ENC)

----- Start Training: 16 emb, 4 layers, 4 heads, 0.5 dropout, positional encoding: False, 50 epochs -----
Epoch [1/50], Loss: 3.0332, Eval Accuracy: 0.5129, Took 2.03 s
Epoch [2/50], Loss: 2.3796, Eval Accuracy: 0.5145, Took 2.02 s
Epoch [3/50], Loss: 1.9794, Eval Accuracy: 0.5145, Took 2.02 s
Epoch [4/50], Loss: 1.7237, Eval Accuracy: 0.5155, Took 2.03 s
Epoch [5/50], Loss: 1.5641, Eval Accuracy: 0.5155, Took 2.02 s
Epoch [6/50], Loss: 1.4599, Eval Accuracy: 0.5145, Took 2.02 s
Epoch [7/50], Loss: 1.3854, Eval Accuracy: 0.5155, Took 2.02 s
Epoch [8/50], Loss: 1.3323, Eval Accuracy: 0.5155, Took 2.02 s
Epoch [9/50], Loss: 1.291, Eval Accuracy: 0.5155, Took 2.03 s
Epoch [10/50], Loss: 1.2609, Eval Accuracy: 0.5155, Took 2.03 s
Epoch [11/50], Loss: 1.2362, Eval Accuracy: 0.5155, Took 2.03 s
Epoch [12/50], Loss: 1.2157, Eval Accuracy: 0.5155, Took 2.03 s
Epoch [13/50], Loss: 1.2004, Eval Accuracy: 0.5155, Took 2.03 s
Epoch [14/50], Loss: 1.1867, Eval Accuracy: 0.5155, Took 2.03 s
Epoch [1

{'encoder_16em_4l_4h_05dr_50ep': 0.5145,
 'encoder_32em_4l_4h_05dr_50ep': 0.5199,
 'encoder_64em_4l_4h_05dr_50ep': 0.5219,
 'encoder_128em_4l_4h_05dr_50ep': 0.5195,
 'encoder_256em_4l_4h_05dr_50ep': 0.5174,
 'encoder_512em_4l_4h_05dr_50ep': 0.506}

#### Number Encoder Layers and Heads

In [36]:
EMBED_DIM = [64]
num_encoder_layers = [1, 2, 4, 8]
num_heads = [1, 2, 4, 8]
DROPOUTS = [0.5]
POS_ENC = [False]
hyper_parameter_training(EMBED_DIM, num_encoder_layers, num_heads, DROPOUTS, POS_ENC)



----- Start Training: 64 emb, 1 layers, 1 heads, 0.5 dropout, positional encoding: False, 50 epochs -----
Epoch [1/50], Loss: 1.8965, Eval Accuracy: 0.5156, Took 0.51 s
Epoch [2/50], Loss: 1.2653, Eval Accuracy: 0.5155, Took 0.51 s
Epoch [3/50], Loss: 1.1597, Eval Accuracy: 0.5155, Took 0.51 s
Epoch [4/50], Loss: 1.1151, Eval Accuracy: 0.5155, Took 0.51 s
Epoch [5/50], Loss: 1.0909, Eval Accuracy: 0.5166, Took 0.51 s
Epoch [6/50], Loss: 1.0769, Eval Accuracy: 0.5145, Took 0.5 s
Epoch [7/50], Loss: 1.0671, Eval Accuracy: 0.5152, Took 0.51 s
Epoch [8/50], Loss: 1.0622, Eval Accuracy: 0.5154, Took 0.51 s
Stopped early after epoch 8 as validation accuracy was lower than average of the last 7 accuracies.
Last Loss: 1.0622, Last Eval Accuracy: 0.5154, Took 4.05 s
Model saved as 20240603143811_encoder_64em_1l_1h_05dr_8ep.pt
----- Start Training: 64 emb, 1 layers, 2 heads, 0.5 dropout, positional encoding: False, 50 epochs -----
Epoch [1/50], Loss: 1.8958, Eval Accuracy: 0.5162, Took 0.55 s
Ep

{'encoder_64em_1l_1h_05dr_50ep': 0.5154,
 'encoder_64em_1l_2h_05dr_50ep': 0.516,
 'encoder_64em_1l_4h_05dr_50ep': 0.5165,
 'encoder_64em_1l_8h_05dr_50ep': 0.5187,
 'encoder_64em_2l_1h_05dr_50ep': 0.5221,
 'encoder_64em_2l_2h_05dr_50ep': 0.5198,
 'encoder_64em_2l_4h_05dr_50ep': 0.5225,
 'encoder_64em_2l_8h_05dr_50ep': 0.5222,
 'encoder_64em_4l_1h_05dr_50ep': 0.5214,
 'encoder_64em_4l_2h_05dr_50ep': 0.5228,
 'encoder_64em_4l_4h_05dr_50ep': 0.5219,
 'encoder_64em_4l_8h_05dr_50ep': 0.5219,
 'encoder_64em_8l_1h_05dr_50ep': 0.5209,
 'encoder_64em_8l_2h_05dr_50ep': 0.5183,
 'encoder_64em_8l_4h_05dr_50ep': 0.5181,
 'encoder_64em_8l_8h_05dr_50ep': 0.5218}

#### Positional Encoding

In [39]:
EMBED_DIM = [64]
NUM_ENCODER_LAYERS = [2]
NUM_HEADS = [4]
DROPOUTS = [0.5]
pos_enc = [True, False]
hyper_parameter_training(EMBED_DIM, NUM_ENCODER_LAYERS, NUM_HEADS, DROPOUTS, pos_enc)

----- Start Training: 64 emb, 2 layers, 4 heads, 0.5 dropout, positional encoding: True, 50 epochs -----
Epoch [1/50], Loss: 2.0597, Eval Accuracy: 0.5154, Took 1.17 s
Epoch [2/50], Loss: 1.2627, Eval Accuracy: 0.5155, Took 1.17 s
Epoch [3/50], Loss: 1.1491, Eval Accuracy: 0.5137, Took 1.16 s
Epoch [4/50], Loss: 1.1068, Eval Accuracy: 0.5155, Took 1.16 s
Epoch [5/50], Loss: 1.0852, Eval Accuracy: 0.5145, Took 1.17 s
Epoch [6/50], Loss: 1.0717, Eval Accuracy: 0.5153, Took 1.17 s
Epoch [7/50], Loss: 1.0629, Eval Accuracy: 0.5159, Took 1.17 s
Epoch [8/50], Loss: 1.0585, Eval Accuracy: 0.5163, Took 1.17 s
Epoch [9/50], Loss: 1.0538, Eval Accuracy: 0.517, Took 1.14 s
Epoch [10/50], Loss: 1.0511, Eval Accuracy: 0.5168, Took 1.14 s
Epoch [11/50], Loss: 1.0485, Eval Accuracy: 0.5164, Took 1.14 s
Epoch [12/50], Loss: 1.0462, Eval Accuracy: 0.5164, Took 1.15 s
Epoch [13/50], Loss: 1.0459, Eval Accuracy: 0.5161, Took 1.15 s
Stopped early after epoch 13 as validation accuracy was lower than averag

{'encoder_64em_2l_4h_05dr_posenc_50ep': 0.5161,
 'encoder_64em_2l_4h_05dr_50ep': 0.5225}

## Drosophila.Melanogaster

In [40]:
organism = "Drosophila.Melanogaster"
min_length = None
max_length = 500

train_dataset = ml_helper.CodonDataset(organism, "train", min_length, max_length, cut_data=True, one_hot_aa=False, data_path=data_path, device=device)
print(f"Länge train_dataset: {len(train_dataset)}")
valid_dataset = ml_helper.CodonDataset(organism, "valid", min_length, max_length, cut_data=True, one_hot_aa=False, data_path=data_path, device=device)
print(f"Länge valid_dataset: {len(valid_dataset)}")

BATCH_SIZE = 32
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)

Länge train_dataset: 33040
Länge valid_dataset: 4073


In [44]:
embed_dims = [64, 128]
num_encoder_layers = [2, 4]
num_heads = [2, 4]
DROPOUTS = [0.2, 0.5]
POS_ENC = [False]
accuracies = hyper_parameter_training(embed_dims, num_encoder_layers, num_heads, DROPOUTS, POS_ENC, epochs=10, print_epochs=True)

----- Start Training: 64 emb, 2 layers, 2 heads, 0.2 dropout, positional encoding: False, 10 epochs -----
Epoch [1/10], Loss: 1.1824, Eval Accuracy: 0.4967, Took 9.54 s
Epoch [2/10], Loss: 1.081, Eval Accuracy: 0.497, Took 9.79 s
Epoch [3/10], Loss: 1.076, Eval Accuracy: 0.4977, Took 9.09 s
Epoch [4/10], Loss: 1.0746, Eval Accuracy: 0.498, Took 9.1 s
Epoch [5/10], Loss: 1.0737, Eval Accuracy: 0.4975, Took 9.1 s
Epoch [6/10], Loss: 1.073, Eval Accuracy: 0.4972, Took 9.11 s
Epoch [7/10], Loss: 1.0725, Eval Accuracy: 0.4987, Took 9.1 s
Epoch [8/10], Loss: 1.0719, Eval Accuracy: 0.4989, Took 9.1 s
Epoch [9/10], Loss: 1.0715, Eval Accuracy: 0.4982, Took 10.15 s
Epoch [10/10], Loss: 1.071, Eval Accuracy: 0.4991, Took 9.1 s
Last Loss: 1.071, Last Eval Accuracy: 0.4991, Took 93.17 s
Model saved as 20240603150541_encoder_64em_2l_2h_02dr_10ep.pt
----- Start Training: 64 emb, 2 layers, 2 heads, 0.5 dropout, positional encoding: False, 10 epochs -----
Epoch [1/10], Loss: 1.2169, Eval Accuracy: 0.4

In [45]:
# Train best suited models for longer
embed_dims = [64]
num_encoder_layers = [4]
num_heads = [4]
DROPOUTS = [0.2]
POS_ENC = [False, True]
accuracies = hyper_parameter_training(embed_dims, num_encoder_layers, num_heads, DROPOUTS, POS_ENC, epochs=100, print_epochs=True)

----- Start Training: 64 emb, 4 layers, 4 heads, 0.2 dropout, positional encoding: False, 100 epochs -----
Epoch [1/100], Loss: 1.1768, Eval Accuracy: 0.4964, Took 20.78 s
Epoch [2/100], Loss: 1.0809, Eval Accuracy: 0.4962, Took 20.48 s
Epoch [3/100], Loss: 1.0759, Eval Accuracy: 0.4978, Took 20.5 s
Epoch [4/100], Loss: 1.0744, Eval Accuracy: 0.498, Took 20.51 s
Epoch [5/100], Loss: 1.0738, Eval Accuracy: 0.4971, Took 20.5 s
Epoch [6/100], Loss: 1.0727, Eval Accuracy: 0.4968, Took 20.45 s
Epoch [7/100], Loss: 1.0719, Eval Accuracy: 0.4984, Took 20.78 s
Epoch [8/100], Loss: 1.0712, Eval Accuracy: 0.4991, Took 20.79 s
Epoch [9/100], Loss: 1.0712, Eval Accuracy: 0.4979, Took 20.52 s
Epoch [10/100], Loss: 1.0706, Eval Accuracy: 0.4993, Took 20.53 s
Epoch [11/100], Loss: 1.0698, Eval Accuracy: 0.4988, Took 20.77 s
Epoch [12/100], Loss: 1.0691, Eval Accuracy: 0.4987, Took 20.72 s
Epoch [13/100], Loss: 1.0687, Eval Accuracy: 0.4993, Took 20.61 s
Epoch [14/100], Loss: 1.0678, Eval Accuracy: 0.

## Homo.Sapiens

In [48]:
%%time

organism = "Homo.Sapiens"
min_length = None
max_length = 500

train_dataset = ml_helper.CodonDataset(organism, "train", min_length, max_length, cut_data=True, one_hot_aa=False, data_path=data_path, device=device)
print(f"Länge train_dataset: {len(train_dataset)}")
valid_dataset = ml_helper.CodonDataset(organism, "valid", min_length, max_length, cut_data=True, one_hot_aa=False, data_path=data_path, device=device)
print(f"Länge valid_dataset: {len(valid_dataset)}")

BATCH_SIZE = 32
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)

Länge train_dataset: 140711
Länge valid_dataset: 17784
CPU times: user 1min 29s, sys: 392 ms, total: 1min 29s
Wall time: 1min 29s


In [50]:
embed_dims = [64]
num_encoder_layers = [4]
num_heads = [4]
DROPOUTS = [0.2]
POS_ENC = [False, True]
hyper_parameter_training(embed_dims, num_encoder_layers, num_heads, DROPOUTS, POS_ENC, epochs=100)

----- Start Training: 64 emb, 4 layers, 4 heads, 0.2 dropout, positional encoding: False, 100 epochs -----
Epoch [1/100], Loss: 1.1119, Eval Accuracy: 0.4747, Took 87.29 s
Epoch [2/100], Loss: 1.082, Eval Accuracy: 0.4793, Took 87.42 s
Epoch [3/100], Loss: 1.0793, Eval Accuracy: 0.4808, Took 88.48 s
Epoch [4/100], Loss: 1.077, Eval Accuracy: 0.483, Took 88.25 s
Epoch [5/100], Loss: 1.0746, Eval Accuracy: 0.4845, Took 87.93 s
Epoch [6/100], Loss: 1.072, Eval Accuracy: 0.4867, Took 87.49 s
Epoch [7/100], Loss: 1.0694, Eval Accuracy: 0.4905, Took 87.57 s
Epoch [8/100], Loss: 1.0667, Eval Accuracy: 0.4912, Took 87.8 s
Epoch [9/100], Loss: 1.0641, Eval Accuracy: 0.4933, Took 87.63 s
Epoch [10/100], Loss: 1.0616, Eval Accuracy: 0.4956, Took 88.19 s
Epoch [11/100], Loss: 1.0593, Eval Accuracy: 0.495, Took 88.09 s
Epoch [12/100], Loss: 1.0574, Eval Accuracy: 0.4982, Took 87.49 s
Epoch [13/100], Loss: 1.0556, Eval Accuracy: 0.4986, Took 87.37 s
Epoch [14/100], Loss: 1.0538, Eval Accuracy: 0.498

{'encoder_64em_4l_4h_02dr_100ep': 0.5088,
 'encoder_64em_4l_4h_posenc_02dr_100ep': 0.5214}