# Encoder-only Transformer Architektur

In [7]:
import sys
import random
import numpy as np
import pandas as pd
import ast
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import time

sys.path.append('../scripts')
import ml_helper

In [8]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)

## Prepare Test and Training Data Loader

In [9]:
amino_acids = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V', '*',
               '_']

aminoacids_to_integer = dict((a, i) for i, a in enumerate(amino_acids))
integer_to_aminoacids = dict((i, a) for i, a in enumerate(amino_acids))

codons = ['TTT', 'TTC', 'TTA', 'TTG', 'TCT', 'TCC', 'TCA', 'TCG', 'TAT', 'TAC', 'TAA', 'TAG', 'TGT', 'TGC', 'TGA',
          'TGG', 'CTT', 'CTC', 'CTA', 'CTG', 'CCT', 'CCC', 'CCA', 'CCG', 'CAT', 'CAC', 'CAA', 'CAG', 'CGT', 'CGC',
          'CGA', 'CGG', 'ATT', 'ATC', 'ATA', 'ATG', 'ACT', 'ACC', 'ACA', 'ACG', 'AAT', 'AAC', 'AAA', 'AAG', 'AGT',
          'AGC', 'AGA', 'AGG', 'GTT', 'GTC', 'GTA', 'GTG', 'GCT', 'GCC', 'GCA', 'GCG', 'GAT', 'GAC', 'GAA', 'GAG',
          'GGT', 'GGC', 'GGA', 'GGG', '___']

codons_to_integer = dict((c, i) for i, c in enumerate(codons))
integer_to_codons = dict((i, c) for i, c in enumerate(codons))

In [10]:
organism = "E.Coli"
min_length = 100
max_length = 500

train_dataset = ml_helper.CodonDataset(organism, "train", min_length, max_length, one_hot_aa=False)
print(f"Länge train_dataset: {len(train_dataset)}")
test_dataset = ml_helper.CodonDataset(organism, "test", min_length, max_length, one_hot_aa=False)
print(f"Länge test_dataset: {len(test_dataset)}")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df.drop(columns=['sequence_length'], inplace=True)


Länge train_dataset: 2284


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df.drop(columns=['sequence_length'], inplace=True)


Länge test_dataset: 592


In [11]:
print(train_dataset[0])
print(train_dataset[0][0].shape)
print(train_dataset[0][1].shape)

(tensor([12, 16, 18,  0, 19,  6, 13,  3,  2, 19, 15,  1, 10, 18,  7,  3, 19,  1,
         0, 19,  3,  7, 19, 15,  9,  0,  9, 11,  3,  7,  6, 13, 13, 15, 12, 10,
         7, 14, 15,  7, 15,  7, 11, 16, 16,  4, 10,  1, 10,  9,  0,  7, 13,  6,
         5, 10, 15,  7,  7,  0,  9, 15,  9, 13,  7, 11, 14,  0, 15,  2, 10, 14,
        14, 17,  6,  1,  3, 19,  2, 16, 19, 13,  5,  3, 18,  0, 10, 13, 14,  8,
        12, 15,  9, 10,  3,  2, 19,  0, 18,  7, 10, 12, 19, 11,  7, 19,  2, 11,
        11,  5,  1,  8,  0, 12,  0,  5,  6,  0, 10,  6, 11, 19,  0, 10,  7, 13,
        19,  8,  5,  1, 11, 14, 15,  5, 10, 15,  7,  7,  5,  1,  5,  1, 19,  0,
         9,  0,  1,  0, 10, 19,  2,  6, 14,  1, 19, 10, 10, 10,  3,  6, 14, 10,
         7,  0, 10,  3, 10, 11, 10,  1,  6,  5, 12,  5, 10,  6, 10, 11, 11, 10,
         5,  5, 15, 10,  7,  9, 16, 13,  9, 13, 19, 16,  8,  3,  5,  7,  6,  0,
        10, 15, 12, 15,  3,  1, 19,  0, 19, 13,  2,  2,  7,  1,  9,  6,  5, 19,
         3, 15, 14,  1,  3, 10, 18, 12,

In [12]:
BATCH_SIZE = 32
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

## Define the encoder-only model

In [80]:
class EncoderClassifier(nn.Module):
    def __init__(self, embed_dim, num_layers, num_heads):
        super(EncoderClassifier, self).__init__()

        self.emb = nn.Embedding(len(amino_acids), embed_dim, padding_idx=len(amino_acids)-1)
        self.encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim, 
            nhead=num_heads, 
            batch_first=True
        )
        self.encoder = nn.TransformerEncoder(
            encoder_layer=self.encoder_layer,
            num_layers=num_layers,
        )
        self.linear = nn.Linear(embed_dim, len(codons))
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x):
        x = x.long()
        x = self.emb(x)
        x = self.encoder(x)
        x = self.dropout(x)
        out = self.linear(x)
        return out   

In [81]:
EMBED_DIM = 256
NUM_ENCODER_LAYERS = 3
NUM_HEADS = 4

In [82]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = EncoderClassifier(
    embed_dim=EMBED_DIM,
    num_layers=NUM_ENCODER_LAYERS,
    num_heads=NUM_HEADS
).to(device)
print(model)

EncoderClassifier(
  (emb): Embedding(22, 256, padding_idx=21)
  (encoder_layer): TransformerEncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
    )
    (linear1): Linear(in_features=256, out_features=2048, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (linear2): Linear(in_features=2048, out_features=256, bias=True)
    (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.1, inplace=False)
    (dropout2): Dropout(p=0.1, inplace=False)
  )
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-2): 3 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
        )
        (linear1): Linear(in_features=256, out_features=2048, bias=True)
        (dropout

In [83]:
# Total parameters and trainable parameters.
def print_parameters(model):
    total_params = sum(p.numel() for p in model.parameters())
    print(f"{total_params:,} total parameters.")
    total_trainable_params = sum(
        p.numel() for p in model.parameters() if p.requires_grad)
    print(f"{total_trainable_params:,} training parameters.")

In [84]:
print_parameters(model)

5,282,625 total parameters.
5,282,625 training parameters.


In [85]:
def test_forward_pass(model, data_loader):
  batch_data, batch_label = next(iter(data_loader))
  print(f"input dim: {batch_data.shape}")
  output = model(batch_data)
  print(f"output dim: {output.shape}")

In [86]:
test_forward_pass(model, train_loader)

input dim: torch.Size([32, 500])
output dim: torch.Size([32, 500, 65])


## Define the training methods

In [87]:
def train_model(model, optimizer, criterion, num_epochs, print_batches=0):
    for epoch in range(num_epochs):
        model.train()

        epoch_start_time = time.time()
        batch_start_time = time.time()
        epoch_loss = 0
        for batch_idx, batch in enumerate(train_loader):
            # Clear gradients
            optimizer.zero_grad()

            # Forward pass
            input_data, labels = batch
            output = model(input_data)

            # Calculate loss
            loss = criterion(output, labels)
            epoch_loss += loss.item()

            # Backward pass
            loss.backward()

            # Update model parameters
            optimizer.step()

            if print_batches != 0 and batch_idx % print_batches == (print_batches-1):
                batch_time =  round(time.time() - batch_start_time,2)
                print(f'Batch [{batch_idx+1}/{len(train_loader)}], Loss: {loss.item():.4f}, Time since last batch print: {batch_time} s')
                batch_start_time = time.time()

        epoch_time = round(time.time() - epoch_start_time,2)
        epoch_loss = round(epoch_loss / len(train_loader),4)
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss}, Took {epoch_time} s')

## Define the evaluation methods to calculate metrics

In [88]:
from sklearn.metrics import accuracy_score

def compute_accuracy(predictions, labels):
    acc = accuracy_score(labels, predictions)
    return acc

In [93]:
def evaluate_model(model, criterion):
    model.eval()  # Set the model to evaluation mode

    total_loss = 0.0

    with torch.no_grad():
        accuracies = []
        for batch_idx, batch in enumerate(test_loader):
            input_data, labels = batch
            labels = labels.view(-1)

            # Forward pass
            outputs = model(input_data)
            outputs = outputs.view(-1, len(codons))
            loss = criterion(outputs, labels)

            # Compute total loss
            total_loss += loss.item()

            # Compute custom metrics
            accuracy = compute_accuracy(outputs.cpu(), labels.cpu())
            accuracies.append(accuracy)

    # Compute average loss
    avg_loss = total_loss / len(test_loader)

    # Compute average accuracy
    avg_accuracy = np.mean(accuracies)

    print(f'Average Loss: {avg_loss:.4f}')
    print(f'Average Accuracy: {avg_accuracy:.4f}')

    return outputs, labels

## Training the model

In [94]:
EMBED_DIM = 64
NUM_ENCODER_LAYERS = 2
NUM_HEADS = 2

In [95]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = EncoderClassifier(
    embed_dim=EMBED_DIM,
    num_layers=NUM_ENCODER_LAYERS,
    num_heads=NUM_HEADS
).to(device)
print_parameters(model)

849,089 total parameters.
849,089 training parameters.


In [96]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

EPOCHS = 5
print("----- Start Training -----")
train_model(model, optimizer, criterion, EPOCHS, print_batches=10)

----- Start Training -----


RuntimeError: Expected target size [32, 65], got [32, 500]

In [58]:
batch_data, batch_label = next(iter(train_loader))
output = model(batch_data)
print(output.shape)

torch.Size([32, 500])


In [55]:
evaluate_model(model, criterion)

ValueError: Classification metrics can't handle a mix of multiclass-multioutput and continuous-multioutput targets