# Predicting Codons using the trained Encoder model

In [1]:
import sys
import torch
import torch.nn as nn
from torch import Tensor
import math

sys.path.append('../scripts')
import ml_helper as mlh
import Classifier as Classifier

In [2]:
device = torch.device(
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using mps device


### Data Preparation

In [3]:
amino_acids = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V', '*']

aminoacids_to_integer = dict((a, i) for i, a in enumerate(amino_acids))

amino_acids_int = torch.tensor([aminoacids_to_integer[aa] for aa in amino_acids]).to(device)
amino_acids_int

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20], device='mps:0')

## Load trained model

In [4]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 500):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Arguments:
            x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [5]:
class EncoderClassifier(nn.Module):
    def __init__(self, embed_dim, num_layers, num_heads, dropout=0.2, pos_enc=False):
        super(EncoderClassifier, self).__init__()

        emb_size = embed_dim
        if SPEEDS_ADDED:
            emb_size -= 1
        self.emb = nn.Embedding(len(amino_acids), emb_size, padding_idx=len(amino_acids)-1)
        self.pos_enc = pos_enc
        self.pos_encoder = PositionalEncoding(embed_dim, dropout)

        self.encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim,
            nhead=num_heads,
            batch_first=True
        )
        self.encoder = nn.TransformerEncoder(
            encoder_layer=self.encoder_layer,
            num_layers=num_layers,
        )
        self.linear = nn.Linear(embed_dim, len(codons))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = x.long()
        if SPEEDS_ADDED:
            x1 = self.emb(x[:, :, 0])
            x2 = x[:, :, 1].unsqueeze(-1)
            x = torch.cat((x1, x2), dim=-1)  # Concatenate along the feature dimension
        else:
            x = self.emb(x)

        if self.pos_enc:
            x = x.transpose(0, 1)
            x = self.pos_encoder(x)  # Add positional encoding
            x = x.transpose(0, 1)
        x = self.encoder(x)
        x = self.dropout(x)
        out = self.linear(x)
        return out

In [6]:
organisms = ["E.Coli", "Drosophila.Melanogaster", "Homo.Sapiens"]
organism = organisms[2]
print("organism", organism)

model = mlh.load_model('encoder', organism, device=device)

organism Homo.Sapiens
Model loaded: 20240603201950_encoder_64em_4l_4h_posenc_02dr_80ep.pt


### Create Embedding Generator

In [7]:
class Embedder(nn.Module):
    def __init__(self, model_with_weights: EncoderClassifier):
        super(Embedder, self).__init__()
        self.emb = nn.Embedding.from_pretrained(model_with_weights.emb.weight)

    def forward(self, x):
        return self.emb(x)

In [8]:
embedder = Embedder(model).to(device)
embedder

Embedder(
  (emb): Embedding(22, 64)
)

In [9]:
embedding = embedder(amino_acids_int).cpu().detach().numpy()
print("shape", embedding.shape)
print(embedding)

shape (21, 64)
[[ 1.1042241   0.8326001   0.15882942 ...  0.1425863  -0.9600666
   0.11301643]
 [ 1.0352929   0.6260455   1.4370686  ... -1.5619731   0.71031946
  -1.4426522 ]
 [ 1.5754012   0.7523007  -1.2519578  ... -1.4231119   0.69422084
  -0.14460011]
 ...
 [-1.6650704   0.03668799  0.69063514 ... -1.7696711   0.8733255
  -1.4260384 ]
 [ 0.3931166   0.9061415   0.94544387 ... -1.3964047  -0.76329756
  -1.3973165 ]
 [-0.29485354 -0.27986267  1.0837125  ...  2.0561688  -0.2347232
  -1.3456243 ]]


### Tensorboard

In [10]:
from torch.utils.tensorboard import SummaryWriter
%load_ext tensorboard

In [11]:
writer = SummaryWriter()
writer.add_embedding(embedding, amino_acids)
writer.flush()

#### Tensor baord starten mit
```zsh
tensorboard --logdir=notebooks/runs
```
evtl Pfad anpassen

In [None]:
%tensorboard --logdir runs