# Predicting Codons using the trained Encoder model

In [1]:
import sys
import random
import numpy as np
import pandas as pd
import ast
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch import Tensor
import time
import math

sys.path.append('../scripts')
import ml_helper
import Classifier as Classifier
import Baseline_classifiers as bc

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


### Data Preparation

In [3]:
amino_acids = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V', '*',
               '_']

aminoacids_to_integer = dict((a, i) for i, a in enumerate(amino_acids))
integer_to_aminoacids = dict((i, a) for i, a in enumerate(amino_acids))

codons = ['TTT', 'TTC', 'TTA', 'TTG', 'TCT', 'TCC', 'TCA', 'TCG', 'TAT', 'TAC', 'TAA', 'TAG', 'TGT', 'TGC', 'TGA',
          'TGG', 'CTT', 'CTC', 'CTA', 'CTG', 'CCT', 'CCC', 'CCA', 'CCG', 'CAT', 'CAC', 'CAA', 'CAG', 'CGT', 'CGC',
          'CGA', 'CGG', 'ATT', 'ATC', 'ATA', 'ATG', 'ACT', 'ACC', 'ACA', 'ACG', 'AAT', 'AAC', 'AAA', 'AAG', 'AGT',
          'AGC', 'AGA', 'AGG', 'GTT', 'GTC', 'GTA', 'GTG', 'GCT', 'GCC', 'GCA', 'GCG', 'GAT', 'GAC', 'GAA', 'GAG',
          'GGT', 'GGC', 'GGA', 'GGG', '___']

codons_to_integer = dict((c, i) for i, c in enumerate(codons))
integer_to_codons = dict((i, c) for i, c in enumerate(codons))

In [4]:
organism = "E.Coli"

def group_codons(sequence):
    return [''.join(sequence[i:i+3]) for i in range(0, len(sequence), 3)]

df = pd.read_pickle(f"../data/{organism}/cleanedData_test.pkl")
usage_biases = pd.read_pickle(f"../data/{organism}/usageBias.pkl")
df['codons'] = df['sequence'].apply(group_codons)

display(df.head())

Unnamed: 0,id,description,sequence,translation,seguid,codons
1,lcl|U00096.3_cds_AAC73113.1_2,lcl|U00096.3_cds_AAC73113.1_2 [gene=thrA] [loc...,"(A, T, G, C, G, A, G, T, G, T, T, G, A, A, G, ...","(M, R, V, L, K, F, G, G, T, S, V, A, N, A, E, ...",/p+3Jdgat4Fq0w2rqqay4xg8Bs4,"[ATG, CGA, GTG, TTG, AAG, TTC, GGC, GGT, ACA, ..."
5,lcl|U00096.3_cds_AAC73117.1_6,lcl|U00096.3_cds_AAC73117.1_6 [gene=yaaA] [loc...,"(A, T, G, C, T, G, A, T, T, C, T, T, A, T, T, ...","(M, L, I, L, I, S, P, A, K, T, L, D, Y, Q, S, ...",vJJ0yR31YORqwI12U79SgItYU3U,"[ATG, CTG, ATT, CTT, ATT, TCA, CCT, GCG, AAA, ..."
12,lcl|U00096.3_cds_AAC73124.1_13,lcl|U00096.3_cds_AAC73124.1_13 [gene=yaaI] [lo...,"(A, T, G, A, A, A, T, C, C, G, T, T, T, T, T, ...","(M, K, S, V, F, T, I, S, A, S, L, A, I, S, L, ...",GT2zzYZoFncaOMVxs4CEcLaePdc,"[ATG, AAA, TCC, GTT, TTT, ACG, ATT, TCC, GCC, ..."
17,lcl|U00096.3_cds_AAT48122.1_18,lcl|U00096.3_cds_AAT48122.1_18 [gene=hokC] [lo...,"(A, T, G, A, A, G, C, A, G, C, A, T, A, A, G, ...","(M, K, Q, H, K, A, M, I, V, A, L, I, V, I, C, ...",yfUY1Sxn8BgBfdGY1FQnaroApNY,"[ATG, AAG, CAG, CAT, AAG, GCG, ATG, ATT, GTC, ..."
23,lcl|U00096.3_cds_AAC73135.1_24,lcl|U00096.3_cds_AAC73135.1_24 [gene=yaaY] [lo...,"(A, T, G, T, G, C, C, G, G, C, A, C, T, C, G, ...","(M, C, R, H, S, L, R, S, D, G, A, G, F, Y, Q, ...",m/3aWNuEiWlqAe7cvTcnrZ58efA,"[ATG, TGC, CGG, CAC, TCG, TTA, CGT, AGT, GAT, ..."


In [5]:
min_length = None
max_length = None

df = ml_helper.filter_sequence_length(df, min_length, max_length)

In [6]:
len(df)

771

## Load trained model

In [10]:
SPEEDS_ADDED = True

In [11]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 500):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Arguments:
            x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [14]:
class EncoderClassifier(nn.Module):
    def __init__(self, embed_dim, num_layers, num_heads, dropout=0.2, pos_enc=False):
        super(EncoderClassifier, self).__init__()

        emb_size = embed_dim
        if SPEEDS_ADDED:
            emb_size -= 1
        self.emb = nn.Embedding(len(amino_acids), emb_size, padding_idx=len(amino_acids)-1)
        self.pos_enc = pos_enc
        self.pos_encoder = PositionalEncoding(embed_dim, dropout)

        self.encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim,
            nhead=num_heads,
            batch_first=True
        )
        self.encoder = nn.TransformerEncoder(
            encoder_layer=self.encoder_layer,
            num_layers=num_layers,
        )
        self.linear = nn.Linear(embed_dim, len(codons))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = x.long()
        if SPEEDS_ADDED:
            x1 = self.emb(x[:, :, 0])
            x2 = x[:, :, 1].unsqueeze(-1)
            x = torch.cat((x1, x2), dim=-1)  # Concatenate along the feature dimension
        else:
            x = self.emb(x)

        if self.pos_enc:
            x = self.pos_encoder(x)  # Add positional encoding
        x = self.encoder(x)
        x = self.dropout(x)
        out = self.linear(x)
        return out

In [15]:
EMBED_DIM = 256
NUM_ENCODER_LAYERS = 4
NUM_HEADS = 4
DROPOUT = 0.2

model = EncoderClassifier(
    embed_dim=EMBED_DIM,
    num_layers=NUM_ENCODER_LAYERS,
    num_heads=NUM_HEADS,
    dropout=DROPOUT,
    pos_enc=False
).to(device)
model = ml_helper.load_model('encoder_256em_4l_4h_02dr_10ep_speeds', organism)

Model loaded: 20240522184144_encoder_256em_4l_4h_02dr_10ep_speeds.pt


## Prepare data for max_length cutting and putting together again

In [16]:
def cut_sequence(aa_sequence, max_length):
    aa_sequences = []
    bit_map = "" # 1 if sequence is cut, 0 if not
    if aa_sequence.shape[0] <= max_length:
        aa_sequences = [aa_sequence]
        bit_map = "0"
    elif aa_sequence.shape[0] > max_length:
        aa_splits = ml_helper._split_tensor(aa_sequence, max_length)
        aa_sequences = aa_splits
        bit_map = "1" * (len(aa_splits) - 1) + "0"
    return aa_sequences, bit_map

In [17]:
def rebuild_sequences(sequences, cut_bit_map):
    new_sequences = []
    new_sequence = None
    for i, sequence in enumerate(sequences):
        if new_sequence is None:
            new_sequence = sequence
        elif new_sequence is not None:
            if type(new_sequence) == torch.Tensor:
                new_sequence = torch.cat((new_sequence, sequence))
            elif type(new_sequence) == list:
                new_sequence += sequence

        if cut_bit_map[i] == "0":
            new_sequences.append(new_sequence)
            new_sequence = None
    return new_sequences

In [18]:
def remove_padding(sequence, padding_value):
    return sequence[sequence != padding_value]

In [45]:
padding_pos = 'right'
def prepare_aa_sequence(aa_sequence):
    max_length = 500
    non_cut_aa_sequence = ml_helper.aa_to_int_tensor(aa_sequence, device)
    aa_sequences, bit_map = cut_sequence(non_cut_aa_sequence, max_length)
    for i, aa_sequence in enumerate(aa_sequences):
        aa_sequences[i] = ml_helper.pad_tensor(aa_sequence, max_length, aminoacids_to_integer['_'], padding_pos)
        if SPEEDS_ADDED:
            aa_sequences[i] = ml_helper.add_speed_dimension(aa_sequences[i], device)
    return aa_sequences, bit_map, non_cut_aa_sequence

In [59]:
# Prepare data (pad, convert to tensor)
prepared_amino_seq = []
cut_bit_map = ""
non_cut_aa_sequences = []
i = 0
for seq in df['translation']:
    aa_sequences, bit_map, non_cut_aa_sequence = prepare_aa_sequence(seq)
    prepared_amino_seq += aa_sequences
    cut_bit_map += bit_map
    non_cut_aa_sequences.append(non_cut_aa_sequence)
# create data_loader for batched throughput
batch_size = 32
data_loader = DataLoader(prepared_amino_seq, batch_size=batch_size)

In [60]:
if SPEEDS_ADDED:
    for i, aa_sequence in enumerate(prepared_amino_seq):
        prepared_amino_seq[i] = aa_sequence[:, 0].int()

In [61]:
re_sequences = rebuild_sequences(prepared_amino_seq, cut_bit_map)

In [62]:
for i, rebuild_sequence in enumerate(re_sequences):
    re_sequences[i] = remove_padding(rebuild_sequence, aminoacids_to_integer["_"])

In [63]:
def are_lists_equal(list1, list2):
    if len(list1) != len(list2):
        return False

    for tensor1, tensor2 in zip(list1, list2):
        if tensor1.shape[0] != tensor2.shape[0]:
            return False
        if not torch.allclose(tensor1, tensor2):
            return False

    return True

In [64]:
are_lists_equal(non_cut_aa_sequences, re_sequences)

True

## Testing the codon prediction

In [67]:
def predict_codons(model, aa_sequence_list):
    # Prepare data (pad, convert to tensor)
    prepared_amino_seq = []
    cut_bit_map = ""
    for seq in aa_sequence_list:
        aa_sequences, bit_map, _ = prepare_aa_sequence(seq)
        prepared_amino_seq += aa_sequences
        cut_bit_map += bit_map

    # create data_loader for batched throughput
    batch_size = 32
    data_loader = DataLoader(prepared_amino_seq, batch_size=batch_size)

    model.eval()
    codon_predictions = []
    with torch.no_grad():
        for batch in data_loader:
            output = model(batch)  # (batch_size, seq_len, num_classes)
            for batch_i in range(output.shape[0]):
                predicted_codons = []
                for seq_i in range(output.shape[1]):
                    if SPEEDS_ADDED:
                        aa_num = batch[batch_i][seq_i][0].item()
                    else:
                        aa_num = batch[batch_i][seq_i].item()
                    if aa_num == aminoacids_to_integer['_']:
                        break
                    codon_idx = torch.argmax(output[batch_i][seq_i]).item()
                    codon = integer_to_codons[codon_idx]
                    predicted_codons.append(codon)
                codon_predictions.append(predicted_codons)
    codon_predictions = rebuild_sequences(codon_predictions, cut_bit_map)
    return codon_predictions

In [68]:
amino_seq = df['translation'].head()
batched_predictions = predict_codons(model, amino_seq)

In [69]:
len(batched_predictions[0])

821

## Building the classifier

In [70]:
amino_acids_to_codons = {
    'A': ['GCT', 'GCC', 'GCA', 'GCG'],
    'R': ['CGT', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG'],
    'N': ['AAT', 'AAC'],
    'D': ['GAT', 'GAC'],
    'C': ['TGT', 'TGC'],
    'Q': ['CAA', 'CAG'],
    'E': ['GAA', 'GAG'],
    'G': ['GGT', 'GGC', 'GGA', 'GGG'],
    'H': ['CAT', 'CAC'],
    'I': ['ATT', 'ATC', 'ATA'],
    'L': ['TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG'],
    'K': ['AAA', 'AAG'],
    'M': ['ATG'],
    'F': ['TTT', 'TTC'],
    'P': ['CCT', 'CCC', 'CCA', 'CCG'],
    'S': ['TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC'],
    'T': ['ACT', 'ACC', 'ACA', 'ACG'],
    'W': ['TGG'],
    'Y': ['TAT', 'TAC'],
    'V': ['GTT', 'GTC', 'GTA', 'GTG'],
    '*': ['TAA', 'TAG', 'TGA']
}

In [71]:
max_weighted_bc = bc.Max_Bias_Baseline_Classifier(usage_biases)
def check_and_replace_codons(aa_sequences, codon_predictions_list):
    total_codons = 0
    not_possible_codons = 0
    used_max_bias = 0
    for i, aa_seq in enumerate(aa_sequences):
        codon_predictions = codon_predictions_list[i]
        for j, aa in enumerate(aa_seq):
            total_codons += 1
            codon_pred = codon_predictions[j]
            max_bias_pred = max_weighted_bc._predict_codon(aa)
            if codon_pred not in amino_acids_to_codons[aa]:
                not_possible_codons += 1
                
                codon_predictions_list[i][j] = max_bias_pred
            else:
                if codon_pred == max_bias_pred:
                    used_max_bias += 1
    max_bias_ratio = used_max_bias / total_codons
    print(f"Model used max bias codon for {max_bias_ratio*100:.2f}% of possible codon predictions")
    not_possible_ratio = not_possible_codons / total_codons
    print(f"Replaced {not_possible_ratio*100:.2f}% of codons")
    return codon_predictions_list

In [72]:
class Encoder_Classifier(Classifier.Classifier):
    def __init__(self, trained_model, seed=42):
        self.model = trained_model
        super().__init__(seed)


    def predict_codons(self, aa_sequences, replace=False):
        predictions_list = predict_codons(self.model, aa_sequences)
        if replace:
            predictions_list = check_and_replace_codons(aa_sequences, predictions_list)
        predictions_matrix = self.pad_and_convert_seq(predictions_list)
        return predictions_matrix

In [74]:
EMBED_DIM = 256
NUM_ENCODER_LAYERS = 4
NUM_HEADS = 4

model = EncoderClassifier(
    embed_dim=EMBED_DIM,
    num_layers=NUM_ENCODER_LAYERS,
    num_heads=NUM_HEADS
).to(device)
model = ml_helper.load_model('encoder_256em_4l_4h_02dr_10ep_speeds', organism)

Model loaded: 20240522184144_encoder_256em_4l_4h_02dr_10ep_speeds.pt


In [75]:
%%time 
encoder_classifier = Encoder_Classifier(model)
amino_seq = df['translation']
true_codons = df['codons']
pred_codons_replaced = encoder_classifier.predict_codons(amino_seq, replace=True)

Model used max bias codon for 86.81% of possible codon predictions
Replaced 0.00% of codons
CPU times: user 8.24 s, sys: 16.5 ms, total: 8.26 s
Wall time: 8.25 s


In [76]:
accuracy = encoder_classifier.calc_accuracy(true_codons, pred_codons_replaced)
print(f"Organismus {organism} - Accuracy: {accuracy}")

Organismus E.Coli - Accuracy: 0.5215032682554798


### Compare with baseline classifier

In [143]:
max_weighted_bc = bc.Max_Bias_Baseline_Classifier(usage_biases)
amino_seq = df['translation'].apply(lambda seq: list(seq))
true_codons = df['codons']
pred_codons = max_weighted_bc.predict_codons(amino_seq)
accuracy = max_weighted_bc.calc_accuracy(true_codons, pred_codons)
print(f"Organismus {organism} - Accuracy: {accuracy}")

Organismus E.Coli - Accuracy: 0.5174658847557534


## Comparison of hyperparameter tuning

In [189]:
def eval_parameter_model(embed_dim, num_encoder_layers, num_heads, dropout, pos_enc, num_epochs):
    start_time = time.time()

    model = EncoderClassifier(
        embed_dim=embed_dim,
        num_layers=num_encoder_layers,
        num_heads=num_heads
    ).to(device)
    model = ml_helper.load_model( f'encoder_{embed_dim}em_{num_encoder_layers}l_{num_heads}h_{str(dropout).replace(".","")}dr_{num_epochs}ep{"_posenc" if pos_enc else ""}', organism)

    encoder_classifier = Encoder_Classifier(model)
    amino_seq = df['translation']
    true_codons = df['codons']
    pred_codons_replaced = encoder_classifier.predict_codons(amino_seq, replace=True)

    accuracy = round(encoder_classifier.calc_accuracy(true_codons, pred_codons_replaced), 4)
    print(f"Accuracy: {accuracy} - Organism: {organism}, Encoder Model - Parameters: {embed_dim} embedding dim, {num_encoder_layers} layers, {num_heads} heads, {num_epochs} epochs")
    print(f"Took {round(time.time() - start_time, 2)} seconds")
    print("")
    return accuracy

### Dropout

In [196]:
EMBED_DIM = 256
NUM_ENCODER_LAYERS = 4
NUM_HEADS = 4
dropouts = [0.1, 0.2, 0.3, 0.4, 0.5]
POS_ENC = False
EPOCHS = 10

In [197]:
accuracies_dropout = {}
for DROPOUT in dropouts:
    accuracy = eval_parameter_model(EMBED_DIM, NUM_ENCODER_LAYERS, NUM_HEADS, DROPOUT, POS_ENC, EPOCHS)
    accuracies_dropout[f'encoder_{EMBED_DIM}em_{NUM_ENCODER_LAYERS}l_{NUM_HEADS}h_{str(DROPOUT).replace(".","")}dr_{EPOCHS}ep{"_posenc" if POS_ENC else ""}'] = accuracy

Model loaded: 20240518114512_encoder_256em_4l_4h_01dr_10ep.pt
Model used max bias codon for 91.08% of possible codon predictions
Replaced 0.00% of codons
Accuracy: 0.5229 - Organism: E.Coli, Encoder Model - Parameters: 256 embedding dim, 4 layers, 4 heads, 10 epochs
Took 7.41 seconds

Model loaded: 20240518114547_encoder_256em_4l_4h_02dr_10ep.pt
Model used max bias codon for 92.80% of possible codon predictions
Replaced 0.00% of codons
Accuracy: 0.524 - Organism: E.Coli, Encoder Model - Parameters: 256 embedding dim, 4 layers, 4 heads, 10 epochs
Took 7.46 seconds

Model loaded: 20240518114622_encoder_256em_4l_4h_03dr_10ep.pt
Model used max bias codon for 92.57% of possible codon predictions
Replaced 0.00% of codons
Accuracy: 0.5244 - Organism: E.Coli, Encoder Model - Parameters: 256 embedding dim, 4 layers, 4 heads, 10 epochs
Took 7.4 seconds

Model loaded: 20240518114658_encoder_256em_4l_4h_04dr_10ep.pt
Model used max bias codon for 92.65% of possible codon predictions
Replaced 0.00% 

In [198]:
accuracies_dropout

{'encoder_256em_4l_4h_01dr_10ep': 0.5229,
 'encoder_256em_4l_4h_02dr_10ep': 0.524,
 'encoder_256em_4l_4h_03dr_10ep': 0.5244,
 'encoder_256em_4l_4h_04dr_10ep': 0.5239,
 'encoder_256em_4l_4h_05dr_10ep': 0.5218}

In [199]:
max(accuracies_dropout.items(), key=lambda item: item[1])

('encoder_256em_4l_4h_03dr_10ep', 0.5244)

### Positional Encoding

In [200]:
EMBED_DIM = 256
NUM_ENCODER_LAYERS = 4
NUM_HEADS = 4
DROPOUT = 0.3
pos_enc = [True, False]
EPOCHS = 10

In [201]:
accuracies_pos_enc = {}
for POS_ENC in pos_enc:
    accuracy = eval_parameter_model(EMBED_DIM, NUM_ENCODER_LAYERS, NUM_HEADS, DROPOUT, POS_ENC, EPOCHS)
    accuracies_pos_enc[f'encoder_{EMBED_DIM}em_{NUM_ENCODER_LAYERS}l_{NUM_HEADS}h_{str(DROPOUT).replace(".","")}dr_{EPOCHS}ep{"_posenc" if POS_ENC else ""}'] = accuracy

Model loaded: 20240518115201_encoder_256em_4l_4h_03dr_10ep_posenc.pt
Model used max bias codon for 88.99% of possible codon predictions
Replaced 0.05% of codons
Accuracy: 0.5171 - Organism: E.Coli, Encoder Model - Parameters: 256 embedding dim, 4 layers, 4 heads, 10 epochs
Took 7.37 seconds

Model loaded: 20240518115236_encoder_256em_4l_4h_03dr_10ep.pt
Model used max bias codon for 92.57% of possible codon predictions
Replaced 0.00% of codons
Accuracy: 0.5244 - Organism: E.Coli, Encoder Model - Parameters: 256 embedding dim, 4 layers, 4 heads, 10 epochs
Took 7.35 seconds



In [202]:
accuracies_pos_enc

{'encoder_256em_4l_4h_03dr_10ep_posenc': 0.5171,
 'encoder_256em_4l_4h_03dr_10ep': 0.5244}

In [203]:
max(accuracies_dropout.items(), key=lambda item: item[1])

('encoder_256em_4l_4h_03dr_10ep', 0.5244)

### Embedding Dimension

In [204]:
embed_dims = [32, 64, 128, 256, 512, 1028]
NUM_ENCODER_LAYERS = 4
NUM_HEADS = 4
DROPOUT = 0.3
POS_ENC = False
EPOCHS = 10

In [205]:
accuracies_emb = {}
for EMBED_DIM in embed_dims:
    accuracy = eval_parameter_model(EMBED_DIM, NUM_ENCODER_LAYERS, NUM_HEADS, DROPOUT, POS_ENC, EPOCHS)
    accuracies_emb[f'encoder_{EMBED_DIM}em_{NUM_ENCODER_LAYERS}l_{NUM_HEADS}h_{str(DROPOUT).replace(".","")}dr_{EPOCHS}ep{"_posenc" if POS_ENC else ""}'] = accuracy

Model loaded: 20240518115409_encoder_32em_4l_4h_03dr_10ep.pt
Model used max bias codon for 100.00% of possible codon predictions
Replaced 0.00% of codons
Accuracy: 0.5175 - Organism: E.Coli, Encoder Model - Parameters: 32 embedding dim, 4 layers, 4 heads, 10 epochs
Took 7.3 seconds

Model loaded: 20240518115430_encoder_64em_4l_4h_03dr_10ep.pt
Model used max bias codon for 91.54% of possible codon predictions
Replaced 0.00% of codons
Accuracy: 0.5225 - Organism: E.Coli, Encoder Model - Parameters: 64 embedding dim, 4 layers, 4 heads, 10 epochs
Took 7.26 seconds

Model loaded: 20240518115453_encoder_128em_4l_4h_03dr_10ep.pt
Model used max bias codon for 94.71% of possible codon predictions
Replaced 0.00% of codons
Accuracy: 0.5233 - Organism: E.Coli, Encoder Model - Parameters: 128 embedding dim, 4 layers, 4 heads, 10 epochs
Took 7.36 seconds

Model loaded: 20240518115529_encoder_256em_4l_4h_03dr_10ep.pt
Model used max bias codon for 92.57% of possible codon predictions
Replaced 0.00% of

In [206]:
accuracies_emb

{'encoder_32em_4l_4h_03dr_10ep': 0.5175,
 'encoder_64em_4l_4h_03dr_10ep': 0.5225,
 'encoder_128em_4l_4h_03dr_10ep': 0.5233,
 'encoder_256em_4l_4h_03dr_10ep': 0.5244,
 'encoder_512em_4l_4h_03dr_10ep': 0.5213,
 'encoder_1028em_4l_4h_03dr_10ep': 0.5175}

In [207]:
max(accuracies_emb.items(), key=lambda item: item[1])

('encoder_256em_4l_4h_03dr_10ep', 0.5244)

### Number Encoder Layers and Heads

In [208]:
EMBED_DIM = 256
num_encoder_layers = [2, 4, 8, 16]
num_heads = [2, 4, 8, 16]
DROPOUT = 0.3
POS_ENC = False
EPOCHS = 10

In [209]:
accuracies_layers_heads = {}
for NUM_ENCODER_LAYERS in num_encoder_layers:
    for NUM_HEADS in num_heads:
        accuracy = eval_parameter_model(EMBED_DIM, NUM_ENCODER_LAYERS, NUM_HEADS, DROPOUT, POS_ENC, EPOCHS)
        accuracies_layers_heads[f'encoder_{EMBED_DIM}em_{NUM_ENCODER_LAYERS}l_{NUM_HEADS}h_{str(DROPOUT).replace(".","")}dr_{EPOCHS}ep{"_posenc" if POS_ENC else ""}'] = accuracy

Model loaded: 20240518120122_encoder_256em_2l_2h_03dr_10ep.pt
Model used max bias codon for 90.48% of possible codon predictions
Replaced 0.00% of codons
Accuracy: 0.5228 - Organism: E.Coli, Encoder Model - Parameters: 256 embedding dim, 2 layers, 2 heads, 10 epochs
Took 7.29 seconds

Model loaded: 20240518120140_encoder_256em_2l_4h_03dr_10ep.pt
Model used max bias codon for 91.66% of possible codon predictions
Replaced 0.00% of codons
Accuracy: 0.5241 - Organism: E.Coli, Encoder Model - Parameters: 256 embedding dim, 2 layers, 4 heads, 10 epochs
Took 7.22 seconds

Model loaded: 20240518120201_encoder_256em_2l_8h_03dr_10ep.pt
Model used max bias codon for 92.38% of possible codon predictions
Replaced 0.00% of codons
Accuracy: 0.5238 - Organism: E.Coli, Encoder Model - Parameters: 256 embedding dim, 2 layers, 8 heads, 10 epochs
Took 7.28 seconds

Model loaded: 20240518120228_encoder_256em_2l_16h_03dr_10ep.pt
Model used max bias codon for 91.50% of possible codon predictions
Replaced 0.0

In [210]:
accuracies_layers_heads

{'encoder_256em_2l_2h_03dr_10ep': 0.5228,
 'encoder_256em_2l_4h_03dr_10ep': 0.5241,
 'encoder_256em_2l_8h_03dr_10ep': 0.5238,
 'encoder_256em_2l_16h_03dr_10ep': 0.5238,
 'encoder_256em_4l_2h_03dr_10ep': 0.5174,
 'encoder_256em_4l_4h_03dr_10ep': 0.5244,
 'encoder_256em_4l_8h_03dr_10ep': 0.5241,
 'encoder_256em_4l_16h_03dr_10ep': 0.5237,
 'encoder_256em_8l_2h_03dr_10ep': 0.5175,
 'encoder_256em_8l_4h_03dr_10ep': 0.5175,
 'encoder_256em_8l_8h_03dr_10ep': 0.5175,
 'encoder_256em_8l_16h_03dr_10ep': 0.5175,
 'encoder_256em_16l_2h_03dr_10ep': 0.5175,
 'encoder_256em_16l_4h_03dr_10ep': 0.5175,
 'encoder_256em_16l_8h_03dr_10ep': 0.5175,
 'encoder_256em_16l_16h_03dr_10ep': 0.5175}

In [211]:
max(accuracies_layers_heads.items(), key=lambda item: item[1])

('encoder_256em_4l_4h_03dr_10ep', 0.5244)

## Testing Drosophila.Melanogaster

In [213]:
organism = "Drosophila.Melanogaster"

def group_codons(sequence):
    return [''.join(sequence[i:i+3]) for i in range(0, len(sequence), 3)]

df = pd.read_pickle(f"../data/{organism}/cleanedData_test.pkl")
usage_biases = pd.read_pickle(f"../data/{organism}/usageBias.pkl")
df['codons'] = df['sequence'].apply(group_codons)

min_length = None
max_length = None

df = ml_helper.filter_sequence_length(df, min_length, max_length)
len(df)

4335

In [214]:
EMBED_DIM = 256
NUM_ENCODER_LAYERS = 4
NUM_HEADS = 4
DROPOUT = 0.3
POS_ENC = False
EPOCHS = 10

In [215]:
accuracy = eval_parameter_model(EMBED_DIM, NUM_ENCODER_LAYERS, NUM_HEADS, DROPOUT, POS_ENC, EPOCHS)

Model loaded: 20240518122826_encoder_256em_4l_4h_03dr_10ep.pt
Model used max bias codon for 88.39% of possible codon predictions
Replaced 0.00% of codons
Accuracy: 0.4074 - Organism: Drosophila.Melanogaster, Encoder Model - Parameters: 256 embedding dim, 4 layers, 4 heads, 10 epochs
Took 107.46 seconds



### Baseline Classifier

In [216]:
max_weighted_bc = bc.Max_Bias_Baseline_Classifier(usage_biases)
amino_seq = df['translation'].apply(lambda seq: list(seq))
true_codons = df['codons']
pred_codons = max_weighted_bc.predict_codons(amino_seq)
accuracy = max_weighted_bc.calc_accuracy(true_codons, pred_codons)
print(f"Organismus {organism} - Accuracy: {accuracy}")

Organismus Drosophila.Melanogaster - Accuracy: 0.49196268368089835
