# Using the best classifiers of each ML model

In [53]:
%load_ext autoreload
%autoreload

import pandas as pd
import sys
import torch

sys.path.append('../scripts')
import ml_helper as mlh
import ml_evaluation as mle
import Baseline_classifiers as bc
import encoder as e
import Tcnn as tcnn
import rnn

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [33]:
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

In [11]:
organisms = ["E.Coli", "Drosophila.Melanogaster", "Homo.Sapiens"]
models = ["Max CUB", "RNN", "Encoder", "TCNN"]

In [12]:
dfs = {}
usage_biases = {}

def group_codons(sequence):
    return [''.join(sequence[i:i+3]) for i in range(0, len(sequence), 3)]

for organism in organisms:
    dfs[organism] = pd.read_pickle(f"../data/{organism}/cleanedData_test.pkl")
    dfs[organism]['codons'] = dfs[organism]['sequence'].apply(group_codons)
    usage_biases[organism] = pd.read_pickle(f"../data/{organism}/usageBias.pkl")

In [51]:
# amino_sequences as list of lists
def predict_codons(amino_sequences, organism, model):
    codon_preds = None

    if model == "Max CUB":
        max_weighted_bc = bc.Max_Bias_Baseline_Classifier(usage_biases[organism])
        codon_preds = max_weighted_bc.predict_codons(amino_sequences)
        codon_preds = codon_preds.tolist()
        codon_preds = [[item for item in sublist if item != ''] for sublist in codon_preds]
    elif model == "RNN":
        e.organism = organism
        model = mlh.load_model('rnn', organism, device=device)
        rnn_classifier = rnn.RNN_Classifier(model)
        codon_preds = rnn_classifier.predict_codons(amino_sequences)
    elif model == "Encoder":
        e.organism = organism
        model = mlh.load_model('encoder', organism, device=device)
        encoder_classifier = e.Encoder_Classifier(model)
        codon_preds = encoder_classifier.predict_codons(amino_sequences)
        codon_preds = codon_preds.tolist()
        codon_preds = [[item for item in sublist if item != ''] for sublist in codon_preds]
    elif model == "TCNN":
        e.organism = organism
        model = mlh.load_model('tcn', organism, device=device)
        tcnn_classifier = tcnn.Tcn_Classifier(model)
        codon_preds = tcnn_classifier.predict_codons(amino_sequences)
    
    return codon_preds

In [37]:
organism = "E.Coli"
amino_sequences = list(dfs[organism]['translation'].apply(lambda seq: list(seq)))
len(amino_sequences)

386

In [26]:
max_cub_preds = predict_codons(amino_sequences, organism, "Max CUB")
print(len(max_cub_preds))
print(max_cub_preds[0])

386
['ATG', 'CGC', 'CAT', 'CCG', 'CTG', 'GTG', 'ATG', 'GGC', 'AAC', 'TGG', 'AAA', 'CTG', 'AAC', 'GGC', 'AGC', 'CGC', 'CAT', 'ATG', 'GTG', 'CAT', 'GAA', 'CTG', 'GTG', 'AGC', 'AAC', 'CTG', 'CGC', 'AAA', 'GAA', 'CTG', 'GCG', 'GGC', 'GTG', 'GCG', 'GGC', 'TGC', 'GCG', 'GTG', 'GCG', 'ATT', 'GCG', 'CCG', 'CCG', 'GAA', 'ATG', 'TAT', 'ATT', 'GAT', 'ATG', 'GCG', 'AAA', 'CGC', 'GAA', 'GCG', 'GAA', 'GGC', 'AGC', 'CAT', 'ATT', 'ATG', 'CTG', 'GGC', 'GCG', 'CAG', 'AAC', 'GTG', 'GAT', 'CTG', 'AAC', 'CTG', 'AGC', 'GGC', 'GCG', 'TTT', 'ACC', 'GGC', 'GAA', 'ACC', 'AGC', 'GCG', 'GCG', 'ATG', 'CTG', 'AAA', 'GAT', 'ATT', 'GGC', 'GCG', 'CAG', 'TAT', 'ATT', 'ATT', 'ATT', 'GGC', 'CAT', 'AGC', 'GAA', 'CGC', 'CGC', 'ACC', 'TAT', 'CAT', 'AAA', 'GAA', 'AGC', 'GAT', 'GAA', 'CTG', 'ATT', 'GCG', 'AAA', 'AAA', 'TTT', 'GCG', 'GTG', 'CTG', 'AAA', 'GAA', 'CAG', 'GGC', 'CTG', 'ACC', 'CCG', 'GTG', 'CTG', 'TGC', 'ATT', 'GGC', 'GAA', 'ACC', 'GAA', 'GCG', 'GAA', 'AAC', 'GAA', 'GCG', 'GGC', 'AAA', 'ACC', 'GAA', 'GAA', 'GTG', '

In [44]:
enocder_preds = predict_codons(amino_sequences, organism, "Encoder")
print(len(enocder_preds))
print(enocder_preds[0])

Model loaded: 20240627091800_encoder_64em_2l_4h_05dr_400ep.pt
386
['ATG', 'CGT', 'CAT', 'CCG', 'CTG', 'GTT', 'ATG', 'GGT', 'AAC', 'TGG', 'AAA', 'CTG', 'AAC', 'GGT', 'TCT', 'CGT', 'CAT', 'ATG', 'GTT', 'CAT', 'GAA', 'CTG', 'GTT', 'TCT', 'AAC', 'CTG', 'CGT', 'AAA', 'GAA', 'CTG', 'GCG', 'GGT', 'GTT', 'GCG', 'GGT', 'TGC', 'GCG', 'GTT', 'GCG', 'ATT', 'GCG', 'CCG', 'CCG', 'GAA', 'ATG', 'TAT', 'ATT', 'GAT', 'ATG', 'GCG', 'AAA', 'CGT', 'GAA', 'GCG', 'GAA', 'GGT', 'TCT', 'CAT', 'ATT', 'ATG', 'CTG', 'GGT', 'GCG', 'CAG', 'AAC', 'GTT', 'GAT', 'CTG', 'AAC', 'CTG', 'TCT', 'GGT', 'GCG', 'TTC', 'ACC', 'GGT', 'GAA', 'ACC', 'TCT', 'GCG', 'GCG', 'ATG', 'CTG', 'AAA', 'GAT', 'ATT', 'GGT', 'GCG', 'CAG', 'TAT', 'ATT', 'ATT', 'ATT', 'GGT', 'CAT', 'TCT', 'GAA', 'CGT', 'CGT', 'ACC', 'TAT', 'CAT', 'AAA', 'GAA', 'TCT', 'GAT', 'GAA', 'CTG', 'ATT', 'GCG', 'AAA', 'AAA', 'TTC', 'GCG', 'GTT', 'CTG', 'AAA', 'GAA', 'CAG', 'GGT', 'CTG', 'ACC', 'CCG', 'GTT', 'CTG', 'TGC', 'ATT', 'GGT', 'GAA', 'ACC', 'GAA', 'GCG', 'GAA', 'A

In [50]:
tcnn_preds = predict_codons(amino_sequences, organism, "TCNN")
print(len(tcnn_preds))
print(tcnn_preds[0])

Model loaded: 20240623012201_tcnn_best_model_acc_0.555.pt


AttributeError: 'str' object has no attribute 'long'

In [54]:
rnn_preds = predict_codons(amino_sequences, organism, "RNN")
print(len(rnn_preds))
print(rnn_preds[0])

Model loaded: 20240521135840_rnn_hidden128_epochs10_lr0.001_optimSGD.pt


ValueError: too many values to unpack (expected 2)