# Using the best classifiers of each ML model

In [3]:
%load_ext autoreload
%autoreload

import pandas as pd
import sys
import torch

sys.path.append('../scripts')
import ml_helper as mlh
import ml_evaluation as mle
import Baseline_classifiers as bc
import encoder as e
import Tcn as tcn
import rnn

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

In [5]:
organisms = ["E.Coli", "Drosophila.Melanogaster", "Homo.Sapiens"]
models = ["Max CUB", "RNN", "Encoder", "TCN"]

In [6]:
dfs = {}
usage_biases = {}

def group_codons(sequence):
    return [''.join(sequence[i:i+3]) for i in range(0, len(sequence), 3)]

for organism in organisms:
    dfs[organism] = pd.read_pickle(f"../data/{organism}/cleanedData_test.pkl")
    dfs[organism]['codons'] = dfs[organism]['sequence'].apply(group_codons)
    usage_biases[organism] = pd.read_pickle(f"../data/{organism}/usageBias.pkl")

In [28]:
# amino_sequences as list of lists
def predict_codons(amino_sequences, organism, model):
    codon_preds = None

    if model == "Max CUB":
        max_weighted_bc = bc.Max_Bias_Baseline_Classifier(usage_biases[organism])
        codon_preds = max_weighted_bc.predict_codons(amino_sequences)
        codon_preds = codon_preds.tolist()
        codon_preds = [[item for item in sublist if item != ''] for sublist in codon_preds]
    elif model == "RNN":
        e.organism = organism
        model = mlh.load_model('rnn', organism, device=device)
        rnn_classifier = rnn.RNN_Classifier(model)
        codon_preds = rnn_classifier.predict_codons(amino_sequences)
        new_codon_preds = []
        for list in codon_preds:
            new_codon_preds.append([mlh.integer_to_codons[pred] for pred in list])
        codon_preds = new_codon_preds
    elif model == "Encoder":
        e.organism = organism
        model = mlh.load_model('encoder', organism, device=device)
        encoder_classifier = e.Encoder_Classifier(model)
        codon_preds = encoder_classifier.predict_codons(amino_sequences)
        codon_preds = codon_preds.tolist()
        codon_preds = [[item for item in sublist if item != ''] for sublist in codon_preds]
    elif model == "TCN":
        e.organism = organism
        model = mlh.load_model('tcn', organism, device=device)
        tcn_classifier = tcn.Tcn_Classifier(model)
        codon_preds = tcn_classifier.predict_codons(amino_sequences)
    
    return codon_preds

## Testing on new data
- Option 1:
    - Run Notebook 01 with the new organism and the new fasta file to receive the needed cleanData file
- Option 2:
    - Enter the amino sequences manually in this script

In [None]:
# Option 1: Use new cleanedData.pkl file
new_organism = "new organism" # Add name here
df_new = pd.read_pickle(f"../data/{new_organism}/cleanedData.pkl")

amino_sequences = list(df_new['translation'].apply(lambda seq: list(seq)))

In [29]:
# Option 2: Define amnio_sequences manually
amino_sequences = [
    ['M', 'A', 'L'],
]

In [30]:
trained_organism = "E.Coli"

In [31]:
max_cub_preds = predict_codons(amino_sequences, trained_organism, "Max CUB")
print(len(max_cub_preds))
print(max_cub_preds[0])

1
['ATG', 'GCG', 'CTG']


In [32]:
enocder_preds = predict_codons(amino_sequences, trained_organism, "Encoder")
print(len(enocder_preds))
print(enocder_preds[0])

Model loaded: 20240627091800_encoder_64em_2l_4h_05dr_400ep.pt
1
['ATG', 'GCG', 'CTG']


In [13]:
tcn_preds = predict_codons(amino_sequences, trained_organism, "TCN")
print(len(tcn_preds))
print(tcn_preds[0])

ModuleNotFoundError: No module named 'Tcnn'

In [34]:
rnn_preds = predict_codons(amino_sequences, trained_organism, "RNN")
print(len(rnn_preds))
print(rnn_preds[0])

Model loaded: 20240521135840_rnn_hidden128_epochs10_lr0.001_optimSGD.pt
1
['ATG', 'GCG', 'CTG']
