## Antimicrobial resistance prediction

Given sequence of nucleotides predict if they are likely to be a pathogenic islands for a bacteria or not.

Input: Sequence. 

Output: SequenceID Sequence Subtype Class Subclass Accuracy %  

This is just a test run to develop encoding methods and model architecture. For the actual training, download all the entries and "Mix and match the different subclasses" to have a good coverages. 
https://www.ncbi.nlm.nih.gov/pathogens/refgene/#

In [8]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Pre-processing

In [9]:
from Bio import SeqIO

def fasta_to_dataframe(fasta_file, cols=['RefSeq nucleotide', 'Sequence nucleotide']):
    # Parse the fasta file and extract the records
    records = list(SeqIO.parse(fasta_file, "fasta"))
    
    # Create a dictionary with ID and Sequence
    ids = [record.id.split(":")[0] for record in records]
    sequences = [str(record.seq) for record in records]
    data = {
        cols[0]: ids,
        cols[1]: sequences
    }
    
    # Convert the dictionary to a pandas DataFrame
    return pd.DataFrame(data)

# Example usage
nucleotide_fasta = './complete_dataset/data/nucleotide.fna'
AA_fasta = './complete_dataset/data/proteins.faa'

nucleotide_df = fasta_to_dataframe(nucleotide_fasta)
AA_df = fasta_to_dataframe(AA_fasta, cols=['RefSeq protein', 'Sequence protein'])

print(nucleotide_df.head())
print(AA_df.head())

  RefSeq nucleotide                                Sequence nucleotide
0       NC_000913.3  AAATTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGG...
1       NC_000913.3  AAATTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGG...
2       NC_000913.3  AAATTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGG...
3       NC_000913.3  AAATTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGG...
4       NC_000913.3  AAATTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGG...
   RefSeq protein                                   Sequence protein
0  WP_003703066.1  MLIKSEYKPRMLPKEEQVKKPMTSNGRISFVLMAMAVLFACLIARG...
1  WP_025297907.1  MENTVFDVRHTFQLTSTELRELQTLLAVVYGGDFSPEDWDHALGGL...
2  WP_063839881.1  MKDRSHDDSMAEVCRNTSENHWLKTDYRTLFRLCPDGRIERENDPD...
3  WP_071224044.1  MNHRKGNEPTAAALMELHVLAMFTHDGNMQIRTINEPWPGEELAPR...
4  WP_004918308.1  MGIEYRSLHTSQLTLSEKEALYDLLIEGFEGDFSHDDFAHTLGGMH...


In [10]:
general_df = pd.read_csv('./complete_dataset/AMR_genes.tsv',sep='\t')
general_df = general_df.dropna()
general_df = pd.merge(general_df, nucleotide_df, on='RefSeq nucleotide', how='inner')
general_df = pd.merge(general_df, AA_df, on='RefSeq protein', how='inner')


In [11]:
from lib.encoding import prepare_data

# --------------------------
# Example usage
# --------------------------
# Assuming general_df is your dataframe
max_len = 200
X_padded, y, label_encoder = prepare_data(df=general_df, max_len=max_len)
num_classes = len(label_encoder.classes_)


import joblib
joblib.dump(label_encoder, "label_encoder.pkl")

['label_encoder.pkl']

In [None]:
from lib.model import ResistanceModel
# Build & train model
trainer = ResistanceModel(max_len=max_len, num_classes=num_classes, model_type="lstm")
history = trainer.train(X_padded, y, epochs=10, batch_size=150, validation_split=0.2)

# Make predictions
preds = trainer.predict(X_padded[:5])
predicted_classes = label_encoder.inverse_transform(preds.argmax(axis=1))
print("Predicted classes:", predicted_classes)

trainer.model.save("./model/resistance_model.h5")


Epoch 1/10
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 224ms/step - accuracy: 0.9088 - loss: 0.3772 - val_accuracy: 0.3741 - val_loss: 3.4024
Epoch 2/10
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 215ms/step - accuracy: 0.9730 - loss: 0.0748 - val_accuracy: 0.3562 - val_loss: 5.1824
Epoch 3/10
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 207ms/step - accuracy: 0.9757 - loss: 0.0612 - val_accuracy: 0.3571 - val_loss: 5.9095
Epoch 4/10
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 217ms/step - accuracy: 0.9771 - loss: 0.0570 - val_accuracy: 0.3567 - val_loss: 6.5525
Epoch 5/10
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 237ms/step - accuracy: 0.9774 - loss: 0.0547 - val_accuracy: 0.3556 - val_loss: 6.4038
Epoch 6/10
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 221ms/step - accuracy: 0.9787 - loss: 0.0522 - val_accuracy: 0.3565 - val_loss: 6.7980
Epoch 7/10

In [None]:
trainer = ResistanceModel(max_len=max_len, num_classes=num_classes, model_type="lstm")

trainer.model.save("./model/resistance_model.keras")


In [None]:
from tensorflow.keras.models import load_model
import joblib
from lib.encoding import GeneEncoder

# Load model + label encoder
model = load_model("./model/resistance_model.keras")
label_encoder = joblib.load("./model/label_encoder.pkl")

# Reuse encoder
encoder = GeneEncoder()

# Example new sequences
new_sequences = [
    "MDEKELIERAGGPVTRGR",   # looks like aminoglycoside
    "MAYEFCEIGESNEYIILA"    # looks like beta-lactam
]

# Encode + pad to same max_len used in training
max_len = 200  # ⚠️ must match training
X_new = encoder.encode_protein_batch(new_sequences, max_len=max_len)

# Predict
preds = model.predict(X_new)
predicted_classes = label_encoder.inverse_transform(preds.argmax(axis=1))

print("Predictions:", predicted_classes)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 344ms/step
Predictions: ['LIPOPEPTIDE' 'QUINOLONE']


2025-09-18 23:27:04.528334: E tensorflow/core/util/util.cc:131] oneDNN supports DT_BOOL only on platforms with AVX-512. Falling back to the default Eigen-based implementation if present.
