In [178]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import f1_score
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from Bio import SeqIO
import subprocess



In [None]:
def generate_feature_matrix(scoring_path, query_df, reference_df):
    # Extract all unique query and target IDs
    query_ids = [f"{row.id}_{row.type}" for _, row in query_df.iterrows()]
    target_ids = [f"{row.id}_{row.type}" for _, row in reference_df.iterrows()]

    # Initialize a dictionary to hold scores with 0s for all query-target pairs
    scores_dict = {query_id: {target_id: 0.0 for target_id in target_ids} for query_id in query_ids}

    # Populate the scores_dict with bit scores from DIAMOND output
    with open(scoring_path) as f:
        for line in f:
            query_id, subject_id, identity, alignment_length, mismatches, gap_opens, q_start, q_end, s_start, s_end, evalue, bit_score = line.strip().split()
            if query_id in scores_dict and subject_id in scores_dict[query_id]:
                scores_dict[query_id][subject_id] = float(bit_score)
    
    # Convert scores_dict to a feature matrix
    all_scores = []
    for query_id in query_ids:
        # Each row is a list of bit scores for the current query against each target in the reference
        scores = [scores_dict[query_id][target_id] for target_id in target_ids]
        all_scores.append(scores)
    
    # Convert to numpy array for matrix operations
    feature_matrix = np.array(all_scores)

    # Normalize the matrix
    scaler = MinMaxScaler()
    normalized_features = scaler.fit_transform(feature_matrix)
    
    return normalized_features

In [154]:
def fasta_to_dataframe(fasta_file):
    records = []
    for record in SeqIO.parse(fasta_file, "fasta"):
        records.append({"id": record.id.split('|')[0]
                        , "db": record.id.split('|')[2]
                        , "type": record.id.split('|')[3]
                        , "sequence": str(record.seq)})
    return pd.DataFrame(records)

data = fasta_to_dataframe("../data/database/v1/features.fasta")

In [155]:
uniprot_data = data[data['db'] == 'UNIPROT']
card_ardb_data = data[data['db'].isin(['CARD', 'ARDB'])]

In [None]:
# Write CARD and ARDB sequences to a FASTA file to create a DIAMOND database
with open("card_ardb_reference.fasta", "w") as f:
    for i, row in card_ardb_data.iterrows():
        f.write(f">{row['id']}_{row['type']}\n{row['sequence']}\n")

with open("uniprot_sequences.fasta", "w") as f:
    for i, row in uniprot_data.iterrows():
        f.write(f">{row['id']}_{row['type']}\n{row['sequence']}\n")

CompletedProcess(args=['diamond', 'makedb', '--in', 'card_ardb_reference.fasta', '-d', 'card_ardb_db'], returncode=-9)

In [121]:

feature_matrix = generate_feature_matrix('out.tsv', uniprot_data, card_ardb_data)



In [122]:
X_train, X_val, y_train, y_val = train_test_split(feature_matrix, uniprot_data['type'], test_size=0.3, random_state=42)

In [159]:
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train) 
y_val_encoded = label_encoder.transform(y_val)

In [161]:
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_encoded, dtype=torch.long)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val_encoded, dtype=torch.long)

In [162]:
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)

In [None]:
class DeepARGMLP(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DeepARGMLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, 2000)
        self.fc2 = nn.Linear(2000, 1000)
        self.fc3 = nn.Linear(1000, 500)
        self.fc4 = nn.Linear(500, 100)
        self.output = nn.Linear(100, output_dim)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = torch.relu(self.fc2(x))
        x = self.dropout(x)
        x = torch.relu(self.fc3(x))
        x = self.dropout(x)
        x = torch.relu(self.fc4(x))
        x = self.dropout(x)
        x = self.output(x)
        return torch.softmax(x, dim=1)

In [164]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

In [169]:
input_dim = X_train.shape[1]
num_classes = uniprot_data['type'].nunique()

model = DeepARGMLP(input_dim=input_dim, output_dim=num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [171]:
epochs = 5
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    
    # Validation
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for X_val_batch, y_val_batch in val_loader:
            outputs = model(X_val_batch)
            loss = criterion(outputs, y_val_batch)
            val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += y_val_batch.size(0)
            correct += (predicted == y_val_batch).sum().item()
    
    accuracy = 100 * correct / total
    print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(train_loader):.4f}, "
          f"Val Loss: {val_loss/len(val_loader):.4f}, Val Accuracy: {accuracy:.2f}%")

Epoch 1/5, Loss: 1.8282, Val Loss: 1.8249, Val Accuracy: 99.15%
Epoch 2/5, Loss: 1.8255, Val Loss: 1.8246, Val Accuracy: 99.18%
Epoch 3/5, Loss: 1.8240, Val Loss: 1.8240, Val Accuracy: 99.25%
Epoch 4/5, Loss: 1.8235, Val Loss: 1.8240, Val Accuracy: 99.25%
Epoch 5/5, Loss: 1.8246, Val Loss: 1.8243, Val Accuracy: 99.22%


In [179]:
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
best_model = None
best_f1_score = 0  # Track the best F1 score

# Cross-validation loop
for fold, (train_index, val_index) in enumerate(kf.split(X_train_tensor, y_train_tensor)):
    print(f"Starting Fold {fold + 1}")
    
    # Split data into train and validation sets for this fold
    X_fold_train, X_fold_val = X_train_tensor[train_index], X_train_tensor[val_index]
    y_fold_train, y_fold_val = y_train_tensor[train_index], y_train_tensor[val_index]
    
    # Create DataLoaders for this fold
    train_dataset = TensorDataset(X_fold_train, y_fold_train)
    val_dataset = TensorDataset(X_fold_val, y_fold_val)
    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=64)
    
    # Initialize model, loss, and optimizer
    model = DeepARGMLP(input_dim=X_train.shape[1], output_dim=num_classes)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9, nesterov=True)
    
    # Train the model for each fold with 100 epochs
    epochs = 100
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        
        # Evaluate on the fold validation set
        model.eval()
        val_loss = 0.0
        all_predictions = []
        all_labels = []
        with torch.no_grad():
            for X_val_batch, y_val_batch in val_loader:
                outputs = model(X_val_batch)
                loss = criterion(outputs, y_val_batch)
                val_loss += loss.item()
                _, predicted = torch.max(outputs, 1)
                all_predictions.extend(predicted.cpu().numpy())
                all_labels.extend(y_val_batch.cpu().numpy())
        
        # Calculate F1 score for this epoch
        fold_f1_score = f1_score(all_labels, all_predictions, average='weighted')
        print(f"Fold {fold + 1} Epoch {epoch + 1}/{epochs}, "
              f"Loss: {running_loss/len(train_loader):.4f}, "
              f"Val Loss: {val_loss/len(val_loader):.4f}, "
              f"F1 Score: {fold_f1_score:.4f}")
    
    # Save the best model based on F1 score
    if fold_f1_score > best_f1_score:
        best_f1_score = fold_f1_score
        best_model = model  # Save the best-performing model from cross-validation



Starting Fold 1
Fold 1 Epoch 1/100, Loss: 2.7042, Val Loss: 2.6973, F1 Score: 0.1643
Fold 1 Epoch 2/100, Loss: 2.6879, Val Loss: 2.6757, F1 Score: 0.1643
Fold 1 Epoch 3/100, Loss: 2.6500, Val Loss: 2.6020, F1 Score: 0.1643
Fold 1 Epoch 4/100, Loss: 2.4477, Val Loss: 2.2479, F1 Score: 0.5824
Fold 1 Epoch 5/100, Loss: 2.1500, Val Loss: 2.1090, F1 Score: 0.5831
Fold 1 Epoch 6/100, Loss: 2.1089, Val Loss: 2.1062, F1 Score: 0.5804
Fold 1 Epoch 7/100, Loss: 2.1049, Val Loss: 2.1037, F1 Score: 0.5776
Fold 1 Epoch 8/100, Loss: 2.1028, Val Loss: 2.1004, F1 Score: 0.5769
Fold 1 Epoch 9/100, Loss: 2.0986, Val Loss: 2.0966, F1 Score: 0.5769
Fold 1 Epoch 10/100, Loss: 2.0952, Val Loss: 2.0929, F1 Score: 0.5769
Fold 1 Epoch 11/100, Loss: 2.0914, Val Loss: 2.0896, F1 Score: 0.5800
Fold 1 Epoch 12/100, Loss: 2.0884, Val Loss: 2.0865, F1 Score: 0.5821
Fold 1 Epoch 13/100, Loss: 2.0853, Val Loss: 2.0835, F1 Score: 0.5823
Fold 1 Epoch 14/100, Loss: 2.0822, Val Loss: 2.0801, F1 Score: 0.5843
Fold 1 Epoch 

In [None]:
from sklearn.metrics import precision_recall_fscore_support

all_predictions = []
all_labels = []
with torch.no_grad():
    for X_val_batch, y_val_batch in val_loader:
        outputs = best_model(X_val_batch)
        _, predicted = torch.max(outputs, 1)
        all_predictions.extend(predicted.cpu().numpy())
        all_labels.extend(y_val_batch.cpu().numpy())

# Calculate macro precision, recall, and F1 score, as well as per-class metrics
precision, recall, f1, support = precision_recall_fscore_support(
    all_labels, all_predictions, average=None, labels=range(num_classes)
)

avg_precision, avg_recall, avg_f1, avg_support = precision_recall_fscore_support(
    all_labels, all_predictions, average='micro'
)

# Calculate macro-averaged metrics (ignoring class imbalance)
macro_precision, macro_recall, macro_f1, _ = precision_recall_fscore_support(
    all_labels, all_predictions, average='macro'
)

weighted_precision, weighted_recall, weighted_f1, weighted_support = precision_recall_fscore_support(
    all_labels, all_predictions, average='weighted'
)

# Print macro-averaged metrics
print(f"Precision: {avg_precision}")
print(f"Recall: {avg_recall}")
print(f"F1 Score: {avg_f1}")

print(f"Macro Precision: {macro_precision}")
print(f"Macro Recall: {macro_recall}")
print(f"Macro F1 Score: {macro_f1}")

print(f"Weighted Precision: {weighted_precision}")
print(f"Weighted Recall: {weighted_recall}")
print(f"Weighted F1 Score: {weighted_f1}")

Precision: 0.9777150031387319
Recall: 0.9777150031387319
F1 Score: 0.9777150031387319
Macro Precision: 0.5416451981395275
Macro Recall: 0.5795250443201113
Macro F1 Score: 0.5557386207392694
Weighted Precision: 0.9662557402712031
Weighted Recall: 0.9777150031387319
Weighted F1 Score: 0.9706750317725013


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [183]:
class_names = label_encoder.classes_

for i, class_name in enumerate(class_names):
    print(f"Class '{class_name}': Precision: {precision[i]}, Recall: {recall[i]}, F1 Score: {f1[i]}, Support: {support[i]}")

Class 'aminoglycoside': Precision: 1.0, Recall: 0.9871794871794872, F1 Score: 0.9935483870967742, Support: 156
Class 'bacitracin': Precision: 1.0, Recall: 1.0, F1 Score: 1.0, Support: 1206
Class 'beta_lactam': Precision: 1.0, Recall: 0.9953314659197012, F1 Score: 0.9976602714085167, Support: 1071
Class 'chloramphenicol': Precision: 1.0, Recall: 0.9836065573770492, F1 Score: 0.9917355371900827, Support: 122
Class 'fosfomycin': Precision: 0.576271186440678, Recall: 1.0, F1 Score: 0.7311827956989247, Support: 68
Class 'fosmidomycin': Precision: 0.0, Recall: 0.0, F1 Score: 0.0, Support: 0
Class 'glycopeptide': Precision: 0.0, Recall: 0.0, F1 Score: 0.0, Support: 9
Class 'macrolide-lincosamide-streptogramin': Precision: 0.927536231884058, Recall: 0.9922480620155039, F1 Score: 0.9588014981273408, Support: 258
Class 'multidrug': Precision: 0.0, Recall: 0.0, F1 Score: 0.0, Support: 47
Class 'mupirocin': Precision: 0.0, Recall: 0.0, F1 Score: 0.0, Support: 1
Class 'polymyxin': Precision: 0.9959

In [190]:
model_path = "models/best_lr_model.pth"

# Save the trained model
torch.save(best_model.state_dict(), model_path)
print(f"Model saved to {model_path}")

Model saved to models/best_lr_model.pth


In [189]:
output_path = "results/lr_model_predictions.csv"

# After evaluating the best model on the holdout set and collecting predictions
all_predictions = []
all_labels = []
with torch.no_grad():
    for X_val_batch, y_val_batch in val_loader:
        outputs = best_model(X_val_batch)
        _, predicted = torch.max(outputs, 1)
        all_predictions.extend(predicted.cpu().numpy())
        all_labels.extend(y_val_batch.cpu().numpy())

# Decode labels and predictions to their original class names
true_labels = label_encoder.inverse_transform(all_labels)
predicted_labels = label_encoder.inverse_transform(all_predictions)

# Include the ID (index) from y_val
ids = range(len(y_val))  # Assuming y_val is the original holdout labels array

# Create a DataFrame to store the outputs
outputs_df = pd.DataFrame({
    "ID": ids,
    "True Label": true_labels,
    "Predicted Label": predicted_labels
})

# Save the outputs to a CSV file
outputs_df.to_csv(output_path, index=False)