In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import f1_score
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from Bio import SeqIO
import subprocess
import pickle

In [3]:
with open('lr_dna_feature_matrix.pkl', 'rb') as handle:
    feature_matrix = pickle.load(handle)

In [4]:
dna_data = pd.read_csv('all_df_v2.csv')
uniprot_data = dna_data[dna_data['db'] == 'UNIPROT']

In [7]:
def contains_invalid_dna_bases(sequence):
    valid_bases = {'A', 'T', 'C', 'G'}
    return any(base not in valid_bases for base in sequence.upper())

uniprot_data = uniprot_data[~uniprot_data['dna_seq'].apply(contains_invalid_dna_bases)]

In [14]:
X_train, X_test, y_train, y_test = train_test_split(feature_matrix, uniprot_data.type, test_size=0.3, random_state=123)


In [15]:
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train) 
y_test_encoded = label_encoder.transform(y_test)

In [16]:
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_encoded, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test_encoded, dtype=torch.long)

In [17]:
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

In [18]:
class DeepARGMLP(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DeepARGMLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, 2000)
        self.fc2 = nn.Linear(2000, 1000)
        self.fc3 = nn.Linear(1000, 500)
        self.fc4 = nn.Linear(500, 100)
        self.output = nn.Linear(100, output_dim)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = torch.relu(self.fc2(x))
        x = self.dropout(x)
        x = torch.relu(self.fc3(x))
        x = self.dropout(x)
        x = torch.relu(self.fc4(x))
        x = self.dropout(x)
        x = self.output(x)
        return torch.softmax(x, dim=1)

In [19]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

In [20]:
input_dim = X_train.shape[1]
num_classes = uniprot_data['type'].nunique()

model = DeepARGMLP(input_dim=input_dim, output_dim=num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9, nesterov=True)

In [21]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [22]:
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
best_model = None
best_f1_score = 0  # Track the best F1 score

# Cross-validation loop
for fold, (train_index, val_index) in enumerate(kf.split(X_train_tensor, y_train_tensor)):
    print(f"Starting Fold {fold + 1}")
    
    # Split data into train and validation sets for this fold
    X_fold_train, X_fold_val = X_train_tensor[train_index], X_train_tensor[val_index]
    y_fold_train, y_fold_val = y_train_tensor[train_index], y_train_tensor[val_index]
    
    # Move data to the specified device
    X_fold_train = X_fold_train.to(device)
    X_fold_val = X_fold_val.to(device)
    y_fold_train = y_fold_train.to(device)
    y_fold_val = y_fold_val.to(device)
    
    # Create DataLoaders for this fold
    train_dataset = TensorDataset(X_fold_train, y_fold_train)
    val_dataset = TensorDataset(X_fold_val, y_fold_val)
    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=64)
    
    # Initialize model, loss, and optimizer
    model = DeepARGMLP(input_dim=X_train.shape[1], output_dim=num_classes).to(device)  # Move model to device
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9, nesterov=True)
    
    # Train the model for each fold with 100 epochs
    epochs = 100
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)  # Move batch to device
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        
        # Evaluate on the fold validation set
        model.eval()
        val_loss = 0.0
        all_predictions = []
        all_labels = []
        with torch.no_grad():
            for X_val_batch, y_val_batch in val_loader:
                X_val_batch, y_val_batch = X_val_batch.to(device), y_val_batch.to(device)  # Move batch to device
                outputs = model(X_val_batch)
                loss = criterion(outputs, y_val_batch)
                val_loss += loss.item()
                _, predicted = torch.max(outputs, 1)
                all_predictions.extend(predicted.cpu().numpy())  # Move to CPU for metric calculation
                all_labels.extend(y_val_batch.cpu().numpy())
        
        # Calculate F1 score for this epoch
        fold_f1_score = f1_score(all_labels, all_predictions, average='weighted')
        print(f"Fold {fold + 1} Epoch {epoch + 1}/{epochs}, "
              f"Loss: {running_loss/len(train_loader):.4f}, "
              f"Val Loss: {val_loss/len(val_loader):.4f}, "
              f"F1 Score: {fold_f1_score:.4f}")
    
    # Save the best model based on F1 score
    if fold_f1_score > best_f1_score:
        best_f1_score = fold_f1_score
        best_model = model 



Starting Fold 1
Fold 1 Epoch 1/100, Loss: 2.7001, Val Loss: 2.6927, F1 Score: 0.2063
Fold 1 Epoch 2/100, Loss: 2.6820, Val Loss: 2.6677, F1 Score: 0.2063
Fold 1 Epoch 3/100, Loss: 2.6334, Val Loss: 2.5593, F1 Score: 0.2063
Fold 1 Epoch 4/100, Loss: 2.4204, Val Loss: 2.3672, F1 Score: 0.2063
Fold 1 Epoch 5/100, Loss: 2.2953, Val Loss: 2.1834, F1 Score: 0.5636
Fold 1 Epoch 6/100, Loss: 2.1537, Val Loss: 2.1590, F1 Score: 0.5636
Fold 1 Epoch 7/100, Loss: 2.1455, Val Loss: 2.1561, F1 Score: 0.5633
Fold 1 Epoch 8/100, Loss: 2.1416, Val Loss: 2.1529, F1 Score: 0.5566
Fold 1 Epoch 9/100, Loss: 2.1403, Val Loss: 2.1517, F1 Score: 0.5562
Fold 1 Epoch 10/100, Loss: 2.1376, Val Loss: 2.1510, F1 Score: 0.5546
Fold 1 Epoch 11/100, Loss: 2.1378, Val Loss: 2.1503, F1 Score: 0.5512
Fold 1 Epoch 12/100, Loss: 2.1357, Val Loss: 2.1499, F1 Score: 0.5524
Fold 1 Epoch 13/100, Loss: 2.1371, Val Loss: 2.1496, F1 Score: 0.5592
Fold 1 Epoch 14/100, Loss: 2.1349, Val Loss: 2.1488, F1 Score: 0.5508
Fold 1 Epoch 

Fold 2 Epoch 18/100, Loss: 2.1276, Val Loss: 2.1159, F1 Score: 0.5626
Fold 2 Epoch 19/100, Loss: 2.1229, Val Loss: 2.1132, F1 Score: 0.5631
Fold 2 Epoch 20/100, Loss: 2.1187, Val Loss: 2.1073, F1 Score: 0.6366
Fold 2 Epoch 21/100, Loss: 2.1110, Val Loss: 2.0918, F1 Score: 0.6824
Fold 2 Epoch 22/100, Loss: 2.0956, Val Loss: 2.0768, F1 Score: 0.6853
Fold 2 Epoch 23/100, Loss: 2.0853, Val Loss: 2.0718, F1 Score: 0.6824
Fold 2 Epoch 24/100, Loss: 2.0790, Val Loss: 2.0682, F1 Score: 0.6824
Fold 2 Epoch 25/100, Loss: 2.0735, Val Loss: 2.0642, F1 Score: 0.6824
Fold 2 Epoch 26/100, Loss: 2.0690, Val Loss: 2.0599, F1 Score: 0.7186
Fold 2 Epoch 27/100, Loss: 2.0635, Val Loss: 2.0522, F1 Score: 0.7586
Fold 2 Epoch 28/100, Loss: 2.0550, Val Loss: 2.0443, F1 Score: 0.7587
Fold 2 Epoch 29/100, Loss: 2.0499, Val Loss: 2.0354, F1 Score: 0.7606
Fold 2 Epoch 30/100, Loss: 2.0396, Val Loss: 2.0276, F1 Score: 0.7611
Fold 2 Epoch 31/100, Loss: 2.0348, Val Loss: 2.0231, F1 Score: 0.7577
Fold 2 Epoch 32/100,

Fold 3 Epoch 35/100, Loss: 2.0484, Val Loss: 2.0208, F1 Score: 0.7487
Fold 3 Epoch 36/100, Loss: 2.0389, Val Loss: 2.0135, F1 Score: 0.7490
Fold 3 Epoch 37/100, Loss: 2.0330, Val Loss: 2.0092, F1 Score: 0.7502
Fold 3 Epoch 38/100, Loss: 2.0293, Val Loss: 2.0056, F1 Score: 0.7610
Fold 3 Epoch 39/100, Loss: 2.0265, Val Loss: 2.0037, F1 Score: 0.7610
Fold 3 Epoch 40/100, Loss: 2.0248, Val Loss: 2.0012, F1 Score: 0.7606
Fold 3 Epoch 41/100, Loss: 2.0205, Val Loss: 1.9995, F1 Score: 0.7616
Fold 3 Epoch 42/100, Loss: 2.0189, Val Loss: 1.9964, F1 Score: 0.7610
Fold 3 Epoch 43/100, Loss: 2.0140, Val Loss: 1.9911, F1 Score: 0.8069
Fold 3 Epoch 44/100, Loss: 2.0081, Val Loss: 1.9821, F1 Score: 0.8194
Fold 3 Epoch 45/100, Loss: 1.9995, Val Loss: 1.9715, F1 Score: 0.8198
Fold 3 Epoch 46/100, Loss: 1.9889, Val Loss: 1.9657, F1 Score: 0.8198
Fold 3 Epoch 47/100, Loss: 1.9841, Val Loss: 1.9628, F1 Score: 0.8224
Fold 3 Epoch 48/100, Loss: 1.9814, Val Loss: 1.9618, F1 Score: 0.8211
Fold 3 Epoch 49/100,

Fold 4 Epoch 52/100, Loss: 1.9741, Val Loss: 1.9671, F1 Score: 0.8179
Fold 4 Epoch 53/100, Loss: 1.9740, Val Loss: 1.9669, F1 Score: 0.8175
Fold 4 Epoch 54/100, Loss: 1.9728, Val Loss: 1.9666, F1 Score: 0.8176
Fold 4 Epoch 55/100, Loss: 1.9735, Val Loss: 1.9666, F1 Score: 0.8175
Fold 4 Epoch 56/100, Loss: 1.9721, Val Loss: 1.9664, F1 Score: 0.8175
Fold 4 Epoch 57/100, Loss: 1.9737, Val Loss: 1.9662, F1 Score: 0.8200
Fold 4 Epoch 58/100, Loss: 1.9727, Val Loss: 1.9659, F1 Score: 0.8205
Fold 4 Epoch 59/100, Loss: 1.9719, Val Loss: 1.9658, F1 Score: 0.8175
Fold 4 Epoch 60/100, Loss: 1.9709, Val Loss: 1.9655, F1 Score: 0.8205
Fold 4 Epoch 61/100, Loss: 1.9722, Val Loss: 1.9652, F1 Score: 0.8182
Fold 4 Epoch 62/100, Loss: 1.9718, Val Loss: 1.9645, F1 Score: 0.8210
Fold 4 Epoch 63/100, Loss: 1.9705, Val Loss: 1.9637, F1 Score: 0.8198
Fold 4 Epoch 64/100, Loss: 1.9698, Val Loss: 1.9626, F1 Score: 0.8179
Fold 4 Epoch 65/100, Loss: 1.9679, Val Loss: 1.9614, F1 Score: 0.8194
Fold 4 Epoch 66/100,

Fold 5 Epoch 69/100, Loss: 1.9728, Val Loss: 1.9756, F1 Score: 0.8083
Fold 5 Epoch 70/100, Loss: 1.9717, Val Loss: 1.9759, F1 Score: 0.8097
Fold 5 Epoch 71/100, Loss: 1.9706, Val Loss: 1.9755, F1 Score: 0.8083
Fold 5 Epoch 72/100, Loss: 1.9713, Val Loss: 1.9758, F1 Score: 0.8083
Fold 5 Epoch 73/100, Loss: 1.9717, Val Loss: 1.9753, F1 Score: 0.8097
Fold 5 Epoch 74/100, Loss: 1.9728, Val Loss: 1.9754, F1 Score: 0.8104
Fold 5 Epoch 75/100, Loss: 1.9713, Val Loss: 1.9755, F1 Score: 0.8083
Fold 5 Epoch 76/100, Loss: 1.9708, Val Loss: 1.9759, F1 Score: 0.8086
Fold 5 Epoch 77/100, Loss: 1.9707, Val Loss: 1.9759, F1 Score: 0.8086
Fold 5 Epoch 78/100, Loss: 1.9713, Val Loss: 1.9754, F1 Score: 0.8083
Fold 5 Epoch 79/100, Loss: 1.9715, Val Loss: 1.9756, F1 Score: 0.8089
Fold 5 Epoch 80/100, Loss: 1.9716, Val Loss: 1.9752, F1 Score: 0.8096
Fold 5 Epoch 81/100, Loss: 1.9701, Val Loss: 1.9757, F1 Score: 0.8083
Fold 5 Epoch 82/100, Loss: 1.9713, Val Loss: 1.9753, F1 Score: 0.8083
Fold 5 Epoch 83/100,

Fold 6 Epoch 86/100, Loss: 1.9700, Val Loss: 1.9840, F1 Score: 0.8012
Fold 6 Epoch 87/100, Loss: 1.9704, Val Loss: 1.9840, F1 Score: 0.8015
Fold 6 Epoch 88/100, Loss: 1.9687, Val Loss: 1.9835, F1 Score: 0.8014
Fold 6 Epoch 89/100, Loss: 1.9694, Val Loss: 1.9841, F1 Score: 0.8015
Fold 6 Epoch 90/100, Loss: 1.9704, Val Loss: 1.9838, F1 Score: 0.8014
Fold 6 Epoch 91/100, Loss: 1.9704, Val Loss: 1.9833, F1 Score: 0.8014
Fold 6 Epoch 92/100, Loss: 1.9692, Val Loss: 1.9838, F1 Score: 0.8014
Fold 6 Epoch 93/100, Loss: 1.9688, Val Loss: 1.9839, F1 Score: 0.8014
Fold 6 Epoch 94/100, Loss: 1.9720, Val Loss: 1.9836, F1 Score: 0.8014
Fold 6 Epoch 95/100, Loss: 1.9695, Val Loss: 1.9835, F1 Score: 0.8014
Fold 6 Epoch 96/100, Loss: 1.9697, Val Loss: 1.9837, F1 Score: 0.8014
Fold 6 Epoch 97/100, Loss: 1.9695, Val Loss: 1.9836, F1 Score: 0.8014
Fold 6 Epoch 98/100, Loss: 1.9702, Val Loss: 1.9833, F1 Score: 0.8014
Fold 6 Epoch 99/100, Loss: 1.9704, Val Loss: 1.9838, F1 Score: 0.8014
Fold 6 Epoch 100/100

Fold 8 Epoch 3/100, Loss: 2.6577, Val Loss: 2.6194, F1 Score: 0.2076
Fold 8 Epoch 4/100, Loss: 2.4810, Val Loss: 2.3571, F1 Score: 0.2076
Fold 8 Epoch 5/100, Loss: 2.3146, Val Loss: 2.1913, F1 Score: 0.5750
Fold 8 Epoch 6/100, Loss: 2.1630, Val Loss: 2.1362, F1 Score: 0.5747
Fold 8 Epoch 7/100, Loss: 2.1450, Val Loss: 2.1344, F1 Score: 0.5747
Fold 8 Epoch 8/100, Loss: 2.1411, Val Loss: 2.1343, F1 Score: 0.5601
Fold 8 Epoch 9/100, Loss: 2.1416, Val Loss: 2.1341, F1 Score: 0.5576
Fold 8 Epoch 10/100, Loss: 2.1399, Val Loss: 2.1346, F1 Score: 0.5574
Fold 8 Epoch 11/100, Loss: 2.1406, Val Loss: 2.1339, F1 Score: 0.5596
Fold 8 Epoch 12/100, Loss: 2.1385, Val Loss: 2.1338, F1 Score: 0.5567
Fold 8 Epoch 13/100, Loss: 2.1377, Val Loss: 2.1330, F1 Score: 0.5573
Fold 8 Epoch 14/100, Loss: 2.1384, Val Loss: 2.1324, F1 Score: 0.5574
Fold 8 Epoch 15/100, Loss: 2.1369, Val Loss: 2.1311, F1 Score: 0.5574
Fold 8 Epoch 16/100, Loss: 2.1362, Val Loss: 2.1303, F1 Score: 0.5575
Fold 8 Epoch 17/100, Loss: 

Fold 9 Epoch 21/100, Loss: 2.1293, Val Loss: 2.1147, F1 Score: 0.5558
Fold 9 Epoch 22/100, Loss: 2.1270, Val Loss: 2.1113, F1 Score: 0.5571
Fold 9 Epoch 23/100, Loss: 2.1236, Val Loss: 2.1074, F1 Score: 0.5672
Fold 9 Epoch 24/100, Loss: 2.1210, Val Loss: 2.1027, F1 Score: 0.5754
Fold 9 Epoch 25/100, Loss: 2.1187, Val Loss: 2.0971, F1 Score: 0.5754
Fold 9 Epoch 26/100, Loss: 2.1106, Val Loss: 2.0911, F1 Score: 0.6232
Fold 9 Epoch 27/100, Loss: 2.1054, Val Loss: 2.0849, F1 Score: 0.6681
Fold 9 Epoch 28/100, Loss: 2.0989, Val Loss: 2.0731, F1 Score: 0.6921
Fold 9 Epoch 29/100, Loss: 2.0884, Val Loss: 2.0599, F1 Score: 0.6931
Fold 9 Epoch 30/100, Loss: 2.0762, Val Loss: 2.0455, F1 Score: 0.7652
Fold 9 Epoch 31/100, Loss: 2.0642, Val Loss: 2.0342, F1 Score: 0.7645
Fold 9 Epoch 32/100, Loss: 2.0553, Val Loss: 2.0228, F1 Score: 0.7607
Fold 9 Epoch 33/100, Loss: 2.0470, Val Loss: 2.0140, F1 Score: 0.7616
Fold 9 Epoch 34/100, Loss: 2.0386, Val Loss: 2.0081, F1 Score: 0.7636
Fold 9 Epoch 35/100,

Fold 10 Epoch 38/100, Loss: 2.0290, Val Loss: 2.0172, F1 Score: 0.7546
Fold 10 Epoch 39/100, Loss: 2.0247, Val Loss: 2.0146, F1 Score: 0.7535
Fold 10 Epoch 40/100, Loss: 2.0195, Val Loss: 2.0123, F1 Score: 0.7540
Fold 10 Epoch 41/100, Loss: 2.0170, Val Loss: 2.0105, F1 Score: 0.7528
Fold 10 Epoch 42/100, Loss: 2.0154, Val Loss: 2.0080, F1 Score: 0.7527
Fold 10 Epoch 43/100, Loss: 2.0120, Val Loss: 2.0055, F1 Score: 0.7780
Fold 10 Epoch 44/100, Loss: 2.0104, Val Loss: 2.0027, F1 Score: 0.7934
Fold 10 Epoch 45/100, Loss: 2.0046, Val Loss: 1.9978, F1 Score: 0.8115
Fold 10 Epoch 46/100, Loss: 1.9989, Val Loss: 1.9925, F1 Score: 0.8128
Fold 10 Epoch 47/100, Loss: 1.9950, Val Loss: 1.9878, F1 Score: 0.8128
Fold 10 Epoch 48/100, Loss: 1.9904, Val Loss: 1.9843, F1 Score: 0.8128
Fold 10 Epoch 49/100, Loss: 1.9862, Val Loss: 1.9816, F1 Score: 0.8141
Fold 10 Epoch 50/100, Loss: 1.9828, Val Loss: 1.9793, F1 Score: 0.8141
Fold 10 Epoch 51/100, Loss: 1.9808, Val Loss: 1.9783, F1 Score: 0.8139
Fold 1

In [25]:
from sklearn.metrics import precision_recall_fscore_support

all_predictions = []
all_labels = []
with torch.no_grad():
    for X_val_batch, y_val_batch in val_loader:
        outputs = best_model(X_val_batch)
        _, predicted = torch.max(outputs, 1)
        all_predictions.extend(predicted.cpu().numpy())
        all_labels.extend(y_val_batch.cpu().numpy())

# Calculate macro precision, recall, and F1 score, as well as per-class metrics
precision, recall, f1, support = precision_recall_fscore_support(
    all_labels, all_predictions, average=None, labels=range(num_classes)
)

avg_precision, avg_recall, avg_f1, avg_support = precision_recall_fscore_support(
    all_labels, all_predictions, average='micro'
)

# Calculate macro-averaged metrics (ignoring class imbalance)
macro_precision, macro_recall, macro_f1, _ = precision_recall_fscore_support(
    all_labels, all_predictions, average='macro'
)

weighted_precision, weighted_recall, weighted_f1, weighted_support = precision_recall_fscore_support(
    all_labels, all_predictions, average='weighted'
)

# Print macro-averaged metrics
print(f"Precision: {avg_precision}")
print(f"Recall: {avg_recall}")
print(f"F1 Score: {avg_f1}")

print(f"Macro Precision: {macro_precision}")
print(f"Macro Recall: {macro_recall}")
print(f"Macro F1 Score: {macro_f1}")

print(f"Weighted Precision: {weighted_precision}")
print(f"Weighted Recall: {weighted_recall}")
print(f"Weighted F1 Score: {weighted_f1}")

Precision: 0.871900826446281
Recall: 0.871900826446281
F1 Score: 0.871900826446281
Macro Precision: 0.5643069086989727
Macro Recall: 0.5746196221833307
Macro F1 Score: 0.5680843386734744
Weighted Precision: 0.8411475747732174
Weighted Recall: 0.871900826446281
Weighted F1 Score: 0.8550724492499289


In [26]:
class_names = label_encoder.classes_

for i, class_name in enumerate(class_names):
    print(f"Class '{class_name}': Precision: {precision[i]}, Recall: {recall[i]}, F1 Score: {f1[i]}, Support: {support[i]}")

Class 'aminoglycoside': Precision: 0.85, Recall: 0.85, F1 Score: 0.85, Support: 40
Class 'bacitracin': Precision: 0.8657718120805369, Recall: 0.9416058394160584, F1 Score: 0.9020979020979021, Support: 274
Class 'beta_lactam': Precision: 0.925, Recall: 0.925, F1 Score: 0.925, Support: 240
Class 'chloramphenicol': Precision: 0.8076923076923077, Recall: 0.7777777777777778, F1 Score: 0.7924528301886792, Support: 27
Class 'fosfomycin': Precision: 0.0, Recall: 0.0, F1 Score: 0.0, Support: 16
Class 'fosmidomycin': Precision: 0.0, Recall: 0.0, F1 Score: 0.0, Support: 0
Class 'glycopeptide': Precision: 0.0, Recall: 0.0, F1 Score: 0.0, Support: 1
Class 'macrolide-lincosamide-streptogramin': Precision: 0.7246376811594203, Recall: 0.8771929824561403, F1 Score: 0.7936507936507936, Support: 57
Class 'multidrug': Precision: 0.0, Recall: 0.0, F1 Score: 0.0, Support: 11
Class 'mupirocin': Precision: 0.0, Recall: 0.0, F1 Score: 0.0, Support: 0
Class 'polymyxin': Precision: 0.9056603773584906, Recall: 0.

In [27]:
model_path = "models/best_dna_lr_model.pth"

# Save the trained model
torch.save(best_model.state_dict(), model_path)
print(f"Model saved to {model_path}")

Model saved to models/best_dna_lr_model.pth


In [30]:
output_path = "results/lr_dna_model_predictions.csv"

# After evaluating the best model on the holdout set and collecting predictions
all_predictions = []
all_labels = []
with torch.no_grad():
    for X_test_batch, y_test_batch in test_loader:
        outputs = best_model(X_test_batch)
        _, predicted = torch.max(outputs, 1)
        all_predictions.extend(predicted.cpu().numpy())
        all_labels.extend(y_test_batch.cpu().numpy())

# Decode labels and predictions to their original class names
true_labels = label_encoder.inverse_transform(all_labels)
predicted_labels = label_encoder.inverse_transform(all_predictions)

# Include the ID (index) from y_val
ids = range(len(y_test))  # Assuming y_val is the original holdout labels array
print(len(y_test))
print(len(all_labels))
print(len(all_predictions))

# Create a DataFrame to store the outputs
outputs_df = pd.DataFrame({
    "ID": ids,
    "True Label": true_labels,
    "Predicted Label": predicted_labels
})

# Save the outputs to a CSV file
outputs_df.to_csv(output_path, index=False)

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument mat1 in method wrapper_addmm)

In [31]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the model to the device
best_model = best_model.to(device)

output_path = "results/lr_dna_model_predictions.csv"

# After evaluating the best model on the holdout set and collecting predictions
all_predictions = []
all_labels = []
with torch.no_grad():
    for X_test_batch, y_test_batch in test_loader:
        # Move input batch and labels to the same device as the model
        X_test_batch = X_test_batch.to(device)
        y_test_batch = y_test_batch.to(device)
        
        outputs = best_model(X_test_batch)
        _, predicted = torch.max(outputs, 1)
        
        # Move predictions and labels to CPU for saving
        all_predictions.extend(predicted.cpu().numpy())
        all_labels.extend(y_test_batch.cpu().numpy())

# Decode labels and predictions to their original class names
true_labels = label_encoder.inverse_transform(all_labels)
predicted_labels = label_encoder.inverse_transform(all_predictions)

# Include the ID (index) from y_test
ids = range(len(y_test))

# Create a DataFrame to store the outputs
outputs_df = pd.DataFrame({
    "ID": ids,
    "True Label": true_labels,
    "Predicted Label": predicted_labels
})

# Save the outputs to a CSV file
outputs_df.to_csv(output_path, index=False)
print(f"Predictions saved to {output_path}")

3115
3115
3115
Predictions saved to results/lr_dna_model_predictions.csv
