In [1]:
! pip install biopython

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
You should consider upgrading via the '/home/ubuntu/anaconda3/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
! pip install torch

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
You should consider upgrading via the '/home/ubuntu/anaconda3/bin/python -m pip install --upgrade pip' command.[0m


In [3]:
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import f1_score
import pickle

In [4]:
def split_to_short_reads(fasta_file, output_file, read_length=33):
    short_reads = []
    read_ids = []
    types = []
    for record in SeqIO.parse(fasta_file, "fasta"):
        sequence = str(record.seq)
        arg_type = '_'.join(record.id.split('_')[1:])
        # Generate short reads
        for i in range(0, len(sequence), read_length):
            if i + read_length <= len(sequence):  # Ensure we don't exceed the sequence length
                short_read = Seq(sequence[i:i + read_length])
                read_id = f"{record.id}_pos_{i}"
                short_reads.append(SeqRecord(short_read, id=read_id, description=""))
                read_ids.append(read_id)
                types.append(arg_type)

    # Write the short reads to a new fasta file
    SeqIO.write(short_reads, output_file, "fasta")
    return short_reads,read_ids,types


In [5]:
input_fasta = "uniprot_sequences.fasta"
output_fasta = "short_reads.fasta"
short_reads,read_ids,types = split_to_short_reads(input_fasta, output_fasta)

In [6]:
def fasta_to_dataframe(fasta_file):
    records = []
    for record in SeqIO.parse(fasta_file, "fasta"):
        records.append({"id": record.id.split('|')[0]
                        , "db": record.id.split('|')[2]
                        , "type": record.id.split('|')[3]
                        , "sequence": str(record.seq)})
    return pd.DataFrame(records)

data = fasta_to_dataframe("features.fasta")

uniprot_data = data[data['db'] == 'UNIPROT']
card_ardb_data = data[data['db'].isin(['CARD', 'ARDB'])]

In [13]:
def generate_feature_matrix(scoring_path, read_ids, reference_df):
    # Extract all unique query and target IDs
    query_ids = read_ids
    target_ids = [f"{row.id}_{row.type}" for _, row in reference_df.iterrows()]

    # Initialize a dictionary to hold scores with 0s for all query-target pairs
    scores_dict = {query_id: {target_id: 0.0 for target_id in target_ids} for query_id in query_ids}

    # Populate the scores_dict with bit scores from DIAMOND output
    with open(scoring_path) as f:
        for line in f:
            query_id, subject_id, identity, alignment_length, mismatches, gap_opens, q_start, q_end, s_start, s_end, evalue, bit_score = line.strip().split()
            if query_id in scores_dict and subject_id in scores_dict[query_id]:
                scores_dict[query_id][subject_id] = float(bit_score)
    
    # Convert scores_dict to a feature matrix
    all_scores = []
    for query_id in query_ids:
        # Each row is a list of bit scores for the current query against each target in the reference
        scores = [scores_dict[query_id][target_id] for target_id in target_ids]
        all_scores.append(scores)
    
    # Convert to numpy array for matrix operations
    feature_matrix = np.array(all_scores)

    # Normalize the matrix
    scaler = MinMaxScaler()
    normalized_features = scaler.fit_transform(feature_matrix)
    
    return normalized_features

In [None]:
feature_matrix = generate_feature_matrix('out_sr.tsv', read_ids, card_ardb_data)


In [8]:
feature_matrix.shape

(99253, 4355)

In [7]:
with open('feature_matrix.pkl', 'rb') as handle:
    feature_matrix = pickle.load(handle)

In [43]:
X_train, X_test, y_train, y_test = train_test_split(feature_matrix, types, test_size=0.3, random_state=42)

In [10]:
X_train.shape

(69477, 4355)

In [33]:
len(val_dataset)

6947

In [38]:
len(train_dataset)

62530

In [45]:
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train) 
y_test_encoded = label_encoder.transform(y_test)

In [48]:
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_encoded, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test_encoded, dtype=torch.long)

In [49]:
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

In [14]:
class DeepARGMLP(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DeepARGMLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, 2000)
        self.fc2 = nn.Linear(2000, 1000)
        self.fc3 = nn.Linear(1000, 500)
        self.fc4 = nn.Linear(500, 100)
        self.output = nn.Linear(100, output_dim)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = torch.relu(self.fc2(x))
        x = self.dropout(x)
        x = torch.relu(self.fc3(x))
        x = self.dropout(x)
        x = torch.relu(self.fc4(x))
        x = self.dropout(x)
        x = self.output(x)
        return torch.softmax(x, dim=1)

In [51]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

In [16]:
input_dim = X_train.shape[1]
num_classes = uniprot_data['type'].nunique()

model = DeepARGMLP(input_dim=input_dim, output_dim=num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9, nesterov=True)

In [71]:
epochs = 10
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    
    # Validation
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for X_val_batch, y_val_batch in val_loader:
            outputs = model(X_val_batch)
            loss = criterion(outputs, y_val_batch)
            val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += y_val_batch.size(0)
            correct += (predicted == y_val_batch).sum().item()
    
    accuracy = 100 * correct / total
    print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(train_loader):.4f}, "
          f"Val Loss: {val_loss/len(val_loader):.4f}, Val Accuracy: {accuracy:.2f}%")

Epoch 1/10, Loss: 2.3959, Val Loss: 2.2565, Val Accuracy: 55.72%
Epoch 2/10, Loss: 2.2495, Val Loss: 2.2491, Val Accuracy: 56.24%
Epoch 3/10, Loss: 2.2462, Val Loss: 2.2434, Val Accuracy: 56.28%
Epoch 4/10, Loss: 2.1715, Val Loss: 2.1458, Val Accuracy: 66.57%
Epoch 5/10, Loss: 2.1039, Val Loss: 2.0815, Val Accuracy: 73.43%
Epoch 6/10, Loss: 2.0819, Val Loss: 2.0811, Val Accuracy: 73.41%
Epoch 7/10, Loss: 2.0800, Val Loss: 2.0805, Val Accuracy: 73.46%
Epoch 8/10, Loss: 2.0793, Val Loss: 2.0802, Val Accuracy: 73.48%
Epoch 9/10, Loss: 2.0786, Val Loss: 2.0795, Val Accuracy: 73.50%
Epoch 10/10, Loss: 2.0777, Val Loss: 2.0780, Val Accuracy: 73.50%


In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [18]:
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
best_model = None
best_f1_score = 0  # Track the best F1 score

# Cross-validation loop
for fold, (train_index, val_index) in enumerate(kf.split(X_train_tensor, y_train_tensor)):
    print(f"Starting Fold {fold + 1}")
    
    # Split data into train and validation sets for this fold
    X_fold_train, X_fold_val = X_train_tensor[train_index], X_train_tensor[val_index]
    y_fold_train, y_fold_val = y_train_tensor[train_index], y_train_tensor[val_index]
    
    # Move data to the specified device
    X_fold_train = X_fold_train.to(device)
    X_fold_val = X_fold_val.to(device)
    y_fold_train = y_fold_train.to(device)
    y_fold_val = y_fold_val.to(device)
    
    # Create DataLoaders for this fold
    train_dataset = TensorDataset(X_fold_train, y_fold_train)
    val_dataset = TensorDataset(X_fold_val, y_fold_val)
    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=64)
    
    # Initialize model, loss, and optimizer
    model = DeepARGMLP(input_dim=X_train.shape[1], output_dim=num_classes).to(device)  # Move model to device
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9, nesterov=True)
    
    # Train the model for each fold with 100 epochs
    epochs = 100
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)  # Move batch to device
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        
        # Evaluate on the fold validation set
        model.eval()
        val_loss = 0.0
        all_predictions = []
        all_labels = []
        with torch.no_grad():
            for X_val_batch, y_val_batch in val_loader:
                X_val_batch, y_val_batch = X_val_batch.to(device), y_val_batch.to(device)  # Move batch to device
                outputs = model(X_val_batch)
                loss = criterion(outputs, y_val_batch)
                val_loss += loss.item()
                _, predicted = torch.max(outputs, 1)
                all_predictions.extend(predicted.cpu().numpy())  # Move to CPU for metric calculation
                all_labels.extend(y_val_batch.cpu().numpy())
        
        # Calculate F1 score for this epoch
        fold_f1_score = f1_score(all_labels, all_predictions, average='weighted')
        print(f"Fold {fold + 1} Epoch {epoch + 1}/{epochs}, "
              f"Loss: {running_loss/len(train_loader):.4f}, "
              f"Val Loss: {val_loss/len(val_loader):.4f}, "
              f"F1 Score: {fold_f1_score:.4f}")
    
    # Save the best model based on F1 score
    if fold_f1_score > best_f1_score:
        best_f1_score = fold_f1_score
        best_model = model 



Starting Fold 1
Fold 1 Epoch 1/100, Loss: 2.5345, Val Loss: 2.3232, F1 Score: 0.3994
Fold 1 Epoch 2/100, Loss: 2.2704, Val Loss: 2.2474, F1 Score: 0.4544
Fold 1 Epoch 3/100, Loss: 2.2503, Val Loss: 2.2436, F1 Score: 0.4552
Fold 1 Epoch 4/100, Loss: 2.2477, Val Loss: 2.2423, F1 Score: 0.4478
Fold 1 Epoch 5/100, Loss: 2.2479, Val Loss: 2.2421, F1 Score: 0.4473
Fold 1 Epoch 6/100, Loss: 2.2475, Val Loss: 2.2425, F1 Score: 0.4634
Fold 1 Epoch 7/100, Loss: 2.2460, Val Loss: 2.2418, F1 Score: 0.4572
Fold 1 Epoch 8/100, Loss: 2.2451, Val Loss: 2.2400, F1 Score: 0.4519
Fold 1 Epoch 9/100, Loss: 2.2293, Val Loss: 2.1955, F1 Score: 0.5978
Fold 1 Epoch 10/100, Loss: 2.1588, Val Loss: 2.1425, F1 Score: 0.5967
Fold 1 Epoch 11/100, Loss: 2.1476, Val Loss: 2.1422, F1 Score: 0.6000
Fold 1 Epoch 12/100, Loss: 2.1436, Val Loss: 2.1307, F1 Score: 0.5959
Fold 1 Epoch 13/100, Loss: 2.1042, Val Loss: 2.0826, F1 Score: 0.7008
Fold 1 Epoch 14/100, Loss: 2.0840, Val Loss: 2.0787, F1 Score: 0.7015
Fold 1 Epoch 

Fold 2 Epoch 18/100, Loss: 2.0760, Val Loss: 2.0766, F1 Score: 0.7028
Fold 2 Epoch 19/100, Loss: 2.0675, Val Loss: 2.0608, F1 Score: 0.7406
Fold 2 Epoch 20/100, Loss: 2.0592, Val Loss: 2.0592, F1 Score: 0.7395
Fold 2 Epoch 21/100, Loss: 2.0571, Val Loss: 2.0584, F1 Score: 0.7395
Fold 2 Epoch 22/100, Loss: 2.0550, Val Loss: 2.0571, F1 Score: 0.7398
Fold 2 Epoch 23/100, Loss: 2.0520, Val Loss: 2.0478, F1 Score: 0.7618
Fold 2 Epoch 24/100, Loss: 2.0460, Val Loss: 2.0454, F1 Score: 0.7602
Fold 2 Epoch 25/100, Loss: 2.0431, Val Loss: 2.0448, F1 Score: 0.7601
Fold 2 Epoch 26/100, Loss: 2.0423, Val Loss: 2.0446, F1 Score: 0.7604
Fold 2 Epoch 27/100, Loss: 2.0406, Val Loss: 2.0443, F1 Score: 0.7604
Fold 2 Epoch 28/100, Loss: 2.0403, Val Loss: 2.0442, F1 Score: 0.7600
Fold 2 Epoch 29/100, Loss: 2.0400, Val Loss: 2.0442, F1 Score: 0.7600
Fold 2 Epoch 30/100, Loss: 2.0408, Val Loss: 2.0442, F1 Score: 0.7601
Fold 2 Epoch 31/100, Loss: 2.0395, Val Loss: 2.0442, F1 Score: 0.7601
Fold 2 Epoch 32/100,

Fold 3 Epoch 35/100, Loss: 2.0314, Val Loss: 2.0242, F1 Score: 0.7890
Fold 3 Epoch 36/100, Loss: 2.0294, Val Loss: 2.0235, F1 Score: 0.7894
Fold 3 Epoch 37/100, Loss: 2.0287, Val Loss: 2.0234, F1 Score: 0.7900
Fold 3 Epoch 38/100, Loss: 2.0276, Val Loss: 2.0231, F1 Score: 0.7895
Fold 3 Epoch 39/100, Loss: 2.0272, Val Loss: 2.0230, F1 Score: 0.7895
Fold 3 Epoch 40/100, Loss: 2.0268, Val Loss: 2.0230, F1 Score: 0.7894
Fold 3 Epoch 41/100, Loss: 2.0261, Val Loss: 2.0229, F1 Score: 0.7898
Fold 3 Epoch 42/100, Loss: 2.0264, Val Loss: 2.0231, F1 Score: 0.7895
Fold 3 Epoch 43/100, Loss: 2.0268, Val Loss: 2.0230, F1 Score: 0.7896
Fold 3 Epoch 44/100, Loss: 2.0263, Val Loss: 2.0229, F1 Score: 0.7894
Fold 3 Epoch 45/100, Loss: 2.0253, Val Loss: 2.0233, F1 Score: 0.7896
Fold 3 Epoch 46/100, Loss: 2.0250, Val Loss: 2.0228, F1 Score: 0.7898
Fold 3 Epoch 47/100, Loss: 2.0260, Val Loss: 2.0228, F1 Score: 0.7896
Fold 3 Epoch 48/100, Loss: 2.0248, Val Loss: 2.0229, F1 Score: 0.7898
Fold 3 Epoch 49/100,

Fold 4 Epoch 52/100, Loss: 2.0248, Val Loss: 2.0272, F1 Score: 0.7846
Fold 4 Epoch 53/100, Loss: 2.0237, Val Loss: 2.0272, F1 Score: 0.7846
Fold 4 Epoch 54/100, Loss: 2.0236, Val Loss: 2.0271, F1 Score: 0.7847
Fold 4 Epoch 55/100, Loss: 2.0245, Val Loss: 2.0272, F1 Score: 0.7847
Fold 4 Epoch 56/100, Loss: 2.0236, Val Loss: 2.0271, F1 Score: 0.7847
Fold 4 Epoch 57/100, Loss: 2.0236, Val Loss: 2.0270, F1 Score: 0.7847
Fold 4 Epoch 58/100, Loss: 2.0244, Val Loss: 2.0271, F1 Score: 0.7847
Fold 4 Epoch 59/100, Loss: 2.0234, Val Loss: 2.0271, F1 Score: 0.7847
Fold 4 Epoch 60/100, Loss: 2.0244, Val Loss: 2.0270, F1 Score: 0.7848
Fold 4 Epoch 61/100, Loss: 2.0233, Val Loss: 2.0270, F1 Score: 0.7847
Fold 4 Epoch 62/100, Loss: 2.0242, Val Loss: 2.0270, F1 Score: 0.7848
Fold 4 Epoch 63/100, Loss: 2.0242, Val Loss: 2.0270, F1 Score: 0.7848
Fold 4 Epoch 64/100, Loss: 2.0232, Val Loss: 2.0270, F1 Score: 0.7847
Fold 4 Epoch 65/100, Loss: 2.0231, Val Loss: 2.0269, F1 Score: 0.7847
Fold 4 Epoch 66/100,

Fold 5 Epoch 69/100, Loss: 2.0246, Val Loss: 2.0255, F1 Score: 0.7873
Fold 5 Epoch 70/100, Loss: 2.0235, Val Loss: 2.0255, F1 Score: 0.7873
Fold 5 Epoch 71/100, Loss: 2.0234, Val Loss: 2.0255, F1 Score: 0.7873
Fold 5 Epoch 72/100, Loss: 2.0246, Val Loss: 2.0255, F1 Score: 0.7874
Fold 5 Epoch 73/100, Loss: 2.0233, Val Loss: 2.0255, F1 Score: 0.7874
Fold 5 Epoch 74/100, Loss: 2.0245, Val Loss: 2.0256, F1 Score: 0.7873
Fold 5 Epoch 75/100, Loss: 2.0234, Val Loss: 2.0255, F1 Score: 0.7874
Fold 5 Epoch 76/100, Loss: 2.0244, Val Loss: 2.0254, F1 Score: 0.7876
Fold 5 Epoch 77/100, Loss: 2.0235, Val Loss: 2.0253, F1 Score: 0.7876
Fold 5 Epoch 78/100, Loss: 2.0244, Val Loss: 2.0254, F1 Score: 0.7874
Fold 5 Epoch 79/100, Loss: 2.0234, Val Loss: 2.0254, F1 Score: 0.7874
Fold 5 Epoch 80/100, Loss: 2.0243, Val Loss: 2.0254, F1 Score: 0.7876
Fold 5 Epoch 81/100, Loss: 2.0233, Val Loss: 2.0254, F1 Score: 0.7874
Fold 5 Epoch 82/100, Loss: 2.0232, Val Loss: 2.0253, F1 Score: 0.7874
Fold 5 Epoch 83/100,

Fold 6 Epoch 86/100, Loss: 2.0165, Val Loss: 2.0182, F1 Score: 0.7977
Fold 6 Epoch 87/100, Loss: 2.0174, Val Loss: 2.0180, F1 Score: 0.7979
Fold 6 Epoch 88/100, Loss: 2.0163, Val Loss: 2.0180, F1 Score: 0.7979
Fold 6 Epoch 89/100, Loss: 2.0163, Val Loss: 2.0179, F1 Score: 0.7979
Fold 6 Epoch 90/100, Loss: 2.0173, Val Loss: 2.0179, F1 Score: 0.7979
Fold 6 Epoch 91/100, Loss: 2.0163, Val Loss: 2.0186, F1 Score: 0.7975
Fold 6 Epoch 92/100, Loss: 2.0165, Val Loss: 2.0181, F1 Score: 0.7979
Fold 6 Epoch 93/100, Loss: 2.0173, Val Loss: 2.0179, F1 Score: 0.7979
Fold 6 Epoch 94/100, Loss: 2.0162, Val Loss: 2.0180, F1 Score: 0.7978
Fold 6 Epoch 95/100, Loss: 2.0163, Val Loss: 2.0181, F1 Score: 0.7978
Fold 6 Epoch 96/100, Loss: 2.0162, Val Loss: 2.0181, F1 Score: 0.7978
Fold 6 Epoch 97/100, Loss: 2.0173, Val Loss: 2.0180, F1 Score: 0.7978
Fold 6 Epoch 98/100, Loss: 2.0162, Val Loss: 2.0180, F1 Score: 0.7979
Fold 6 Epoch 99/100, Loss: 2.0161, Val Loss: 2.0180, F1 Score: 0.7979
Fold 6 Epoch 100/100

Fold 8 Epoch 3/100, Loss: 2.2494, Val Loss: 2.2459, F1 Score: 0.4517
Fold 8 Epoch 4/100, Loss: 2.2478, Val Loss: 2.2453, F1 Score: 0.4448
Fold 8 Epoch 5/100, Loss: 2.2468, Val Loss: 2.2450, F1 Score: 0.4502
Fold 8 Epoch 6/100, Loss: 2.2463, Val Loss: 2.2435, F1 Score: 0.4471
Fold 8 Epoch 7/100, Loss: 2.2301, Val Loss: 2.1666, F1 Score: 0.5906
Fold 8 Epoch 8/100, Loss: 2.1568, Val Loss: 2.1452, F1 Score: 0.5930
Fold 8 Epoch 9/100, Loss: 2.1477, Val Loss: 2.1440, F1 Score: 0.5959
Fold 8 Epoch 10/100, Loss: 2.1438, Val Loss: 2.1349, F1 Score: 0.5940
Fold 8 Epoch 11/100, Loss: 2.1107, Val Loss: 2.0867, F1 Score: 0.6984
Fold 8 Epoch 12/100, Loss: 2.0836, Val Loss: 2.0815, F1 Score: 0.6998
Fold 8 Epoch 13/100, Loss: 2.0800, Val Loss: 2.0809, F1 Score: 0.6984
Fold 8 Epoch 14/100, Loss: 2.0798, Val Loss: 2.0806, F1 Score: 0.6984
Fold 8 Epoch 15/100, Loss: 2.0788, Val Loss: 2.0804, F1 Score: 0.6997
Fold 8 Epoch 16/100, Loss: 2.0785, Val Loss: 2.0804, F1 Score: 0.7003
Fold 8 Epoch 17/100, Loss: 

Fold 9 Epoch 21/100, Loss: 2.0786, Val Loss: 2.0731, F1 Score: 0.7094
Fold 9 Epoch 22/100, Loss: 2.0779, Val Loss: 2.0726, F1 Score: 0.7093
Fold 9 Epoch 23/100, Loss: 2.0778, Val Loss: 2.0716, F1 Score: 0.7100
Fold 9 Epoch 24/100, Loss: 2.0758, Val Loss: 2.0676, F1 Score: 0.7102
Fold 9 Epoch 25/100, Loss: 2.0666, Val Loss: 2.0492, F1 Score: 0.7510
Fold 9 Epoch 26/100, Loss: 2.0584, Val Loss: 2.0478, F1 Score: 0.7489
Fold 9 Epoch 27/100, Loss: 2.0566, Val Loss: 2.0474, F1 Score: 0.7495
Fold 9 Epoch 28/100, Loss: 2.0559, Val Loss: 2.0471, F1 Score: 0.7500
Fold 9 Epoch 29/100, Loss: 2.0557, Val Loss: 2.0469, F1 Score: 0.7501
Fold 9 Epoch 30/100, Loss: 2.0546, Val Loss: 2.0464, F1 Score: 0.7499
Fold 9 Epoch 31/100, Loss: 2.0538, Val Loss: 2.0442, F1 Score: 0.7491
Fold 9 Epoch 32/100, Loss: 2.0496, Val Loss: 2.0355, F1 Score: 0.7728
Fold 9 Epoch 33/100, Loss: 2.0454, Val Loss: 2.0337, F1 Score: 0.7729
Fold 9 Epoch 34/100, Loss: 2.0436, Val Loss: 2.0332, F1 Score: 0.7725
Fold 9 Epoch 35/100,

Fold 10 Epoch 38/100, Loss: 2.0535, Val Loss: 2.0519, F1 Score: 0.7435
Fold 10 Epoch 39/100, Loss: 2.0540, Val Loss: 2.0520, F1 Score: 0.7433
Fold 10 Epoch 40/100, Loss: 2.0534, Val Loss: 2.0518, F1 Score: 0.7438
Fold 10 Epoch 41/100, Loss: 2.0539, Val Loss: 2.0519, F1 Score: 0.7436
Fold 10 Epoch 42/100, Loss: 2.0534, Val Loss: 2.0519, F1 Score: 0.7448
Fold 10 Epoch 43/100, Loss: 2.0533, Val Loss: 2.0517, F1 Score: 0.7434
Fold 10 Epoch 44/100, Loss: 2.0532, Val Loss: 2.0517, F1 Score: 0.7436
Fold 10 Epoch 45/100, Loss: 2.0531, Val Loss: 2.0516, F1 Score: 0.7436
Fold 10 Epoch 46/100, Loss: 2.0531, Val Loss: 2.0516, F1 Score: 0.7444
Fold 10 Epoch 47/100, Loss: 2.0533, Val Loss: 2.0507, F1 Score: 0.7446
Fold 10 Epoch 48/100, Loss: 2.0486, Val Loss: 2.0395, F1 Score: 0.7665
Fold 10 Epoch 49/100, Loss: 2.0433, Val Loss: 2.0372, F1 Score: 0.7674
Fold 10 Epoch 50/100, Loss: 2.0412, Val Loss: 2.0365, F1 Score: 0.7679
Fold 10 Epoch 51/100, Loss: 2.0396, Val Loss: 2.0346, F1 Score: 0.7675
Fold 1

In [19]:
from sklearn.metrics import precision_recall_fscore_support

all_predictions = []
all_labels = []
with torch.no_grad():
    for X_val_batch, y_val_batch in val_loader:
        outputs = best_model(X_val_batch)
        _, predicted = torch.max(outputs, 1)
        all_predictions.extend(predicted.cpu().numpy())
        all_labels.extend(y_val_batch.cpu().numpy())

# Calculate macro precision, recall, and F1 score, as well as per-class metrics
precision, recall, f1, support = precision_recall_fscore_support(
    all_labels, all_predictions, average=None, labels=range(num_classes)
)

avg_precision, avg_recall, avg_f1, avg_support = precision_recall_fscore_support(
    all_labels, all_predictions, average='micro'
)

# Calculate macro-averaged metrics (ignoring class imbalance)
macro_precision, macro_recall, macro_f1, _ = precision_recall_fscore_support(
    all_labels, all_predictions, average='macro'
)

weighted_precision, weighted_recall, weighted_f1, weighted_support = precision_recall_fscore_support(
    all_labels, all_predictions, average='weighted'
)

# Print macro-averaged metrics
print(f"Precision: {avg_precision}")
print(f"Recall: {avg_recall}")
print(f"F1 Score: {avg_f1}")

print(f"Macro Precision: {macro_precision}")
print(f"Macro Recall: {macro_recall}")
print(f"Macro F1 Score: {macro_f1}")

print(f"Weighted Precision: {weighted_precision}")
print(f"Weighted Recall: {weighted_recall}")
print(f"Weighted F1 Score: {weighted_f1}")

Precision: 0.8014970490859363
Recall: 0.8014970490859363
F1 Score: 0.8014970490859363
Macro Precision: 0.5427454283262227
Macro Recall: 0.43059994325143236
Macro F1 Score: 0.468465276830728
Weighted Precision: 0.8735935060920231
Weighted Recall: 0.8014970490859363
Weighted F1 Score: 0.8011653461489525


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
class_names = label_encoder.classes_

for i, class_name in enumerate(class_names):
    print(f"Class '{class_name}': Precision: {precision[i]}, Recall: {recall[i]}, F1 Score: {f1[i]}, Support: {support[i]}")

Class 'aminoglycoside': Precision: 0.9941860465116279, Recall: 0.684, F1 Score: 0.8104265402843601, Support: 250
Class 'bacitracin': Precision: 0.6160290421669925, Recall: 1.0, F1 Score: 0.7623984793502678, Support: 2206
Class 'beta_lactam': Precision: 1.0, Recall: 0.785109228711547, F1 Score: 0.8796203796203796, Support: 2243
Class 'chloramphenicol': Precision: 1.0, Recall: 0.6289308176100629, F1 Score: 0.7722007722007722, Support: 159
Class 'fosfomycin': Precision: 1.0, Recall: 0.8333333333333334, F1 Score: 0.9090909090909091, Support: 60
Class 'fosmidomycin': Precision: 0.0, Recall: 0.0, F1 Score: 0.0, Support: 1
Class 'glycopeptide': Precision: 0.0, Recall: 0.0, F1 Score: 0.0, Support: 13
Class 'macrolide-lincosamide-streptogramin': Precision: 1.0, Recall: 0.4573170731707317, F1 Score: 0.6276150627615064, Support: 984
Class 'multidrug': Precision: 0.990990990990991, Recall: 0.8270676691729323, F1 Score: 0.901639344262295, Support: 133
Class 'mupirocin': Precision: 0.0, Recall: 0.0,

In [21]:
model_path = "models/best_sr_model.pth"

# Save the trained model
torch.save(best_model.state_dict(), model_path)
print(f"Model saved to {model_path}")

Model saved to models/best_sr_model.pth


In [53]:
output_path = "results/sr_model_predictions.csv"

# After evaluating the best model on the holdout set and collecting predictions
all_predictions = []
all_labels = []
with torch.no_grad():
    for X_test_batch, y_test_batch in test_loader:
        outputs = best_model(X_test_batch)
        _, predicted = torch.max(outputs, 1)
        all_predictions.extend(predicted.cpu().numpy())
        all_labels.extend(y_test_batch.cpu().numpy())

# Decode labels and predictions to their original class names
true_labels = label_encoder.inverse_transform(all_labels)
predicted_labels = label_encoder.inverse_transform(all_predictions)

# Include the ID (index) from y_val
ids = range(len(y_val))  # Assuming y_val is the original holdout labels array
print(len(y_val))
print(len(all_labels))
print(len(all_predictions))

# Create a DataFrame to store the outputs
outputs_df = pd.DataFrame({
    "ID": ids,
    "True Label": true_labels,
    "Predicted Label": predicted_labels
})

# Save the outputs to a CSV file
outputs_df.to_csv(output_path, index=False)

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument mat1 in method wrapper_addmm)

In [29]:
val_loader

<torch.utils.data.dataloader.DataLoader at 0x7f3e60312150>