In [None]:
! conda update -c bioconda diamond

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from Bio import SeqIO
import subprocess



ModuleNotFoundError: No module named 'pandas'

In [42]:
def run_diamond_command(diamond_command):
    try:
        result = subprocess.run(diamond_command, capture_output=True, text=True, check=True)
        print("DIAMOND output:", result.stdout)
        print("DIAMOND errors:", result.stderr)
    except subprocess.CalledProcessError as e:
        print("DIAMOND failed:", e.stderr)
        raise

def generate_feature_matrix(sequences, reference_db_path):
    with open("temp_sequences.fasta", "w") as f:
        for i, seq in enumerate(sequences):
            f.write(f">seq_{i}\n{seq}\n")
    
    # DIAMOND command setup
    diamond_output_file = "temp_diamond_output.tsv"
    diamond_command = [
        "diamond", "blastx", 
        "-q", "temp_sequences.fasta", 
        "-d", reference_db_path, 
        "-o", diamond_output_file, 
        "--outfmt", "6",  # Tabular format
        "--evalue", "1e-10",
        "--max-target-seqs", "500",
        "--threads", "1"
    ]
    
    # Run DIAMOND command and capture output
    run_diamond_command(diamond_command)
    
    # Parse DIAMOND output
    scores_dict = {}
    with open(diamond_output_file) as f:
        for line in f:
            query_id, subject_id, identity, alignment_length, mismatches, gap_opens, q_start, q_end, s_start, s_end, evalue, bit_score = line.strip().split()
            if query_id not in scores_dict:
                scores_dict[query_id] = []
            scores_dict[query_id].append(float(bit_score))
    
    # Convert scores_dict to a feature matrix
    all_scores = []
    for i in range(len(sequences)):
        query_id = f"seq_{i}"
        scores = scores_dict.get(query_id, [])
        all_scores.append(scores)
    
    # Pad with zeros and normalize
    max_hits = max(len(scores) for scores in all_scores)
    feature_matrix = np.array([np.pad(scores, (0, max_hits - len(scores)), 'constant') for scores in all_scores])
    scaler = MinMaxScaler()
    normalized_features = scaler.fit_transform(feature_matrix)
    
    # Clean up temporary files
    subprocess.run(["rm", "temp_sequences.fasta", "temp_diamond_output.tsv"])
    
    return normalized_features

In [3]:
class DeepARGModel(nn.Module):
    def __init__(self, input_dim, output_dim=30):
        super(DeepARGModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, 2000)
        self.fc2 = nn.Linear(2000, 1000)
        self.fc3 = nn.Linear(1000, 500)
        self.fc4 = nn.Linear(500, 100)
        self.output = nn.Linear(100, output_dim)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = torch.relu(self.fc2(x))
        x = self.dropout(x)
        x = torch.relu(self.fc3(x))
        x = self.dropout(x)
        x = torch.relu(self.fc4(x))
        x = self.dropout(x)
        x = self.output(x)
        return torch.softmax(x, dim=1)

In [25]:
def fasta_to_dataframe(fasta_file):
    records = []
    for record in SeqIO.parse(fasta_file, "fasta"):
        records.append({"id": record.id.split('|')[0]
                        , "db": record.id.split('|')[2]
                        , "type": record.id.split('|')[3]
                        , "sequence": str(record.seq)})
    return pd.DataFrame(records)

data = fasta_to_dataframe("../data/database/v1/features.fasta")

In [30]:
uniprot_data = data[data['db'] == 'UNIPROT']
card_ardb_data = data[data['db'].isin(['CARD', 'ARDB'])]

train_df, val_df = train_test_split(uniprot_data, test_size=0.3, random_state=42)

# Write CARD and ARDB sequences to a FASTA file to create a DIAMOND database
with open("card_ardb_reference.fasta", "w") as f:
    for i, row in card_ardb_data.iterrows():
        f.write(f">{row['type']}_{i}\n{row['sequence']}\n")

# Create DIAMOND database for CARD and ARDB
subprocess.run(["diamond", "makedb", "--in", "card_ardb_reference.fasta", "-d", "card_ardb_db"])

CompletedProcess(args=['diamond', 'makedb', '--in', 'card_ardb_reference.fasta', '-d', 'card_ardb_db'], returncode=-9)

In [36]:
reference_db_path = "card_ardb_db"  # DIAMOND database path
X_train = generate_feature_matrix(train_df['sequence'], reference_db_path)
X_val = generate_feature_matrix(val_df['sequence'], reference_db_path)

DIAMOND failed: 


CalledProcessError: Command '['diamond', 'blastx', '-q', 'temp_sequences.fasta', '-d', 'card_ardb_db', '-o', 'temp_diamond_output.tsv', '--outfmt', '6', '--evalue', '1e-10', '--max-target-seqs', '1000', '--threads', '1']' died with <Signals.SIGKILL: 9>.

In [43]:
sampled_sequences = train_df['sequence'].values[:10]
X_train_sample = generate_feature_matrix(sampled_sequences, reference_db_path)


DIAMOND failed: 


CalledProcessError: Command '['diamond', 'blastx', '-q', 'temp_sequences.fasta', '-d', 'card_ardb_db', '-o', 'temp_diamond_output.tsv', '--outfmt', '6', '--evalue', '1e-10', '--max-target-seqs', '500', '--threads', '1']' died with <Signals.SIGKILL: 9>.

In [41]:
! diamond --version


In [None]:
label_mapping = {label: idx for idx, label in enumerate(train_df['label'].unique())}
y_train = train_df['label'].map(label_mapping).values
y_val = val_df['label'].map(label_mapping).values

# Prepare Torch datasets and dataloaders
train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.long))
val_dataset = TensorDataset(torch.tensor(X_val, dtype=torch.float32), torch.tensor(y_val, dtype=torch.long))

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

In [None]:
input_dim = X_train.shape[1]
model = DeepARGModel(input_dim=input_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
epochs = 20
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    
    # Validation
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for X_val_batch, y_val_batch in val_loader:
            outputs = model(X_val_batch)
            loss = criterion(outputs, y_val_batch)
            val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += y_val_batch.size(0)
            correct += (predicted == y_val_batch).sum().item()
    
    print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(train_loader)}, Val Loss: {val_loss/len(val_loader)}, Accuracy: {100 * correct / total:.2f}%")y
