In [None]:
# !pip install biopython
!pip install --upgrade --no-cache-dir biopython
!pip install rdkit-pypi
!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-2.2.0+cu118.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-2.2.0+cu118.html
!pip install -q torch-geometric
!pip install fair-esm


In [None]:
import numpy as np
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
import torch
from torch_geometric.data import Data
from torch_geometric.data import Batch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, global_mean_pool
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
import pickle
from torch.utils.data import DataLoader, Subset, random_split
import esm
from joblib import Parallel, delayed




In [None]:
pIC50=np.load('/kaggle/input/drug-virus-features/pIC50.npy')
with open("/kaggle/input/drug-virus-features/drug_graphs.pkl", "rb") as f:
    drug_graphs = pickle.load(f)



In [None]:

def esm_model(model,alphabet,seq):
   
    batch_converter = alphabet.get_batch_converter()
    batch_labels, batch_strs, batch_tokens = batch_converter([("protein", seq)])
    with torch.no_grad():
        results = model(batch_tokens, repr_layers=[16], return_contacts=True)
    
    contact_map = results["contacts"]  # Shape: [1, L, L]
    
   
    return contact_map

In [None]:
def split_sequence(seq, window_size=1000, stride=500):
    windows = []
    for start in range(0, len(seq), stride):
        end = min(start + window_size, len(seq))
        if end - start < 2:  # skip too-short fragments
            break
        windows.append((start, seq[start:end]))
        if end == len(seq):
            break
    return windows

In [None]:
def protein_graph(model, alphabet, seq, threshold=0.5, window_size=1000, stride=500):
    aa_dict = {aa: i for i, aa in enumerate("ACDEFGHIKLMNPQRSTVWY")}
    L = len(seq)
    
    # Build node features (one-hot encoding for the full sequence)
    node_features = torch.eye(20)[[aa_dict.get(aa, 0) for aa in seq]]  # [L, 20]

    # Containers for merged edges
    edge_index = []
    edge_attr = []

    windows = split_sequence(seq, window_size, stride)

    for start_idx, subseq in windows:
        contact_map = esm_model(model, alphabet, subseq)[0]  # shape: [L_window, L_window]
        L_win = len(subseq)

        for i in range(L_win):
            for j in range(L_win):
                prob = contact_map[i, j].item()
                if prob > threshold:
                    global_i = start_idx + i
                    global_j = start_idx + j
                    if global_i < L and global_j < L:
                        edge_index.append([global_i, global_j])
                        edge_attr.append(prob)

    return node_features, edge_index, edge_attr

In [None]:
# Read the CSV file
df = pd.read_csv("/kaggle/input/virus-drug/virus_drug_interactions.csv")

In [None]:
model, alphabet = esm.pretrained.esm1b_t33_650M_UR50S()
model.eval()
protein_sequences = df['Protein_Sequence']

In [None]:

def compute_graph(protein):
   
    return protein_graph(model, alphabet, protein)

# Run in parallel using all CPU cores
protein_graphs = Parallel(n_jobs=4)(
    delayed(compute_graph)(protein) for protein in tqdm(protein_sequences, desc="Processing proteins")
)

# Print results
print(len(protein_graphs), len(protein_graphs[0]))


In [None]:
total_size = len(protein_features)
train_size = int(0.7 * total_size)
val_size = int(0.15 * total_size)
test_size = total_size - train_size - val_size

all_indices = list(range(total_size))
train_indices, val_indices, test_indices = random_split(
    all_indices, [train_size, val_size, test_size],
    generator=torch.Generator().manual_seed(42)
)


In [None]:
train_protein = protein_features[train_indices] 
pca = PCA(n_components=50)
pca.fit(train_protein)

# Step 3: Transform all sets
protein_pca = pca.transform(protein_features)
protein_pca = torch.tensor(protein_pca, dtype=torch.float32)



In [None]:
def drug_graph_to_data(drug_graph):
    mol_size, nodes, edges, edges_type = drug_graph
    x = torch.tensor(nodes, dtype=torch.float)  # [num_nodes, node_features]
    
    edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()  # [2, num_edges]
    edge_attr = torch.tensor(edges_type, dtype=torch.float).unsqueeze(1)  # [num_edges, 1]
    
    data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr)
    return data

In [None]:
def protein_graph_to_data(protein_graph):
    node_features,edge_index,edge_attr = protein_graph
    x = node_features
    
    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()  # [2, num_edges]
    edge_attr = torch.tensor(edge_attr, dtype=torch.float).unsqueeze(1)  # [num_edges, 1]
    
    data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr)
    return data

In [None]:
class DrugProteinDataset(torch.utils.data.Dataset):
    def __init__(self, protein_features, drug_graphs, pIC50_values):
        self.protein_features = protein_features
        self.drug_graphs = drug_graphs
        self.pIC50_values = pIC50_values
    
    def __len__(self):
        return len(self.pIC50_values)
    
    def __getitem__(self, idx):
        protein_feature = torch.tensor(self.protein_features[idx], dtype=torch.float)
        drug_graph = drug_graph_to_data(self.drug_graphs[idx])
        pIC50_value = torch.tensor(self.pIC50_values[idx], dtype=torch.float)
        return protein_feature, drug_graph, pIC50_value

def custom_collate(batch):
    protein_feats = torch.stack([item[0] for item in batch])  # [batch_size, protein_feature_dim]
    drug_graphs = [item[1] for item in batch]                 # List of PyG Data objects
    labels = torch.stack([item[2] for item in batch])         # [batch_size]

    batch_drug_graphs = Batch.from_data_list(drug_graphs)     # Combine graphs into a single batched graph

    return protein_feats, batch_drug_graphs, labels


In [None]:
class DrugTargetGNN(nn.Module):
    def __init__(self, node_feature_dim=78, protein_feature_dim=50, hidden_dim=128):
        super().__init__()
        # GNN layers for drug graph
        self.conv1 = GCNConv(node_feature_dim, node_feature_dim)
        self.conv2 = GCNConv(node_feature_dim, node_feature_dim*2)
        self.conv3 = GCNConv(node_feature_dim*2, node_feature_dim*4)

        
        self.lineargraph = nn.Linear(node_feature_dim*4, hidden_dim)
        
        # MLP for protein features
        self.protein_mlp = nn.Sequential(
            nn.Linear(protein_feature_dim, protein_feature_dim*2),
            nn.ReLU(),
            nn.Linear(protein_feature_dim*2, protein_feature_dim*4),
            nn.ReLU(),
            nn.Linear(protein_feature_dim*4, hidden_dim)
            
        )
        
        # Final layers for combined features
        self.final_mlp = nn.Sequential(
            nn.Linear(hidden_dim * 2, hidden_dim),
            nn.ReLU(),
            
            nn.Linear(hidden_dim, hidden_dim//2),
            nn.ReLU(),
            nn.Linear(hidden_dim//2, 1)  # regression output for pIC50
        )

        
    def forward(self, protein_feat, drug_graph):
        # GNN on drug graph
        x, edge_index,edge_attr = drug_graph.x, drug_graph.edge_index,drug_graph.edge_attr
        x = F.relu(self.conv1(x, edge_index,edge_attr))
        x = F.relu(self.conv2(x, edge_index,edge_attr))
        x = F.relu(self.conv3(x, edge_index,edge_attr))
        
        x = global_mean_pool(x, drug_graph.batch)  # [batch_size, hidden_dim]
        x = F.relu(self.lineargraph(x))

        
        # Protein feature embedding
        p = self.protein_mlp(protein_feat)  # [batch_size, hidden_dim]
        
        # Combine embeddings
        combined = torch.cat([x, p], dim=1)
        out = self.final_mlp(combined)
        return out.squeeze()  # [batch_size]

In [None]:

dataset = DrugProteinDataset(protein_pca, drug_graphs, pIC50)

train_dataset = Subset(dataset, train_indices)
val_dataset = Subset(dataset, val_indices)
test_dataset = Subset(dataset, test_indices)


train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, collate_fn=custom_collate)
val_loader = DataLoader(val_dataset, batch_size=128, shuffle=False, collate_fn=custom_collate)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False, collate_fn=custom_collate)

In [None]:
# dataset = DrugProteinDataset(protein_features, drug_graphs, pIC50)

# # Step 1: Load the original CSV and get top 10 virus indices
# df = pd.read_csv('/kaggle/input/virus-drug/virus_drug_interactions.csv')
# top_3_viruses = df['Virus_Organism'].value_counts().head(3).index
# top_3_indices = df[df['Virus_Organism'].isin(top_3_viruses)].index.tolist()

# # Step 2: Get remaining indices
# all_indices = set(range(len(df)))
# remaining_indices = list(all_indices - set(top_3_indices))

# # Step 3: Split remaining into val and test (50/50)
# remaining_size = len(remaining_indices)
# val_size = remaining_size // 2
# test_size = remaining_size - val_size  # ensures no rounding errors

# val_indices, test_indices = random_split(
#     remaining_indices,
#     [val_size, test_size],
#     generator=torch.Generator().manual_seed(42)
# )

# # Step 4: Build subsets
# train_dataset = Subset(dataset, top_10_indices)
# val_dataset = Subset(dataset, val_indices)
# test_dataset = Subset(dataset, test_indices)

# # Step 5: Build DataLoaders
# train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, collate_fn=custom_collate)
# val_loader = DataLoader(val_dataset, batch_size=128, shuffle=False, collate_fn=custom_collate)
# test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False, collate_fn=custom_collate)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DrugTargetGNN().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.MSELoss()  # for regression

model.train()
for epoch in range(20):
    total_loss = 0
    for protein_feat, drug_graph, values in train_loader:
        protein_feat = protein_feat.to(device)
        drug_graph = drug_graph.to(device)
        values = values.to(device)

        optimizer.zero_grad()
        outputs = model(protein_feat, drug_graph)
        loss = criterion(outputs, values)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}, Loss: {avg_loss:.4f}")

In [None]:

def evaluate(model, dataloader, device):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for protein_feats, drug_graphs, values in dataloader:
            protein_feats = protein_feats.to(device)
            drug_graphs = drug_graphs.to(device)
            values = values.to(device)

            outputs = model(protein_feats, drug_graphs)
            all_preds.append(outputs.cpu())
            all_labels.append(values.cpu())
         
    preds = torch.cat(all_preds).numpy()
   
    values = torch.cat(all_labels).numpy()

    mse = mean_squared_error(values, preds)
    rmse = mse ** 0.5
    pearson_corr, _ = pearsonr(values, preds)

    return {
        "MSE": mse,
        "RMSE": rmse,
        "Pearson": pearson_corr
    }


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

val_metrics = evaluate(model, val_loader, device)
print("Validation Metrics:", val_metrics)

In [None]:
import pandas as pd

# Load your dataset
df = pd.read_csv('/kaggle/input/virus-drug/virus_drug_interactions.csv')

# Count the number of occurrences for each unique Virus_Organism
virus_counts = df['Virus_Organism'].value_counts()

# Get the top 10 most frequent Virus_Organisms
top_10_viruses = virus_counts.head(3).index

# Get the indices of these top 10 viruses in the original DataFrame
top_10_indices = df[df['Virus_Organism'].isin(top_10_viruses)].index.tolist()

# Print the indices
print("Indices of top 10 Virus_Organisms in the original file:")
print(len(top_10_indices))


In [None]:
print(len(top_3_indices))