In [None]:
import numpy as np
from collections import Counter
import pandas as pd 
from sklearn.cluster import KMeans
from scipy.spatial.distance import squareform
from scipy.cluster.hierarchy import linkage, fcluster
from tqdm.notebook import tqdm
from itertools import product
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score,roc_auc_score
import gc
import joblib
from joblib import Parallel, delayed
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay,precision_recall_curve
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
import os
import itertools
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import roc_curve, auc
import itertools
import seaborn as sns


file_name_ncbi_datas="/kaggle/input/ncbi-data-csv/ncbi_cleaned_train_data.csv"
file_name_gasaid_datas="/kaggle/input/ncbi-data-csv/gasaid_cleaned_train_data.csv"
ncbi_test_file_name="/kaggle/input/ncbi-data-csv/ncbi_cleaned_test_data.csv"
gasaid_test_file_name="/kaggle/input/ncbi-data-csv/gasaid_cleaned_test_data.csv"
covid_test="/kaggle/input/ncbi-data-csv/test_covid.csv"

Asian_flu_test="/kaggle/input/testing-pandemics-csv/Asian Flu (1957-1958).csv"
hong_kong_flu_test="/kaggle/input/testing-pandemics-csv/Hong Kong (1968-1970).csv"
spanish_flu_test="/kaggle/input/testing-pandemics-csv/Spanish Flu (1918-1920).csv"
pdmh1n1_flu_test="/kaggle/input/testing-pandemics-csv/Swine Flu (2009-2010).csv"

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from __future__ import print_function
import argparse
import numpy as np
import torch.optim as optim
import torch.utils.data as data_utils
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from torch.utils.data import Subset
from collections import defaultdict
from tqdm import tqdm
# from gensim.models import Word2Vec,KeyedVectors
from gensim.models import FastText,KeyedVectors
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from itertools import chain
from sklearn.preprocessing import label_binarize


torch.manual_seed(42)
np.random.seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Auto-detect GPU


In [None]:

# need validation
N=193
SG_EMBEDD_SIZE=30
SG_WINDOW=5
# Transformer Parameters
N_HEAD = 5         # Number of attention heads
ENCODER_N_LAYERS = 2       # Number of transformer layers
EMBEDDING_SIZE=SG_EMBEDD_SIZE
INTERMIDIATE_DIM=512
BATCH_SIZE=32

l_sub=10

In [None]:
def ASW(sequence, l_sub):
    """
        sequence (str): The original viral sequence.
        l_sub (int): The length of each subsequence.
        n (int): The number of subsequences to generate.
    """
    l = len(sequence)
    
 
    if N> 1:
        l_stride = (l - l_sub) // (N - 1)
    else:
        l_stride = 1  
    
    subsequences = []

 
    for i in range(0, min(N * l_stride, l - l_sub + 1), l_stride):
        subsequences.append(sequence[i:i + l_sub])
    
    return subsequences


In [None]:
# 0->human  1-> animals
class GatedAttention(nn.Module):
    def __init__(self,N_HEAD,ENCODER_N_LAYERS,EMBEDDING_SIZE,INTERMIDIATE_DIM):
        super(GatedAttention, self).__init__()
        self.M = EMBEDDING_SIZE
        self.L = INTERMIDIATE_DIM
        self.ENCODER_N_LAYERS=ENCODER_N_LAYERS
        self.ATTENTION_BRANCHES = 1
        self.N_HEAD=N_HEAD

        # embedding 
        self.encoder_layer = TransformerEncoderLayer(d_model=self.M, nhead=self.N_HEAD)
        self.transformer_encoder = TransformerEncoder(self.encoder_layer, num_layers=self.ENCODER_N_LAYERS)
        
        # instance level 
        self.attention_V_1 = nn.Sequential(
            nn.Linear(self.M, self.L), # matrix V
            nn.Tanh()
        )

        self.attention_U_1 = nn.Sequential(
            nn.Linear(self.M, self.L), # matrix U
            nn.Sigmoid()
        )

        self.attention_w_1 = nn.Linear(self.L, self.ATTENTION_BRANCHES) # matrix w (or vector w if self.ATTENTION_BRANCHES==1)


        # bag level 
        self.attention_V_2 = nn.Sequential(
            nn.Linear(self.M, self.L), # matrix V
            nn.Tanh()
        )

        self.attention_U_2 = nn.Sequential(
            nn.Linear(self.M, self.L), # matrix U
            nn.Sigmoid()
        )

        self.attention_w_2 = nn.Linear(self.L, self.ATTENTION_BRANCHES) # matrix w (or vector w if self.ATTENTION_BRANCHES==1)


        
        # classifier
        self.classifier = nn.Sequential(
            nn.Conv1d(in_channels=self.ATTENTION_BRANCHES, out_channels=128, kernel_size=4, padding='same'),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Conv1d(in_channels=128, out_channels=128, kernel_size=5, padding='same'),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.AvgPool1d(kernel_size=2),
            nn.Conv1d(in_channels=128, out_channels=128, kernel_size=7, padding='same'),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.AvgPool1d(kernel_size=2),
            nn.Flatten(),  # Converts to 1D before fully connected layers
            nn.Linear(128 * ((self.M) // 4), 256),  # Adjust size based on sequence length
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Linear(128, 1),
            nn.Sigmoid() 
        )

    
    def forward(self, datas,ids,Seq_ids):
        A_vec_2_all=[]
        A_vec_all=[]
        #### STEP 1:embeddings
        datas = datas.float()  # Ensure correct dtype
        instances=self.transformer_encoder(datas) 
        
        #### STEP 2: INSTANCE-LEVEL ATTENTION ####
        # Apply attention mechanisms per bag (over instances_per_bag)
        A_V = self.attention_V_1(instances)  
        A_U = self.attention_U_1(instances)  
        A = self.attention_w_1(A_V * A_U)
        A = torch.transpose(A, 1, 0)  
        inner_bags = torch.unique_consecutive(Seq_ids)
      
        output = torch.empty(((len(inner_bags), self.M))).to(device)
        super_ids = torch.empty(((len(inner_bags))))
        for i, bag in enumerate(inner_bags):
            A_vec=F.softmax(A[0][Seq_ids == bag],dim=0)
            A_vec_all.append(A_vec)
            output[i] = torch.matmul(A_vec, instances[Seq_ids == bag])
            super_ids[i]=ids[Seq_ids == bag][0]
        
        ### STEP 3: BAG-LEVEL ATTENTION ####
        A_V_2 = self.attention_V_2(output)  
        A_U_2 = self.attention_U_2(output)  
        A_2 = self.attention_w_2(A_V_2 * A_U_2)  
        A_2 = torch.transpose(A_2, 1,0)   

      
        outer_bags = torch.unique_consecutive(super_ids)
        output2 = torch.empty(((len(outer_bags), self.M))).to(device)

        for i, bag in enumerate(outer_bags):
            A_vec_2=F.softmax(A_2[0][super_ids == bag],dim=0)
            A_vec_2_all.append(A_vec_2)
            output2[i] = torch.matmul(A_vec_2, output[super_ids == bag])

        
        
        ### STEP 4: CLASSIFICATION ####
        # output2 = output2.view(output2.shape[0], -1)  # Flatten over bags_per_bag for classification
        output2 = output2.unsqueeze(1)  # Add a channel dimension


        Y_prob = self.classifier(output2)  # Shape: [batch_size, 1]
        Y_hat = torch.ge(Y_prob, 0.5).float()  # Convert probabilities to binary predictions
        return Y_prob, Y_hat, A_vec_all, A_vec_2_all

In [None]:
class MILDataset(Dataset):
    def __init__(self, datas, ids, seq_ids, labels):
        self.datas = datas  # Instance features
        self.ids = ids # Virus (outer bag) IDs
        self.seq_ids = seq_ids  # Sequence (inner bag) IDs
        self.labels = labels.to("cpu")  # Labels at the virus (outer bag) level

        # Unique IDs for outer bags (viruses) and their indices
        self.unique_virus_ids, self.virus_indices = torch.unique(self.ids, return_inverse=True)
        
        # Unique IDs for inner bags (sequences) and their indices
        self.unique_seq_ids, self.seq_indices = torch.unique(self.seq_ids, return_inverse=True)

        # Mapping from virus to instance indices  2d array each list is the virus data indecies
        self.virus_bag_indices_list = [torch.where(self.virus_indices == i)[0].to("cpu") for i in tqdm(range(len(self.unique_virus_ids)))]

        # Mapping from sequence to instance indices 2d array each list is the seq data indecies
        self.seq_bag_indices_list = [torch.where(self.seq_indices == i)[0].to("cpu") for i in tqdm( range(len(self.unique_seq_ids)))]

        # Labels assigned at the virus level (each virus gets one label)
        self.virus_labels = [self.labels[indices[0]] for indices in self.virus_bag_indices_list]

        # Precomputed bag-of-bags structure (virus → [seq])
        self.virus_seq_map = {}  # Maps virus_id -> list of sequence indices
        for i, virus_id in tqdm(enumerate(self.unique_virus_ids)):
            self.virus_seq_map[virus_id.item()] = list((self.seq_ids[self.virus_bag_indices_list[i]].tolist()))

        # Precomputed bag IDs for each virus and sequence
        self.precomputed_virus_ids = [torch.full((indices.shape[0],), self.unique_virus_ids[i], dtype=torch.long) 
                                      for i, indices in enumerate(self.virus_bag_indices_list)]

      
        self.datas = self.datas.cpu()


    def __len__(self):
        return len(self.unique_virus_ids)  # Number of unique viruses (outer bags)

    def __getitem__(self, index):
        """ Return outer bag (virus), inner bags (sequences), and instance-level data. """
        
        # Get all instance indices belonging to this virus
        virus_instance_indices = self.virus_bag_indices_list[index]
        # Retrieve instance-level data
        virus_data = self.datas[virus_instance_indices]
        virus_label = self.virus_labels[index]
        virus_id = self.precomputed_virus_ids[index]
        # Find which sequences belong to this virus
       
        seq_ids_in_virus = self.virus_seq_map[virus_id[0].item()]

        return {
            "virus_id": virus_id,
            "virus_data": virus_data,
            "virus_label": virus_label,
            "seq_id": seq_ids_in_virus
        }


def collate_fn(batch):
    """ Custom collate function for Bag-of-Bags MIL """

    batch_size = len(batch)

    all_virus_ids = []
    all_virus_data = []
    all_virus_labels = []
    all_virus_seq_ids = []
   

    for item in batch:
        virus_id = item["virus_id"].tolist()
        virus_data = item["virus_data"].tolist()
        virus_label = item["virus_label"]
        seq_id = item["seq_id"]

        all_virus_seq_ids.extend(seq_id)
        all_virus_ids.extend(virus_id)
        all_virus_data.extend(virus_data)
        all_virus_labels.append(virus_label)
    
    # Convert to tensors
    batch_virus_labels = torch.tensor(all_virus_labels, dtype=torch.float)
    batch_seq_ids = torch.tensor(all_virus_seq_ids, dtype=torch.float)
    batch_virus_datas = torch.tensor(all_virus_data, dtype=torch.float)
    batch_virus_ids = torch.tensor(all_virus_ids, dtype=torch.float)


    return batch_virus_datas, batch_virus_ids,batch_seq_ids, batch_virus_labels


In [None]:
def test_deep(dataloader):
    # print(len(labels))
    model.eval()
    acc=0
    total_samples=0
    output=[]
    probs = []
    true_lables=[]
    with torch.no_grad():
         for batch_data, batch_ids,batch_seq_ids, batch_labels in tqdm(dataloader, desc="Processing Batches"):
            batch_data,batch_ids,batch_seq_ids, batch_labels  = batch_data.to(device), batch_ids.to(device),batch_seq_ids.to(device), batch_labels.to(device)
            Y_prob, Y_hat, A, A_2 =model(batch_data,batch_ids,batch_seq_ids)         
            output += Y_hat.cpu().tolist()
            true_lables += batch_labels.cpu().tolist()
            # Y_prob=Y_prob.squeeze(1)
            probs += Y_prob.squeeze(1).cpu().tolist() 
            Y_hat = Y_hat.view_as(batch_labels)
            acc += ((Y_hat == batch_labels).sum().item())
            total_samples += batch_labels.size(0)  # Track the total number of samples processed

    acc=acc/total_samples*100
    print(f'acc: {acc:.1f}%')
    print("\nClassification Report:")
    print(classification_report(true_lables, output, target_names=["Human", "Animal"]))

    con_matrix(true_lables, output)
    # plot_roc_curve(true_lables, output)
    return probs,true_lables

In [None]:
def read_data_from_file(filename):
    file_path = os.path.abspath(filename)

    # Read CSV file
    df = pd.read_csv(file_path)

    # Rename columns
    df.columns = ["ID", "Sequence"]
    
    # Drop rows where Sequence is not a string or is missing
    df = df[df["Sequence"].apply(lambda x: isinstance(x, str))].copy()
    df = df[df["Sequence"].apply(lambda x: len(x) > 200 if isinstance(x, str) else False)].copy()
    df = df[~df['Sequence'].str.contains(r'[^ACDEFGHIKLMNPQRSTVWY]')]

    # Extract fields
    df["Virus_ID"] = df["ID"].apply(lambda x: "".join(x.split("|")[1:]) if "|" in x else "")
    df["Seq_ID"] = df["ID"].apply(lambda x: x.split("|")[0] if "|" in x else "")
    df["Class"] = df["ID"].apply(lambda x: x.split("|")[-1] if "|" in x else "")
    df["Length"] = df["Sequence"].apply(len)

    return df[["Sequence", "Virus_ID", "Seq_ID", "Class", "Length"]]

In [None]:
def remove_common_sequences(df1, df2, column='Sequence'):
    common_mask = df1[column].isin(df2[column])
    if common_mask.any():
        count = common_mask.sum()
        print(f"{count} common sequences found. Removing them from df2...")
        df1_cleaned = df1[~common_mask].copy()
    else:
        print("No common sequences found between the two DataFrames.")
        df1_cleaned = df1.copy()

    return df1_cleaned

In [None]:
def remove_duplicates(df, subset=None, show_duplicates=True):
    duplicated_mask = df.duplicated(subset=subset)
    has_duplicates = duplicated_mask.any()

    if has_duplicates:
        num_duplicates = duplicated_mask.sum()
        print(f"⚠️ Found {num_duplicates} duplicated row(s). Removing them...")
        df = df.drop_duplicates(subset=subset)
    else:
        print("✅ No duplicated rows found.")

    return df

In [None]:
# Apply ASW 
def create_data_loader(datas,labels,ids,seq_ids,ft_model):
    embeddings, ids,seq_ids, labels=tokenize(datas,ids,seq_ids,labels,ft_model)
    mildataset = MILDataset(embeddings, ids,seq_ids, labels)
    data_loader = DataLoader(mildataset, batch_size=BATCH_SIZE, shuffle=False,num_workers=0, collate_fn=collate_fn)
    return data_loader

In [None]:
model = GatedAttention(N_HEAD, ENCODER_N_LAYERS, EMBEDDING_SIZE, INTERMIDIATE_DIM).to(device)
model.load_state_dict(torch.load("/kaggle/input/virogen-model/pytorch/default/1/model_weights.pth", map_location=device))
model.eval()
ft_model = FastText.load("/kaggle/input/virogen-model/pytorch/default/1/ft_skipgram.model")

In [None]:
df_test_gasaid=read_data_from_file(gasaid_test_file_name)
df_test_ncbi=read_data_from_file(ncbi_test_file_name)

df_test = pd.concat([df_test_gasaid, df_test_ncbi], ignore_index=True)
print(len(df_test))

In [None]:
df_test = remove_duplicates(df_test)

print(len(df_test))

In [None]:
df_Asian_flu=read_data_from_file(Asian_flu_test)
df_hong_kong_flu=read_data_from_file(hong_kong_flu_test)
df_pdmh1n1_flu=read_data_from_file(pdmh1n1_flu_test)
df_covid= read_data_from_file(covid_test)


In [None]:
print(len(df_Asian_flu))
print(len(df_hong_kong_flu))
print(len(df_pdmh1n1_flu))
print(len(df_covid))


In [None]:
# Parameters
k = 4  # k-mer size
m = 3  # minimizer size
pseudo_count = 0.1
alphabet = ['A','C','D', 'E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y']
alphabet = sorted(set(alphabet))
alphabet_size = len(alphabet)
char_to_idx = {c: i for i, c in enumerate(alphabet)}
alphabet_size

In [None]:
def get_all_combinations(alphabet, k):
    return [''.join(p) for p in product(alphabet, repeat=k)]

# All the m-mers (not k-mers!) combinations
combos = get_all_combinations(alphabet, m)
combo_index = {mer: i for i, mer in enumerate(combos)}  # Faster lookup

def get_kmers(seq, k):
    return [seq[i:i+k] for i in range(len(seq) - k + 1)]

def get_minimizer(kmer, m):
    """
    Get lex smallest m-mer in both the k-mer and its reverse.
    """
    kmer_rev = kmer[::-1]
    all_mmers = [kmer[i:i+m] for i in range(len(kmer) - m + 1)]
    all_mmers += [kmer_rev[i:i+m] for i in range(len(kmer_rev) - m + 1)]
    return min(all_mmers)

def comp_minimizers(seq, k, m):
    """Return list of minimizers for k-mers in sequence"""
    kmers = get_kmers(seq, k)
    return [get_minimizer(kmer, m) for kmer in kmers]

def get_alphabet_count(col, alphabet):
    """Return count vector of characters in one column of matrix A"""
    counter = Counter(col)
    return np.array([counter.get(char, 0) for char in alphabet])

def get_p_c():
    # Codon-based number of mappings to amino acids
    codon_table = {
        'A': 4, 'C': 2, 'D': 2, 'E': 2, 'F': 2, 'G': 4,
        'H': 2, 'I': 3, 'K': 2, 'L': 6, 'M': 1, 'N': 2,
        'P': 4, 'Q': 2, 'R': 6, 'S': 6, 'T': 4, 'V': 4,
        'W': 1, 'Y': 2
    }
    # Return vector for p(c) aligned with the alphabet
    return np.array([codon_table.get(c, 1) / 61 for c in alphabet])

def comp_ppm(pfm):
    # Step 1: Add pseudocounts
    pfm = pfm + pseudo_count
    # Step 2: Compute PPM with small constant to avoid division by zero
    return (pfm / (np.sum(pfm, axis=0) + 1e-9))
    
def comp_pwm(pfm, p_c):
    # Step 1: Add pseudocounts
    pfm = pfm + pseudo_count
    # Step 2: Compute PPM with small constant to avoid division by zero
    ppm = pfm / (np.sum(pfm, axis=0) + 1e-9)
    # Step 3: Compute log-odds PWM
    return np.log2(ppm / p_c[:, np.newaxis])

def comp_mmers_score(mmer, pwm):
    # Score is sum of weights for each character position
    # W("AC") = W['A'][0] + W['C'][1]
    return sum(pwm[char_to_idx[c], i] for i, c in enumerate(mmer))


In [None]:
pfm = np.zeros((alphabet_size, m))
v = np.zeros(len(combos))
def Pwm2Vec(S, alphabet=alphabet, k=k, m=m):
    V = np.zeros((len(S), (alphabet_size*m)), dtype=np.float32)
    for j, seq  in enumerate(tqdm(S, desc="Processing sequences")):
        A = comp_minimizers(seq, k, m)  # List of minimizers (m-mers)

        pfm.fill(0)
        for i in range(m):
            # the chars in pos i in A
            col = [a[i] for a in A]
            # PFM[c][i] = count of character c at position i across all m-mers
            pfm[:, i] = get_alphabet_count(col, alphabet)
        
        ppm = comp_ppm(pfm)
        
        V[j, :] = ppm.flatten()  # Assign the computed vector directly to row j
    return V

In [None]:
pfm = np.zeros((alphabet_size, m))
v = np.zeros(len(combos))
def Virus2Vec(S, alphabet=alphabet, k=k, m=m):
    V = np.zeros((len(S), len(combos)), dtype=np.float32)
    for j, seq  in enumerate(tqdm(S, desc="Processing sequences")):
        A = comp_minimizers(seq, k, m)  # List of minimizers (m-mers)

        pfm.fill(0)
        for i in range(m):
            # the chars in pos i in A
            col = [a[i] for a in A]
            # PFM[c][i] = count of character c at position i across all m-mers
            pfm[:, i] = get_alphabet_count(col, alphabet)
        
        p_c = get_p_c()
        pwm = comp_pwm(pfm, p_c)
        
        # Step 4: Score for all Minimizers
        W   = [comp_mmers_score(mmer, pwm) for mmer in A]
        # Step 5
        v.fill(0) # feature vector of size |Σ|^m
        for i in range(len(A)):    # A contains minimizers (m-mers)
            idx = combo_index[A[i]]  # find index of the i-th m-mer
            v[idx] += W[i]  # add the minimizer's score
        V[j, :] = v  # Assign the computed vector directly to row j
    return V

In [None]:
# Hydrophobicity (h1) and Hydrophilicity (h2)
hydrophobicity = {
    'A': 0.62,  'C': 0.29,  'D': -0.90, 'E': -0.74, 'F': 1.19,
    'G': 0.48,  'H': -0.40, 'I': 1.38,  'K': -1.50, 'L': 1.06,
    'M': 0.64,  'N': -0.78, 'P': 0.12,  'Q': -0.85, 'R': -2.53,
    'S': -0.18, 'T': -0.05, 'V': 1.08,  'W': 0.81,  'Y': 0.26
}

hydrophilicity = {
    'A': -0.50, 'C': -1.00, 'D': 3.00,  'E': 3.00,  'F': -2.50,
    'G': 0.00,  'H': -0.50, 'I': -1.80, 'K': 3.00,  'L': -1.80,
    'M': -1.30, 'N': 0.20,  'P': 0.00,  'Q': 0.20,  'R': 3.00,
    'S': 0.30,  'T': -0.40, 'V': -1.50, 'W': -3.40, 'Y': -2.30
}

amino_acids = ['A','C','D', 'E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y']

In [None]:
#30
def apaac(sequence, lambda_value=10, weight=0.05):
    """
    Feature APAAC (Amphiphilic Pseudo Amino Acid Composition)

    This function computes the APAAC feature vector for a given protein sequence.
    APAAC extends the standard amino acid composition (AAC) by incorporating 
    sequence-order information based on two physicochemical properties:
    hydrophobicity and hydrophilicity.

    Components:
    - AAC: Frequency of each of the 20 amino acids in the sequence.
    - Lambda correlation factors: Capture local sequence-order information 
      using pairwise differences in hydrophobicity and hydrophilicity for residues 
      separated by 1 to `lambda_value` positions.

    Parameters:
    - sequence (str): Protein sequence using one-letter amino acid codes.
    - lambda_value (int): Number of sequence-order correlation factors to compute.
    - weight (float): Weighting factor that controls the influence of sequence-order info.

    Returns:
    - np.ndarray: Concatenated feature vector of length 20 + lambda_value
    """
    sequence = sequence.upper()  
    
    # Compute standard amino acid composition (AAC)
    aac = np.array([sequence.count(aa) / len(sequence) for aa in amino_acids])

    # Compute sequence-order correlation factors
    lambda_correlation = np.zeros(lambda_value)
    
    for i in range(1, lambda_value + 1):
        sum_corr = 0
        for j in range(len(sequence) - i):
            h1_corr = (hydrophobicity[sequence[j]] - hydrophobicity[sequence[j + i]])**2
            h2_corr = (hydrophilicity[sequence[j]] - hydrophilicity[sequence[j + i]])**2
            sum_corr += (h1_corr + h2_corr) / 2  # Average correlation
        lambda_correlation[i-1] = ((sum_corr+1e-7) / ((len(sequence) - i)+1e-7))

    # Normalize and combine features
    return np.concatenate((aac * (1 - weight * sum(lambda_correlation)), weight * lambda_correlation))

In [None]:
secondary_structure = {
"Helix": set("EALMQKRH"),
"Strand": set("VIYCWFT"),
"Coil": set("GNPSD"),
}

#15
def ctdd(sequence):
    """
    Feature CTDD (Distribution Descriptor of Amino Acid Properties)

    This function calculates the "Distribution" part of the CTD (Composition, 
    Transition, Distribution) descriptor, which describes how amino acids 
    belonging to specific physicochemical property classes are distributed 
    across a protein sequence.

    For each amino acid class (e.g., based on secondary structure propensity),
    it computes five percentile-based positional features:
    - The relative positions (normalized by sequence length) of the first,
      25%, 50%, 75%, and last occurrence of any amino acid in that class.

    Parameters:
    - sequence (str): Protein sequence using one-letter amino acid codes.

    Returns:
    - list: A feature vector of length 15 values
    """
    ctdd_vector = []
    sequence_length=len(sequence)
    for class_name, amino_acids in secondary_structure.items():
        positions = [i for i, aa in enumerate(sequence) if aa in amino_acids]
        
        if not positions:  # If no amino acid of this class is found
            ctdd_vector.extend([0, 0, 0, 0, 0])
            continue

        # Calculate the five key positions (first, 25%, 50%, 75%, last)
        first = positions[0] / sequence_length
        p25 = positions[int(len(positions) * 0.25)] / sequence_length
        p50 = positions[int(len(positions) * 0.50)] / sequence_length
        p75 = positions[int(len(positions) * 0.75)] / sequence_length
        last = positions[-1] / sequence_length

        ctdd_vector.extend([first, p25, p50, p75, last])

    return ctdd_vector

In [None]:
#Grpups based on their dipoles and side-chain volumes
amino_acid_groups = {
'A': 1, 'G': 1, 'V': 1,  # Group 1
'I': 2, 'L': 2, 'F': 2, 'P': 2,  # Group 2
'Y': 3, 'M': 3, 'T': 3, 'S': 3,  # Group 3
'H': 4, 'N': 4, 'Q': 4, 'W': 4,  # Group 4
'R': 5, 'K': 5,  # Group 5
'D': 6, 'E': 6,  # Group 6
'C': 7   # Group 7
 }

#343
def ctriad(sequence):
    """
    Feature #343: Conjoint Triad (CTriad)

    This function computes the Conjoint Triad (CTriad) feature vector for a protein sequence.
    It maps amino acids into 7 predefined groups based on dipole moments and side-chain volumes,
    then extracts all overlapping triads (3-residue sliding windows) and counts the frequency
    of each of the 343 possible group combinations (7 × 7 × 7 = 343).

    Components:
    - Reduces each amino acid to one of 7 groups.
    - Forms triads (triplets) using a sliding window across the grouped sequence.
    - Counts and normalizes the frequency of each triad pattern.
    - Produces a 343-dimensional feature vector representing all triad combinations.

    Parameters:
    - sequence (str): Protein sequence using one-letter amino acid codes.

    Returns:
    - np.ndarray: Flattened 343-dimensional vector of normalized triad frequencies.
    """
    # Convert sequence to reduced alphabet (group numbers)
    reduced_seq = [amino_acid_groups[aa] - 1 for aa in sequence if aa in amino_acid_groups]
    # Extract triads
    triads = [tuple(reduced_seq[i:i+3]) for i in range(len(reduced_seq) - 2)]
   
    # Count occurrences of each triad
    triad_counts = Counter(triads)

    # Normalize counts
    total_triads = len(triads)
    triad_vector = np.zeros((7, 7, 7))  # 7^3 possible triads

    for triad, count in triad_counts.items():
        triad_vector[triad] = count / total_triads  # Normalize frequency

    return triad_vector.flatten()

In [None]:
#400
def dde(sequence):
    """
    Feature #400: DDE (Dipeptide Deviation from Expected Mean)

    This function calculates the DDE (Dipeptide Deviation from Expected mean) descriptor
    for a given protein sequence. It evaluates how the observed frequency of each possible
    dipeptide deviates from its expected frequency under the assumption of amino acid independence.

    Components:
    - Dc: Observed frequency of each of the 400 dipeptides (20 × 20).
    - Tm: Theoretical mean frequency of each dipeptide (product of individual amino acid frequencies).
    - Tv: Theoretical variance assuming independent occurrence.
    - DDE: Z-score-like value capturing standardized deviation (Dc - Tm) / sqrt(Tv)

    This descriptor is useful in capturing **non-random pairwise amino acid associations** 
    and provides insight into sequence-specific dipeptide usage biases.

    Parameters:
    - sequence (str): Protein sequence using one-letter amino acid codes.

    Returns:
    - list: A 400-dimensional feature vector representing the DDE of all possible dipeptides.
    """
    
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'  
    dipeptides = [''.join(pair) for pair in itertools.product(amino_acids, repeat=2)]
    
    aa_counts = Counter(sequence)
    L = len(sequence)
    aa_freq = {aa: aa_counts.get(aa, 0) / L for aa in amino_acids}

    dipeptide_counts = Counter([sequence[i:i+2] for i in range(L-1)])
    Dc = {dp: dipeptide_counts.get(dp, 0) / (L-1) for dp in dipeptides}

    Tm = {dp: aa_freq[dp[0]] * aa_freq[dp[1]] for dp in dipeptides}
    Tv = {dp: (Tm[dp] * (1 - Tm[dp])) / L if Tm[dp] > 0 else 0 for dp in dipeptides}

    DDE = {dp: (Dc[dp] - Tm[dp]) / (Tv[dp] ** 0.5) if Tv[dp] > 0 else 0 for dp in dipeptides}

    return list(DDE.values())

In [None]:
#5
hydrophobicity_Kyte_Doolittle  = {
    'A': 1.8,  'C': 2.5,  'D': -3.5, 'E': -3.5, 'F': 2.8,
    'G': -0.4, 'H': -3.2, 'I': 4.5,  'K': -3.9, 'L': 3.8,
    'M': 1.9,  'N': -3.5, 'P': -1.6, 'Q': -3.5, 'R': -4.5,
    'S': -0.8, 'T': -0.7, 'V': 4.2,  'W': -0.9, 'Y': -1.3
}

Hydrophilicity_Hopp_Woods_scale  = {
    'A': -0.5, 'C': -1.0, 'D': 3.0,  'E': 3.0,  'F': -2.5,
    'G': 0.0,  'H': -0.5, 'I': -1.8, 'K': 3.0,  'L': -1.8,
    'M': -1.3, 'N': 0.2,  'P': 0.0,  'Q': 0.2,  'R': 3.0,
    'S': 0.3,  'T': -0.4, 'V': -1.5, 'W': -3.4, 'Y': -2.3
}
Polarity_Scale = {
    'A': 8.1,  'C': 5.5,  'D': 13.0, 'E': 12.3, 'F': 5.2,
    'G': 9.0,  'H': 10.4, 'I': 5.2,  'K': 11.3, 'L': 4.9,
    'M': 5.7,  'N': 11.6, 'P': 8.0,  'Q': 10.5, 'R': 10.5,
    'S': 9.2,  'T': 8.6,  'V': 5.9,  'W': 5.4,  'Y': 6.2
}
Molecular_Weight = {
    'A': 89.09,  'C': 121.15, 'D': 133.10, 'E': 147.13, 'F': 165.19,
    'G': 75.07,  'H': 155.16, 'I': 131.17, 'K': 146.19, 'L': 131.17,
    'M': 149.21, 'N': 132.12, 'P': 115.13, 'Q': 146.15, 'R': 174.20,
    'S': 105.09, 'T': 119.12, 'V': 117.15, 'W': 204.23, 'Y': 181.19
}


def geary_autocorrelation(sequence, max_lag=5, property_dict=hydrophobicity_Kyte_Doolittle):
    """
    Feature #5: Geary Autocorrelation Descriptor

    This function calculates the Geary autocorrelation descriptor for a given protein sequence,
    which quantifies the spatial autocorrelation of a chosen physicochemical property across 
    different lags (distances) in the amino acid sequence.

    Concept:
    - For a selected amino acid property (e.g., hydrophobicity, polarity), this feature measures 
      how similar the property values are between amino acids separated by a lag `d`.
    - It is based on the Geary's C metric (used in spatial statistics), which is sensitive to 
      local changes in the property values across the sequence.

    Formula:
    Geary(d) = (N - 1) * Σ[(Pᵢ - Pᵢ₊ₗ)²] / [2 * (N - d) * Σ(Pᵢ - mean(P))²]

    Parameters:
    - sequence (str): Protein sequence using one-letter amino acid codes.
    - max_lag (int): Maximum distance (lag) to consider for autocorrelation (default = 5).
    - property_dict (dict): Dictionary mapping amino acids to a numeric property scale,
                            such as hydrophobicity, hydrophilicity, polarity, or molecular weight.

    Returns:
    - list: A list of `max_lag` Geary autocorrelation values for lags from 1 to `max_lag`.
    """
    prop_values = np.array([property_dict.get(aa, 0) for aa in sequence])  # Default 0 if AA is unknown
    N = len(prop_values)
    mean_p = np.mean(prop_values)

    geary_values = {}
    
    for d in range(1, max_lag + 1):
        numerator = np.sum((prop_values[:-d] - prop_values[d:]) ** 2)
        denominator = 2 * (N - d) * np.sum((prop_values - mean_p) ** 2)
        geary_values[f'Geary_Lag_{d}'] = (N - 1) * numerator / denominator if denominator != 0 else 0

    return list(geary_values.values())

In [None]:
# ---- Define sizes of each feature set ----
vec_size = 30 + 15 + 343 + 400 + 5   
def Features(S):
    V = np.zeros((len(S), vec_size))
    
    for i, seq in enumerate(tqdm(S, desc="Processing sequences")):
        offset = 0

        # APAAC: 30 features
        V[i, offset:offset + 30] = apaac(seq)
        offset += 30

        # CTDD: 15 features
        V[i, offset:offset + 15] = ctdd(seq)
        offset += 15

        # CTriad: 343 features
        V[i, offset:offset + 343] = ctriad(seq)
        offset += 343

        # DDE: 400 features
        V[i, offset:offset + 400] = dde(seq)
        offset += 400

        # Geary Autocorrelation: 5 features
        V[i, offset:offset + 5] = geary_autocorrelation(seq)
        offset += 5

    return V

In [None]:
def get_feature_vector(df, algo=""):
    return algo(df['Sequence'])

In [None]:
def cls_report(y_test,preds):
    print(classification_report(y_test, preds, zero_division=0))
    print("Accuracy:", accuracy_score(y_test, preds))

def get_preds(model_name,X_test,y_test,Verbose=False):
    preds = models[model_name].predict(X_test)
    if Verbose:
        print(f"🎯 {model_name} Results:")
        cls_report(y_test,preds)
    return preds

    
def con_matrix(y_test,preds):
    # Compute the confusion matrix
    cm = confusion_matrix(y_test, preds)
    
    #labels (0 = human, 1 = animal)
    
    # Create and display the confusion matrix with labels
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Human', 'Animal'])
    disp.plot(cmap=plt.cm.Blues)
    plt.title("Confusion Matrix")
    plt.show()

In [None]:
feature_extractors = {
    "Virus2VecModel": Virus2Vec,
    "PwmVecModel": Pwm2Vec,
    "FeaturesModel": Features,  
}

# Get the Labels

In [None]:
y_test = np.array((df_test["Class"].str.lower() != "human").astype(int))

In [None]:
models={}
models["deepLearning"] = model

In [None]:
# load the model of virus2vec using joblib
Virus2VecModel = joblib.load('/kaggle/input/virogenml/scikitlearn/default/11/RF_Virus2vec.pkl')
models["Virus2VecModel"] = Virus2VecModel
# get the feature vector due to virus2vec
Virus2Vec_feature_vector = get_feature_vector(df_test,Virus2Vec)
# get the preds of the model
Virus2Vec_preds = get_preds('Virus2VecModel',Virus2Vec_feature_vector,y_test,Verbose=True)
con_matrix(y_test,Virus2Vec_preds)

In [None]:
# load the model of features using joblib
FeaturesModel = joblib.load('/kaggle/input/virogenml/scikitlearn/default/11/SVM_ML_Features.pkl')
models["FeaturesModel"] = FeaturesModel
# get the feature vector due to FeaturesModel
FeaturesModel_feature_vector = get_feature_vector(df_test,Features)
# get the preds of the FeaturesModel 
FeaturesModel_preds = get_preds('FeaturesModel',FeaturesModel_feature_vector,y_test,Verbose=True)
con_matrix(y_test,FeaturesModel_preds)

In [None]:
# load the model of Pwm2Vec using joblib
PwmVecModel = joblib.load('/kaggle/input/virogenml/scikitlearn/default/11/SVM_PWM2Vec.pkl')
models["PwmVecModel"] = PwmVecModel
# get the feature vector due to Pwm2Vec
Pwm2Vec_feature_vector = get_feature_vector(df_test,Pwm2Vec)
# get the preds of the Pwm2Vec model
Pwm2Vec_preds = get_preds('PwmVecModel',Pwm2Vec_feature_vector,y_test,Verbose=True)
con_matrix(y_test,Pwm2Vec_preds)

In [None]:
def tokenize(datas,ids,seq_ids,labels,ft_model):
    datas = [ASW(sequence,l_sub) for sequence in datas.tolist()]
    labels= np.repeat(labels, N).tolist()
    ids=np.repeat(ids, N).tolist()
    seq_ids=np.repeat(seq_ids, N).tolist()
    
    # Apply FastText (CBOW)
    keys_wv=set(list(ft_model.wv.key_to_index.keys()))
    
    embeddings = np.array([
        ft_model.wv[k]  # FastText will handle unknown k-mers
        for kmer in tqdm(datas, desc="FastText inference")
        for k in kmer
    ])
    embeddings=torch.tensor(embeddings).to(device)
    ids=torch.tensor(ids).to(device)
    seq_ids=torch.tensor(seq_ids).to(device)
    labels=torch.tensor(labels).to(device)
    return embeddings,ids,seq_ids,labels

In [None]:
test_deep_ids=df_test["Virus_ID"]
test_deep_seq_ids=df_test["Seq_ID"]+" "+df_test["Virus_ID"]

# convert test string id to numeric
_,test_deep_ids = np.unique(test_deep_ids, return_inverse=True)
_,test_deep_seq_ids = np.unique(test_deep_seq_ids, return_inverse=True)
test_deep_datas = df_test['Sequence']


In [None]:
test_loader = create_data_loader(test_deep_datas, y_test,test_deep_ids,test_deep_seq_ids,ft_model)

In [None]:
pred_deep,true_deep=test_deep(test_loader)

In [None]:
# Convert to numpy array for vectorized comparison
test_deep_ids = np.array(test_deep_ids)
unique_ids = np.unique(test_deep_ids)

# Create mapping: sequence ID → corresponding prediction
seq_pred_map = {seq_id: pred for seq_id, pred in zip(unique_ids, pred_deep)}

# Generate output array by mapping each test_deep_id to its prediction
seq_level_preds = np.array([seq_pred_map[seq_id] for seq_id in test_deep_ids])
print(len(seq_level_preds))

In [None]:
def plot_models_roc_curves(models: dict, model_inputs: dict, y_test):
    """
    Plots ROC curves for multiple classifiers using the same color palette as the precision-recall plot.

    Parameters:
    - models (dict): Keys are model names, values are fitted model instances.
    - model_inputs (dict): Keys are model names, values are corresponding X_test feature vectors.
    - y_test: True labels (0 = Human, 1 = Animal). Class 0 is treated as the positive class.
    """
    plt.figure(figsize=(10, 7))
    distinct_colors = sns.color_palette("hls", len(models))  # Consistent colors

    for i, (name, model) in enumerate(models.items()):
        X = model_inputs[name]
        # Predict probabilities
        try:
            y_score = model.predict_proba(X)[:, 0]  # Probability of class 0 (Human)
        except AttributeError:
            y_score=seq_level_preds
            y_score=1-y_score
            

        # Compute ROC metricspred_l
        fpr, tpr, _ = roc_curve(y_test, y_score, pos_label=0)
        roc_auc = auc(fpr, tpr)

        # Plot ROC curve
        plt.plot(fpr, tpr, lw=2, color=distinct_colors[i],
                 label=f'{name} (AUC = {roc_auc:.2f})')

    # Plot random guess line
    plt.plot([0, 1], [0, 1], color='gray', linestyle='--', lw=2, label='Chance')

    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curves (Human = Positive Class)')
    plt.legend(loc='lower right')
    plt.grid(True)
    plt.tight_layout()
    plt.show()


In [None]:
def plot_precision_vs_recall_max_precision_at_full_recall(models, model_inputs, y_test):
    """
    Plots precision vs recall curves for multiple models (with different feature vectors),
    and highlights the max precision at recall = 1.0 point for each model.

    Parameters:
    - models: dict of model_name → trained model
    - model_inputs: dict of model_name → corresponding X_test
    - y_test: true labels (same for all models)
    """
    plt.figure(figsize=(10, 7))
    distinct_colors = sns.color_palette("hls", len(models))  # distinct colors per model

    for i, (name, model) in enumerate(models.items()):
        X = model_inputs[name]
      
        # Get prediction scores
        if hasattr(model, "predict_proba"):
            y_scores = model.predict_proba(X)[:, 0]
        else:
            y_scores=seq_level_preds
            y_scores=1-y_scores



        # Calculate precision-recall curve
        precision, recall, _ = precision_recall_curve(y_test, y_scores, pos_label=0)


        # Get max precision at recall = 1.0
        recall_1_indices = np.where(recall == 1.0)[0]
        if len(recall_1_indices) > 0:
            max_prec_at_recall_1 = np.max(precision[recall_1_indices])
            plt.scatter(max_prec_at_recall_1, 1.0, color=distinct_colors[i], marker='o',
                        edgecolor='black', s=100,
                        label=f'{name} Max Prec @ R=1.0: {max_prec_at_recall_1:.2f}')
        else:
            print(f'{name} never reaches 100% recall.')

        # Plot Precision (X) vs Recall (Y)
        plt.plot(precision, recall, label=name, color=distinct_colors[i])

    plt.xlabel('Precision')
    plt.ylabel('Recall')
    plt.title('Precision vs Recall Curves (Highlight Max Precision @ Recall=1.0)')
    plt.legend()
    plt.grid()
    plt.show()

In [None]:
model_inputs = {
    "Virus2VecModel": Virus2Vec_feature_vector,
    "FeaturesModel": FeaturesModel_feature_vector,
    "PwmVecModel": Pwm2Vec_feature_vector,
    "deepLearning":None
}

In [None]:
plot_precision_vs_recall_max_precision_at_full_recall(models, model_inputs, y_test)

In [None]:
plot_models_roc_curves(models, model_inputs, y_test)

In [None]:
def predict_deep(df,y_true):
    test_deep_ids=df["Virus_ID"]
    test_deep_seq_ids=df["Seq_ID"]+" "+df["Virus_ID"]
    
    # convert test string id to numeric
    _,test_deep_ids = np.unique(test_deep_ids, return_inverse=True)
    _,test_deep_seq_ids = np.unique(test_deep_seq_ids, return_inverse=True)
    test_deep_datas = df['Sequence']

    DL = create_data_loader(test_deep_datas, y_true,test_deep_ids,test_deep_seq_ids,ft_model)

    
    model.eval()
    output = []
    true_lables=[]
    with torch.no_grad():
         for batch_data, batch_ids,batch_seq_ids, batch_labels in tqdm(  DL, desc="Processing Batches"):
            batch_data,batch_ids,batch_seq_ids, batch_labels  = batch_data.to(device), batch_ids.to(device),batch_seq_ids.to(device), batch_labels.to(device)
            Y_prob, Y_hat, A, A_2 =model(batch_data,batch_ids,batch_seq_ids)         
            output += Y_hat.cpu().tolist()
            true_lables += batch_labels.cpu().tolist()
            # Y_prob=Y_prob.squeeze(1)
            # probs += Y_prob.squeeze(1).cpu().tolist() 
            Y_hat = Y_hat.view_as(batch_labels)

    
        # Convert to numpy array for vectorized comparison
    test_deep_ids = np.array(test_deep_ids)
    unique_ids = np.unique(test_deep_ids)
        
        # Create mapping: sequence ID → corresponding prediction
    seq_pred_map = {seq_id: pred for seq_id, pred in zip(unique_ids, output)}
        
        # Generate output array by mapping each test_deep_id to its prediction
    seq_level_preds = np.array([seq_pred_map[seq_id] for seq_id in test_deep_ids])
    return seq_level_preds
    

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

def evaluate_models_on_pandemics(pandemic_dfs, model_ml_name, alphabet=alphabet, k=k, m=m):
    for flu_name, df in pandemic_dfs.items():
        feature_fn = feature_extractors[model_ml_name]
        feature_vector = get_feature_vector(df, feature_fn)
        y_true = (df["Class"].str.lower() != "human").astype(int).values
        # Machine Learning predictions
        y_pred_ml = get_preds(model_ml_name, feature_vector, y_true)
        # Deep Learning predictions
        y_pred_dl = predict_deep(df, y_true)

        # Compute confusion matrices
        cm_ml = confusion_matrix(y_true, y_pred_ml, labels=[0, 1])
        cm_dl = confusion_matrix(y_true, y_pred_dl, labels=[0, 1])

        # Plot heatmap confusion matrices
        fig, axs = plt.subplots(1, 2, figsize=(12, 5))
        fig.suptitle(f'{flu_name} - Confusion Matrices', fontsize=16)

        sns.heatmap(cm_ml, annot=True, fmt='d', cmap='Blues', ax=axs[0],
            xticklabels=["Human", "Animal"], yticklabels=["Human", "Animal"])
        axs[0].set_title('Machine Learning')
        axs[0].set_xlabel('Predicted')
        axs[0].set_ylabel('True')

        sns.heatmap(cm_dl, annot=True, fmt='d', cmap='Greens', ax=axs[1],
            xticklabels=["Human", "Animal"], yticklabels=["Human", "Animal"])

        axs[1].set_title('Deep Learning')
        axs[1].set_xlabel('Predicted')
        axs[1].set_ylabel('True')

        plt.tight_layout(rect=[0, 0.03, 1, 0.95])
        plt.show()


In [None]:
pandemics = {
    "Asian Flu": df_Asian_flu,
    "Hong Kong Flu": df_hong_kong_flu,
    "Swine Flu": df_pdmh1n1_flu,
    "covid Flu": df_covid,
}

evaluate_models_on_pandemics(pandemics, model_ml_name="Virus2VecModel")