In [2]:
!pip install --upgrade --no-cache-dir biopython
!pip install rdkit-pypi
!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-2.2.0+cu118.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-2.2.0+cu118.html
!pip install -q torch-geometric



Collecting biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m69.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85
Collecting rdkit-pypi
  Downloading rdkit_pypi-2022.9.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Downloading rdkit_pypi-2022.9.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.4/29.4 MB[0m [31m55.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: rdkit-pypi
Successfully installed rdkit-pypi-2022.9.5
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m69.1 MB/s

In [None]:
import numpy as np
from collections import Counter
import pandas as pd 
from Bio.Align import substitution_matrices
from scipy.spatial.distance import squareform
from scipy.cluster.hierarchy import linkage, fcluster
from tqdm import tqdm
from joblib import Parallel, delayed
import itertools
blosum62 = substitution_matrices.load('BLOSUM62')

In [None]:
#30
def apaac(sequence, lambda_value=10, weight=0.05):
    
    
    hydrophobicity = {
        'A': 0.62,  'C': 0.29,  'D': -0.90, 'E': -0.74, 'F': 1.19,
        'G': 0.48,  'H': -0.40, 'I': 1.38,  'K': -1.50, 'L': 1.06,
        'M': 0.64,  'N': -0.78, 'P': 0.12,  'Q': -0.85, 'R': -2.53,
        'S': -0.18, 'T': -0.05, 'V': 1.08,  'W': 0.81,  'Y': 0.26
    }
    
    hydrophilicity = {
        'A': -0.50, 'C': -1.00, 'D': 3.00,  'E': 3.00,  'F': -2.50,
        'G': 0.00,  'H': -0.50, 'I': -1.80, 'K': 3.00,  'L': -1.80,
        'M': -1.30, 'N': 0.20,  'P': 0.00,  'Q': 0.20,  'R': 3.00,
        'S': 0.30,  'T': -0.40, 'V': -1.50, 'W': -3.40, 'Y': -2.30
    }
    
    
    amino_acids = list(hydrophobicity.keys())
    sequence = sequence.upper()  
    
    # amino acid composition 
    aac = np.array([sequence.count(aa) / len(sequence) for aa in amino_acids])

    # correlation factors
    lambda_correlation = []
    for i in range(1, lambda_value + 1):
        sum_corr = 0
        for j in range(len(sequence) - i):
            if sequence[j] in amino_acids and sequence[j + i] in amino_acids:
                h1_corr = (hydrophobicity[sequence[j]] - hydrophobicity[sequence[j + i]])**2
                h2_corr = (hydrophilicity[sequence[j]] - hydrophilicity[sequence[j + i]])**2
                sum_corr += (h1_corr + h2_corr) / 2  # Average correlation
        lambda_correlation.append((sum_corr+1e-7) / ((len(sequence) - i)))

    lambda_correlation = np.array(lambda_correlation)

    # normalize and combine features
    apaac_vector = np.concatenate((aac * (1 - weight * sum(lambda_correlation)), weight * lambda_correlation))
    
    return apaac_vector

In [None]:
#15
def ctdd(sequence):
    secondary_structure = {
    "Helix": set("EALMQKRH"),
    "Strand": set("VIYCWFT"),
    "Coil": set("GNPSD"),
    }

    ctdd_vector = []
    sequence_length=len(sequence)
    for c, amino_acids in secondary_structure.items():
        positions = [i for i, aa in enumerate(sequence) if aa in amino_acids]
        
        if not positions:  # if no amino acid of this class is found
            ctdd_vector.extend([0, 0, 0, 0, 0])
            continue

        # calculate the five key positions (first, 25%, 50%, 75%, last)
        first = positions[0] / sequence_length
        p25 = positions[int(len(positions) * 0.25)] / sequence_length
        p50 = positions[int(len(positions) * 0.50)] / sequence_length
        p75 = positions[int(len(positions) * 0.75)] / sequence_length
        last = positions[-1] / sequence_length

        ctdd_vector.extend([first, p25, p50, p75, last])

    return ctdd_vector



In [None]:
#343
def ctriad(sequence):
    #based on dipoles and side-chain volumes
    amino_acid_groups = {
    'A': 1, 'G': 1, 'V': 1,  # Group 1
    'I': 2, 'L': 2, 'F': 2, 'P': 2,  # Group 2
    'Y': 3, 'M': 3, 'T': 3, 'S': 3,  # Group 3
    'H': 4, 'N': 4, 'Q': 4, 'W': 4,  # Group 4
    'R': 5, 'K': 5,  # Group 5
    'D': 6, 'E': 6,  # Group 6
    'C': 7   # Group 7
     }
    reduced_seq = [amino_acid_groups[aa] - 1 for aa in sequence if aa in amino_acid_groups]

    triads = [tuple(reduced_seq[i:i+3]) for i in range(len(reduced_seq) - 2)]
   
    triad_counts = Counter(triads)

    total_triads = len(triads)
    triad_vector = np.zeros((7, 7, 7))  # 7^3 possible triads

    for triad, count in triad_counts.items():
        triad_vector[triad] = count / total_triads  

    return triad_vector.flatten()



In [None]:
#10
def get_similarity(seq1, seq2):
    return sum(blosum62.get((a, b), blosum62.get((b, a), -4)) for a, b in zip(seq1, seq2))


def extract_subsequences(protein_seqs, k=5):
    all_subsequences = set()
    protein_subsequences = []  # store k-mers for each protein 

    for seq in tqdm(protein_seqs,desc='extract_subsequences'):
        kmers = [seq[i:i+k] for i in range(len(seq) - k + 1)]
        protein_subsequences.append(kmers)  
        all_subsequences.update(kmers)  # all k-mers for clustering

    return all_subsequences, protein_subsequences

def compute_similarity_matrix(subsequences, n_jobs=-1):
    n = len(subsequences)
    similarity_matrix = np.zeros((n, n))

    def compute_row(i):
        row = np.zeros(n)
        for j in range(i, n):
            score = get_similarity(subsequences[i], subsequences[j])
            row[j] = score
        return i, row

    results = Parallel(n_jobs=n_jobs)(delayed(compute_row)(i) for i in tqdm(range(n), desc="Computing rows"))

    for i, row in results:
        similarity_matrix[i, i:] = row[i:]
        similarity_matrix[i:, i] = row[i:]

    return similarity_matrix

def convert_to_distance_matrix(similarity_matrix):
    distance_matrix = 1 - (similarity_matrix - np.min(similarity_matrix)) / (np.max(similarity_matrix) - np.min(similarity_matrix))
    np.fill_diagonal(distance_matrix, 0)  # Ensure self-distance is 0
    return distance_matrix

def cluster_subsequences(distance_matrix, num_clusters=10):
    condensed_distance = squareform(distance_matrix)
    linkage_matrix = linkage(condensed_distance, method='average')
    cluster_labels = fcluster(linkage_matrix, num_clusters, criterion='maxclust')
    return cluster_labels

def generate_feature_vectors(protein_subsequences, cluster_labels, all_subsequences, num_clusters):

    # map each k-mer to its cluster ID 
    kmer_to_cluster = {kmer: cluster_labels[i] for i, kmer in enumerate(all_subsequences)}

    feature_vectors = []

    for protein_kmers in tqdm(protein_subsequences, desc='generate_feature_vectors'):
        cluster_counts = np.zeros(num_clusters)

        for kmer in protein_kmers:
            cluster_id = kmer_to_cluster.get(kmer)
            if cluster_id is not None:
                cluster_counts[cluster_id - 1] += 1  

        total = cluster_counts.sum()
        if total > 0:
            feature_vector = cluster_counts / total
        else:
            feature_vector = cluster_counts  

        feature_vectors.append(feature_vector)

    return np.array(feature_vectors)



In [None]:
#400
def dde(sequence):
    
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'  
    dipeptides = [''.join(pair) for pair in itertools.product(amino_acids, repeat=2)]
    
    aa_counts = Counter(sequence)
    L = len(sequence)
    aa_freq = {aa: aa_counts.get(aa, 0) / L for aa in amino_acids}

    dipeptide_counts = Counter([sequence[i:i+2] for i in range(L-1)])
    Dc = {dp: dipeptide_counts.get(dp, 0) / (L-1) for dp in dipeptides}

    Tm = {dp: aa_freq[dp[0]] * aa_freq[dp[1]] for dp in dipeptides}
    Tv = {dp: (Tm[dp] * (1 - Tm[dp])) / L if Tm[dp] > 0 else 0 for dp in dipeptides}

    DDE = {dp: (Dc[dp] - Tm[dp]) / (Tv[dp] ** 0.5) if Tv[dp] > 0 else 0 for dp in dipeptides}

    return DDE.values()

In [None]:
#5
hydrophobicity_Kyte_Doolittle  = {
    'A': 1.8,  'C': 2.5,  'D': -3.5, 'E': -3.5, 'F': 2.8,
    'G': -0.4, 'H': -3.2, 'I': 4.5,  'K': -3.9, 'L': 3.8,
    'M': 1.9,  'N': -3.5, 'P': -1.6, 'Q': -3.5, 'R': -4.5,
    'S': -0.8, 'T': -0.7, 'V': 4.2,  'W': -0.9, 'Y': -1.3
}

Hydrophilicity_Hopp_Woods_scale  = {
    'A': -0.5, 'C': -1.0, 'D': 3.0,  'E': 3.0,  'F': -2.5,
    'G': 0.0,  'H': -0.5, 'I': -1.8, 'K': 3.0,  'L': -1.8,
    'M': -1.3, 'N': 0.2,  'P': 0.0,  'Q': 0.2,  'R': 3.0,
    'S': 0.3,  'T': -0.4, 'V': -1.5, 'W': -3.4, 'Y': -2.3
}
Polarity_Scale = {
    'A': 8.1,  'C': 5.5,  'D': 13.0, 'E': 12.3, 'F': 5.2,
    'G': 9.0,  'H': 10.4, 'I': 5.2,  'K': 11.3, 'L': 4.9,
    'M': 5.7,  'N': 11.6, 'P': 8.0,  'Q': 10.5, 'R': 10.5,
    'S': 9.2,  'T': 8.6,  'V': 5.9,  'W': 5.4,  'Y': 6.2
}
Molecular_Weight = {
    'A': 89.09,  'C': 121.15, 'D': 133.10, 'E': 147.13, 'F': 165.19,
    'G': 75.07,  'H': 155.16, 'I': 131.17, 'K': 146.19, 'L': 131.17,
    'M': 149.21, 'N': 132.12, 'P': 115.13, 'Q': 146.15, 'R': 174.20,
    'S': 105.09, 'T': 119.12, 'V': 117.15, 'W': 204.23, 'Y': 181.19
}



def geary_autocorrelation(sequence, max_lag=5, property_dict=hydrophobicity_Kyte_Doolittle):
    
    prop_values = np.array([property_dict.get(aa, 0) for aa in sequence])  
    N = len(prop_values)
    mean_p = np.mean(prop_values)

    geary_values = {}
    
    for d in range(1, max_lag + 1):
        numerator = np.sum((prop_values[:-d] - prop_values[d:]) ** 2)
        denominator = 2 * (N - d) * np.sum((prop_values - mean_p) ** 2)
        geary_values[f'Geary_Lag_{d}'] = (N - 1) * numerator / denominator if denominator != 0 else 0

    return geary_values.values()

In [None]:
# Read the CSV file
df = pd.read_csv("/kaggle/input/virus-drug/virus_drug_interactions.csv")
df = df.drop(df.columns[0], axis=1)
# Display the first few rows
print(df.head())

protein_sequences = df['Protein_Sequence']
pIC50=df['pIC50']

In [None]:

k = 5
num_clusters = 10
# extract subsequences (k-mers)
all_subsequences, protein_subsequences = extract_subsequences(protein_sequences, k)

all_subsequences=list(all_subsequences)
similarity_matrix = compute_similarity_matrix(all_subsequences)
distance_matrix = convert_to_distance_matrix(similarity_matrix)

subsequence_cluster_labels = cluster_subsequences(distance_matrix, num_clusters)
# generate feature vectors for each protein
feature_vectors = generate_feature_vectors(protein_subsequences, subsequence_cluster_labels, all_subsequences, num_clusters)

print(len(feature_vectors),len(feature_vectors[0]))


extract_subsequences: 100%|██████████| 19451/19451 [00:02<00:00, 7268.26it/s]
Computing rows:   0%|          | 164/56409 [01:30<8:29:24,  1.84it/s]

In [None]:

protein_features=[]
for i,protein in tqdm(enumerate(protein_sequences)):
    vec=[]
    vec.extend(apaac(protein))
    vec.extend(ctdd(protein))
    vec.extend(ctriad(protein))
    vec.extend(dde(protein))
    vec.extend(geary_autocorrelation(protein))
    vec.extend(feature_vectors[i])
    protein_features.append(vec)

In [None]:
protein_features=np.array(protein_features)
pIC50=np.array(pIC50)

np.save("protein_features.npy", protein_features)
np.save("pIC50.npy", pIC50)
