In [2]:
import numpy as np
from collections import Counter

In [3]:

def apaac(sequence, lambda_value=10, weight=0.05):
    
    # Hydrophobicity (h1) and Hydrophilicity (h2)
    hydrophobicity = {
        'A': 0.62,  'C': 0.29,  'D': -0.90, 'E': -0.74, 'F': 1.19,
        'G': 0.48,  'H': -0.40, 'I': 1.38,  'K': -1.50, 'L': 1.06,
        'M': 0.64,  'N': -0.78, 'P': 0.12,  'Q': -0.85, 'R': -2.53,
        'S': -0.18, 'T': -0.05, 'V': 1.08,  'W': 0.81,  'Y': 0.26
    }
    
    hydrophilicity = {
        'A': -0.50, 'C': -1.00, 'D': 3.00,  'E': 3.00,  'F': -2.50,
        'G': 0.00,  'H': -0.50, 'I': -1.80, 'K': 3.00,  'L': -1.80,
        'M': -1.30, 'N': 0.20,  'P': 0.00,  'Q': 0.20,  'R': 3.00,
        'S': 0.30,  'T': -0.40, 'V': -1.50, 'W': -3.40, 'Y': -2.30
    }
    
    
    amino_acids = list(hydrophobicity.keys())
    sequence = sequence.upper()  
    
    # Compute standard amino acid composition (AAC)
    aac = np.array([sequence.count(aa) / len(sequence) for aa in amino_acids])

    # Compute sequence-order correlation factors
    lambda_correlation = []
    for i in range(1, lambda_value + 1):
        sum_corr = 0
        for j in range(len(sequence) - i):
            if sequence[j] in amino_acids and sequence[j + i] in amino_acids:
                h1_corr = (hydrophobicity[sequence[j]] - hydrophobicity[sequence[j + i]])**2
                h2_corr = (hydrophilicity[sequence[j]] - hydrophilicity[sequence[j + i]])**2
                sum_corr += (h1_corr + h2_corr) / 2  # Average correlation
        lambda_correlation.append(sum_corr / (len(sequence) - i))

    lambda_correlation = np.array(lambda_correlation)

    # Normalize and combine features
    apaac_vector = np.concatenate((aac * (1 - weight * sum(lambda_correlation)), weight * lambda_correlation))
    
    return apaac_vector

In [4]:
seq = "ACDEFGHIKLMNPQRSTVWYACDEFGHIKLMNPQRSTVWYX"
apaac_vector = apaac(seq)
print("APAAC Feature Vector:", apaac_vector)
print("Feature Vector Length:", len(apaac_vector))


APAAC Feature Vector: [-0.06343104 -0.06343104 -0.06343104 -0.06343104 -0.06343104 -0.06343104
 -0.06343104 -0.06343104 -0.06343104 -0.06343104 -0.06343104 -0.06343104
 -0.06343104 -0.06343104 -0.06343104 -0.06343104 -0.06343104 -0.06343104
 -0.06343104 -0.06343104  0.20651625  0.24157526  0.20727237  0.33520682
  0.24398625  0.1566355   0.27018647  0.18447924  0.18053359  0.27394452]
Feature Vector Length: 30


In [6]:

def ctdd(sequence):
    secondary_structure = {
    "Helix": set("EALMQKRH"),
    "Strand": set("VIYCWFT"),
    "Coil": set("GNPSD"),
    }

    ctdd_vector = []
    sequence_length=len(sequence)
    for class_name, amino_acids in secondary_structure.items():
        positions = [i for i, aa in enumerate(sequence) if aa in amino_acids]
        
        if not positions:  # If no amino acid of this class is found
            ctdd_vector.extend([0, 0, 0, 0, 0])
            continue

        # Calculate the five key positions (first, 25%, 50%, 75%, last)
        first = positions[0] / sequence_length
        p25 = positions[int(len(positions) * 0.25)] / sequence_length
        p50 = positions[int(len(positions) * 0.50)] / sequence_length
        p75 = positions[int(len(positions) * 0.75)] / sequence_length
        last = positions[-1] / sequence_length

        ctdd_vector.extend([first, p25, p50, p75, last])

    return ctdd_vector



In [7]:
seq = "ACDEFGHIKLMNPQRSTVWYACDEFGHIKLMNPQRSTVWYX"
ctdd_vector = ctdd(seq)
print("CTDD Feature Vector:", ctdd_vector)
print("Feature Vector Length:", len(ctdd_vector))

CTDD Feature Vector: [0.0, 0.21951219512195122, 0.4878048780487805, 0.7073170731707317, 0.8292682926829268, 0.024390243902439025, 0.3902439024390244, 0.5121951219512195, 0.8780487804878049, 0.9512195121951219, 0.04878048780487805, 0.2682926829268293, 0.5365853658536586, 0.7560975609756098, 0.8536585365853658]
Feature Vector Length: 15


In [10]:

def ctriad(sequence):
    #Grpups based on their dipoles and side-chain volumes
    amino_acid_groups = {
    'A': 1, 'G': 1, 'V': 1,  # Group 1
    'I': 2, 'L': 2, 'F': 2, 'P': 2,  # Group 2
    'Y': 3, 'M': 3, 'T': 3, 'S': 3,  # Group 3
    'H': 4, 'N': 4, 'Q': 4, 'W': 4,  # Group 4
    'R': 5, 'K': 5,  # Group 5
    'D': 6, 'E': 6,  # Group 6
    'C': 7   # Group 7
     }
    # Convert sequence to reduced alphabet (group numbers)
    reduced_seq = [amino_acid_groups[aa] - 1 for aa in sequence if aa in amino_acid_groups]
    # Extract triads
    triads = [tuple(reduced_seq[i:i+3]) for i in range(len(reduced_seq) - 2)]
   
    # Count occurrences of each triad
    triad_counts = Counter(triads)

    # Normalize counts
    total_triads = len(triads)
    triad_vector = np.zeros((7, 7, 7))  # 7^3 possible triads

    for triad, count in triad_counts.items():
        triad_vector[triad] = count / total_triads  # Normalize frequency

    return triad_vector.flatten()



In [11]:

seq = "ACDEFGHIKLMNPQRSTVWYACDEFGHIKLMNPQRSTVWYX"
ctriad_vector = ctriad(seq)

print("CTriad Feature Vector:", ctriad_vector)
print("Feature Vector Length:", len(ctriad_vector))

CTriad Feature Vector: [0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.05263158 0.05263158
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.05263158
 0.         0.         0.         0.         0.05263158 0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.05263158 0.         0.         0.         0.         0.
 0.         0.         0.05263158 0.         0.         0.
 0.05263158 0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.   