In [None]:
import pandas as pd
import numpy as np
from collections import Counter

# Example CDR H3 sequences
cdr_seqs = {
    "Ab1_H3": "ARDGYGMDV",
    "Ab2_H3": "ARDGYYFDY",
    "Ab3_H3": "ARETYYAMN",
    "Ab4_H3": "ARDFWGQGT",
    "Ab5_H3": "ARDFYGFAY"
}

# Kyte-Doolittle hydrophobicity scale
hydro_scale = {
    'A': 1.8, 'C': 2.5, 'D': -3.5, 'E': -3.5, 'F': 2.8,
    'G': -0.4, 'H': -3.2, 'I': 4.5, 'K': -3.9, 'L': 3.8,
    'M': 1.9, 'N': -3.5, 'P': -1.6, 'Q': -3.5, 'R': -4.5,
    'S': -0.8, 'T': -0.7, 'V': 4.2, 'W': -0.9, 'Y': -1.3
}

charged = set("DEKR")
polar = set("STNQ")
hydrophobic = set("AILMFWV")

def extract_features(seq):
    aa_counts = Counter(seq)
    length = len(seq)
    hydro_vals = [hydro_scale.get(aa, 0.0) for aa in seq]
    avg_hydro = np.mean(hydro_vals)

    freqs = {f"freq_{aa}": aa_counts.get(aa, 0) / length for aa in hydro_scale}
    features = {
        "length": length,
        "avg_hydrophobicity": avg_hydro,
        "charged_count": sum(seq.count(r) for r in charged),
        "polar_count": sum(seq.count(r) for r in polar),
        "hydrophobic_count": sum(seq.count(r) for r in hydrophobic),
        **freqs
    }
    return features

# Extract features for all sequences
df = pd.DataFrame.from_dict(
    {name: extract_features(seq) for name, seq in cdr_seqs.items()},
    orient='index'
).reset_index().rename(columns={"index": "CDR"})

print(df.head())

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

pca = PCA(n_components=2)
X = df.drop(columns=["CDR"])
X_pca = pca.fit_transform(X)

plt.figure(figsize=(6, 5))
plt.scatter(X_pca[:, 0], X_pca[:, 1])
for i, name in enumerate(df["CDR"]):
    plt.text(X_pca[i, 0], X_pca[i, 1], name)
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("PCA of CDR Features")
plt.grid(True)
plt.show()

adding structural features

In [None]:
import pymol
from pymol import cmd
import pandas as pd
import numpy as np
from collections import Counter

# Example CDR sequences and PDB
cdr_seqs = {
    "Ab1_H3": "ARDGYGMDV",
    "Ab2_H3": "ARDGYYFDY",
    "Ab3_H3": "ARETYYAMN",
    "Ab4_H3": "ARDFWGQGT",
    "Ab5_H3": "ARDFYGFAY"
}
pdb_files = {
    "Ab1_H3": "Ab1.pdb",
    "Ab2_H3": "Ab2.pdb",
    "Ab3_H3": "Ab3.pdb",
    "Ab4_H3": "Ab4.pdb",
    "Ab5_H3": "Ab5.pdb"
}

# Kyte-Doolittle hydrophobicity scale (same as before)
hydro_scale = {
    'A': 1.8, 'C': 2.5, 'D': -3.5, 'E': -3.5, 'F': 2.8,
    'G': -0.4, 'H': -3.2, 'I': 4.5, 'K': -3.9, 'L': 3.8,
    'M': 1.9, 'N': -3.5, 'P': -1.6, 'Q': -3.5, 'R': -4.5,
    'S': -0.8, 'T': -0.7, 'V': 4.2, 'W': -0.9, 'Y': -1.3
}

def extract_features(seq, pdb_file):
    # Sequence-based features
    aa_counts = Counter(seq)
    length = len(seq)
    hydro_vals = [hydro_scale.get(aa, 0.0) for aa in seq]
    avg_hydro = np.mean(hydro_vals)

    charged = set("DEKR")
    polar = set("STNQ")
    hydrophobic = set("AILMFWV")

    num_charged = sum(seq.count(r) for r in charged)
    num_polar = sum(seq.count(r) for r in polar)
    num_hydrophobic = sum(seq.count(r) for r in hydrophobic)
    
    # Structural features: SASA and contacts with antigen
    sasa, contact = get_sasa_and_contacts(pdb_file, "H", "A")  # Assuming H chain and A chain are the antibody and antigen chains
    
    # Return combined features
    return {
        "length": length,
        "avg_hydrophobicity": avg_hydro,
        "charged_count": num_charged,
        "polar_count": num_polar,
        "hydrophobic_count": num_hydrophobic,
        "sasa": sasa,
        "contacts_with_antigen": contact,
    }

# Extract features for all sequences
features_df = pd.DataFrame.from_dict(
    {name: extract_features(seq, pdb_files[name]) for name, seq in cdr_seqs.items()},
    orient="index"
).reset_index().rename(columns={"index": "CDR"})

print(features_df.head())