In [1]:
import warnings
from Bio.PDB import MMCIFParser, DSSP
from Bio.PDB.PDBExceptions import PDBConstructionWarning
from pathlib import Path
import pandas as pd

In [2]:
# Suppress warnings
warnings.simplefilter('ignore', PDBConstructionWarning)

In [None]:
# Paths
structure_dir = Path("../data/raw/structures_cif")
dssp_exe = "mkdssp"
output_path = Path("../data/processed/dssp_residue_features.csv")

In [6]:
# Load list of unique PDB IDs (adjust as needed)
# Load both positive and unlabeled residue-level data
pos_df = pd.read_csv("../data/processed/BioLiP_positives_residue_level_with_duplicates.csv")
unlab_df = pd.read_csv("../data/processed/BioLiP_unlabeled_residues_with_duplicates.csv")

# Get all unique pdb_ids across both sets
all_pdb_ids = pd.concat([pos_df["pdb_id"], unlab_df["pdb_id"]]).unique()

In [7]:
unique_pdb_ids = all_pdb_ids

In [8]:
# DSSP parser setup
parser = MMCIFParser(QUIET=True)
dssp_feature_rows = []

In [9]:
for pdb_id in unique_pdb_ids:
    cif_file = structure_dir / f"{pdb_id}.cif"
    if not cif_file.exists():
        print(f"[!] Missing structure: {pdb_id}")
        continue

    try:
        structure = parser.get_structure(pdb_id, cif_file)
        model = structure[0]
        dssp = DSSP(model, cif_file, dssp=dssp_exe)
    except Exception as e:
        print(f"[!] Skipping {pdb_id}: DSSP failed – {e}")
        continue

    for key in dssp.keys():
        chain_id, res_id = key
        hetflag, resnum, icode = res_id
        residue_id = f"{resnum}{icode.strip()}" if icode.strip() else str(resnum)

        dssp_data = dssp[key]
        aa = dssp_data[1]

        # Residue-level flags
        aromatic = aa in {"F", "Y", "W", "H"}
        polar = aa in {"S", "T", "N", "Q", "Y", "C"}
        charged = aa in {"R", "K", "D", "E", "H"}
        hydrophobic = aa in {"A", "V", "I", "L", "M", "F", "W", "Y"}

        dssp_feature_rows.append({
            "pdb_id": pdb_id,
            "chain_id": chain_id,
            "residue_id": residue_id,
            "residue_name": aa,
            "secondary_structure": dssp_data[2],
            "absolute_sasa": dssp_data[3],
            "relative_asa": dssp_data[4],
            "phi": dssp_data[5],
            "psi": dssp_data[6],
            "hbond_NH_O1_energy": dssp_data[7],
            "hbond_NH_O2_energy": dssp_data[8],
            "hbond_O_NH1_energy": dssp_data[9],
            "hbond_O_NH2_energy": dssp_data[10],
            "is_aromatic": int(aromatic),
            "is_polar": int(polar),
            "is_charged": int(charged),
            "is_hydrophobic": int(hydrophobic),
        })

[!] Skipping 1bxr: DSSP failed – '>'
[!] Skipping 1c3o: DSSP failed – '>'


empty protein, or no valid complete residues



[!] Skipping 1c53: DSSP failed – DSSP failed to produce an output
[!] Skipping 1ct9: DSSP failed – '>'


empty protein, or no valid complete residues



[!] Skipping 1efg: DSSP failed – DSSP failed to produce an output


empty protein, or no valid complete residues



[!] Skipping 1ffk: DSSP failed – DSSP failed to produce an output
[!] Skipping 1hfe: DSSP failed – '>'
[!] Skipping 1iru: DSSP failed – '>'
[!] Skipping 1izl: DSSP failed – '>'


empty protein, or no valid complete residues



[!] Skipping 1j5a: DSSP failed – DSSP failed to produce an output
[!] Skipping 1kfl: DSSP failed – '>'


empty protein, or no valid complete residues



[!] Skipping 1lbg: DSSP failed – DSSP failed to produce an output


empty protein, or no valid complete residues



[!] Skipping 1nkw: DSSP failed – DSSP failed to produce an output


empty protein, or no valid complete residues



[!] Skipping 1nwx: DSSP failed – DSSP failed to produce an output
[!] Skipping 1pyh: DSSP failed – '>'


empty protein, or no valid complete residues



[!] Skipping 1qzv: DSSP failed – DSSP failed to produce an output
[!] Skipping 1rvv: DSSP failed – '>'
[!] Skipping 1s4d: DSSP failed – '>'
[!] Skipping 1w2b: DSSP failed – '>'
[!] Skipping 2aaz: DSSP failed – '>'
[!] Skipping 2axt: DSSP failed – '>'
[!] Skipping 2bte: DSSP failed – '>'
[!] Skipping 2d32: DSSP failed – '>'
[!] Skipping 2d3a: DSSP failed – '>'
[!] Skipping 2dns: DSSP failed – '>'
[!] Skipping 2ef5: DSSP failed – '>'
[!] Skipping 2efu: DSSP failed – '>'
[!] Skipping 2fug: DSSP failed – '>'
[!] Skipping 2j28: DSSP failed – '>'
[!] Skipping 2j5t: DSSP failed – '>'
[!] Skipping 2jd6: DSSP failed – '>'
[!] Skipping 2nv2: DSSP failed – '>'
[!] Skipping 2vtb: DSSP failed – '>'
[!] Skipping 2wqj: DSSP failed – '>'
[!] Missing structure: 2wwn
[!] Skipping 2zzs: DSSP failed – '>'
[!] Skipping 3a0b: DSSP failed – '>'
[!] Skipping 3a0h: DSSP failed – '>'
[!] Skipping 3ab4: DSSP failed – '>'
[!] Skipping 3be7: DSSP failed – '>'
[!] Skipping 3cc2: DSSP failed – '>'
[!] Skipping 3cf5:

empty protein, or no valid complete residues



[!] Skipping 4aqv: DSSP failed – DSSP failed to produce an output
[!] Skipping 4bts: DSSP failed – '>'
[!] Skipping 4c2m: DSSP failed – '>'
[!] Skipping 4ce4: DSSP failed – '>'
[!] Skipping 4ctf: DSSP failed – '>'
[!] Skipping 4d5l: DSSP failed – '>'
[!] Skipping 4d5y: DSSP failed – '>'
[!] Skipping 4d61: DSSP failed – '>'
[!] Skipping 4dx9: DSSP failed – '>'
[!] Skipping 4eu2: DSSP failed – '>'
[!] Skipping 4f86: DSSP failed – '>'
[!] Skipping 4h5f: DSSP failed – '>'
[!] Skipping 4hea: DSSP failed – '>'
[!] Skipping 4hh4: DSSP failed – '>'
[!] Skipping 4il6: DSSP failed – '>'
[!] Skipping 4io9: DSSP failed – '>'
[!] Skipping 4izg: DSSP failed – '>'
[!] Skipping 4l6v: DSSP failed – '>'
[!] Skipping 4lrh: DSSP failed – '>'
[!] Skipping 4n9f: DSSP failed – '>'
[!] Skipping 4p5h: DSSP failed – '>'
[!] Skipping 4pe5: DSSP failed – '>'
[!] Skipping 4r4n: DSSP failed – '>'
[!] Skipping 4s0r: DSSP failed – '>'
[!] Skipping 4u8u: DSSP failed – '>'
[!] Skipping 4uer: DSSP failed – '>'


empty protein, or no valid complete residues



[!] Skipping 4uy0: DSSP failed – DSSP failed to produce an output
[!] Skipping 4v3p: DSSP failed – '>'
[!] Skipping 4v4g: DSSP failed – '>'
[!] Skipping 4v4i: DSSP failed – '>'
[!] Skipping 4v4n: DSSP failed – '>'
[!] Skipping 4v4o: DSSP failed – '>'
[!] Skipping 4v4p: DSSP failed – '>'
[!] Skipping 4v4s: DSSP failed – '>'
[!] Skipping 4v4v: DSSP failed – '>'
[!] Skipping 4v5i: DSSP failed – '>'
[!] Skipping 4v5k: DSSP failed – '>'
[!] Skipping 4v5o: DSSP failed – '>'
[!] Skipping 4v5p: DSSP failed – '>'
[!] Skipping 4v5z: DSSP failed – '>'
[!] Skipping 4v61: DSSP failed – '>'
[!] Skipping 4v63: DSSP failed – '>'
[!] Skipping 4v6a: DSSP failed – '>'
[!] Skipping 4v6f: DSSP failed – '>'
[!] Skipping 4v6g: DSSP failed – '>'
[!] Skipping 4v6i: DSSP failed – '>'
[!] Skipping 4v6m: DSSP failed – '>'
[!] Skipping 4v6u: DSSP failed – '>'
[!] Skipping 4v6v: DSSP failed – '>'
[!] Skipping 4v6w: DSSP failed – '>'
[!] Skipping 4v6x: DSSP failed – '>'
[!] Skipping 4v7e: DSSP failed – '>'


empty protein, or no valid complete residues



[!] Skipping 4v7f: DSSP failed – DSSP failed to produce an output
[!] Skipping 4v7h: DSSP failed – '>'
[!] Skipping 4v7j: DSSP failed – '>'
[!] Skipping 4v7m: DSSP failed – '>'
[!] Skipping 4v7n: DSSP failed – '>'
[!] Skipping 4v7p: DSSP failed – '>'
[!] Skipping 4v7r: DSSP failed – '>'
[!] Skipping 4v88: DSSP failed – '>'
[!] Skipping 4v8i: DSSP failed – '>'
[!] Skipping 4v8k: DSSP failed – '>'
[!] Skipping 4v8m: DSSP failed – '>'
[!] Skipping 4v8p: DSSP failed – '>'
[!] Skipping 4v8s: DSSP failed – '>'
[!] Skipping 4v8z: DSSP failed – '>'
[!] Skipping 4v92: DSSP failed – '>'
[!] Skipping 4v98: DSSP failed – '>'
[!] Skipping 4v99: DSSP failed – '>'
[!] Skipping 4v9f: DSSP failed – '>'
[!] Skipping 4v9g: DSSP failed – '>'
[!] Skipping 4v9h: DSSP failed – '>'
[!] Skipping 4v9i: DSSP failed – '>'
[!] Skipping 4w2e: DSSP failed – '>'
[!] Skipping 4w4g: DSSP failed – '>'
[!] Skipping 4wfb: DSSP failed – '>'
[!] Skipping 4wht: DSSP failed – '>'
[!] Skipping 4wjg: DSSP failed – '>'
[!] Skipp

empty protein, or no valid complete residues



[!] Skipping 5lqw: DSSP failed – DSSP failed to produce an output
[!] Skipping 5lqy: DSSP failed – '>'
[!] Skipping 5lqz: DSSP failed – '>'
[!] Skipping 5lzb: DSSP failed – '>'
[!] Skipping 5lzt: DSSP failed – '>'
[!] Skipping 5lzw: DSSP failed – '>'
[!] Skipping 5m1j: DSSP failed – '>'
[!] Skipping 5m32: DSSP failed – '>'


empty protein, or no valid complete residues



[!] Skipping 5m5i: DSSP failed – DSSP failed to produce an output
[!] Skipping 5mc6: DSSP failed – '>'
[!] Skipping 5mdx: DSSP failed – '>'
[!] Skipping 5mdy: DSSP failed – '>'
[!] Skipping 5mkf: DSSP failed – '>'
[!] Skipping 5mkn: DSSP failed – '>'
[!] Skipping 5mlc: DSSP failed – '>'
[!] Skipping 5mmi: DSSP failed – '>'
[!] Skipping 5mmj: DSSP failed – '>'
[!] Skipping 5mp9: DSSP failed – '>'
[!] Skipping 5mps: DSSP failed – '>'
[!] Skipping 5mqf: DSSP failed – '>'
[!] Skipping 5mrc: DSSP failed – '>'
[!] Skipping 5myj: DSSP failed – '>'
[!] Skipping 5n5e: DSSP failed – '>'
[!] Skipping 5ndg: DSSP failed – '>'
[!] Skipping 5ndv: DSSP failed – '>'
[!] Skipping 5nif: DSSP failed – '>'
[!] Skipping 5njt: DSSP failed – '>'
[!] Skipping 5nrl: DSSP failed – '>'
[!] Skipping 5nwy: DSSP failed – '>'
[!] Skipping 5nyw: DSSP failed – '>'
[!] Skipping 5o61: DSSP failed – '>'
[!] Skipping 5o9z: DSSP failed – '>'
[!] Skipping 5oa1: DSSP failed – '>'
[!] Skipping 5oa3: DSSP failed – '>'
[!] Skipp

empty protein, or no valid complete residues



[!] Skipping 6ywe: DSSP failed – '>'
[!] Skipping 6yws: DSSP failed – '>'
[!] Skipping 6ywv: DSSP failed – '>'
[!] Skipping 6ywx: DSSP failed – '>'
[!] Skipping 6ywy: DSSP failed – '>'
[!] Skipping 6yxx: DSSP failed – '>'
[!] Skipping 6yxy: DSSP failed – '>'
[!] Skipping 6z1p: DSSP failed – '>'
[!] Skipping 6z5r: DSSP failed – '>'
[!] Skipping 6z5s: DSSP failed – '>'
[!] Skipping 6z6j: DSSP failed – '>'
[!] Skipping 6z7n: DSSP failed – '>'
[!] Skipping 6zj3: DSSP failed – '>'
[!] Skipping 6zm5: DSSP failed – '>'
[!] Skipping 6zm7: DSSP failed – '>'
[!] Skipping 6zon: DSSP failed – '>'
[!] Skipping 6zpo: DSSP failed – '>'
[!] Skipping 6zqa: DSSP failed – '>'
[!] Skipping 6zqb: DSSP failed – '>'
[!] Skipping 6zqg: DSSP failed – '>'
[!] Skipping 6ztl: DSSP failed – '>'
[!] Skipping 6zu5: DSSP failed – '>'
[!] Skipping 6zxd: DSSP failed – '>'
[!] Skipping 6zxf: DSSP failed – '>'
[!] Skipping 6zxg: DSSP failed – '>'
[!] Skipping 6zzx: DSSP failed – '>'
[!] Skipping 7a01: DSSP failed – '>'
[

In [None]:
# Convert to DataFrame
dssp_df = pd.DataFrame(dssp_feature_rows)

# Save result
dssp_df.to_csv(output_path, index=False)
print(f"[✓] Extracted features for {len(dssp_df)} residues across {len(unique_pdb_ids)} structures.")