In this notebook we implement approaches to finding cryptic binding sites
based on the predicted structures downloaded from AlphaFold DB.

In the `HACKATHON_DATA`

1. **pLDDT based approach** - we look for pockets with low predicted
pLDDT score, which indicates low confidence of structure prediction = high
probability of conformational changes

In [2]:
import sys
from pathlib import Path

project_root = Path().resolve().parents[1]
sys.path.append(str(project_root))

In [3]:
import re

from Bio.PDB import PDBParser
import pandas as pd
from tqdm import tqdm

from src.constants import HACKATHON_DATA, ALPHAFOLD_STRUCTURES_DIR

In [4]:
data = pd.read_csv(HACKATHON_DATA)
data.head()

Unnamed: 0,pdb_id,query_poi,chain,uniprot_id,pdb_pocket_selection,alphafold_pocket_selection
0,3kr4,K_BES_1003,K,Q8IL11,3kr4 and ( (chain K and resi 374+379+386+392+3...,AF-Q8IL11-F1-model_v4 and ( resi 374+379+386+3...
1,3kun,B_HEM_139,B,Q9NAV8,3kun and ( (chain B and resi 24+31+34+35+36+51...,AF-Q9NAV8-F1-model_v4 and ( resi 25+32+35+36+3...
2,5t67,A_SAH_502,A,B3TMQ9,5t67 and ( (chain A and resi 72+76+78+80+90+11...,AF-B3TMQ9-F1-model_v4 and ( resi 72+76+78+80+9...
3,3kr6,A_FFQ_500,A,P0A749,3kr6 and ( (chain A and resi 22+23+49+91+114+1...,AF-P0A749-F1-model_v4 and ( resi 22+23+49+91+1...
4,3kpz,A_ZNE_525,A,P11473,3kpz and ( (chain A and resi 143+147+150+227+2...,AF-P11473-F1-model_v4 and ( resi 143+147+150+2...


In [5]:
def extract_residues(selection_str: str) -> list[int]:
    """
    Extracts list of residue indices from `alphafold_pocket_selection` string.
    The string is expected to be in the format 'resi 1+2+3+4+5'.
    """
    match = re.search(r'resi\s+([0-9+]+)', selection_str)
    if match:
        residues_str = match.group(1)
        residues = [int(r) for r in residues_str.split('+')]
        return residues
    else:
        return []

### 1. pLDDT based approach

In [6]:
def extract_plddt_from_pdb(pdb_file: Path) -> list[tuple[int, float]] | list[float]:
    """
    Extract pLDDT values from a PDB file downloaded from AlphaFold DB.

    Returns:
        A list of tuples, where each tuple contains the residue index and its
        corresponding pLDDT value.
    """
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure("model", pdb_file)

    # AlphaFold predictions contain one model and one chain
    model = next(structure.get_models())
    chain = next(model.get_chains())

    residue_plddt_scores = []

    for residue in chain:
        atoms = list(residue.get_atoms())
        # All atoms in a residue a have the same pLDDT in AF outputs, so
        # we get the pLDDT value from the B-factor field of the first atom
        plddt = atoms[0].get_bfactor()
        residue_plddt_scores.append(plddt)

    return residue_plddt_scores

In [33]:
def residues_mean_plddt(residues: list[int], plddt_scores: list[float]) -> float:
    """
    Calculate the mean pLDDT score for a list of residues.
    """
    selected_scores = [plddt_scores[r - 1] for r in residues] # residues are 1-indexed
    return sum(selected_scores) / len(selected_scores)

Now we iterate over all AlphaFold pocket selections in our dataset, and check the mean value of pLDDT.
If it is below threshold (low confidence prediction = possible high structure variation), classify the selected pocket as *cryptic*. 

In [8]:
pockets_plddt_df = data[['uniprot_id', 'alphafold_pocket_selection']].drop_duplicates()
pockets_plddt_df.head()

Unnamed: 0,uniprot_id,alphafold_pocket_selection
0,Q8IL11,AF-Q8IL11-F1-model_v4 and ( resi 374+379+386+3...
1,Q9NAV8,AF-Q9NAV8-F1-model_v4 and ( resi 25+32+35+36+3...
2,B3TMQ9,AF-B3TMQ9-F1-model_v4 and ( resi 72+76+78+80+9...
3,P0A749,AF-P0A749-F1-model_v4 and ( resi 22+23+49+91+1...
4,P11473,AF-P11473-F1-model_v4 and ( resi 143+147+150+2...


In [46]:
pocket_mean_plddt = []

for idx, row in tqdm(pockets_plddt_df.iterrows(), total=len(pockets_plddt_df)):
    uniprot_id = row['uniprot_id']
    selection_str = row['alphafold_pocket_selection']
    residues = extract_residues(selection_str)
    pdb_file = ALPHAFOLD_STRUCTURES_DIR / f"{uniprot_id}.pdb"

    if not pdb_file.exists():
        print(f"Warning: PDB file for {uniprot_id} not found. Skipping.")
        pocket_mean_plddt.append(None)
        continue

    plddt_scores = extract_plddt_from_pdb(pdb_file)

    try:
        mean_plddt = residues_mean_plddt(residues, plddt_scores)
        pocket_mean_plddt.append(mean_plddt)
    except Exception as e:
        print(f"Error calculating mean pLDDT for {idx}, {uniprot_id}: {e}")
        pocket_mean_plddt.append(None)
        continue



 14%|█▎        | 4661/34260 [01:39<11:11, 44.06it/s]



 14%|█▎        | 4679/34260 [01:39<08:13, 59.94it/s]



 59%|█████▉    | 20210/34260 [06:44<03:43, 62.90it/s]

Error calculating mean pLDDT for 36266, Q7YTB0: list index out of range


 59%|█████▉    | 20225/34260 [06:44<04:22, 53.41it/s]

Error calculating mean pLDDT for 36299, Q7YTB0: list index out of range


 75%|███████▌  | 25793/34260 [08:41<02:53, 48.94it/s]

Error calculating mean pLDDT for 46941, P9WFR9: list index out of range


 75%|███████▌  | 25815/34260 [08:41<02:31, 55.61it/s]

Error calculating mean pLDDT for 46985, P9WFR9: list index out of range


 75%|███████▌  | 25850/34260 [08:42<02:36, 53.79it/s]

Error calculating mean pLDDT for 47038, P9WFR9: list index out of range


 75%|███████▌  | 25866/34260 [08:43<02:33, 54.77it/s]

Error calculating mean pLDDT for 47054, P9WFR9: list index out of range
Error calculating mean pLDDT for 47062, P9WFR9: list index out of range


100%|██████████| 34260/34260 [11:30<00:00, 49.60it/s]


In [47]:
pockets_plddt_df['mean_plddt'] = pocket_mean_plddt

In [48]:
pockets_plddt_df

Unnamed: 0,uniprot_id,alphafold_pocket_selection,mean_plddt
0,Q8IL11,AF-Q8IL11-F1-model_v4 and ( resi 374+379+386+3...,98.145500
1,Q9NAV8,AF-Q9NAV8-F1-model_v4 and ( resi 25+32+35+36+3...,96.951364
2,B3TMQ9,AF-B3TMQ9-F1-model_v4 and ( resi 72+76+78+80+9...,97.025217
3,P0A749,AF-P0A749-F1-model_v4 and ( resi 22+23+49+91+1...,90.284167
4,P11473,AF-P11473-F1-model_v4 and ( resi 143+147+150+2...,95.835385
...,...,...,...
64377,Q16698,AF-Q16698-F1-model_v4 and ( resi 66+69+70+71+8...,97.465417
64379,Q16698,AF-Q16698-F1-model_v4 and ( resi 66+69+70+71+7...,95.601538
64380,Q96X16,AF-Q96X16-F1-model_v4 and ( resi 396+398+401+4...,97.343750
64384,P69834,AF-P69834-F1-model_v4 and ( resi 84+85+86+87+9...,92.081333


In [59]:
import matplotlib.pyplot as plt

pockets_plddt_df['mean_plddt'].sort_values(ascending=True)[:200]

29214    36.030000
63841    39.570000
25502    41.672500
10644    46.850000
25504    49.870000
           ...    
9722     78.664286
42080    78.681111
41316    78.718824
35251    78.762000
34118    78.780455
Name: mean_plddt, Length: 200, dtype: float64

In [49]:
CRYPTIC_PLDDT_THRESHOLD = 70.0  # Pockets with mean pLDDT below this threshold will be considered cryptic
pockets_plddt_df['cryptic'] = pockets_plddt_df['mean_plddt'] < CRYPTIC_PLDDT_THRESHOLD