## RFdiffusion3 (RFD3)生成骨架 >> ProteinMPNN生成序列 >> RosettaFold3 (RF3)验证结构
- author：alex_fang（方鹏飞）
- email：alex_fang@zju.edu.cn
- github: https://github.com/pf-f
- data：2026/01/26
- tel: 18888917100

In [1]:
import os
# RXT2060禁用cuequivariance
os.environ["SHOULD_USE_CUEQUIVARIANCE"] = "0"
os.environ["CUEQUIVARIANCE_USE_FALLBACK"] = "1"          
os.environ["DISABLE_CUEQUIVARIANCE"] = "1"
# 强制fp16
os.environ["TORCH_DTYPE"] = "fp16"                       
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
print(os.environ.get("SHOULD_USE_CUEQUIVARIANCE"))
print(os.environ.get("DISABLE_CUEQUIVARIANCE"))

0
1


In [2]:
# 目录
folders = ['out_cdr', 'out_cif']
for folder in folders:
    os.makedirs(os.path.join(os.getcwd(), folder), exist_ok=True)

In [3]:
# 量化ram/cpu/time
%load_ext memory_profiler
%load_ext autotime

# %%gpumem
from IPython.core.magic import register_cell_magic
import pynvml
import time

pynvml.nvmlInit()

@register_cell_magic
def gpumem(line, cell):
    handle = pynvml.nvmlDeviceGetHandleByIndex(0) 

    def get_mem():
        info = pynvml.nvmlDeviceGetMemoryInfo(handle)
        return info.used / 1024**3, info.total / 1024**3  # GB

    used_before, total = get_mem()
    print(f"Before: {used_before:.2f} GB / {total:.2f} GB")

    start = time.time()
    get_ipython().run_cell(cell)
    elapsed = time.time() - start

    used_after, _ = get_mem()
    print(f"After : {used_after:.2f} GB / {total:.2f} GB")
    print(f"Δ     : {used_after - used_before:+.2f} GB")
    print(f"Time  : {elapsed:.2f} s")

time: 30.6 ms (started: 2026-01-26 01:26:44 +08:00)


In [4]:
# AtomWorks可视化模块
from atomworks.io.utils.visualize import view

Environment variable CCD_MIRROR_PATH not set. Will not be able to use function requiring this variable. To set it you may:
  (1) add the line 'export VAR_NAME=path/to/variable' to your .bashrc or .zshrc file
  (2) set it in your current shell with 'export VAR_NAME=path/to/variable'
  (3) write it to a .env file in the root of the atomworks.io repository
Environment variable PDB_MIRROR_PATH not set. Will not be able to use function requiring this variable. To set it you may:
  (1) add the line 'export VAR_NAME=path/to/variable' to your .bashrc or .zshrc file
  (2) set it in your current shell with 'export VAR_NAME=path/to/variable'
  (3) write it to a .env file in the root of the atomworks.io repository


time: 727 ms (started: 2026-01-26 01:26:44 +08:00)


### Step1：RFdiffusion3 >> 骨架设计

In [5]:
%%gpumem
%%memit

from lightning.fabric import seed_everything
from rfd3.engine import RFD3InferenceConfig, RFD3InferenceEngine
seed_everything(0)

import json
from rfd3.engine import RFD3InferenceConfig, RFD3InferenceEngine

# rfd3模型config
spec = json.load(open('/home/alex/aidd/PDL1-4ZQK/protein_binder_design.json'))['pdl1_clean'] 

config = RFD3InferenceConfig(
    specification=spec,      
    diffusion_batch_size=2,
)
engine = RFD3InferenceEngine(**config)

# 生成2条backbone
outputs = engine.run(
    inputs='/home/alex/aidd/PDL1-4ZQK/pd_l1_clean.pdb', 
    out_dir='/home/alex/aidd/PDL1-4ZQK/out_cdr',
    n_batches=1,            
)

Before: 1.66 GB / 12.00 GB


  self.pid = os.fork()
01:26:49 DEBUG transforms: Debug mode is on
Seed set to 0
01:26:50 INFO rfd3.engine: [rank: 0] Outputs will be written to /home/alex/aidd/PDL1-4ZQK/out_cdr.
01:26:50 INFO rfd3.engine: [rank: 0] Prevalidating design specification for example: pd_l1_clean
01:26:51 INFO rfd3.engine: [rank: 0] Found 0 existing example IDs in the output directory.
Using bfloat16 Automatic Mixed Precision (AMP)
  .index_reduce(
01:27:29 INFO rfd3.engine: [rank: 0] Finished inference batch in 25.50 seconds.
01:27:30 INFO rfd3.engine: [rank: 0] Outputs for pd_l1_clean_0_model_0 written to /home/alex/aidd/PDL1-4ZQK/out_cdr/pd_l1_clean_0_model_0.
01:27:30 INFO rfd3.engine: [rank: 0] Outputs for pd_l1_clean_0_model_1 written to /home/alex/aidd/PDL1-4ZQK/out_cdr/pd_l1_clean_0_model_1.


peak memory: 8275.20 MiB, increment: 8081.49 MiB
time: 45.3 s (started: 2026-01-26 01:26:44 +08:00)
After : 3.92 GB / 12.00 GB
Δ     : +2.26 GB
Time  : 45.27 s
time: 45.3 s (started: 2026-01-26 01:26:44 +08:00)


In [6]:
# 骨架CIF可视化
from pathlib import Path
from atomworks.io.utils.io_utils import load_any
import biotite.structure as struc

cif_files = sorted(Path('out_cdr').glob('pd_l1_clean*_model_*.cif.gz'))
first_cif = cif_files[0]
atom_array = load_any(str(first_cif), model=1)
print(f'Loaded {first_cif.name}  ({atom_array.shape[0]} atoms)')

view(atom_array)

Loaded pd_l1_clean_0_model_0.cif.gz  (971 atoms)


<py3Dmol.view at 0x7ef896236f00>

time: 1.71 s (started: 2026-01-26 01:27:30 +08:00)


### Step2：ProteinMPNN >> 序列填充

In [7]:
%%gpumem
%%memit

from mpnn.inference_engines.mpnn import MPNNInferenceEngine
from biotite.structure import get_residue_starts
from biotite.sequence import ProteinSequence


# MPNN模型config
engine_config = {
    "model_type": "protein_mpnn", 
    "is_legacy_weights": True, 
    "out_directory": None, 
    "write_structures": False,
    "write_fasta": False,
}

# 输入json的config
input_configs = [
    {
        "batch_size": 10, 
        "remove_waters": True,
    }
]

# 生成10条backbone序列
model = MPNNInferenceEngine(**engine_config)
mpnn_outputs = model.run(input_dicts=input_configs, atom_arrays=[atom_array])

# 序列显示
print(f"Generated {len(mpnn_outputs)} designed sequences:\n")

for i, item in enumerate(mpnn_outputs):
    res_starts = get_residue_starts(item.atom_array)
    seq_1letter = ''.join(
        ProteinSequence.convert_letter_3to1(res_name)
        for res_name in item.atom_array.res_name[res_starts]
    )
    print(f"Sequence {i+1}: {seq_1letter}")

Before: 3.93 GB / 12.00 GB


  self.pid = os.fork()


Generated 10 designed sequences:

Sequence 1: MKGLELLKKLEEEEAEFKVSVPEDTYVVKLGSTASISCLFPVGESLNVEELILLWMKDGKIIIYWEKGKLIESLVDPEYHDRASLVKSKLPEGVAELVIKDVTEADAGTYTCVVSYNGSDFKKIKVIVES
Sequence 2: MEGLELLARLEEEEAAFKVSVPEDTYNVALGSTASITCNFPVGESLDESKLIVLWTKDGKIIIYWEKGKLLKELVDPRYLNRASLDKSSLPKGEATLTITNVTLADAGTYTCLVSYDGADSKTITLVVNA
Sequence 3: MRALELYAQLQAEEAEFKVSVPQSVYTVQEGSDMSISCLFPVGESLDVSELIVLWMKDGKVIIYWEKGEVLWELVDPRYHDRASLVKEELPKGVAKLQIKNVTAEDAGTYTCLVSYNGSDSVKIEVKVVA
Sequence 4: SSGLEALKKLLEEEAAFKVSVPQTTYNVQAGSTMSITCNFPVGESLKVDELTVLWQKDGKIIIYYEKGKVLEELVDPEYHERASLDLSSLPEGKATLTIKNVTAEDAGTYTCTVKYNGADHAKIQVNVLS
Sequence 5: SEGLELLEKLLKEKAAFTVSVPTSTYNVKEGSTMSISCYFPVGESLDVSKLIVLWTKDGEIIIYWERGKVLESLVDPRYHNRASLDLSSLPNGKATLTISNVTAEDAGTYTCLVKYNGADFKTIKVNVVA
Sequence 6: MEALKLLKELEEKEKEFKVSVPEDVYNVKLGSTASITCYFPVGDSLDVSKLKVLWMKDGKVIIYWEKGEVLWEKVDPRYHERASLDESSLPEGKATLIIKDVTEEDAGTYTCLVSYNGADFKKIKVNVVS
Sequence 7: MKALELLAKLKAERAAFKVSVPSSTYKVELGSTMSITCLFPVGDSLDVENLIVLWTKDGKIIIYWERGEVRESLVDPEFHERAYLDLSSLPQGRAT

### Step3：RosettaFold3 >> 结构回测

In [8]:
%%gpumem
%%memit

from rf3.inference_engines.rf3 import RF3InferenceEngine
from rf3.utils.inference import InferenceInput


# RF3模型config
inference_engine = RF3InferenceEngine(
    ckpt_path='rf3',
    verbose=False,
    n_recycles=1,              
    diffusion_batch_size=1,    
    num_steps=20              
)

# MPNN序列 >> RF3 re-fold
input_structure = InferenceInput.from_atom_array(atom_array, example_id="pdl1_clean")
rf3_outputs = inference_engine.run(inputs=input_structure)

print(f"Output keys: {rf3_outputs.keys()}")
print(f"Number of models for 'example_protein': {len(rf3_outputs['pdl1_clean'])}")

# RF3预测top-rank[0]
rf3_output = rf3_outputs["pdl1_clean"][0]

# RF3Output可视化
print(f"RF3Output contains:")
print(f"  - atom_array: {len(rf3_output.atom_array)} atoms")
print(f"  - summary_confidences: {list(rf3_output.summary_confidences.keys())}")
print(f"  - confidences: {list(rf3_output.confidences.keys()) if rf3_output.confidences else None}")

view(rf3_output.atom_array)

Before: 3.92 GB / 12.00 GB


  self.pid = os.fork()
01:27:35 INFO rf3.inference_engines.rf3: [rank: 0] Loading checkpoint from /home/alex/.foundry/checkpoints/rf3_foundry_01_24_latest_remapped.ckpt...
Using bfloat16 Automatic Mixed Precision (AMP)
01:27:44 INFO rf3.inference_engines.rf3: [rank: 0] Found 1 structures to predict!
01:27:44 INFO rf3.inference_engines.rf3: [rank: 0] Predicting structure 1/1: pdl1_clean


Output keys: dict_keys(['pdl1_clean'])
Number of models for 'example_protein': 1
RF3Output contains:
  - atom_array: 1018 atoms
  - summary_confidences: ['chain_ptm', 'chain_pair_pae_min', 'chain_pair_pde_min', 'chain_pair_pae', 'chain_pair_pde', 'overall_plddt', 'overall_pde', 'overall_pae', 'ptm', 'iptm', 'has_clash', 'ranking_score']
  - confidences: ['atom_chain_ids', 'atom_plddts', 'pae', 'token_chain_ids', 'token_res_ids']
peak memory: 9876.35 MiB, increment: 1633.80 MiB
time: 23.4 s (started: 2026-01-26 01:27:34 +08:00)
After : 6.02 GB / 12.00 GB
Δ     : +2.11 GB
Time  : 23.37 s
time: 23.4 s (started: 2026-01-26 01:27:34 +08:00)


### Step4：任务评估

In [9]:
# 模型quality confidences
summary = rf3_output.summary_confidences

print("=== Summary Confidences ===")
print(f"  Overall pLDDT:    {summary['overall_plddt']:.3f}")
print(f"  Overall PAE:      {summary['overall_pae']:.2f} A")
print(f"  Overall PDE:      {summary['overall_pde']:.3f}")
print(f"  pTM:              {summary['ptm']:.3f}")
print(f"  ipTM:             {summary.get('iptm', 'N/A (single chain)')}")
print(f"  Ranking score:    {summary['ranking_score']:.3f}")
print(f"  Has clash:        {summary['has_clash']}")

=== Summary Confidences ===
  Overall pLDDT:    0.712
  Overall PAE:      13.52 A
  Overall PDE:      4.523
  pTM:              0.488
  ipTM:             0.23222367465496063
  Ranking score:    0.283
  Has clash:        False
time: 1.4 ms (started: 2026-01-26 01:27:57 +08:00)


In [10]:
# 原子/残基confidences
conf = rf3_output.confidences

print("=== Per-Atom/Residue Confidences ===")
print(f"  atom_plddts:      {len(conf['atom_plddts'])} values (one per atom)")
print(f"  atom_chain_ids:   {len(conf['atom_chain_ids'])} values")
print(f"  token_chain_ids:  {len(conf['token_chain_ids'])} values (one per residue)")
print(f"  token_res_ids:    {len(conf['token_res_ids'])} values")
print(f"  PAE matrix:       {len(conf['pae'])}x{len(conf['pae'][0])}")

# 前10原子pLDDT scores
import numpy as np
print(f"\nFirst 10 atom pLDDTs: {np.round(conf['atom_plddts'][:10], 2).tolist()}")

=== Per-Atom/Residue Confidences ===
  atom_plddts:      1018 values (one per atom)
  atom_chain_ids:   1018 values
  token_chain_ids:  130 values (one per residue)
  token_res_ids:    130 values
  PAE matrix:       130x130

First 10 atom pLDDTs: [0.67, 0.69, 0.69, 0.66, 0.67, 0.64, 0.62, 0.56, 0.55, 0.67]
time: 145 ms (started: 2026-01-26 01:27:57 +08:00)


### Step5： RMSD评估 & cif导出

In [11]:
# 5a：带模板全局RMSD
from biotite.structure import rmsd, superimpose
from atomworks.constants import PROTEIN_BACKBONE_ATOM_NAMES
import numpy as np

# RFD3 backbone vs RF3-predicted结构比较
aa_generated = atom_array              
aa_refolded = rf3_output.atom_array    

# 保留骨干原子(N, CA, C, O)
bb_generated = aa_generated[np.isin(aa_generated.atom_name, PROTEIN_BACKBONE_ATOM_NAMES)]
bb_refolded = aa_refolded[np.isin(aa_refolded.atom_name, PROTEIN_BACKBONE_ATOM_NAMES)]

# 叠加结构计算RMSD
bb_refolded_fitted, _ = superimpose(bb_generated, bb_refolded)
rmsd_value = rmsd(bb_generated, bb_refolded_fitted)

print(f"Backbone RMSD: {rmsd_value:.2f} A")
print(f"\nInterpretation: {'Excellent' if rmsd_value < 1.0 else 'Good' if rmsd_value < 2.0 else 'Moderate'} designability")

print("="*40)

# 5b:仅保留cdr的RMSD
from biotite.structure import rmsd, superimpose
from atomworks.constants import PROTEIN_BACKBONE_ATOM_NAMES
import numpy as np

aa_generated = atom_array            
aa_refolded = rf3_output.atom_array    

# 只保留chain A（CDR-H3）
mask_a = (atom_array.chain_id == "A")
mask_a_refolded = (rf3_output.atom_array.chain_id == "A")

aa_generated_cdr = atom_array[mask_a]
aa_refolded_cdr = rf3_output.atom_array[mask_a_refolded]

bb_generated = aa_generated_cdr[np.isin(aa_generated_cdr.atom_name, PROTEIN_BACKBONE_ATOM_NAMES)]
bb_refolded  = aa_refolded_cdr[np.isin(aa_refolded_cdr.atom_name, PROTEIN_BACKBONE_ATOM_NAMES)]

# 仅计算CDR
bb_refolded_fitted, _ = superimpose(bb_generated, bb_refolded)
rmsd_value = rmsd(bb_generated, bb_refolded_fitted)

print(f"CDR-H3 Backbone RMSD (chain A only): {rmsd_value:.2f} Å")
print(f"\nInterpretation: {'Excellent' if rmsd_value < 1.2 else 'Good' if rmsd_value < 2.0 else 'Moderate' if rmsd_value < 3.0 else 'Poor'} designability")


Backbone RMSD: 7.20 A

Interpretation: Moderate designability
CDR-H3 Backbone RMSD (chain A only): 0.18 Å

Interpretation: Excellent designability
time: 200 ms (started: 2026-01-26 01:27:58 +08:00)


In [12]:
# 导出cif
from atomworks.io.utils.io_utils import to_cif_file

to_cif_file(aa_generated, "/home/alex/aidd/PDL1-4ZQK/out_cif/pdl1_clean_generated.cif")
to_cif_file(aa_refolded, "/home/alex/aidd/PDL1-4ZQK/out_cif/pdl1_clean_refolded.cif")

to_cif_file(aa_generated_cdr, "/home/alex/aidd/PDL1-4ZQK/out_cif/pdl1_clean_generated_cdr.cif")
to_cif_file(aa_refolded_cdr, "/home/alex/aidd/PDL1-4ZQK/out_cif/pdl1_clean_refolded_cdr.cif")



'/home/alex/aidd/PDL1-4ZQK/out_cif/pdl1_clean_refolded_cdr.cif'

time: 230 ms (started: 2026-01-26 01:27:58 +08:00)


### pymol叠加结果

![Pymol gen_ref_CDR-H3_Cα_only叠加结果](./pymolFigs/gen_ref_CDR-H3_Cα_only.png)
Pymol gen_ref_CDR-H3_Cα_only叠加结果

![Pymol gen_ref_full_Cα_only_叠加结果](./pymolFigs/gen_ref_full_Cα_only.png)
Pymol gen_ref_full_Cα_only叠加结果

![Pymol gen_ref_CDR-H3_CNCAO叠加结果](./pymolFigs/gen_ref_CDR-H3_CNCAO.png)
Pymol gen_ref_CDR-H3_CNCAO叠加结果

![Pymol gen_ref_full_CNCAO_叠加结果](./pymolFigs/gen_ref_full_CNCAO.png)
Pymol gen_ref_full_CNCAO叠加结果