# Prepare Inputs

Objectives
- Download PDB of interest
- Generate mutant structures of protein of interest using PyRosetta

In [5]:
from pathlib import Path
import requests
from Bio import SeqIO
import random

ModuleNotFoundError: No module named 'requests'

## Download input PDB

In [6]:
# Input parameters
PDB_ID = '6FUD'

In [2]:
# Prepare directories
input_dir = Path('inputs')
input_dir.mkdir(parents=True, exist_ok=True)

In [14]:
# Download input PDB
pdb_id = PDB_ID.upper() # PDB IDs are typically uppercase
base_url = 'https://files.rcsb.org/download/'
url = f'{base_url}{pdb_id}.pdb'
response = requests.get(url, stream=True)

pdb_path = input_dir / f'{PDB_ID}.pdb'
with open(pdb_path, 'wb') as f:
    for chunk in response.iter_content(chunk_size=8192):
        f.write(chunk)

print(f"Successfully downloaded {pdb_id} to {pdb_path}")

Successfully downloaded 6FUD to inputs/6FUD.pdb


### Functionalize this

In [25]:
def download_pdb_from_id(pdb_id: str, target_dir: str | Path='inputs'):
    """Downloads PDB file from RCSB, storing it in an inputs directory (which is made if not extant)

    Args:
        pdb_id (str): PDB ID of the PDF file you wish to download.
        target_dir (str | Path): The path to which your downloaded PDB will be saved.
    """
    # Prepare directories
    if type(target_dir) == Path:
        target_dir.mkdir(parents=True, exist_ok=True)
    else:
        target_dir = Path(target_dir)
        target_dir.mkdir(parents=True, exist_ok=True)

    # Download input PDB
    pdb_id = pdb_id.upper()
    base_url = 'https://files.rcsb.org/download/'
    url = f'{base_url}{pdb_id}.pdb'
    response = requests.get(url, stream=True)

    pdb_path = target_dir / f'{pdb_id}.pdb'
    with open(pdb_path, 'wb') as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)

    print(f"Successfully downloaded {pdb_id} to {pdb_path}")

In [28]:
download_pdb_from_id('6FUD')

Successfully downloaded 6FUD to inputs/6FUD.pdb


## Extract sequences

In [27]:
# Extract sequences from PDB file
def extract_seq_from_pdb(pdb_path: str | Path):
    sequences = {}
    # 'pdb-seqres' parser reads from SEQRES records
    for record in SeqIO.parse(pdb_path, "pdb-seqres"):
        # record.id typically looks like "PDBID:CHAINID" (e.g., "1FAT:A")
        # We want the sequence attribute: record.seq
        # And the chain ID is in annotations['chain']
        chain = record.annotations.get("chain", record.id.split(':')[-1])
        sequences[chain] = str(record.seq)

    return sequences

In [30]:
sequences = extract_seq_from_pdb(f'inputs/{PDB_ID}.pdb')
sequences

{'A': 'GPGGEMQKIVFKIPMVDDKSRTKAMSLVASTVGVHSVAIAGDLRDQVVVVGDGIDSINLVSALRKKVGPAMFLEVSQVKED',
 'B': 'METGNKYIEKRAIDLSRERDPNFFDNADIPVPECFWFMFKNNVRQDAGTCYSSWKMDMKVGPNWVHIKSDDNCNLSGDFPPGWIVLGKKRPGF'}

## Generate random mutations using PyRosetta

Resources:
- https://nbviewer.org/github/RosettaCommons/PyRosetta.notebooks/blob/master/notebooks/06.08-Point-Mutation-Scan.ipynb

In [3]:
import pyrosetta
from pyrosetta import pose_from_pdb, Pose, get_fa_scorefxn, get_score_function
from pyrosetta.toolbox import cleanATOM
from pyrosetta.rosetta.protocols.relax import FastRelax
import os
pyrosetta.init()

┌──────────────────────────────────────────────────────────────────────────────┐
│                                 PyRosetta-4                                  │
│              Created in JHU by Sergey Lyskov and PyRosetta Team              │
│              (C) Copyright Rosetta Commons Member Institutions               │
│                                                                              │
│ NOTE: USE OF PyRosetta FOR COMMERCIAL PURPOSES REQUIRE PURCHASE OF A LICENSE │
│         See LICENSE.PyRosetta.md or email license@uw.edu for details         │
└──────────────────────────────────────────────────────────────────────────────┘
PyRosetta-4 2025 [Rosetta PyRosetta4.conda.ubuntu.cxx11thread.serialization.Ubuntu.python313.Release 2025.06+release.029c6a159b896477003a14f78f472d4cd2cead46 2025-02-04T15:14:13] retrieved from: http://www.pyrosetta.org
core.init: Checking for fconfig files in pwd and ./rosetta/flags
core.init: Rosetta version: PyRosetta4.conda.ubuntu.cxx11thread.ser

In [6]:
def clean_and_relax_pdb(pdb_path: str | Path, output_dir: str | Path):
    """
    Clean and relax a PDB structure using PyRosetta.

    This function uses PyRosetta's `cleanATOM` utility to clean the input PDB file,
    then loads the cleaned structure into a Pose object, applies the FastRelax protocol
    with coordinate constraints, and saves the relaxed structure to the specified output directory.

    Args:
        pdb_path (str | Path): 
            Path to the input PDB file to be cleaned and relaxed.
        output_dir (str | Path): 
            Directory where the relaxed PDB file will be saved.

    Returns:
        preppedPose (pyrosetta.rosetta.core.pose.Pose): 
            The cleaned and relaxed Pose object.

    Output:
        - A cleaned PDB file named `<original>.clean.pdb` is created in the same directory as the input.
        - A relaxed PDB file named `<original>.relax.pdb` is saved in the specified output directory.

    Notes:
        - The FastRelax protocol is run with coordinate constraints to maintain the starting structure.
        - The function creates the output directory if it does not exist.
        - The runtime of the relaxation step may be significant for large structures.
    """
    
    pdb_path = Path(pdb_path)
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True,exist_ok=True)
    
    # Clean the PDB file, then load it into a pose
    cleanATOM(pdb_path)
    pose = pose_from_pdb(str(pdb_path.parent / (pdb_path.stem +'.clean.pdb')))
    preppedPose = Pose()
    preppedPose.assign(pose)

    # Relax the structure (runtime: 64 s)
    relax = FastRelax()
    scorefxn = get_fa_scorefxn()
    relax.set_scorefxn(scorefxn)
    relax.constrain_relax_to_start_coords(True)
    relax.apply(preppedPose)
    preppedPose.dump_pdb(str(output_dir / (pdb_path.stem +'.relax.pdb')))
    return preppedPose

In [61]:
def mutate_and_pack(pose: pyrosetta.rosetta.core.pose.Pose, position: int, amino_acid: str): 
    # select mutation position
    mut_pos = pyrosetta.rosetta.core.select.residue_selector.ResidueIndexSelector()
    mut_pos.set_index(position)

    # select neighbor positions
    nbr_selector = pyrosetta.rosetta.core.select.residue_selector.NeighborhoodResidueSelector()
    nbr_selector.set_focus_selector(mut_pos)
    nbr_selector.set_include_focus_in_subset(True)

    # select no design area
    not_design = pyrosetta.rosetta.core.select.residue_selector.NotResidueSelector(mut_pos)

    # set up task factory
    tf = pyrosetta.rosetta.core.pack.task.TaskFactory()

    tf.push_back(pyrosetta.rosetta.core.pack.task.operation.InitializeFromCommandline())
    tf.push_back(pyrosetta.rosetta.core.pack.task.operation.IncludeCurrent())
    tf.push_back(pyrosetta.rosetta.core.pack.task.operation.NoRepackDisulfides())

    # disable packing
    prevent_repacking_rlt = pyrosetta.rosetta.core.pack.task.operation.PreventRepackingRLT()
    prevent_subset_repacking = pyrosetta.rosetta.core.pack.task.operation.OperateOnResidueSubset(prevent_repacking_rlt, nbr_selector, True )
    tf.push_back(prevent_subset_repacking)

    # disable design
    tf.push_back(pyrosetta.rosetta.core.pack.task.operation.OperateOnResidueSubset(
        pyrosetta.rosetta.core.pack.task.operation.RestrictToRepackingRLT(),not_design))

    # enable design
    aa_to_design = pyrosetta.rosetta.core.pack.task.operation.RestrictAbsentCanonicalAASRLT()
    aa_to_design.aas_to_keep(amino_acid)
    tf.push_back(pyrosetta.rosetta.core.pack.task.operation.OperateOnResidueSubset(aa_to_design, mut_pos))
    
    # create Packer
    packer = pyrosetta.rosetta.protocols.minimization_packing.PackRotamersMover()
    packer.task_factory(tf)

    # perform The Move
    if not os.getenv("DEBUG"):
      packer.apply(pose) 

In [129]:
def generate_mutants(structure: str | Path | pyrosetta.rosetta.core.pose.Pose, output_dir: str | Path, mut_range: tuple, min: int=1, max: int=5, n: int=10):
    """
    Generate a library of random protein mutants by introducing random point mutations
    within a specified residue range, using PyRosetta.

    Args:
        structure (str | Path | pyrosetta.rosetta.core.pose.Pose): 
            Input structure as a PDB file path or a PyRosetta Pose object. Assumes input PDB has been
            cleaned and relaxed.
        output_dir (str | Path): 
            Directory where mutant PDB files will be saved.
        mut_range (tuple): 
            Tuple (start, end) specifying the residue indices (1-based, inclusive start, inclusive end) 
            where mutations are allowed.
        min (int, optional): 
            Minimum number of mutations per mutant. Default is 1.
        max (int, optional): 
            Maximum number of mutations per mutant. Default is 5.
        n (int, optional): 
            Number of unique mutants to generate. Default is 10.

    Notes:
        - Only standard amino acids are considered for mutations.
        - Each mutant will have a random number of mutations between `min` and `max`.
        - Mutants with duplicate sequences are not saved.
        - Output PDB files are named according to their mutation pattern (e.g., 'A123G_R45K.pdb').
    """
    # Make output directory a Path object and make it if not present
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    
    AMINO_ACIDS = "ACDEFGHIKLMNPQRSTVWY" # All 20 standard amino acids

    # Set original pose depending on if input structure is a Pose object or a PDB
    if type(structure) == pyrosetta.rosetta.core.pose.Pose:
        input_pose = structure
        original_pose = input_pose.clone()
    elif type(structure) == str | Path:
        input_pose = pose_from_pdb(structure)
        original_pose = input_pose.clone()
    else:
        raise TypeError("Ensure input strcture is either a path to a PDB or a Pose object")
    
    # Generate a list of mutable residue indices. 1-indexed as this is what Rosetta requires.
    mutable_residue_indices = list(range(mut_range[0], mut_range[1]+1))

    generated_mutants_count = 0
    unique_mutant_sequences = set() # To store mutant sequences and avoid duplicates

    while generated_mutants_count < n:
        current_pose = original_pose.clone() # get a fresh copy of the original pose
        num_mut = random.randint(min, max) # randomly select number of mutations this variant will receive

        mutation_sites = random.sample(mutable_residue_indices, num_mut) # select the sites to be mutated
        mutation_details = [] # To store wt_aa, position, new_aa

        for site_idx in mutation_sites:
            wt_aa = current_pose.residue(site_idx).name1()
            candidate_mut_aa = list(AMINO_ACIDS)
            candidate_mut_aa.remove(wt_aa)
            mut_aa = random.choice(candidate_mut_aa)
            mutation_details.append((wt_aa, site_idx, mut_aa))
            mutate_and_pack(current_pose, site_idx, mut_aa)
    
        # check for duplicates
        current_sequence = current_pose.sequence()
        if current_sequence in unique_mutant_sequences:
            continue
        else:
            unique_mutant_sequences.add(current_sequence)
            generated_mutants_count += 1

        mutation_string = '-'.join([f'{details[0]}{details[1]}{details[2]}' for details in mutation_details])
        current_pose.dump_pdb(str(output_dir / f'{mutation_string}.pdb'))
        

In [None]:
def get_chains_summary(pose: pyrosetta.rosetta.core.pose.Pose):
    """
    Summarize chain information from a PyRosetta Pose.

    Returns a list of dictionaries, each containing:
        - chain_id: Numeric chain identifier
        - chain_letter: Chain letter (e.g., 'A', 'B')
        - resnums_list: List of residue indices for the chain
        - resnums_range: Tuple of (min_resnum, max_resnum) for the chain

    Args:
        pose (pyrosetta.rosetta.core.pose.Pose): The input Pose object.

    Returns:
        list[dict]: List of chain summary dictionaries.
    """
    chain_resnums = []
    chains = list(pyrosetta.rosetta.core.pose.get_chains(pose))
    for chain in chains:
        chain_letter = pyrosetta.rosetta.core.pose.get_chain_from_chain_id(chain, pose)
        resnums = list(pyrosetta.rosetta.core.pose.get_resnums_for_chain_id(pose, chain))
        resnums_range = (min(resnums), max(resnums))
        chain_resnums.append({
            'chain_id': chain,
            'chain_letter': chain_letter,
            'resnums_list': resnums,
            'resnums_range': resnums_range
        })
    return chain_resnums

In [10]:
relaxed_pose = clean_and_relax_pdb('inputs/6FUD.pdb', 'inputs')

core.import_pose.import_pose: File 'inputs/6FUD.clean.pdb' automatically determined to be of type PDB
core.conformation.Conformation: Found disulfide between residues 99 138
core.scoring.ScoreFunctionFactory: SCOREFUNCTION: ref2015
core.scoring.ScoreFunctionFactory: SCOREFUNCTION: ref2015
protocols.relax.FastRelax: CMD: repeat  -290.539  0  0  0.55
protocols.relax.FastRelax: CMD: coord_cst_weight  -290.539  0  0  0.55
protocols.relax.FastRelax: CMD: scale:fa_rep  -382.411  0  0  0.022
core.pack.task: Packer task: initialize from command line()
core.pack.pack_rotamers: built 4083 rotamers at 158 positions.
core.pack.pack_rotamers: Requesting all available threads for interaction graph computation.
core.pack.interaction_graph.interaction_graph_factory: Instantiating DensePDInteractionGraph
core.pack.rotamer_set.RotamerSets: Completed interaction graph pre-calculation in 1 available threads (1 had been requested).
protocols.relax.FastRelax: CMD: repack  -533.954  0  0  0.022
protocols.rel

In [131]:
import pandas as pd
chain_summary = pd.DataFrame(get_chains_summary(relaxed_pose))
type(chain_summary.loc[chain_summary['chain_letter'] == 'B', 'resnums_range'].values[0])

tuple

In [None]:
generate_mutants(relaxed_pose, 'mutant_library', chain_summary.loc[chain_summary['chain_letter'] == 'B', 'resnums_range'].values[0], n=3)

core.scoring.ScoreFunctionFactory: SCOREFUNCTION: ref2015
core.pack.task: Packer task: initialize from command line()
core.pack.pack_rotamers: built 473 rotamers at 19 positions.
core.pack.pack_rotamers: Requesting all available threads for interaction graph computation.
core.pack.interaction_graph.interaction_graph_factory: Instantiating PDInteractionGraph
core.pack.rotamer_set.RotamerSets: Completed interaction graph pre-calculation in 1 available threads (1 had been requested).
core.scoring.ScoreFunctionFactory: SCOREFUNCTION: ref2015
core.pack.task: Packer task: initialize from command line()
core.pack.pack_rotamers: built 224 rotamers at 17 positions.
core.pack.pack_rotamers: Requesting all available threads for interaction graph computation.
core.pack.interaction_graph.interaction_graph_factory: Instantiating PDInteractionGraph
core.pack.rotamer_set.RotamerSets: Completed interaction graph pre-calculation in 1 available threads (1 had been requested).
core.scoring.ScoreFunctionFa