# Prepare Inputs

Objectives
- Download PDB of interest
- Generate mutant structures of protein of interest using PyRosetta

In [1]:
from pathlib import Path
import random
import requests
from Bio import SeqIO

ModuleNotFoundError: No module named 'requests'

## Download input PDB

In [6]:
# Input parameters
PDB_ID = '6FUD'

In [2]:
# Prepare directories
input_dir = Path('inputs')
input_dir.mkdir(parents=True, exist_ok=True)

In [14]:
# Download input PDB
pdb_id = PDB_ID.upper() # PDB IDs are typically uppercase
base_url = 'https://files.rcsb.org/download/'
url = f'{base_url}{pdb_id}.pdb'
response = requests.get(url, stream=True)

pdb_path = input_dir / f'{PDB_ID}.pdb'
with open(pdb_path, 'wb') as f:
    for chunk in response.iter_content(chunk_size=8192):
        f.write(chunk)

print(f"Successfully downloaded {pdb_id} to {pdb_path}")

Successfully downloaded 6FUD to inputs/6FUD.pdb


### Functionalize this

In [25]:
def download_pdb_from_id(pdb_id: str, target_dir: str | Path='inputs'):
    """Downloads PDB file from RCSB, storing it in an inputs directory (which is made if not extant)

    Args:
        pdb_id (str): PDB ID of the PDF file you wish to download.
        target_dir (str | Path): The path to which your downloaded PDB will be saved.
    """
    # Prepare directories
    if type(target_dir) == Path:
        target_dir.mkdir(parents=True, exist_ok=True)
    else:
        target_dir = Path(target_dir)
        target_dir.mkdir(parents=True, exist_ok=True)

    # Download input PDB
    pdb_id = pdb_id.upper()
    base_url = 'https://files.rcsb.org/download/'
    url = f'{base_url}{pdb_id}.pdb'
    response = requests.get(url, stream=True)

    pdb_path = target_dir / f'{pdb_id}.pdb'
    with open(pdb_path, 'wb') as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)

    print(f"Successfully downloaded {pdb_id} to {pdb_path}")

In [28]:
download_pdb_from_id('6FUD')

Successfully downloaded 6FUD to inputs/6FUD.pdb


## Extract sequences

In [27]:
# Extract sequences from PDB file
def extract_seq_from_pdb(pdb_path: str | Path):
    sequences = {}
    # 'pdb-seqres' parser reads from SEQRES records
    for record in SeqIO.parse(pdb_path, "pdb-seqres"):
        # record.id typically looks like "PDBID:CHAINID" (e.g., "1FAT:A")
        # We want the sequence attribute: record.seq
        # And the chain ID is in annotations['chain']
        chain = record.annotations.get("chain", record.id.split(':')[-1])
        sequences[chain] = str(record.seq)

    return sequences

In [30]:
sequences = extract_seq_from_pdb(f'inputs/{PDB_ID}.pdb')
sequences

{'A': 'GPGGEMQKIVFKIPMVDDKSRTKAMSLVASTVGVHSVAIAGDLRDQVVVVGDGIDSINLVSALRKKVGPAMFLEVSQVKED',
 'B': 'METGNKYIEKRAIDLSRERDPNFFDNADIPVPECFWFMFKNNVRQDAGTCYSSWKMDMKVGPNWVHIKSDDNCNLSGDFPPGWIVLGKKRPGF'}

## Generate random mutations using PyRosetta

Resources:
- https://nbviewer.org/github/RosettaCommons/PyRosetta.notebooks/blob/master/notebooks/06.08-Point-Mutation-Scan.ipynb

In [1]:
import pyrosetta
from pyrosetta import pose_from_pdb, Pose, get_fa_scorefxn, get_score_function
from pyrosetta.toolbox import cleanATOM
from pyrosetta.rosetta.protocols.relax import FastRelax
import os
from pathlib import Path
import pandas as pd
import random
pyrosetta.init()

┌──────────────────────────────────────────────────────────────────────────────┐
│                                 PyRosetta-4                                  │
│              Created in JHU by Sergey Lyskov and PyRosetta Team              │
│              (C) Copyright Rosetta Commons Member Institutions               │
│                                                                              │
│ NOTE: USE OF PyRosetta FOR COMMERCIAL PURPOSES REQUIRE PURCHASE OF A LICENSE │
│         See LICENSE.PyRosetta.md or email license@uw.edu for details         │
└──────────────────────────────────────────────────────────────────────────────┘
PyRosetta-4 2025 [Rosetta PyRosetta4.conda.ubuntu.cxx11thread.serialization.Ubuntu.python313.Release 2025.06+release.029c6a159b896477003a14f78f472d4cd2cead46 2025-02-04T15:14:13] retrieved from: http://www.pyrosetta.org
core.init: Checking for fconfig files in pwd and ./rosetta/flags
core.init: Rosetta version: PyRosetta4.conda.ubuntu.cxx11thread.ser

In [2]:
def clean_and_relax_pdb(pdb_path: str | Path, output_dir: str | Path):
    """
    Clean and relax a PDB structure using PyRosetta.

    This function uses PyRosetta's `cleanATOM` utility to clean the input PDB file,
    then loads the cleaned structure into a Pose object, applies the FastRelax protocol
    with coordinate constraints, and saves the relaxed structure to the specified output directory.

    Args:
        pdb_path (str | Path): 
            Path to the input PDB file to be cleaned and relaxed.
        output_dir (str | Path): 
            Directory where the relaxed PDB file will be saved.

    Returns:
        preppedPose (pyrosetta.rosetta.core.pose.Pose): 
            The cleaned and relaxed Pose object.

    Output:
        - A cleaned PDB file named `<original>.clean.pdb` is created in the same directory as the input.
        - A relaxed PDB file named `<original>.relax.pdb` is saved in the specified output directory.

    Notes:
        - The FastRelax protocol is run with coordinate constraints to maintain the starting structure.
        - The function creates the output directory if it does not exist.
        - The runtime of the relaxation step may be significant for large structures.
    """
    
    pdb_path = Path(pdb_path)
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True,exist_ok=True)
    
    # Clean the PDB file, then load it into a pose
    cleanATOM(pdb_path)
    pose = pose_from_pdb(str(pdb_path.parent / (pdb_path.stem +'.clean.pdb')))
    preppedPose = Pose()
    preppedPose.assign(pose)

    # Relax the structure (runtime: 64 s)
    relax = FastRelax()
    scorefxn = get_fa_scorefxn()
    relax.set_scorefxn(scorefxn)
    relax.constrain_relax_to_start_coords(True)
    relax.apply(preppedPose)
    preppedPose.dump_pdb(str(output_dir / (pdb_path.stem +'.relax.pdb')))
    return preppedPose

In [3]:
def mutate_and_pack(pose: pyrosetta.rosetta.core.pose.Pose, position: int, amino_acid: str): 
    # select mutation position
    mut_pos = pyrosetta.rosetta.core.select.residue_selector.ResidueIndexSelector()
    mut_pos.set_index(position)

    # select neighbor positions
    nbr_selector = pyrosetta.rosetta.core.select.residue_selector.NeighborhoodResidueSelector()
    nbr_selector.set_focus_selector(mut_pos)
    nbr_selector.set_include_focus_in_subset(True)

    # select no design area
    not_design = pyrosetta.rosetta.core.select.residue_selector.NotResidueSelector(mut_pos)

    # set up task factory
    tf = pyrosetta.rosetta.core.pack.task.TaskFactory()

    tf.push_back(pyrosetta.rosetta.core.pack.task.operation.InitializeFromCommandline())
    tf.push_back(pyrosetta.rosetta.core.pack.task.operation.IncludeCurrent())
    tf.push_back(pyrosetta.rosetta.core.pack.task.operation.NoRepackDisulfides())

    # disable packing
    prevent_repacking_rlt = pyrosetta.rosetta.core.pack.task.operation.PreventRepackingRLT()
    prevent_subset_repacking = pyrosetta.rosetta.core.pack.task.operation.OperateOnResidueSubset(prevent_repacking_rlt, nbr_selector, True )
    tf.push_back(prevent_subset_repacking)

    # disable design
    tf.push_back(pyrosetta.rosetta.core.pack.task.operation.OperateOnResidueSubset(
        pyrosetta.rosetta.core.pack.task.operation.RestrictToRepackingRLT(),not_design))

    # enable design
    aa_to_design = pyrosetta.rosetta.core.pack.task.operation.RestrictAbsentCanonicalAASRLT()
    aa_to_design.aas_to_keep(amino_acid)
    tf.push_back(pyrosetta.rosetta.core.pack.task.operation.OperateOnResidueSubset(aa_to_design, mut_pos))
    
    # create Packer
    packer = pyrosetta.rosetta.protocols.minimization_packing.PackRotamersMover()
    packer.task_factory(tf)

    # perform The Move
    if not os.getenv("DEBUG"):
      packer.apply(pose) 

In [4]:
def generate_mutants(structure: str | Path | pyrosetta.rosetta.core.pose.Pose, output_dir: str | Path, mut_range: tuple, min: int=1, max: int=5, n: int=10):
    """
    Generate a library of random protein mutants by introducing random point mutations
    within a specified residue range, using PyRosetta.

    Args:
        structure (str | Path | pyrosetta.rosetta.core.pose.Pose): 
            Input structure as a PDB file path or a PyRosetta Pose object. Assumes input PDB has been
            cleaned and relaxed.
        output_dir (str | Path): 
            Directory where mutant PDB files will be saved.
        mut_range (tuple): 
            Tuple (start, end) specifying the residue indices (1-based, inclusive start, inclusive end) 
            where mutations are allowed.
        min (int, optional): 
            Minimum number of mutations per mutant. Default is 1.
        max (int, optional): 
            Maximum number of mutations per mutant. Default is 5.
        n (int, optional): 
            Number of unique mutants to generate. Default is 10.

    Notes:
        - Only standard amino acids are considered for mutations.
        - Each mutant will have a random number of mutations between `min` and `max`.
        - Mutants with duplicate sequences are not saved.
        - Output PDB files are named according to their mutation pattern (e.g., 'A123G_R45K.pdb').
    """
    # Make output directory a Path object and make it if not present
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    
    AMINO_ACIDS = "ACDEFGHIKLMNPQRSTVWY" # All 20 standard amino acids

    # Set original pose depending on if input structure is a Pose object or a PDB
    if type(structure) == pyrosetta.rosetta.core.pose.Pose:
        input_pose = structure
        original_pose = input_pose.clone()
    elif type(structure) == str | Path:
        input_pose = pose_from_pdb(structure)
        original_pose = input_pose.clone()
    else:
        raise TypeError("Ensure input strcture is either a path to a PDB or a Pose object")
    
    # Generate a list of mutable residue indices. 1-indexed as this is what Rosetta requires.
    mutable_residue_indices = list(range(mut_range[0], mut_range[1]+1))

    generated_mutants_count = 0
    unique_mutant_sequences = set() # To store mutant sequences and avoid duplicates
    mutant_data = [] # To store mutation details and sequences for CSV

    while generated_mutants_count < n:
        current_pose = original_pose.clone() # get a fresh copy of the original pose
        num_mut = random.randint(min, max) # randomly select number of mutations this variant will receive

        mutation_sites = random.sample(mutable_residue_indices, num_mut) # select the sites to be mutated
        mutation_details = [] # To store wt_aa, position, new_aa

        for site_idx in mutation_sites:
            wt_aa = current_pose.residue(site_idx).name1()
            candidate_mut_aa = list(AMINO_ACIDS)
            candidate_mut_aa.remove(wt_aa)
            mut_aa = random.choice(candidate_mut_aa)
            mutation_details.append((wt_aa, site_idx, mut_aa))
            mutate_and_pack(current_pose, site_idx, mut_aa)
    
        # check for duplicates
        current_sequence = current_pose.sequence()
        if current_sequence in unique_mutant_sequences:
            continue
        else:
            unique_mutant_sequences.add(current_sequence)
            generated_mutants_count += 1

        mutation_string = '-'.join([f'{details[0]}{details[1]}{details[2]}' for details in mutation_details])
        current_pose.dump_pdb(str(output_dir / f'{mutation_string}.pdb'))
        mutant_data.append({
            'mut': mutation_string,
            'sequence': current_sequence
        })

    mut_df = pd.DataFrame(mutant_data)
    mut_df.to_csv(str(output_dir / 'mutant_library.csv'), index=False)

In [5]:
def get_chains_summary(pose: pyrosetta.rosetta.core.pose.Pose):
    """
    Summarize chain information from a PyRosetta Pose.

    Returns a list of dictionaries, each containing:
        - chain_id: Numeric chain identifier
        - chain_letter: Chain letter (e.g., 'A', 'B')
        - resnums_list: List of residue indices for the chain
        - resnums_range: Tuple of (min_resnum, max_resnum) for the chain

    Args:
        pose (pyrosetta.rosetta.core.pose.Pose): The input Pose object.

    Returns:
        list[dict]: List of chain summary dictionaries.
    """
    chain_resnums = []
    chains = list(pyrosetta.rosetta.core.pose.get_chains(pose))
    for chain in chains:
        chain_letter = pyrosetta.rosetta.core.pose.get_chain_from_chain_id(chain, pose)
        resnums = list(pyrosetta.rosetta.core.pose.get_resnums_for_chain_id(pose, chain))
        resnums_range = (min(resnums), max(resnums))
        chain_resnums.append({
            'chain_id': chain,
            'chain_letter': chain_letter,
            'resnums_list': resnums,
            'resnums_range': resnums_range
        })
    return chain_resnums

In [6]:
relaxed_pose = clean_and_relax_pdb('inputs/6FUD.pdb', 'inputs')

core.chemical.GlobalResidueTypeSet: Finished initializing fa_standard residue type set.  Created 985 residue types
core.chemical.GlobalResidueTypeSet: Total time to initialize 0.419787 seconds.
core.import_pose.import_pose: File 'inputs/6FUD.clean.pdb' automatically determined to be of type PDB
core.conformation.Conformation: Found disulfide between residues 99 138
protocols.relax.RelaxScriptManager: Reading relax scripts list from database.
core.scoring.ScoreFunctionFactory: SCOREFUNCTION: ref2015
core.scoring.etable: Starting energy table calculation
core.scoring.etable: smooth_etable: changing atr/rep split to bottom of energy well
core.scoring.etable: smooth_etable: spline smoothing lj etables (maxdis = 6)
core.scoring.etable: smooth_etable: spline smoothing solvation etables (max_dis = 6)
core.scoring.etable: Finished calculating energy tables.
basic.io.database: Database file opened: scoring/score_functions/hbonds/ref2015_params/HBPoly1D.csv
basic.io.database: Database file opene

In [7]:
import pandas as pd
chain_summary = pd.DataFrame(get_chains_summary(relaxed_pose))
chain_summary

Unnamed: 0,chain_id,chain_letter,resnums_list,resnums_range
0,1,A,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","(1, 77)"
1,2,B,"[78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 8...","(78, 158)"


In [11]:
generate_mutants(relaxed_pose, 'mutant_library', chain_summary.loc[chain_summary['chain_letter'] == 'B', 'resnums_range'].values[0], n=300)

Generate mutant structures for Pikh1-AVR-PikC
- RCSB structure downloaded and modified to only include one Pikh1 HMA and AVR-PikC molecule
- Will be cleaned, relaxed, then mutated.

In [8]:
relaxed_pose = clean_and_relax_pdb('inputs/h1-C_rcsb_cut.pdb', 'inputs')
chain_summary = pd.DataFrame(get_chains_summary(relaxed_pose))
chain_summary

core.chemical.GlobalResidueTypeSet: Finished initializing fa_standard residue type set.  Created 985 residue types
core.chemical.GlobalResidueTypeSet: Total time to initialize 0.600726 seconds.
core.import_pose.import_pose: File 'inputs/h1-C_rcsb_cut.clean.pdb' automatically determined to be of type PDB
core.conformation.Conformation: Found disulfide between residues 96 135
protocols.relax.RelaxScriptManager: Reading relax scripts list from database.
core.scoring.ScoreFunctionFactory: SCOREFUNCTION: ref2015
core.scoring.etable: Starting energy table calculation
core.scoring.etable: smooth_etable: changing atr/rep split to bottom of energy well
core.scoring.etable: smooth_etable: spline smoothing lj etables (maxdis = 6)
core.scoring.etable: smooth_etable: spline smoothing solvation etables (max_dis = 6)
core.scoring.etable: Finished calculating energy tables.
basic.io.database: Database file opened: scoring/score_functions/hbonds/ref2015_params/HBPoly1D.csv
basic.io.database: Database f

Unnamed: 0,chain_id,chain_letter,resnums_list,resnums_range
0,1,B,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","(1, 74)"
1,2,C,"[75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 8...","(75, 154)"


In [9]:
generate_mutants(relaxed_pose, 'mutant_library/h1-c', chain_summary.loc[chain_summary['chain_letter'] == 'B', 'resnums_range'].values[0], n=300)

core.scoring.ScoreFunctionFactory: SCOREFUNCTION: ref2015
core.pack.task: Packer task: initialize from command line()
core.pack.pack_rotamers: built 127 rotamers at 14 positions.
core.pack.pack_rotamers: Requesting all available threads for interaction graph computation.
core.pack.interaction_graph.interaction_graph_factory: Instantiating PDInteractionGraph
core.pack.rotamer_set.RotamerSets: Completed interaction graph pre-calculation in 1 available threads (1 had been requested).
core.scoring.ScoreFunctionFactory: SCOREFUNCTION: ref2015
core.pack.task: Packer task: initialize from command line()
core.pack.pack_rotamers: built 585 rotamers at 20 positions.
core.pack.pack_rotamers: Requesting all available threads for interaction graph computation.
core.pack.interaction_graph.interaction_graph_factory: Instantiating PDInteractionGraph
core.pack.rotamer_set.RotamerSets: Completed interaction graph pre-calculation in 1 available threads (1 had been requested).
core.scoring.ScoreFunctionFa

In [19]:
import pyrosetta
from pyrosetta import pose_from_pdb, get_fa_scorefxn
import os
from pathlib import Path
import pandas as pd
import random
from tqdm import tqdm
import concurrent.futures
import csv

In [None]:
# =============================================================================
# WORKER FUNCTION
# =============================================================================
def apply_mutations_and_relax(args):
    """
    A 'dumb' worker that receives a specific list of mutations to perform,
    applies them, and returns the result. It no longer generates random mutations itself.
    """
    relaxed_pdb_path, mutations_to_apply, relax_mutant, output_dir = args

    # Each worker still needs its own quiet PyRosetta instance
    pyrosetta.init("-run:constant_seed -jran 1 -mute all -out:level 0")

    pose = pose_from_pdb(str(relaxed_pdb_path))

    mutation_details = []
    # Apply the pre-determined list of mutations
    for position, new_aa in mutations_to_apply:
        wt_aa = pose.residue(position).name1()
        mutation_details.append((wt_aa, position, new_aa))
        mutate_and_pack(pose, position, new_aa)

    # Optional relaxation of the final structure
    if relax_mutant:
        relax = FastRelax()
        scorefxn = get_fa_scorefxn()
        relax.set_scorefxn(scorefxn)
        relax.constrain_relax_to_start_coords(True)
        relax.apply(pose)

    final_sequence = pose.sequence()
    mutation_string = '-'.join([f'{wt}{pos}{mut}' for wt, pos, mut in mutation_details])
    
    # The worker is now responsible for saving the PDB
    output_pdb_path = output_dir / f'{mutation_string}.pdb'
    pose.dump_pdb(str(output_pdb_path))
    
    return (mutation_string, final_sequence)


In [None]:
# =============================================================================
# MANAGER FUNCTION 
# =============================================================================
def generate_mutants_parallel(
    structure_path: str | Path, 
    output_dir: str | Path, 
    mut_range: tuple, 
    min_mut: int = 1, 
    max_mut: int = 5, 
    n: int = 10,
    relax_after_mutation: bool = False,
    num_workers: int | None = None
):
    """
    Generates mutants by pre-generating mutation lists and checking for
    uniqueness before submitting to parallel workers.
    """
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    
    # --- Step 1: Prepare base structure and load existing data ---
    relaxed_pdb_path = output_dir / (Path(structure_path).stem + '.relax.pdb')
    if not relaxed_pdb_path.exists():
        pyrosetta.init()
        print("Preparing and relaxing the initial structure...")
        clean_and_relax_pdb(structure_path, output_dir)
    
    # Load the WT sequence once for pre-computation checks
    wt_pose = pose_from_pdb(str(relaxed_pdb_path))
    wt_sequence = list(wt_pose.sequence()) # Convert to a list for easy modification

    csv_path = output_dir / 'mutant_library.csv'
    existing_sequences = set()
    if csv_path.exists():
        existing_df = pd.read_csv(csv_path)
        existing_sequences = set(existing_df['sequence'])
    num_already_have = len(existing_sequences)
    num_needed = n - num_already_have

    if num_needed <= 0:
        print(f"Already have {num_already_have} unique mutants. Nothing to do.")
        return

    # --- Step 2: NEW - Pre-generate unique mutation lists ---
    print(f"Pre-generating {num_needed} unique mutation sets to run...")
    jobs_to_submit = []
    mutable_indices = list(range(mut_range[0], mut_range[1] + 1))
    AMINO_ACIDS = "ACDEFGHIKLMNPQRSTVWY"

    pbar_pregen = tqdm(total=num_needed, desc="Finding Unique Mutations")
    while len(jobs_to_submit) < num_needed:
        num_mut = random.randint(min_mut, max_mut)
        mutation_sites = random.sample(mutable_indices, num_mut)
        
        mutations_to_apply = []
        temp_sequence = wt_sequence[:] # Make a copy
        is_novel = True
        
        for site_idx in mutation_sites:
            original_aa = temp_sequence[site_idx - 1]
            possible_muts = [aa for aa in AMINO_ACIDS if aa != original_aa]
            new_aa = random.choice(possible_muts)
            mutations_to_apply.append((site_idx, new_aa))
            temp_sequence[site_idx - 1] = new_aa # Update the temp sequence
        
        final_predicted_sequence = "".join(temp_sequence)

        # Check if this sequence already exists BEFORE doing any Rosetta work
        if final_predicted_sequence not in existing_sequences:
            jobs_to_submit.append(mutations_to_apply)
            existing_sequences.add(final_predicted_sequence) # Add to set to avoid self-duplication
            pbar_pregen.update(1)
    
    pbar_pregen.close()
    print(f"Generated {len(jobs_to_submit)} unique jobs to submit.")

    # --- Step 3: Configure workers and run parallel jobs ---
    if num_workers is None:
        num_workers = max(1, os.cpu_count() - 2)
    
    args_list = [(relaxed_pdb_path, job, relax_after_mutation, output_dir) for job in jobs_to_submit]

    with open(csv_path, mode='a', newline='') as f:
        writer = csv.writer(f)
        if num_already_have == 0: # Write header only if file is new
            writer.writerow(['mut', 'sequence'])

        with concurrent.futures.ProcessPoolExecutor(max_workers=num_workers) as executor:
            results_iterator = executor.map(apply_mutations_and_relax, args_list)
            for result in tqdm(results_iterator, total=len(jobs_to_submit), desc="Executing Mutations"):
                if result:
                    writer.writerow(result)
                    f.flush()

    print(f"\nProcess complete. Added {len(jobs_to_submit)} new mutants.")

In [22]:
# Get chains summary
pose = pose = pose_from_pdb('inputs/h1-C_rcsb_cut.relax.pdb')
chain_summary = pd.DataFrame(get_chains_summary(pose))
chain_summary

core.import_pose.import_pose: File 'inputs/h1-C_rcsb_cut.relax.pdb' automatically determined to be of type PDB
core.conformation.Conformation: Found disulfide between residues 96 135


Unnamed: 0,chain_id,chain_letter,resnums_list,resnums_range
0,1,B,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","(1, 74)"
1,2,C,"[75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 8...","(75, 154)"


In [36]:
# Define your parameters
PDB_FILE = 'inputs/h1-C_rcsb_cut.pdb'
OUTPUT_DIR = 'mutant_library/h1-c_relaxed'
generate_mutants_parallel(
    structure_path=PDB_FILE,
    output_dir=OUTPUT_DIR, # Save to a different directory
    mut_range=chain_summary.loc[chain_summary['chain_letter'] == 'B', 'resnums_range'].values[0],
    n=109,
    relax_after_mutation=True,
    num_workers=4
)

core.import_pose.import_pose: File 'mutant_library/h1-c_relaxed/h1-C_rcsb_cut.relax.pdb' automatically determined to be of type PDB
core.conformation.Conformation: Found disulfide between residues 96 135
Already have 109 unique mutants. Nothing to do.


## Update CSV file if some structures weren't saved correctly

In [27]:
import pandas as pd
from pathlib import Path
import pyrosetta
from pyrosetta import pose_from_pdb
from tqdm import tqdm
import concurrent.futures
import os

def read_pdb_info(pdb_path: Path):
    """
    Worker function to read a single PDB file and extract its sequence.
    Designed to be run in a separate process.

    Args:
        pdb_path (Path): The path to the PDB file.

    Returns:
        tuple: A tuple containing (mutation_string, sequence) or None on failure.
    """
    try:
        # Each worker process needs its own quiet PyRosetta instance
        pyrosetta.init("-run:constant_seed -jran 1 -mute all -out:level 0")
        
        # The mutation string is the filename without the .pdb extension
        mutation_string = pdb_path.stem
        
        # Load the pose and get the sequence
        pose = pose_from_pdb(str(pdb_path))
        sequence = pose.sequence()
        
        return (mutation_string, sequence)
    except Exception as e:
        # Return None if there's any error reading a specific file
        print(f"Warning: Could not process file {pdb_path.name}. Error: {e}")
        return None

def update_csv_from_folder(directory_path: str | Path, num_workers: int | None = None):
    """
    Scans a directory for .pdb files, reads their sequences in parallel,
    and creates/overwrites a 'mutant_library.csv' file with the contents.

    Args:
        directory_path (str | Path): The path to the folder containing mutant PDBs.
        num_workers (int | None, optional): Number of CPU cores to use. 
                                            Defaults to a safe number (all cores - 2).
    """
    target_dir = Path(directory_path)
    if not target_dir.is_dir():
        print(f"Error: Directory not found at '{target_dir}'")
        return

    # Find all .pdb files in the target directory
    pdb_files = list(target_dir.glob("*.pdb"))
    if not pdb_files:
        print(f"No .pdb files found in '{target_dir}'. Nothing to do.")
        return

    print(f"Found {len(pdb_files)} PDB files to process...")

    # Configure number of workers
    if num_workers is None:
        num_workers = max(1, os.cpu_count() - 2)
    
    mutant_data = []
    
    # Process files in parallel
    with concurrent.futures.ProcessPoolExecutor(max_workers=num_workers) as executor:
        # Use tqdm to show a progress bar
        results_iterator = executor.map(read_pdb_info, pdb_files)
        for result in tqdm(results_iterator, total=len(pdb_files), desc="Reading PDBs"):
            if result:
                mutant_data.append({'mut': result[0], 'sequence': result[1]})

    if not mutant_data:
        print("Could not extract information from any PDB files.")
        return

    # Create a DataFrame and remove any potential duplicates
    final_df = pd.DataFrame(mutant_data)
    final_df.drop_duplicates(subset=['sequence'], inplace=True)
    
    # Save the final, synchronized CSV file
    csv_path = target_dir / 'mutant_library.csv'
    final_df.to_csv(csv_path, index=False)
    
    print(f"\nProcess complete.")
    print(f"Successfully created/updated '{csv_path}' with {len(final_df)} unique mutants.")

In [None]:
# Initialize PyRosetta once in the main process
pyrosetta.init()

# Set this to the path of your folder containing the mutant PDB files
MUTANT_FOLDER_PATH = 'mutant_library/h1-c_relaxed' 

# Run the update function
update_csv_from_folder(MUTANT_FOLDER_PATH)