# Workbook to workup the biopython/modelcif package for bulk pdb alignments

In [None]:
from Bio.PDB import *
import sys
import os
import multiprocessing as mp
from functools import partial
import time

# Configuration
directory_to_parse = './distance_filtered'  # Change to point to directory with your PDB files
directory_to_save = './distance_filtered_aligned' # Change to point to directory where you want to save aligned files
num_processes = mp.cpu_count()  # Use all available CPU cores, or set to a specific number

def align_structure(pdb_file, reference_ca_coords, directory_to_parse, directory_to_save):
    """
    Function to align a single PDB structure against reference CA coordinates
    """
    try:
        parser = PDBParser(QUIET=True)
        super_imposer = Superimposer()
        
        # Load the structure
        structure = parser.get_structure(pdb_file, os.path.join(directory_to_parse, pdb_file))
        
        # Get CA atoms from current structure
        ca_atoms_current = [atom for atom in structure.get_atoms() if atom.get_id() == 'CA']
        
        if len(reference_ca_coords) != len(ca_atoms_current):
            return f"Error: The number of CA atoms in '{pdb_file}' does not match the reference structure. Reference: {len(reference_ca_coords)}, Target: {len(ca_atoms_current)}"
        
        if len(ca_atoms_current) < 3:
            return f"Error: At least 3 CA atoms are required for alignment in '{pdb_file}'."
        
        # Create dummy atoms for reference coordinates (needed for Superimposer)
        # We'll use the coordinates from the reference structure
        from Bio.PDB.Atom import Atom
        reference_atoms = []
        for i, coord in enumerate(reference_ca_coords):
            atom = Atom('CA', coord, 0, 1, ' ', 'CA', i, 'C')
            reference_atoms.append(atom)
        
        # Perform alignment
        super_imposer.set_atoms(reference_atoms, ca_atoms_current)
        super_imposer.apply(structure.get_atoms())
        
        # Get RMSD for reporting
        rmsd = super_imposer.rms
        
        # Save the aligned structure
        io = PDBIO()
        io.set_structure(structure)
        output_file = os.path.join(directory_to_save, f"aligned_{pdb_file}")
        io.save(output_file)
        
        return f"Aligned structure saved to {output_file} (RMSD: {rmsd:.2f} Å)"
        
    except Exception as e:
        return f"Error processing {pdb_file}: {str(e)}"

# Main execution
print(f"Starting parallel PDB alignment with {num_processes} processes...")
start_time = time.time()

# Ensure the output directory exists
if not os.path.exists(directory_to_save):
    os.makedirs(directory_to_save)

# Check if the directory exists
if not os.path.exists(directory_to_parse):
    print(f"Error: The directory '{directory_to_parse}' does not exist.")
    sys.exit(1)

# Check if the directory is empty
if not os.listdir(directory_to_parse):
    print(f"Error: The directory '{directory_to_parse}' is empty.")
    sys.exit(1)

# Parse the PDB files in the directory
pdb_files = [f for f in os.listdir(directory_to_parse) if f.endswith('.pdb')]
if not pdb_files:
    print(f"Error: No PDB files found in the directory '{directory_to_parse}'.")
    sys.exit(1)

if len(pdb_files) < 2:
    print("Error: At least two PDB files are required for alignment.")
    sys.exit(1)

print(f"Found {len(pdb_files)} PDB files to process")

# Get the reference structure (first file) and extract CA coordinates
parser = PDBParser(QUIET=True)
reference_structure = parser.get_structure("reference", os.path.join(directory_to_parse, pdb_files[0]))
reference_ca_atoms = [atom for atom in reference_structure.get_atoms() if atom.get_id() == 'CA']
reference_ca_coords = [atom.get_coord() for atom in reference_ca_atoms]

print(f"Reference structure: {pdb_files[0]} with {len(reference_ca_coords)} CA atoms")

# Save the reference structure (unchanged)
io = PDBIO()
io.set_structure(reference_structure)
output_file_ref = os.path.join(directory_to_save, f"aligned_{pdb_files[0]}")
io.save(output_file_ref)
print(f"Reference structure saved to {output_file_ref}")

# Prepare the partial function with fixed arguments
align_func = partial(align_structure, 
                    reference_ca_coords=reference_ca_coords,
                    directory_to_parse=directory_to_parse,
                    directory_to_save=directory_to_save)

# Process files in parallel (excluding the reference file)
files_to_process = pdb_files[1:]
print(f"Processing {len(files_to_process)} files in parallel...")

with mp.Pool(processes=num_processes) as pool:
    results = pool.map(align_func, files_to_process)

# Print results
successful_alignments = 0
failed_alignments = 0

for result in results:
    print(result)
    if "Error" in result:
        failed_alignments += 1
    else:
        successful_alignments += 1

end_time = time.time()
print(f"\nAlignment process completed in {end_time - start_time:.2f} seconds")
print(f"Processed {len(pdb_files)} files using {num_processes} parallel processes")
print(f"Successful alignments: {successful_alignments}")
print(f"Failed alignments: {failed_alignments}")

# Optional: Calculate alignment statistics
if successful_alignments > 0:
    print("\nAlignment Statistics:")
    rmsd_values = []
    for result in results:
        if "RMSD:" in result:
            try:
                rmsd_str = result.split("RMSD: ")[1].split(" Å")[0]
                rmsd_values.append(float(rmsd_str))
            except:
                continue
    
    if rmsd_values:
        import numpy as np
        print(f"  Mean RMSD: {np.mean(rmsd_values):.2f} Å")
        print(f"  Min RMSD: {np.min(rmsd_values):.2f} Å")
        print(f"  Max RMSD: {np.max(rmsd_values):.2f} Å")
        print(f"  Std RMSD: {np.std(rmsd_values):.2f} Å")