# Workbook to workup the biopython/modelcif package for bulk pdb alignments

In [None]:
from Bio.PDB import *
import sys
import os
import multiprocessing as mp
from functools import partial
import time

# Configuration
directory_to_parse = './testcif'  # Change to point to directory with your CIF files
directory_to_save = './aligned_cif' # Change to point to directory where you want to save aligned files
num_processes = mp.cpu_count()  # Use all available CPU cores, or set to a specific number

def align_structure(cif_file, reference_ca_coords, directory_to_parse, directory_to_save):
    """
    Function to align a single structure against reference CA coordinates
    """
    try:
        parser = MMCIFParser(QUIET=True)
        super_imposer = Superimposer()
        
        # Load the structure
        structure = parser.get_structure(cif_file, os.path.join(directory_to_parse, cif_file))
        
        # Get CA atoms from current structure
        ca_atoms_current = [atom for atom in structure.get_atoms() if atom.get_id() == 'CA']
        
        if len(reference_ca_coords) != len(ca_atoms_current):
            return f"Error: The number of CA atoms in '{cif_file}' does not match the reference structure."
        
        if len(ca_atoms_current) < 3:
            return f"Error: At least 3 CA atoms are required for alignment in '{cif_file}'."
        
        # Create dummy atoms for reference coordinates (needed for Superimposer)
        # We'll use the coordinates from the reference structure
        from Bio.PDB.Atom import Atom
        reference_atoms = []
        for i, coord in enumerate(reference_ca_coords):
            atom = Atom('CA', coord, 0, 1, ' ', 'CA', i, 'C')
            reference_atoms.append(atom)
        
        # Perform alignment
        super_imposer.set_atoms(reference_atoms, ca_atoms_current)
        super_imposer.apply(structure.get_atoms())
        
        # Save the aligned structure
        io = MMCIFIO()
        io.set_structure(structure)
        output_file = os.path.join(directory_to_save, f"aligned_{cif_file}")
        io.save(output_file)
        
        return f"Aligned structure saved to {output_file}"
        
    except Exception as e:
        return f"Error processing {cif_file}: {str(e)}"

# Main execution
print(f"Starting parallel alignment with {num_processes} processes...")
start_time = time.time()

# Ensure the output directory exists
if not os.path.exists(directory_to_save):
    os.makedirs(directory_to_save)

# Check if the directory exists
if not os.path.exists(directory_to_parse):
    print(f"Error: The directory '{directory_to_parse}' does not exist.")
    sys.exit(1)

# Check if the directory is empty
if not os.listdir(directory_to_parse):
    print(f"Error: The directory '{directory_to_parse}' is empty.")
    sys.exit(1)

# Parse the CIF files in the directory
cif_files = [f for f in os.listdir(directory_to_parse) if f.endswith('.cif')]
if not cif_files:
    print(f"Error: No CIF files found in the directory '{directory_to_parse}'.")
    sys.exit(1)

if len(cif_files) < 2:
    print("Error: At least two CIF files are required for alignment.")
    sys.exit(1)

print(f"Found {len(cif_files)} CIF files to process")

# Get the reference structure (first file) and extract CA coordinates
parser = MMCIFParser(QUIET=True)
reference_structure = parser.get_structure("reference", os.path.join(directory_to_parse, cif_files[0]))
reference_ca_atoms = [atom for atom in reference_structure.get_atoms() if atom.get_id() == 'CA']
reference_ca_coords = [atom.get_coord() for atom in reference_ca_atoms]

print(f"Reference structure: {cif_files[0]} with {len(reference_ca_coords)} CA atoms")

# Save the reference structure (unchanged)
io = MMCIFIO()
io.set_structure(reference_structure)
output_file_ref = os.path.join(directory_to_save, f"aligned_{cif_files[0]}")
io.save(output_file_ref)
print(f"Reference structure saved to {output_file_ref}")

# Prepare the partial function with fixed arguments
align_func = partial(align_structure, 
                    reference_ca_coords=reference_ca_coords,
                    directory_to_parse=directory_to_parse,
                    directory_to_save=directory_to_save)

# Process files in parallel (excluding the reference file)
files_to_process = cif_files[1:]
print(f"Processing {len(files_to_process)} files in parallel...")

with mp.Pool(processes=num_processes) as pool:
    results = pool.map(align_func, files_to_process)

# Print results
for result in results:
    print(result)

end_time = time.time()
print(f"\nAlignment process completed successfully in {end_time - start_time:.2f} seconds")
print(f"Processed {len(cif_files)} files using {num_processes} parallel processes")


