# Workbook for filtering and plotting Boltz-2 output data

In [None]:
# Imports

import os
import pandas as pd
import json
import shutil
import numpy as np
import modelcif.reader
import modelcif
import biotite
import biotite.structure as struc
import biotite.structure.io as strucio
from filtering_functions import *
import matplotlib.pyplot as plt
import seaborn as sns
import multiprocessing as mp
from functools import partial
import time

In [None]:
# Setup parameters

folder_list = os.listdir('./') # Define the root folder where the boltz-2 output folders are located

confidence_value = float(0.7) # Define the boltz-2 confidence value to filter results

affinity_value = float(0.1) # Definte the boltz-2 affinity value to filter results (this is the predicted binding affinity, not the experimental one)

##### May want to change these variables #####
# 157 is Histadine, 89 is Tyrosine, 211 is Phe
# Note: the distance array starts at 0, so amino acids numbers from chimerax will be +1

binding_site_residue1 = int(157)
binding_site_residue2 = None  # You can set this to None if you don't want to filter by a second residue
binding_site_residue3 = None  # You can set this to None if you don't want to filter by a third residue
distance_threshold = float(15.0)

In [None]:
for folder in folder_list:
    if folder.startswith('boltz_results_'):
        predictions_folder = os.path.join(folder, 'predictions/', folder.split('_')[2])
        if not os.path.exists(predictions_folder):
            print(f"Predictions folder does not exist: {predictions_folder}")
            continue
        else:
            print(f"working on folder: {predictions_folder}")
        print(f"Processing folder: {predictions_folder}")  # Debugging statement
        results_df = parse_boltz2_results(os.path.join('./', predictions_folder))
        # For example, to save it as a CSV file:
        results_df.to_csv(f"{folder}_results.csv", index=False)
    else:
        continue

# Combine all the CSV files into a single DataFrame

file_list = [f for f in os.listdir('./') if f.endswith('_results.csv')]

combined_df = combine_csv_results(file_list)
combined_df.to_csv('boltz_results_combined.csv', index=False)
print("Combined results saved to 'boltz_results_combined.csv'")  # Debugging statement

# remove the temporary .csv files
for file in file_list:
    if os.path.exists(file):
        os.remove(file)
        #print(f"Removed temporary file: {file}")  # Debugging statement
    else:
        continue


# Copy all the .pdb files from the combined results to a new directory
combined_output_dir = 'combined_models'
os.makedirs(combined_output_dir, exist_ok=True)
for file in combined_df['model_path'].unique():
    if os.path.exists(file):
        # Copy the model file to the output directory
        os.system(f"cp {file} {combined_output_dir}")
        print(f"Copied {file} to {combined_output_dir}")
    else:
        print(f"File does not exist: {file}")  # Debugging statement




In [None]:
# Calculate the hydrogen bonds between the Receptor and Ligand for each model and append the results to the DataFrame

# Optional: Use fewer processes if memory is limited
num_processes = min(mp.cpu_count(), 8)  # Cap at 8 processes

# Populate pdb_file_list with paths to PDB files
pdb_file_list = []
for folder in folder_list:
    if folder.startswith('boltz_results_'):
        # Extract the numeric identifier from the folder name
        folder_id = folder.split('_')[2]
        predictions_folder = os.path.join(folder, 'predictions', folder_id)
        if os.path.exists(predictions_folder):
            for f in os.listdir(predictions_folder):
                if f.endswith('.pdb'):
                    pdb_file_path = os.path.join(predictions_folder, f)
                    pdb_file_list.append(pdb_file_path)

print(f"Found {len(pdb_file_list)} PDB files to process")

# Process files in parallel
num_processes = mp.cpu_count()  # Use all available CPU cores
print(f"Starting hydrogen bond analysis with {num_processes} processes...")

start_time = time.time()

with mp.Pool(processes=num_processes) as pool:
    results = pool.map(process_hydrogen_bonds, pdb_file_list)

# Filter out None results and collect hydrogen bond data
hydrogen_bond_data = [result for result in results if result is not None]

end_time = time.time()
print(f"Hydrogen bond analysis completed in {end_time - start_time:.2f} seconds")
print(f"Successfully processed {len(hydrogen_bond_data)} files")

# Save the hydrogen bond data to a CSV file
hydrogen_bond_df = pd.concat(hydrogen_bond_data, axis=1).transpose()

# Rename the columns
hydrogen_bond_df.columns = ['num_hbonds']  # Set the column name for the hydrogen bond count
hydrogen_bond_df.index.name = 'model_path'  # Set the index name to 'model_path'

hydrogen_bond_df.index = hydrogen_bond_df.index.map(lambda x: f"./{x}" if not x.startswith("./") else x)

hydrogen_bond_df.to_csv('hydrogen_bond_data.csv', index=True)
print("Hydrogen bond data saved to 'hydrogen_bond_data.csv'")

# Combine the hydrogen bond data with the combined DataFrame
combined_df = pd.merge(combined_df, hydrogen_bond_df, left_on='model_path', right_index=True, how='left')

# Save the combined DataFrame with hydrogen bond data to a new CSV file
combined_df.to_csv('boltz_results_combined_with_hbonds.csv', index=False)

In [None]:
# Calculate the distance from the center of mass for each model for distance filtering

# Filtering by distance of CA atoms to the COM of the ligand

distance_filtered_dir = 'distance_filtered'
os.makedirs(distance_filtered_dir, exist_ok=True)

distance_data = [] # List to store distance data for each model

for file_path in pdb_file_list:
    if not os.path.isfile(file_path):  # Skip if it's not a file
        print(f"Skipping non-file: {file_path}")  # Debugging statement
        continue
    
    try:
        pdb_file_obj = pdb.PDBFile.read(file_path)
        pdb_data = pdb_file_obj.get_structure()
      
        CA_positions = extract_ca_positions(pdb_data, chain_id='A')
        Lig_positions = extract_atoms_by_chain(pdb_data, chain_id='B')
        Lig_COM = calculate_center_of_mass(Lig_positions)
        distances = calculate_distances_to_com(CA_positions, Lig_COM)

        # Append the distances and filename to the list
        distance_data.append(pd.Series(distances, name=file_path))

        if filter_by_sites(distances, binding_site_residue1, binding_site_residue2, binding_site_residue3, distance_threshold):
            # If the distances pass the filter, copy the file to the distance_filtered_dir
            os.system(f"cp {file_path} {distance_filtered_dir}")
            # print(f"Copied {file_path} to {distance_filtered_dir}")
    except FileNotFoundError as e:
        print(f"Error processing file {file_path}: {e}")


# Combine all the distance data into a single DataFrame using pd.concat
distance_df = pd.concat(distance_data, axis=1).transpose()

# Write the distance data to a CSV file
distance_df.to_csv('distance_matrices.csv', index=True)

# Rename the columns
distance_df.index.name = 'model_path'  # Set the index name to 'model_path'
distance_df.index = distance_df.index.map(lambda x: f"./{x}" if not x.startswith("./") else x)

# Reset the index to make 'model_path' a regular column
distance_df = distance_df.reset_index()

# Define the binding site residue column to extract
binding_site_residue_column = binding_site_residue1  # Ensure the column name matches the residue identifier

# Ensure the column exists in the DataFrame
if binding_site_residue_column in distance_df.columns:
    # Group by model_path and select the mean distance for the specified residue
    reduced_distance_df = distance_df.groupby('model_path', as_index=False)[binding_site_residue_column].mean()
    reduced_distance_df.rename(columns={binding_site_residue_column: 'distance_to_residue'}, inplace=True)

    # Save the reduced DataFrame to a CSV file
    reduced_distance_df.to_csv('reduced_distance_data.csv', index=False)
    print("Reduced distance data saved to 'reduced_distance_data.csv'")
else:
    print(f"Column '{binding_site_residue_column}' not found in distance_df.")

# Combine the distance data with the combined DataFrame
combined_df = pd.merge(combined_df, reduced_distance_df, on='model_path', how='left')
combined_df.to_csv('boltz_results_combined_with_hbonds_distances.csv', index=False)
print("Combined DataFrame with distances saved to 'boltz_results_combined_with_hbonds_distances.csv'")




In [None]:
# Plot distribution of confidence scores and affinity prediction values
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.histplot(combined_df['confidence_score'], bins=30, kde=True)
plt.title('Distribution of Confidence Scores')
plt.xlabel('Confidence Score')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
sns.histplot(combined_df['affinity_pred_value'], bins=30, kde=True)
plt.title('Distribution of Affinity Prediction Values')
plt.xlabel('Affinity Prediction Value')
plt.ylabel('Frequency')

plt.tight_layout()
plt.savefig('boltz_results_distribution.png')
plt.show()

In [None]:
# Plot the distribution of distances to specific residues
# The distance data is expected to be in a CSV file named 'distance_matrices.csv'

distance_data = pd.read_csv('distance_matrices.csv')

# The distance data has the first column as the model name and each subsequent column as the distance to a specific residue.

# Initialally just want to define the column to plot
binding_site_residue1 = 157
binding_site_residue2 = 90
binding_site_residue3 = 212
binding_site_residue4 = 57


plt.figure(figsize=(10, 10))

plt.subplot(2, 2, 1)
sns.histplot(distance_data.iloc[:, binding_site_residue1], bins=50, kde=True)
plt.title(f'Distribution of Distances to Residue {binding_site_residue1}')
plt.xlabel('Mean ligand distance to Residue (angstroms)')
plt.ylabel('Frequency')
plt.grid()

plt.subplot(2, 2, 2)
sns.histplot(distance_data.iloc[:, binding_site_residue2], bins=50, kde=True)
plt.title(f'Distribution of Distances to Residue {binding_site_residue2}')
plt.xlabel('Mean ligand distance to Residue (angstroms)')
plt.ylabel('Frequency')
plt.grid() 

plt.subplot(2, 2, 3)
sns.histplot(distance_data.iloc[:, binding_site_residue3], bins=50, kde=True)
plt.title(f'Distribution of Distances to Residue {binding_site_residue3}')
plt.xlabel('Mean ligand distance to Residue (angstroms)')
plt.ylabel('Frequency')
plt.grid()

plt.subplot(2, 2, 4)
sns.histplot(distance_data.iloc[:, binding_site_residue4], bins=50, kde=True)
plt.title(f'Distribution of Distances to Residue {binding_site_residue4}')
plt.xlabel('Mean ligand distance to Residue (angstroms)')
plt.ylabel('Frequency')
plt.grid()

plt.tight_layout()
plt.savefig('distance_distribution_plots.png')
plt.show()

# Plot each column of the distance data, ploting the mean and standard deviation of each column
# This will help to visualize the distribution of distances to each residue across all models
mean_distance = []
std_distance = []

for i in range(1, distance_data.shape[1]):
    # Calculate and plot the mean and standard deviation
    mean_distance.append(distance_data.iloc[:, i].mean())
    std_distance.append(distance_data.iloc[:, i].std())

# Create a DataFrame for the mean and standard deviation
distance_summary = pd.DataFrame({
    'Residue': distance_data.columns[1:],
    'Mean Distance': mean_distance,
    'Standard Deviation': std_distance
})

# Plot the mean and standard deviation
plt.figure(figsize=(40, 10))
plt.errorbar(distance_summary['Residue'], distance_summary['Mean Distance'], yerr=distance_summary['Standard Deviation'], fmt='o', capsize=5)
plt.title('Mean and Standard Deviation of Distances to Residues')
plt.xlabel('Residue')
plt.ylabel('Mean Distance (angstroms)')
plt.xticks(rotation=45)
plt.grid()
plt.tight_layout()
plt.savefig('mean_std_distance_plot.png')
plt.show()

# Filter the distance_summary, and return the residue index that have the mean distance below a certain threshold
distance_threshold = 12.0

# Filter the distance data based on the distance threshold
filtered_distance_data = distance_summary[
    (distance_summary['Mean Distance'] < distance_threshold)    
]


# Save the filtered distance data to a CSV file
filtered_distance_data.to_csv('filtered_distance_data.csv', index=False)


In [None]:
# Filter the combined dataframe based on all the combined metrics
# This will filter the DataFrame based on the confidence score, affinity prediction value, and distance
filtered_combined_df = combined_df[
    (combined_df['confidence_score'] > confidence_value) &
    (combined_df['affinity_pred_value'] < affinity_value) &
    (combined_df['distance_to_residue'] < distance_threshold)
]

filtered_combined_df.to_csv('Final_Filtered_models.csv', index=False)

filtered_dir = 'filtered_models'
os.makedirs(filtered_dir, exist_ok=True)

for index, row in filtered_combined_df.iterrows():
    model_path = row['model_path']
    if os.path.exists(model_path):
        # Copy the model file to the output directory
        os.system(f"cp {model_path} {filtered_dir}")
        print(f"Copied {model_path} to {filtered_dir}")
    else:
        print(f"Model file does not exist: {model_path}")  # Debugging statement

In [None]:
# Align the filtered models to the reference structure

directory_to_parse = './filtered_models'  # Change to point to directory with your PDB files
directory_to_save = './filtered_aligned' # Change to point to directory where you want to save aligned files
num_processes = mp.cpu_count()  # Use all available CPU cores, or set to a specific number

# Main execution
print(f"Starting parallel PDB alignment with {num_processes} processes...")
start_time = time.time()

# Ensure the output directory exists
if not os.path.exists(directory_to_save):
    os.makedirs(directory_to_save)

# Check if the directory exists
if not os.path.exists(directory_to_parse):
    print(f"Error: The directory '{directory_to_parse}' does not exist.")
    sys.exit(1)

# Check if the directory is empty
if not os.listdir(directory_to_parse):
    print(f"Error: The directory '{directory_to_parse}' is empty.")
    sys.exit(1)

# Parse the PDB files in the directory
pdb_files = [f for f in os.listdir(directory_to_parse) if f.endswith('.pdb')]
if not pdb_files:
    print(f"Error: No PDB files found in the directory '{directory_to_parse}'.")
    sys.exit(1)

if len(pdb_files) < 2:
    print("Error: At least two PDB files are required for alignment.")
    sys.exit(1)

print(f"Found {len(pdb_files)} PDB files to process")

# Get the reference structure (first file) and extract CA coordinates
parser = PDBParser(QUIET=True)
reference_structure = parser.get_structure("reference", os.path.join(directory_to_parse, pdb_files[0]))
reference_ca_atoms = [atom for atom in reference_structure.get_atoms() if atom.get_id() == 'CA']
reference_ca_coords = [atom.get_coord() for atom in reference_ca_atoms]

print(f"Reference structure: {pdb_files[0]} with {len(reference_ca_coords)} CA atoms")

# Save the reference structure (unchanged)
io = PDBIO()
io.set_structure(reference_structure)
output_file_ref = os.path.join(directory_to_save, f"aligned_{pdb_files[0]}")
io.save(output_file_ref)
print(f"Reference structure saved to {output_file_ref}")

# Prepare the partial function with fixed arguments
align_func = partial(align_structure, 
                    reference_ca_coords=reference_ca_coords,
                    directory_to_parse=directory_to_parse,
                    directory_to_save=directory_to_save)

# Process files in parallel (excluding the reference file)
files_to_process = pdb_files[1:]
print(f"Processing {len(files_to_process)} files in parallel...")

with mp.Pool(processes=num_processes) as pool:
    results = pool.map(align_func, files_to_process)

# Print results
successful_alignments = 0
failed_alignments = 0

for result in results:
    print(result)
    if "Error" in result:
        failed_alignments += 1
    else:
        successful_alignments += 1

end_time = time.time()
print(f"\nAlignment process completed in {end_time - start_time:.2f} seconds")
print(f"Processed {len(pdb_files)} files using {num_processes} parallel processes")
print(f"Successful alignments: {successful_alignments}")
print(f"Failed alignments: {failed_alignments}")

# Optional: Calculate alignment statistics
if successful_alignments > 0:
    print("\nAlignment Statistics:")
    rmsd_values = []
    for result in results:
        if "RMSD:" in result:
            try:
                rmsd_str = result.split("RMSD: ")[1].split(" Å")[0]
                rmsd_values.append(float(rmsd_str))
            except:
                continue
    
    if rmsd_values:
        import numpy as np
        print(f"  Mean RMSD: {np.mean(rmsd_values):.2f} Å")
        print(f"  Min RMSD: {np.min(rmsd_values):.2f} Å")
        print(f"  Max RMSD: {np.max(rmsd_values):.2f} Å")
        print(f"  Std RMSD: {np.std(rmsd_values):.2f} Å")

In [None]:
# Save Data and zip results for sharing:

# Create ZIP archive of the important results for sharing
# Path to the existing archive
archive_name = f'boltz_results_archive_confidence{confidence_value}_affinity{affinity_value}'

# Temporary directory to extract the archive
temp_dir = 'temp_archive'

# Copy the .csv files to the temporary directory
os.makedirs(temp_dir, exist_ok=True)

shutil.copy('boltz_results_combined_with_hbonds_distances.csv', temp_dir)
shutil.copy('distance_matrices.csv', temp_dir)
shutil.copy('Final_Filtered_models.csv', temp_dir)



# Copy the filtered_models directory to the temporary directory
filtered_models_path = os.path.join(temp_dir, 'filtered_models')
shutil.copytree('filtered_models', filtered_models_path, dirs_exist_ok=True)

# Copy the combined_models directory to the temporary directoryxist_ok=True)
combined_models_path = os.path.join(temp_dir, 'combined_models')
shutil.copytree('combined_models', combined_models_path, dirs_exist_ok=True)

# Copy the aligned models directory to the temporary directory
aligned_models_path = os.path.join(temp_dir, 'filtered_aligned')
shutil.copytree('filtered_aligned', aligned_models_path, dirs_exist_ok=True)

# Copy all the .png files to the temporary directory
png_files = [f for f in os.listdir('.') if f.endswith('.png')]
for png_file in png_files:
    shutil.copy(png_file, temp_dir)

# Create the archive with the updated contentsdels_path, dirs_exist_ok=True)
shutil.make_archive(archive_name, 'zip', temp_dir)
# Create the archive with the updated contents
# Clean up the temporary directory'zip', temp_dir)
shutil.rmtree(temp_dir)
# Clean up the temporary directory
print("Results Filtered and Collated comrade")