In [1]:
import os
import pandas as pd
import json
import shutil
import numpy as np
import modelcif.reader
import modelcif
import biotite
import biotite.structure as struc
import biotite.structure.io as strucio

def parse_boltz2_results(directory):
    results = []
        
    # Get confidence and affinity files
    confidence_files = [f for f in os.listdir(directory) if f.startswith('confidence_') and f.endswith('.json')]
    affinity_files = [f for f in os.listdir(directory) if f.startswith('affinity_') and f.endswith('.json')]
    
    # Create a dictionary to store affinity data by model key
    affinity_dict = {}
    
    # Process affinity files first to build lookup dictionary
    for aff_file in affinity_files:
        base_name = aff_file.replace('affinity_', '').replace('.json', '')  # Extract base name
        model_key = f"{base_name}"  # Key matches confidence file base name without `_model_0`
        
        try:
            affinity_data = json.load(open(os.path.join(directory, aff_file), 'r'))
            affinity_dict[model_key] = affinity_data
            #print(f"Loaded affinity data for key: '{model_key}' from file: {aff_file}")  # Debugging statement
        except Exception as e:
            print(f"Error loading affinity file {aff_file}: {e}")

    # Process confidence files and merge with affinity data
    for conf_file in confidence_files:
        model_index = conf_file.split('_')[-1].split('.')[0]  # Extract model index from filename
        base_name = conf_file.replace('confidence_', '').replace(f'_model_{model_index}.json', '')  # Extract base name
        model_file = f"{base_name}_model_{model_index}.pdb"  # Assuming the model files are in PDB format
        model_path = os.path.join(directory, model_file)
        model_key = f"{base_name}"  # Key matches affinity file base name

        # Load confidence data
        try:
            confidence_data = json.load(open(os.path.join(directory, conf_file), 'r'))
        except Exception as e:
            print(f"Error loading confidence file {conf_file}: {e}")
            continue
        
        #print(f"Processing confidence file: {conf_file}")
        #print(f"Generated model key: '{model_key}'")
        #print(f"Constructed model path: {model_path}")  # Debugging statement

        if os.path.exists(model_path):
                        
            # Create the result dictionary with confidence data
            result_entry = {
                'model_path': model_path,
                'model_index': model_index,
                'confidence_score': confidence_data['confidence_score'],
                'ptm': confidence_data['ptm'],
                'iptm': confidence_data['iptm'],
                'ligand_iptm': confidence_data['ligand_iptm'],
                'protein_iptm': confidence_data['protein_iptm'],
                'complex_plddt': confidence_data['complex_plddt'],
                'complex_iplddt': confidence_data['complex_iplddt'],
                'complex_pde': confidence_data['complex_pde'],
                'complex_ipde': confidence_data['complex_ipde'],
                'chains_ptm': confidence_data['chains_ptm'],
                'pair_chains_iptm': confidence_data['pair_chains_iptm']
            }
            
            # Add affinity data if available for this model
            if model_key in affinity_dict:
                affinity_data = affinity_dict[model_key]
                result_entry.update({
                    'affinity_pred_value': affinity_data.get('affinity_pred_value', None),
                    'affinity_probability_binary': affinity_data.get('affinity_probability_binary', None),
                    'affinity_pred_value1': affinity_data.get('affinity_pred_value1', None),
                    'affinity_probability_binary1': affinity_data.get('affinity_probability_binary1', None),
                    'affinity_pred_value2': affinity_data.get('affinity_pred_value2', None),
                    'affinity_probability_binary2': affinity_data.get('affinity_probability_binary2', None)
                })
                #print(f"✓ Successfully added affinity data for: {model_key}")  # Debugging statement
            else:
                # Add None values for affinity data if not available
                result_entry.update({
                    'affinity_pred_value': None,
                    'affinity_probability_binary': None,
                    'affinity_pred_value1': None,
                    'affinity_probability_binary1': None,
                    'affinity_pred_value2': None,
                    'affinity_probability_binary2': None
                })
                print(f"✗ No affinity data found for: '{model_key}'")  # Debugging statement
                print(f"  Available keys: {list(affinity_dict.keys())}")
            
            results.append(result_entry)
    
        else:
            print(f"Model file does not exist: {model_path}")  # Debugging statement

    #print(f"\nProcessed {len(results)} models total")
    print(f"Affinity data matched for {sum(1 for r in results if r.get('affinity_pred_value') is not None)} models")
    
    return pd.DataFrame(results)


# Function to combine the CSV results from multiple files:
def combine_csv_results(file_list):
    combined_df = pd.concat([pd.read_csv(f) for f in file_list if f.endswith('_results.csv')])
    return combined_df

In [6]:
folder_list = os.listdir('./') # Define the root folder where the boltz-2 output folders are located

confidence_value = float(0.7) # Define the boltz-2 confidence value to filter results

affinity_value = float(0.1)

##### May want to change these variables #####
# 157 is Histadine, 89 is Tyrosine, 211 is Phe
# Note: the distance array starts at 0, so amino acids numbers from chimerax will be +1

binding_site_residue1 = int(157)
binding_site_residue2 = None  # You can set this to None if you don't want to filter by a second residue
binding_site_residue3 = None  # You can set this to None if you don't want to filter by a third residue
distance_threshold = float(15.0)

In [7]:
# Parsing results and initial confidence filtering

#print(f"Found folders: {folder_list}")  # Debugging statement

for folder in folder_list:
    if folder.startswith('boltz_results_'):
        predictions_folder = os.path.join(folder, 'predictions/', folder.split('_')[2])
        if not os.path.exists(predictions_folder):
            print(f"Predictions folder does not exist: {predictions_folder}")
            continue
        else:
            print(f"working on folder: {predictions_folder}")
        print(f"Processing folder: {predictions_folder}")  # Debugging statement
        results_df = parse_boltz2_results(os.path.join('./', predictions_folder))
        # For example, to save it as a CSV file:
        results_df.to_csv(f"{folder}_results.csv", index=False)
    else:
        continue

# Combine all the CSV files into a single DataFrame

file_list = [f for f in os.listdir('./') if f.endswith('_results.csv')]

combined_df = combine_csv_results(file_list)
combined_df.to_csv('boltz_results_combined.csv', index=False)
print("Combined results saved to 'boltz_results_combined.csv'")  # Debugging statement

# remove the temporary .csv files
for file in file_list:
    if os.path.exists(file):
        os.remove(file)
        #print(f"Removed temporary file: {file}")  # Debugging statement
    else:
        continue

# Filtering the DataFrame to include only rows with a confidence score greater than the specified value 
confidence_filtered_df = combined_df[combined_df['confidence_score'] > confidence_value]
confidence_filtered_df.to_csv('boltz_results_confidence_filtered.csv', index=False)

# Using the filtered DataFrame to copy the corresponding .pdb model files into a new directory
confidence_filtered_dir = 'confidence_filtered_models'
os.makedirs(confidence_filtered_dir, exist_ok=True)
for index, row in confidence_filtered_df.iterrows():
    model_path = row['model_path']
    if os.path.exists(model_path):
        # Copy the model file to the output directory
        os.system(f"cp {model_path} {confidence_filtered_dir}")
        print(f"Copied {model_path} to {confidence_filtered_dir}")
    else:
        print(f"Model file does not exist: {model_path}")  # Debugging statement

affinity_filtered_df = combined_df[combined_df['affinity_pred_value'] > affinity_value]
affinity_filtered_df.to_csv('boltz_results_affinity_filtered.csv', index=False)

# Using the filtered DataFrame to copy the corresponding .pdb model files into a new directory
affinity_filtered_dir = 'affinity_filtered_models'
os.makedirs(affinity_filtered_dir, exist_ok=True)
for index, row in affinity_filtered_df.iterrows():
    model_path = row['model_path']
    if os.path.exists(model_path):
        # Copy the model file to the output directory
        os.system(f"cp {model_path} {affinity_filtered_dir}")
        print(f"Copied {model_path} to {affinity_filtered_dir}")
    else:
        print(f"Model file does not exist: {model_path}")  # Debugging statement

# Copy all the .pdb files from the combined results to a new directory
combined_output_dir = 'combined_models'
os.makedirs(combined_output_dir, exist_ok=True)
for file in combined_df['model_path'].unique():
    if os.path.exists(file):
        # Copy the model file to the output directory
        os.system(f"cp {file} {combined_output_dir}")
        print(f"Copied {file} to {combined_output_dir}")
    else:
        print(f"File does not exist: {file}")  # Debugging statement


working on folder: boltz_results_391/predictions/391
Processing folder: boltz_results_391/predictions/391
Affinity data matched for 1 models
Predictions folder does not exist: boltz_results_combined.csv/predictions/combined.csv
Predictions folder does not exist: boltz_results_confidence_filtered.csv/predictions/confidence
working on folder: boltz_results_4839/predictions/4839
Processing folder: boltz_results_4839/predictions/4839
Affinity data matched for 1 models
working on folder: boltz_results_1945/predictions/1945
Processing folder: boltz_results_1945/predictions/1945
Affinity data matched for 1 models
working on folder: boltz_results_4897/predictions/4897
Processing folder: boltz_results_4897/predictions/4897
Affinity data matched for 1 models
Predictions folder does not exist: boltz_results_filtered.csv/predictions/filtered.csv
working on folder: boltz_results_5789/predictions/5789
Processing folder: boltz_results_5789/predictions/5789
Affinity data matched for 1 models
Combined 