# Utility notebook for extracting out confidence scores from boltz2 predictions

In [None]:
import os
import pandas as pd
import json
import shutil

# This script is designed to parse the results from boltz2 predictions, extract relevant data,
# and save it in a structured format for further analysis.

# Data output structure from boltz2
"""
So boltz2 outputs data in a directory structure like this:

~/boltz_results_MIPS-0051357/predictions/MIPS-0051357

in side this directory, there are multiple files , including the confidence scores per model:
confidence_MIPS-0051357_model_0.json
confidence_MIPS-0051357_model_1.json
confidence_MIPS-0051357_model_2.json
confidence_MIPS-0051357_model_3.json
confidence_MIPS-0051357_model_4.json

and the models themselves:
MIPS-0051357_model_0.cif
MIPS-0051357_model_1.cif
MIPS-0051357_model_2.cif
MIPS-0051357_model_3.cif
MIPS-0051357_model_4.cif

The confidence files are in JSON format, and look like this:
{
    "confidence_score": 0.8774623870849609,
    "ptm": 0.7683840990066528,
    "iptm": 0.8961464762687683,
    "ligand_iptm": 0.8961464762687683,
    "protein_iptm": 0.0,
    "complex_plddt": 0.8727914094924927,
    "complex_iplddt": 0.7528989315032959,
    "complex_pde": 0.8251760005950928,
    "complex_ipde": 2.783323049545288,
    "chains_ptm": {
        "0": 0.7612196803092957,
        "1": 0.945591151714325
    },
    "pair_chains_iptm": {
        "0": {
            "0": 0.7612196803092957,
            "1": 0.4030245244503021
        },
        "1": {
            "0": 0.8961464762687683,
            "1": 0.945591151714325
        }
    }
}
"""

def parse_boltz2_results(directory):
    results = []
    print(f"Scanning directory: {directory}")  # Debugging statement
    confidence_files = [f for f in os.listdir(directory) if f.startswith('confidence_') and f.endswith('.json')]
    print(f"Found confidence files: {confidence_files}")  # Debugging statement

    for conf_file in confidence_files:
        model_index = conf_file.split('_')[-1].split('.')[0]  # Extract model index from filename
        base_name = '_'.join(conf_file.split('_')[1:-2])
        confidence_data = json.load(open(os.path.join(directory, conf_file), 'r'))  # Load the JSON data

        # Find the corresponding model file dynamically based on the confidence file name
        base_name = conf_file.replace('confidence_', '').replace(f'_model_{model_index}.json', '')
        model_file = f"{base_name}_model_{model_index}.cif"
        model_path = os.path.join(directory, model_file)
        print(f"Constructed model path: {model_path}")  # Debugging statement

        if os.path.exists(model_path):
            print(f"Model file exists: {model_path}")  # Debugging statement
            results.append({
                'model_path': model_path,
                'model_index': model_index,
                'confidence_score': confidence_data['confidence_score'],
                'ptm': confidence_data['ptm'],
                'iptm': confidence_data['iptm'],
                'ligand_iptm': confidence_data['ligand_iptm'],
                'protein_iptm': confidence_data['protein_iptm'],
                'complex_plddt': confidence_data['complex_plddt'],
                'complex_iplddt': confidence_data['complex_iplddt'],
                'complex_pde': confidence_data['complex_pde'],
                'complex_ipde': confidence_data['complex_ipde'],
                'chains_ptm': confidence_data['chains_ptm'],
                'pair_chains_iptm': confidence_data['pair_chains_iptm']  
            })
        else:
            print(f"Model file does not exist: {model_path}")  # Debugging statement

    return pd.DataFrame(results)

# Function to combine the CSV results from multiple files:
def combine_csv_results(file_list):
    combined_df = pd.concat([pd.read_csv(f) for f in file_list if f.endswith('_results.csv')])
    return combined_df

folder_list = os.listdir('./')

print(f"Found folders: {folder_list}")  # Debugging statement

for folder in folder_list:
    if folder.startswith('boltz_results_'):
        # Process only folders that start with 'boltz_results_'
        # The .json files within this folder are two folders further up in the directory structure
        # so we need to adjust the path accordingly
        # For example, if the folder is 'boltz_results_MIPS-0051357', we will look for
        # 'boltz_results_MIPS-0051357/predictions/MIPS-0051357'
        # Adjust the path to point to the correct directory
        predictions_folder = os.path.join(folder, 'predictions/', folder.split('_')[2])
        if not os.path.exists(predictions_folder):
            print(f"Predictions folder does not exist: {predictions_folder}")
            continue
        else:
            print(f"working on folder: {predictions_folder}")
        print(f"Processing folder: {predictions_folder}")  # Debugging statement
        results_df = parse_boltz2_results(os.path.join('./', predictions_folder))
        # For example, to save it as a CSV file:
        results_df.to_csv(f"{folder}_results.csv", index=False)
    else:
        print(f"Skipping folder: {folder} (does not start with 'boltz_results_')")

# Combine all the CSV files into a single DataFrame

file_list = [f for f in os.listdir('./') if f.endswith('_results.csv')]

combined_df = combine_csv_results(file_list)
combined_df.to_csv('boltz_results_combined.csv', index=False)
print("Combined results saved to 'boltz_results_combined.csv'")  # Debugging statement

confidence_value = float(input(("Enter the confidence value to filter results: ")))


# Filtering the DataFrame to include only rows with a confidence score greater than 0.9 
filtered_df = combined_df[combined_df['confidence_score'] > confidence_value]
filtered_df.to_csv('boltz_results_filtered.csv', index=False)
print("Filtered results saved to 'boltz_results_filtered.csv'")  # Debugging statement

# Using the filtered DataFrame to copy the corresponding .cif model files into a new directory
output_dir = 'filtered_models'
os.makedirs(output_dir, exist_ok=True)
for index, row in filtered_df.iterrows():
    model_path = row['model_path']
    if os.path.exists(model_path):
        # Copy the model file to the output directory
        os.system(f"cp {model_path} {output_dir}")
        print(f"Copied {model_path} to {output_dir}")
    else:
        print(f"Model file does not exist: {model_path}")  # Debugging statement

# remove the temporary .csv files
for file in file_list:
    if os.path.exists(file):
        os.remove(file)
        print(f"Removed temporary file: {file}")  # Debugging statement
    else:
        print(f"File does not exist: {file}")  # Debugging statement


# Copy all the .cif files from the combined results to a new directory
combined_output_dir = 'combined_models'
os.makedirs(combined_output_dir, exist_ok=True)
for file in combined_df['model_path'].unique():
    if os.path.exists(file):
        # Copy the model file to the output directory
        os.system(f"cp {file} {combined_output_dir}")
        print(f"Copied {file} to {combined_output_dir}")
    else:
        print(prediction_id)
        print(f"File does not exist: {file}")  # Debugging statement



In [None]:
# Utility to take the top confidence results for each prediction and match add them to a .csv file which has pharmacology data

pharm_data1 = pd.read_csv('Series_1a_-_134_compounds.csv')
pharm_data2 = pd.read_csv('Series_1b_-_500_compounds.csv')


top_confidence_results_by_prediction = {}
for index, row in combined_df.iterrows():
    model_index = row['model_index']
    model_path = row['model_path']
    confidence_score = row['confidence_score']
    ipTM = row['iptm']
    plddt_score = row['complex_plddt']
    
    # Extract the prediction ID from the model path
    prediction_id = os.path.basename(model_path).split('_')[0]
    
    if prediction_id not in top_confidence_results_by_prediction:
        top_confidence_results_by_prediction[prediction_id] = {
            'model_index': model_index,
            'model_path': model_path,
            'confidence_score': confidence_score,
            'iptm': ipTM,
            'plddt_score': plddt_score
        }
    else:
        # If the prediction already exists, compare confidence scores
        if confidence_score > top_confidence_results_by_prediction[prediction_id]['confidence_score']:
            top_confidence_results_by_prediction[prediction_id] = {
                'model_index': model_index,
                'model_path': model_path,
                'confidence_score': confidence_score,
                'iptm': ipTM,
                'plddt_score': plddt_score
            }

# Convert the dictionary to a DataFrame
top_confidence_df = pd.DataFrame.from_dict(top_confidence_results_by_prediction, orient='index').reset_index()
top_confidence_df.rename(columns={'index': 'prediction_id'}, inplace=True)

save_path = 'top_confidence_results_all_models.csv'
top_confidence_df.to_csv(save_path, index=False)

# Now need to merge this with the pharmacology data: Note that the pharmachology data has the prediction_id named 'Molecule Name'ArithmeticError
# First merge the two pharmacology datasets
pharm_data = pd.concat([pharm_data1, pharm_data2], ignore_index=True)

# Merge the top confidence results with the pharmacology data
merged_df = pd.merge(top_confidence_df, pharm_data, left_on='prediction_id', right_on='Molecule Name', how='left')
# Save the merged DataFrame to a CSV file
merged_save_path = 'top_confidence_results_with_pharm_data.csv'
merged_df.to_csv(merged_save_path, index=False)
print(f"Top confidence results with pharmacology data saved to {merged_save_path}")  #



In [None]:
# Create ZIP archive of the important results for sharing
# Path to the existing archive
archive_name = f'boltz_results_archive_{confidence_value}'

# Temporary directory to extract the archive
temp_dir = 'temp_archive'

# Copy the .csv files to the temporary directory
os.makedirs(temp_dir, exist_ok=True)
shutil.copy('boltz_results_combined.csv', temp_dir)
shutil.copy('boltz_results_filtered.csv', temp_dir)
shutil.copy('top_confidence_results_all_models.csv', temp_dir)
shutil.copy('top_confidence_results_with_pharm_data.csv', temp_dir)

# Copy the filtered_models directory to the temporary directory
filtered_models_path = os.path.join(temp_dir, 'filtered_models')
shutil.copytree('filtered_models', filtered_models_path, dirs_exist_ok=True)

# Copy the combined_models directory to the temporary directory
combined_models_path = os.path.join(temp_dir, 'combined_models')
shutil.copytree('combined_models', combined_models_path, dirs_exist_ok=True)

# Create the archive with the updated contents
shutil.make_archive(archive_name, 'zip', temp_dir)

# Clean up the temporary directory
shutil.rmtree(temp_dir)


print("Results Filtered and Collated comrade")