# Workup for functions for BoltzAnalysis

In [None]:
# Basic idea for parsing and filtering atoms in a CIF file using modelcif

import modelcif
import modelcif.reader

def filter_atoms_by_id(file_path, target_atom_id=None, target_asym_unit_id=None):
    """
    Filters atoms based on `atom_id` and `asym_unit.id`.

    Parameters:
        file_path (str): Path to the CIF file.
        target_atom_id (str, optional): Filter for atom_id (e.g., 'N').
        target_asym_unit_id (str, optional): Filter for asym_unit.id (e.g., 'A').

    Returns:
        None
    """
    # Open the CIF file and pass the file handle to the reader
    with open(file_path, 'r') as fh:
        systems = modelcif.reader.read(fh)  # Returns a list of System objects
    
    print(f"Number of systems found: {len(systems)}")
    for system in systems:
        print(f"System ID: {system.id}")
        
        # Extract models
        print("\nFiltering atoms:")
        models = system._all_models()  # Call the method to retrieve models
        for model_tuple in models:  # Iterate over the generator
            model_list, model_object = model_tuple
            
            # Extract atoms using the `get_atoms` method
            atoms = model_object.get_atoms()
            for atom in atoms:
                # Retrieve atom_id and asym_unit.id
                atom_id = atom.atom_id
                asym_unit = atom.asym_unit
                asym_unit_id = getattr(asym_unit, 'id', None) if asym_unit else None
                
                # Apply filters
                if (target_atom_id is None or atom_id == target_atom_id) and \
                   (target_asym_unit_id is None or asym_unit_id == target_asym_unit_id):
                    print(f"Atom: atom_id={atom_id}, type_symbol={atom.type_symbol}, x={atom.x}, y={atom.y}, z={atom.z}")
                    print(f"Asym Unit: id={asym_unit_id}, details={getattr(asym_unit, 'details', 'N/A')}")

# Example usage
file_path = './2_model_0.cif'  # Replace with the path to your CIF file

# Filter by atom_id
print("\nFiltering by atom_id='N':")
filter_atoms_by_id(file_path, target_atom_id='CA')

# Filter by asym_unit.id
print("\nFiltering by asym_unit.id='A':")
filter_atoms_by_id(file_path, target_asym_unit_id='B')

# Filter by both atom_id and asym_unit.id
#print("\nFiltering by atom_id='CA' and asym_unit.id='A':")
#filter_atoms_by_id(file_path, target_atom_id='CA', target_asym_unit_id='A')

In [None]:
# Extract coordinates into a structured numpy array for analysis

import numpy as np
import modelcif.reader
import modelcif

def extract_coordinates(file_path, target_atom_id=None, target_asym_unit_id=None):
    # Open the CIF file and pass the file handle to the reader
    with open(file_path, 'r') as fh:
        systems = modelcif.reader.read(fh)  # Returns a list of System objects
    
    # print(f"Number of systems found: {len(systems)}") # Debugging line
    coordinates = None  # Initialize coordinates before the loop
    for system in systems:
        # print(f"System ID: {system.id}")
        
        # Extract models
        print("\nFiltering atoms:")
        models = system._all_models()  # Call the method to retrieve models
        for model_tuple in models:  # Iterate over the generator
            model_list, model_object = model_tuple
            
            # Extract atoms using the `get_atoms` method
            atoms = model_object.get_atoms()
            for atom in atoms:
                # Retrieve atom_id and asym_unit.id
                atom_id = atom.atom_id
                asym_unit = atom.asym_unit
                asym_unit_id = getattr(asym_unit, 'id', None) if asym_unit else None
                
                # Apply filters
                if (target_atom_id is None or atom_id == target_atom_id) and \
                   (target_asym_unit_id is None or asym_unit_id == target_asym_unit_id):
                    # Extract the ouput coordinates
                    if coordinates is None:
                        coordinates = np.array([atom.x, atom.y, atom.z])
                    else:
                        coordinates = np.vstack((coordinates, np.array([atom.x, atom.y, atom.z])))
    return coordinates

# Example usage
file_path = './2_model_0.cif'  # Replace with the path to your

CA_positions = extract_coordinates(file_path, target_atom_id='CA')

print(CA_positions[:5])
print(CA_positions.shape)

ligand_positions = extract_coordinates(file_path, target_asym_unit_id='B')

print(ligand_positions[:5])
print(ligand_positions.shape)

In [None]:
# Utility Functions for calculating Centre of Mass and distances.

def centre_of_mass(coordinates):
    # Calculate the 3D position of the centre of mass of set of cartesian coordinates
    result = np.mean(coordinates, axis = 0)
    return result

value1 = centre_of_mass(ligand_positions)
print (value1)

def calculate_distance_matrix(CA_positions,Centre_of_Mass):
    """
    Funtion to calculate distances between the centre of mass of the ligand and 
    all the 'C-alpha' positions.
    Parameters:
        CA_positions (np.ndarray): Array of C-alpha positions.
        Centre_of_Mass (np.ndarray): Centre of mass position.

    Returns:
        np.ndarray: Distance matrix between the centre of mass and C-alpha positions.
    """
    # Intialize new array:
    distances = []
    for row in CA_positions:
        distances.append(np.linalg.norm(row - Centre_of_Mass))
    return np.array(distances)

distance1 = calculate_distance_matrix(CA_positions,value1)

print(distance1)
print(distance1.shape)



[ 9.261308   -5.54084325  1.63225775]
[17.0956535  18.94361117 18.25069807 14.43364081 15.21798792 17.56342636
 15.48787408 12.94743312 16.12092171 17.77091518 14.96680557 14.45322102
 18.25737781 19.01030514 16.74652776 18.72306921 21.94530541 21.65365367
 20.31778373 23.16643145 25.65590523 24.37159367 24.47390692 27.91850585
 29.12941646 28.1645911  29.75668851 32.83039216 33.81570914 35.0718028
 35.77296618 32.03262478 31.69053894 30.70272297 28.36959029 25.88689999
 25.96540014 25.67332441 22.31333452 21.33423947 22.46612147 20.34735987
 17.19805496 18.18449461 18.7520029  15.28666846 13.88664815 16.76657869
 16.41427119 12.66480147 13.47520728 16.59465594 15.03351421 12.57169286
 15.77875261 17.81871165 15.54545483 15.61039087 19.35749364 20.78110703
 18.8303567  17.34251893 20.78781743 22.89452253 23.88991247 26.65428772
 24.46460706 22.20337394 25.49435893 27.05107997 24.02860354 24.97947964
 23.88099091 22.72651227 19.69286884 18.70443232 19.36978946 17.11862482
 14.26287457 1

In [None]:
# Function to filter by distance and position. The inputs need to be up to three amino acid positions a distance per amino acid position

def filter_by_sites(distances, site1, site2, site3, distance_thresh):
    """
    Function to filter the dataset by defining 3 CA residue positions and distance threshold

    Parameters:
    distances (np.array): distances in 1D array
    site1, site2, site3 (int): refer to the row position and the hence the residue position
    distance_thresh (float): threshold for filtering by distance
    
    Returns: Boolean and distances to site
    """
    # read np array
    
    if distances[site1] < distance_thresh and distances[site2] < distance_thresh and distances[site3] < distance_thresh:
        return True, distances[site1], distances[site2], distances[site3]
    else:
        return False, None, None, None


# 157 is Histadine, 89 is Tyrosine, 211 is Phe
filter_by_sites(distance1,157,89,211,11)


(True, 7.717457874154522, 10.297696083804116, 7.599926929480285)

In [29]:
# Filter by prediction confidence:
import os
import pandas as pd
import json
import shutil

def parse_boltz2_results(directory):
    results = []
    print(f"Scanning directory: {directory}")  # Debugging statement
    confidence_files = [f for f in os.listdir(directory) if f.startswith('confidence_') and f.endswith('.json')]
    print(f"Found confidence files: {confidence_files}")  # Debugging statement

    for conf_file in confidence_files:
        model_index = conf_file.split('_')[-1].split('.')[0]  # Extract model index from filename
        base_name = '_'.join(conf_file.split('_')[1:-2])
        confidence_data = json.load(open(os.path.join(directory, conf_file), 'r'))  # Load the JSON data

        # Find the corresponding model file dynamically based on the confidence file name
        base_name = conf_file.replace('confidence_', '').replace(f'_model_{model_index}.json', '')
        model_file = f"{base_name}_model_{model_index}.cif"
        model_path = os.path.join(directory, model_file)
        print(f"Constructed model path: {model_path}")  # Debugging statement

        if os.path.exists(model_path):
            print(f"Model file exists: {model_path}")  # Debugging statement
            results.append({
                'model_path': model_path,
                'model_index': model_index,
                'confidence_score': confidence_data['confidence_score'],
                'ptm': confidence_data['ptm'],
                'iptm': confidence_data['iptm'],
                'ligand_iptm': confidence_data['ligand_iptm'],
                'protein_iptm': confidence_data['protein_iptm'],
                'complex_plddt': confidence_data['complex_plddt'],
                'complex_iplddt': confidence_data['complex_iplddt'],
                'complex_pde': confidence_data['complex_pde'],
                'complex_ipde': confidence_data['complex_ipde'],
                'chains_ptm': confidence_data['chains_ptm'],
                'pair_chains_iptm': confidence_data['pair_chains_iptm']  
            })
        else:
            print(f"Model file does not exist: {model_path}")  # Debugging statement

    return pd.DataFrame(results)

# Function to combine the CSV results from multiple files:
def combine_csv_results(file_list):
    combined_df = pd.concat([pd.read_csv(f) for f in file_list if f.endswith('_results.csv')])
    return combined_df

folder_list = os.listdir('./')

print(f"Found folders: {folder_list}")  # Debugging statement

for folder in folder_list:
    if folder.startswith('boltz_results_'):
        # Process only folders that start with 'boltz_results_'
        # The .json files within this folder are two folders further up in the directory structure
        # so we need to adjust the path accordingly
        # For example, if the folder is 'boltz_results_MIPS-0051357', we will look for
        # 'boltz_results_MIPS-0051357/predictions/MIPS-0051357'
        # Adjust the path to point to the correct directory
        predictions_folder = os.path.join(folder, 'predictions/', folder.split('_')[2])
        if not os.path.exists(predictions_folder):
            print(f"Predictions folder does not exist: {predictions_folder}")
            continue
        else:
            print(f"working on folder: {predictions_folder}")
        print(f"Processing folder: {predictions_folder}")  # Debugging statement
        results_df = parse_boltz2_results(os.path.join('./', predictions_folder))
        # For example, to save it as a CSV file:
        results_df.to_csv(f"{folder}_results.csv", index=False)
    else:
        print(f"Skipping folder: {folder} (does not start with 'boltz_results_')")

# Combine all the CSV files into a single DataFrame

file_list = [f for f in os.listdir('./') if f.endswith('_results.csv')]

combined_df = combine_csv_results(file_list)
combined_df.to_csv('boltz_results_combined.csv', index=False)
print("Combined results saved to 'boltz_results_combined.csv'")


Found folders: ['Connections3.py', 'boltz_batch_SLURM.bash', 'boltz_results_2', 'Learning_mmCIF.ipynb', 'move_yaml_files.sh', 'GenerateYAML_pocket.ipynb', 'GenerateYAML_FromCSV.ipynb', 'mmCIF_Filtereing_Tests.ipynb', 'README.md', 'boltz_results_1001', 'boltz_results_4035', '2_model_0.cif', 'Boltz_Analyze.ipynb', 'GenerateTemplate_YAML.ipynb', 'boltz_batch.sh', 'ExtractResults.ipynb', 'PyMOL_SuperAlignLOOP.py', 'boltz_results_2468', '.git']
Skipping folder: Connections3.py (does not start with 'boltz_results_')
Skipping folder: boltz_batch_SLURM.bash (does not start with 'boltz_results_')
working on folder: boltz_results_2/predictions/2
Processing folder: boltz_results_2/predictions/2
Scanning directory: ./boltz_results_2/predictions/2
Found confidence files: ['confidence_2_model_0.json']
Constructed model path: ./boltz_results_2/predictions/2/2_model_0.cif
Model file exists: ./boltz_results_2/predictions/2/2_model_0.cif
Skipping folder: Learning_mmCIF.ipynb (does not start with 'boltz_

In [None]:
# Test main loop
"""
Idea is to read all the predicitons to cif. If they pass the distance threshold and also 
a prediction confidence threshold, then they are copied to a new directory filtered_models
"""

