Tools for Generating Boltz-2 inputs from a filtered chEMBL subset

In [None]:
# Setup imports
import pandas as pd
import yaml
import re
import os
import sys

In [None]:
# Load Functions:

# Simple FASTA reader (no external dependencies)
def read_fasta(path):
    if not os.path.exists(path):
        raise FileNotFoundError(f"FASTA file not found: {path}")
    seq_lines = []
    with open(path, 'r') as fh:
        for line in fh:
            line = line.strip()
            if not line:
                continue
            if line.startswith('>'):
                # skip header lines
                continue
            seq_lines.append(line)
    return ''.join(seq_lines)

def create_yaml_entry(smiles):
    return {
        'protein_A': {
            'id': chainA,
            'sequence': protein_sequence_chainA,
            'msa': msa_file1
        },
        'protein_B': {
            'id': chainB,
            'sequence': protein_sequence_chainB,
            'msa': msa_file2
        },
        'ligand': {
            'id': chainC,
            'smiles': smiles
        }
    }

def generate_yaml_files_affinity(df):
    for index, row in df.iterrows():
        molecule_name = str(row['chembl_id']).replace(" ", "_")
        smiles = row['canonical_smiles']
        entry = create_yaml_entry(smiles)
        output_data = {
            'version': 1,
            'sequences': [
                {
                    'protein': entry['protein_A']
                },
                {
                    'protein': entry['protein_B']
                },
                {
                    'ligand': entry['ligand']
                }
            ],
            'properties': [
                {
                    'affinity': {
                        'binder': chainC
                    }
                }
            ]
        }


        filename = f"{molecule_name}.yaml"
        with open(filename, 'w') as file:
            yaml.dump(output_data, file, default_flow_style=False, sort_keys=False)
        # Strip out the extra quotes from the YAML output
        with open(filename, 'r') as file:
            contents = file.read()
        # Add single quotes around SMILES strings
        contents = re.sub(r'(smiles: )(.*)', r"\1'\2'", contents)
        
        # Add single quotes around protein sequence (handle multi-line if needed)
        contents = re.sub(r'(sequence: )([A-Z]+)', r"\1'\2'", contents)
        
        # Remove single quotes around [A], [B], and [R]
        contents = re.sub(r"'(\[A\])'", r"\1", contents)
        contents = re.sub(r"'(\[B\])'", r"\1", contents)
        contents = re.sub(r"'(\[R\])'", r"\1", contents)
        
        # Write the corrected contents back to the file
        with open(filename, 'w') as file:
            file.write(contents)



Edit cell below to reflect your input files

In [None]:
# Read in input .csv file 
# Edit as necessary for your .csv input

msa_file1 = 'chainA.a3m'  # change this path if your MSA is elsewhere
msa_file2 = 'chainB.a3m'

df = pd.read_csv('diverse_MW500-900_LogP3-5.csv')
print("Loaded CSV file into dataframe.")

# input FASTA file
# Edit as necessary for your FASTA input
ChainA_fasta_file = 'chainA.fasta'  # change this path if your FASTA is elsewhere
ChainB_fasta_file = 'chainB.fasta'

# Define some chain variables:
chainA = "A"
chainB = "B"
chainC = "C"

# Load the protein sequence into a variable
try:
    protein_sequence_chainA = read_fasta(ChainA_fasta_file)
    print(f"Loaded protein sequence from {ChainA_fasta_file} (length={len(protein_sequence_chainA)})")
    #print(f"Sequence (first 80 aa): {protein_sequence[:80]}")
except FileNotFoundError as e:
    print(e)
    protein_sequence_chainA = ''

try:
    protein_sequence_chainB = read_fasta(ChainB_fasta_file)
    print(f"Loaded protein sequence from {ChainB_fasta_file} (length={len(protein_sequence_chainB)})")
    #print(f"Sequence (first 80 aa): {protein_sequence[:80]}")
except FileNotFoundError as e:
    print(e)
    protein_sequence_chainB = ''


print("Protein sequence loaded successfully.")


In [None]:
# Use the function to generate all the YAML files from the dataframe

generate_yaml_files_affinity(df)  

# Check how many YAML files were generated
yaml_files = [f for f in os.listdir('.') if f.endswith('.yaml')]
print(f"Generated {len(yaml_files)} YAML files")

if len(yaml_files) == 0:
    print("No YAML files created, fix the issue!!!")
    sys.exit(1)

elif len(yaml_files) < 100:
    print("Less than 100 YAML files generated, Putting into a single directory.")

    # Create a directory for the YAML files
    if not os.path.exists('yaml'):
        os.makedirs('yaml')

    # Move the YAML files into the new directory
    for filename in yaml_files:
        source_path = filename
        target_path = os.path.join('yaml', filename)
        os.rename(source_path, target_path)
        print(f"Moved {filename} to yaml/")

elif 100 <= len(yaml_files) <= 500:
    print("Between 100-500 YAML files generated, distributing across yaml1, yaml2, yaml3, yaml4 directories")

    # Need set up the yaml1, yaml2, yaml3, yaml4 directories and move the files there
    if not os.path.exists('yaml1'):
        os.makedirs('yaml1')
    if not os.path.exists('yaml2'):
        os.makedirs('yaml2')
    if not os.path.exists('yaml3'):
        os.makedirs('yaml3')
    if not os.path.exists('yaml4'):
        os.makedirs('yaml4')
    
    for i, filename in enumerate(yaml_files):
        # Determine which directory to use based on the file index
        dir_index = (i % 4) + 1  # This will cycle through 1, 2, 3, 4
        target_dir = f'yaml{dir_index}'
        
        # Move the file to the target directory
        source_path = filename
        target_path = os.path.join(target_dir, filename)
        
        os.rename(source_path, target_path)
        print(f"Moved {filename} to {target_dir}/")

    print("All YAML files have been distributed across yaml1-yaml4 directories")

elif len(yaml_files) > 500:
    print("More than 500 YAML files generated, distributing across yaml1, yaml2, yaml3, yaml4, yaml5, yaml6, yaml7 and yaml8 directories")
    if not os.path.exists('yaml1'):
        os.makedirs('yaml1')
    if not os.path.exists('yaml2'):
        os.makedirs('yaml2')
    if not os.path.exists('yaml3'):
        os.makedirs('yaml3')
    if not os.path.exists('yaml4'):
        os.makedirs('yaml4')
    if not os.path.exists('yaml5'):
        os.makedirs('yaml5')
    if not os.path.exists('yaml6'):
        os.makedirs('yaml6')
    if not os.path.exists('yaml7'):
        os.makedirs('yaml7')
    if not os.path.exists('yaml8'):
        os.makedirs('yaml8')
    
    for i, filename in enumerate(yaml_files):
        # Determine which directory to use based on the file index
        dir_index = (i % 8) + 1  # This will cycle through 1, 2, 3, 4, 5, 6, 7, 8
        target_dir = f'yaml{dir_index}'

        # Move the file to the target directory
        source_path = filename
        target_path = os.path.join(target_dir, filename)

        os.rename(source_path, target_path)
        print(f"Moved {filename} to {target_dir}/")

    print("All YAML files have been distributed across yaml1-yaml8 directories")

else:
    print("No YAML files created, fix the issue!!!")


# If you want to generate the template YAML files, uncomment the line below
        