In [1]:
# Utitlity to generate YAML files for each molecule in a CSV file
# This YAML file for input into Boltz-2 contains all the template information

# Structure of output YAML file
"""
version: 1
sequences:
- protein:
    id: [A]
    sequence: YTMCNAFTPEKLKNAYVLYYLAIVGHSLSIFTLVISLGIFVFFRSLGCQRVTLHKNMFLTYILNSMIIIIHLVEVVPNGELVRRDPVSCKILHFFHQYMMACNYFWMLCEGIYLHTLIVVAVFTEKQRLRWYYLLGWGFPLVPTTIHAITRAVYFNDNCWLSVETHLLYIIHGPVMAALVVNFFFLLNIVRVLVTKMRETHEAESHMYLKAVKATMILVPLLGIQFVVFPWRPSNKMLGKIYDYVMHSLIHFQGFFVATIYCFCNNEVQTTVKRQWAQFKIQWNQ
    msa: ./msa/CTR_noECD.a3m
- ligand:
    id: [B]
    smiles: 'FC1=CC(C(N2C(=O)OC3=C2C=CC(C2=C(C4=NN=C(C)O4)C(CCC4=CC=C(F)C=C4)=NC4=C2C(=O)N2N4CCC2)=C3)C)=CC=C1C#N'
templates:
    - cif: ./tempates/CTR_template.cif
      chain_id: [R] # Chain ID of the template structure
properties:
    - affinity:
        binder: [B]

"""
import pandas as pd
import yaml
import re

# Read a CSV file into a DataFrame  
df = pd.read_csv('Series_10_SMILES.csv')


protein_sequence = "YTMCNAFTPEKLKNAYVLYYLAIVGHSLSIFTLVISLGIFVFFRSLGCQRVTLHKNMFLTYILNSMIIIIHLVEVVPNGELVRRDPVSCKILHFFHQYMMACNYFWMLCEGIYLHTLIVVAVFTEKQRLRWYYLLGWGFPLVPTTIHAITRAVYFNDNCWLSVETHLLYIIHGPVMAALVVNFFFLLNIVRVLVTKMRETHEAESHMYLKAVKATMILVPLLGIQFVVFPWRPSNKMLGKIYDYVMHSLIHFQGFFVATIYCFCNNEVQTTVKRQWAQFKIQWNQRWGRRPSNRSARAAAAAAEAGDIPIYICHQELRNEPANNQGEESAEIIPLNIIEQESSAPAGLEVLFQ"
chainA = "[A]"
chainB = "[B]"


def create_yaml_entry(smiles):
    return {
        'protein': {
            'id': chainA,
            'sequence': protein_sequence,
            'msa': './msa/CTR.a3m'
        },
        'ligand': {
            'id': chainB,
            'smiles': smiles
        }
    }

def generate_yaml_files(df):
    for index, row in df.iterrows():
        molecule_name = str(row['Molecule Name']).replace(" ", "_")
        smiles = row['SMILES']
        entry = create_yaml_entry(smiles)
        output_data = {
            'version': 1,
            'sequences': [
                {
                    'protein': entry['protein']
                },
                {
                    'ligand': entry['ligand']
                }
            ],
            'templates': [
                {
                    'cif': './tempates/CTR_template.cif',
                    'chain_id': '[R]'  # Chain ID of the template structure
                }
            ],
            'properties': [
                {
                    'affinity': {
                        'binder': '[B]'
                    }
                }
            ]
        }
        filename = f"{molecule_name}.yaml"
        with open(filename, 'w') as file:
            yaml.dump(output_data, file, default_flow_style=False, sort_keys=False)
        # Strip out the extra quotes from the YAML output
        with open(filename, 'r') as file:
            contents = file.read()
        # Add single quotes around SMILES strings
        contents = re.sub(r'(smiles: )(.*)', r"\1'\2'", contents)
        
        # Remove single quotes around [A], [B], and [R]
        contents = re.sub(r"'(\[A\])'", r"\1", contents)
        contents = re.sub(r"'(\[B\])'", r"\1", contents)
        contents = re.sub(r"'(\[R\])'", r"\1", contents)
        
        # Write the corrected contents back to the file
        with open(filename, 'w') as file:
            file.write(contents)

# Use the function to generate all the YAML files from the dataframe
generate_yaml_files(df)

FileNotFoundError: [Errno 2] No such file or directory: 'Series_10_SMILES.csv'