In [None]:
# Utitlity to generate YAML files for each molecule in a CSV file
# This YAML file for input into Boltz-2 contains all the template information

# Example Structure of output YAML file
"""
version: 1
sequences:
- protein:
    id: [A]
    sequence: YTMCNAFTPEKLKNAYVLYYLAIVGHSLSIFTLVISLGIFVFFRSLGCQRVTLHKNMFLTYILNSMIIIIHLVEVVPNGELVRRDPVSCKILHFFHQYMMACNYFWMLCEGIYLHTLIVVAVFTEKQRLRWYYLLGWGFPLVPTTIHAITRAVYFNDNCWLSVETHLLYIIHGPVMAALVVNFFFLLNIVRVLVTKMRETHEAESHMYLKAVKATMILVPLLGIQFVVFPWRPSNKMLGKIYDYVMHSLIHFQGFFVATIYCFCNNEVQTTVKRQWAQFKIQWNQ
    msa: ./msa/CTR_noECD.a3m
- ligand:
    id: [B]
    smiles: 'SMILESCODE'
templates:
    - cif: ./tempates/CTR_template.cif

"""
import pandas as pd
import yaml
import re
import os
import sys

# Read a CSV file into a DataFrame  
df = pd.read_csv('00-Unbiased_GenAI_NovelMolecules.csv')


protein_sequence = "AYVLYYLAIVGHSLSIFTLVISLGIFVFFRSLGCQRVTLHKNMFLTYILNSMIIIIHLVEVVPNGELVRRDPVSCKILHFFHQYMMACNYFWMLCEGIYLHTLIVVAVFTEKQRLRWYYLLGWGFPLVPTTIHAITRAVYFNDNCWLSVETHLLYIIHGPVMAALVVNFFFLLNIVRVLVTKMRETHEAESHMYLKAVKATMILVPLLGIQFVVFPWRPSNKMLGKIYDYVMHSLIHFQGFFVATIYCFCNNEVQTTVKRQWAQFKIQWNQRW"
chainA = "A"
chainB = "B"


def create_yaml_entry(smiles):
    return {
        'protein': {
            'id': chainA,
            'sequence': protein_sequence,
            'msa': './msa/CTR_Template_MSA.a3m'
        },
        'ligand': {
            'id': chainB,
            'smiles': smiles
        }
    }

def generate_yaml_files_template(df):
    for index, row in df.iterrows():
        molecule_name = str(row['index']).replace(" ", "_") # replace index with whatver is in the CSV file.
        smiles = row['SMILES']
        entry = create_yaml_entry(smiles)
        output_data = {
            'version': 1,
            'sequences': [
                {
                    'protein': entry['protein']
                },
                {
                    'ligand': entry['ligand']
                }
            ],
            'templates': [
                {
                    'cif': './CTR_GuideTemplate_56795.cif', # Update this path as needed
                }
            ]
        }

        filename = f"{molecule_name}.yaml"
        with open(filename, 'w') as file:
            yaml.dump(output_data, file, default_flow_style=False, sort_keys=False)
        # Strip out the extra quotes from the YAML output
        with open(filename, 'r') as file:
            contents = file.read()
        # Add single quotes around SMILES strings
        contents = re.sub(r'(smiles: )(.*)', r"\1'\2'", contents)
        
        # Remove single quotes around [A], [B], and [R]
        contents = re.sub(r"'(\[A\])'", r"\1", contents)
        contents = re.sub(r"'(\[B\])'", r"\1", contents)
        contents = re.sub(r"'(\[R\])'", r"\1", contents)
        
        # Write the corrected contents back to the file
        with open(filename, 'w') as file:
            file.write(contents)

def generate_yaml_files_affinity(df):
    for index, row in df.iterrows():
        molecule_name = str(row['index']).replace(" ", "_") # replace index with whatver is in the CSV file.
        smiles = row['SMILES']
        entry = create_yaml_entry(smiles)
        output_data = {
            'version': 1,
            'sequences': [
                {
                    'protein': entry['protein']
                },
                {
                    'ligand': entry['ligand']
                }
            ],
            'properties': [
                {
                    'affinity': {
                        'binder': chainB
                    }
                }
            ]
        }

        filename = f"{molecule_name}.yaml"
        with open(filename, 'w') as file:
            yaml.dump(output_data, file, default_flow_style=False, sort_keys=False)
        # Strip out the extra quotes from the YAML output
        with open(filename, 'r') as file:
            contents = file.read()
        # Add single quotes around SMILES strings
        contents = re.sub(r'(smiles: )(.*)', r"\1'\2'", contents)
        
        # Remove single quotes around [A], [B], and [R]
        contents = re.sub(r"'(\[A\])'", r"\1", contents)
        contents = re.sub(r"'(\[B\])'", r"\1", contents)
        contents = re.sub(r"'(\[R\])'", r"\1", contents)
        
        # Write the corrected contents back to the file
        with open(filename, 'w') as file:
            file.write(contents)

# Use the function to generate all the YAML files from the dataframe

# generate_yaml_files_template(df)
generate_yaml_files_affinity(df)  # Uncomment this line to generate affinity YAML files

# Check how many YAML files were generated
yaml_files = [f for f in os.listdir('.') if f.endswith('.yaml')]
print(f"Generated {len(yaml_files)} YAML files")

# Create a conditional, if the len(yaml_files) > 500 then distribute them across yaml1, yaml2, yaml3, yaml4 directories
if len(yaml_files) > 500:
    print("More than 500 YAML files generated, distributing across yaml1, yaml2, yaml3, yaml4 directories")

# Need set up the yaml1, yaml2, yaml3, yaml4 directories and move the files there
    if not os.path.exists('yaml1'):
        os.makedirs('yaml1')
    if not os.path.exists('yaml2'):
        os.makedirs('yaml2')
    if not os.path.exists('yaml3'):
        os.makedirs('yaml3')
    if not os.path.exists('yaml4'):
        os.makedirs('yaml4')
# Create directories if they do not exist
    for i, filename in enumerate(yaml_files):
        # Determine which directory to use based on the file index
        dir_index = (i % 4) + 1  # This will cycle through 1, 2, 3, 4
        target_dir = f'yaml{dir_index}'
        
        # Move the file to the target directory
        source_path = filename
        target_path = os.path.join(target_dir, filename)
        
        os.rename(source_path, target_path)
        print(f"Moved {filename} to {target_dir}/")

    print("All YAML files have been distributed across yaml1-yaml4 directories")
else:
    print("Less than or equal to 500 YAML files generated, no distribution needed")
    if not os.path.exists('yaml'):
        os.makedirs('yaml')
        for filename in yaml_files:
            target_path = os.path.join('yaml', filename)
            os.rename(filename, target_path)
            print(f"Moved {filename} to yaml/")


# If you want to generate the template YAML files, uncomment the line below
        

Generated 7137 YAML files
More than 500 YAML files generated, distributing across yaml1, yaml2, yaml3, yaml4 directories
Moved 1062.yaml to yaml1/
Moved 46.yaml to yaml2/
Moved 2421.yaml to yaml3/
Moved 2478.yaml to yaml4/
Moved 2707.yaml to yaml1/
Moved 1531.yaml to yaml2/
Moved 4620.yaml to yaml3/
Moved 2957.yaml to yaml4/
Moved 4550.yaml to yaml1/
Moved 4965.yaml to yaml2/
Moved 3745.yaml to yaml3/
Moved 1160.yaml to yaml4/
Moved 761.yaml to yaml1/
Moved 4446.yaml to yaml2/
Moved 4394.yaml to yaml3/
Moved 1498.yaml to yaml4/
Moved 4648.yaml to yaml1/
Moved 3539.yaml to yaml2/
Moved 5063.yaml to yaml3/
Moved 1432.yaml to yaml4/
Moved 6702.yaml to yaml1/
Moved 1000.yaml to yaml2/
Moved 5138.yaml to yaml3/
Moved 935.yaml to yaml4/
Moved 5575.yaml to yaml1/
Moved 5335.yaml to yaml2/
Moved 2113.yaml to yaml3/
Moved 2252.yaml to yaml4/
Moved 5154.yaml to yaml1/
Moved 4232.yaml to yaml2/
Moved 7011.yaml to yaml3/
Moved 1768.yaml to yaml4/
Moved 7135.yaml to yaml1/
Moved 1279.yaml to yaml2/