In [23]:
# Structure of output YAML file
"""
version: 1  # Optional, defaults to 1
sequences:
  - protein:
      id: [A]
      sequence: MVTPEGNVSLVDESLLVGVTDEDRAVRSAHQFYERLIGLWAPA
      msa: ./msa/CTR.a3m
  - ligand:
      id: [B]
      smiles: 'N[C@@H](Cc1ccc(O)cc1)C(=O)O'

"""
import pandas as pd
import yaml
import re

# Read a CSV file into a DataFrame  
df = pd.read_csv('Series_2_SMILES.csv')


protein_sequence = "GPAAFSNQTYPTIEPKPFLYVVGRKKMMDAQYKCYDRMQQLPAYQGEGPYCNRTWDGWLCWDDTPAGVLSYQFCPDYFPDFDPSEKVTKYCDEKGVWFKHPENNRTWSNYTMCNAFTPEKLKNAYVLYYLAIVGHSLSIFTLVISLGIFVFFRSLGCQRVTLHKNMFLTYILNSMIIIIHLVEVVPNGELVRRDPVSCKILHFFHQYMMACNYFWMLCEGIYLHTLIVVAVFTEKQRLRWYYLLGWGFPLVPTTIHAITRAVYFNDNCWLSVETHLLYIIHGPVMAALVVNFFFLLNIVRVLVTKMRETHEAESHMYLKAVKATMILVPLLGIQFVVFPWRPSNKMLGKIYDYVMHSLIHFQGFFVATIYCFCNNEVQTTVKRQWAQFKIQWNQRWGRRPSNRSARAAAAAAEAGDIPIYICHQELRNEPANNQGEESAEIIPLNIIEQESSAPAGLEVLFQ"
chainA = "[A]"
chainB = "[B]"


def create_yaml_entry(smiles):
    return {
        'protein': {
            'id': chainA,
            'sequence': protein_sequence,
            'msa': './msa/CTR.a3m'
        },
        'ligand': {
            'id': chainB,
            'smiles': smiles
        }
    }

def generate_yaml_files(df):
    for index, row in df.iterrows():
        molecule_name = str(row['Molecule Name']).replace(" ", "_")
        smiles = row['SMILES']
        entry = create_yaml_entry(smiles)
        output_data = {
            'version': 1,
            'sequences': [
                {
                    'protein': entry['protein']
                },
                {
                    'ligand': entry['ligand']
                }
            ]
        }
        filename = f"{molecule_name}.yaml"
        with open(filename, 'w') as file:
            yaml.dump(output_data, file, default_flow_style=False, sort_keys=False)
        # Strip out the extra quotes from the YAML output
        with open(filename, 'r') as file:
            contents = file.read()
        # Add single quotes around SMILES strings
        contents = re.sub(r'(smiles: )(.*)', r"\1'\2'", contents)
        
        # Remove single quotes around [A] and [B]
        contents = re.sub(r"'(\[A\])'", r"\1", contents)
        contents = re.sub(r"'(\[B\])'", r"\1", contents)
        
        # Write the corrected contents back to the file
        with open(filename, 'w') as file:
            file.write(contents)


# Use the function to generate all the YAML files from the dataframe
generate_yaml_files(df)

