In [1]:
import sys
repo_root = "../"
if repo_root not in sys.path:
    sys.path.insert(0, repo_root)

In [2]:
%load_ext autoreload
%autoreload 2
from rdkit import Chem
from rdkit.Chem import AllChem
from utils import HELMConverter, draw_mol, get_main_mol, remove_isotopes, is_same_mol

lib_files = [
    "chembl_35_monomer_library.xml",
    "modan.json"
]
converter = HELMConverter().load(*[repo_root + f"data/helm/library/{name}" for name in lib_files])

  from .autonotebook import tqdm as notebook_tqdm


Loading BioT5 models...
Model loading completed.


In [3]:
import re

def split_modan_pep_seq(seq: str) -> list[str]:
    if seq.startswith("H-"):
        seq = seq[2:]
    pattern = (
        r"("
        r"\-"
        r"|NH2|X0=|X0|X1|X2|S5|R8"
        r"|Ac|CH3\(CH2\)4CO|CH3\(CH2\)10CO|CH3\(CH2\)16CO"
        r"|[A-Z]"
        r"|[a-z]"
        r")"
    )
    tokens = re.findall(pattern, seq)
    #assert seq == "".join(tokens)
    return tokens

def helm_from_modan_pep_seq(seq: str) -> str:
    token_dict = {"g": "G", #G doesn't have D-form
        "B": "[Ac5c]", "O": "[Orn]", "U": "[Aib]", "Z": "[Ac6c]", 
        "X0": "[OAllylHse]", "X0=": "[OAllylHse]", "J": "[Dab]", "X1": "[Dab]", "X2": "[Sar]", 
        "S5": "[S5]", "R8": "[R8]", 
        "NH2": "[am]", "Ac": "[ac]",
        "CH3(CH2)4CO": "[Pentanoyl]", "CH3(CH2)10CO": "[Dodecanoyl]", "CH3(CH2)16CO": "[Octadecanoyl]"}
    seq_list = split_modan_pep_seq(seq)
    bridge_left_idx = bridge_right_idx = -1
    monomer_idx = 1
    helm = "PEPTIDE1{"
    for t in seq_list:
        if t == "-":
            continue
        elif t == "S5" or t == "R8" or t == "X0=":
            if bridge_left_idx == -1:
                bridge_left_idx = monomer_idx
                helm += token_dict[t] + "."
            else:
                bridge_right_idx = monomer_idx
                helm += token_dict[t] + "."
        else:
            if t in token_dict:
                helm += token_dict[t] + "."
            else:
                if t.isupper():
                    helm += t + "."
                else:
                    helm += "[d" + t.upper() + "]."
        monomer_idx += 1
    if bridge_left_idx == -1:
        helm = helm[:-1] + "}$$$$"
    else:
        helm = helm[:-1] + "}$"
        helm += "PEPTIDE1,PEPTIDE1," + str(bridge_left_idx) + ":R3-c-" + str(bridge_right_idx) + ":R3$$$"
    return helm

In [4]:
import pandas as pd

modan_raw_path = repo_root + "data/helm/Dataset_MODAN_initial.xlsx"
df = pd.read_excel(modan_raw_path, sheet_name="Sheet1", index_col=0)

for idx, r in df.iterrows():
    pep = r["Peptide sequence"]
    smiles = r["SMILES"]
    helm = helm_from_modan_pep_seq(pep)
    mol_h = converter.convert(helm)
    mol_s = Chem.MolFromSmiles(smiles)
    if not is_same_mol(mol_h, mol_s):
        print("unmatch: ", idx, pep, helm, smiles)

unmatch:  44 H-GIOOFLKSUOOFVOUFO-NH2 PEPTIDE1{G.I.[Orn].[Orn].F.L.K.S.[Aib].[Orn].[Orn].F.V.[Orn].[Aib].F.[Orn].[am]}$$$$ [H]NCC(N[C@]([C@@H](C)CC)([H])C(N[C@@H](CCCN)C(N[C@@H](CCCN)C(N[C@H](C(N[C@@H](CC(C)C)C(N[C@@H](CCCN)C(N[C@@H](CO)C(NC(C(N[C@@H](CCCN)C(N[C@@H](CCCN)C(N[C@H](C(N[C@@H](C(C)C)C(N[C@@H](CCCN)C(NC(C(N[C@H](C(N[C@@H](CCCN)C(N)=O)=O)CC1=CC=CC=C1)=O)(C)C)=O)=O)=O)CC2=CC=CC=C2)=O)=O)=O)(C)C)=O)=O)=O)=O)CC3=CC=CC=C3)=O)=O)=O)=O
unmatch:  57 H-GIKWFLKSUWKFVWUFK-NH2 PEPTIDE1{G.I.K.W.F.L.K.S.[Aib].W.K.F.V.W.[Aib].F.K.[am]}$$$$ [H]NCC(N[C@]([C@@H](C)CC)([H])C(N[C@H](C(N[C@H](C(N[C@H](C(N[C@@H](CC(C)C)C(N[C@H](C(N[C@@H](CO)C(NC(C(N[C@H](C(N[C@H](C(N[C@H](C(N[C@@H](C(C)C)C(N[C@H](C(NC(C(N[C@H](C(N[C@H](C(N)=O)C)=O)CC1=CC=CC=C1)=O)(C)C)=O)CC2=CNC3=C2C=CC=C3)=O)=O)CC4=CC=CC=C4)=O)C)=O)CC5=CNC6=C5C=CC=C6)=O)(C)C)=O)=O)C)=O)=O)CC7=CC=CC=C7)=O)CC8=CNC9=C8C=CC=C9)=O)C)=O)=O
