# Fragmentation

> Fragmentation module

In [None]:
#| default_exp fragmentation

In [None]:
#| hide
from nbdev.showdoc import *
from fastcore.test import test_eq, test_fail

In [None]:
#| export
import sys
if '..' not in sys.path:
    sys.path.append('..')
import numpy as np
from rdkit import Chem
from copy import deepcopy
from rdkit.Chem import BRICS
from breadth_first_fragmentation.utilities import mol_from_smiles, mols_from_smiles, mol_to_smiles, root_smiles

In [None]:
#| export
def count_dummies(mol:Chem.rdchem.Mol, # input molecule
                  )->int: # count of dummy atoms
    'Function to count dummy atoms.'
    count = 0
    for atom in mol.GetAtoms():
        if atom.GetAtomicNum() == 0:
            count += 1
    return count

In [None]:
show_doc(count_dummies)

---

[source](https://github.com/panukorn17/breadth-first-fragmentation/blob/main/breadth_first_fragmentation/fragmentation.py#L20){target="_blank" style="float:right; font-size:smaller"}

### count_dummies

>      count_dummies (mol:rdkit.Chem.rdchem.Mol)

*Function to count dummy atoms.*

|    | **Type** | **Details** |
| -- | -------- | ----------- |
| mol | Mol | input molecule |
| **Returns** | **int** | **count of dummy atoms** |

In [None]:
frag = '*c1c(C)cccc1C'
print(count_dummies(mol_from_smiles(frag)))

1


In [None]:
frag = '*NC(*)C'
print(count_dummies(mol_from_smiles(frag)))

2


Unit Tests

In [None]:
test_eq(count_dummies(mol_from_smiles('Cc1cccc(C)c1NC(=O)C[NH+]1CCC(OCc2ccc(F)cc2)CC1')),0)
test_eq(count_dummies(mol_from_smiles('*c1c(C)cccc1C')),1)
test_eq(count_dummies(mol_from_smiles('*NC(*)C')),2)

In [None]:
#| export
def get_size(frag:Chem.rdchem.Mol, # input fragment
             )->int: # count of real atoms
    'Function to count real atoms.'
    dummies = count_dummies(frag)
    total_atoms = frag.GetNumAtoms()
    real_atoms = total_atoms - dummies
    return real_atoms

In [None]:
show_doc(get_size)

---

[source](https://github.com/panukorn17/breadth-first-fragmentation/blob/main/breadth_first_fragmentation/fragmentation.py#L30){target="_blank" style="float:right; font-size:smaller"}

### get_size

>      get_size (frag:rdkit.Chem.rdchem.Mol)

*Function to count real atoms.*

|    | **Type** | **Details** |
| -- | -------- | ----------- |
| frag | Mol | input fragment |
| **Returns** | **int** | **count of real atoms** |

In [None]:
frag = '*c1c(C)cccc1C'
print(get_size(mol_from_smiles(frag)))

8


In [None]:
frag = '*NC(*)C'
print(get_size(mol_from_smiles(frag)))

3


In [None]:
smi = 'Cc1cccc(C)c1NC(=O)C[NH+]1CCC(OCc2ccc(F)cc2)CC1'
print(get_size(mol_from_smiles(smi)))

27


Unit Tests

In [None]:
test_eq(get_size(mol_from_smiles('*c1c(C)cccc1C')),8)
test_eq(get_size(mol_from_smiles('*NC(*)C')),3)
test_eq(get_size(mol_from_smiles('Cc1cccc(C)c1NC(=O)C[NH+]1CCC(OCc2ccc(F)cc2)CC1')),27)


In [None]:
#| export
def replace_last(s:str, # the string (fragment) to which the dummy label * is to be replaced with another fragment
                 old:str, # the string from the fragment s to be replaced
                 new:str, # the string to replace the 'old' string in the fragment s
                 )->str: # the original string s with the replacement
    'Function to replace the last occuring dummy label with a fragment.'
    s_reversed = s[::-1]
    old_reversed = old[::-1]
    new_reversed = new[::-1]

    # Replace the first occurrence in the reversed string
    s_reversed = s_reversed.replace(old_reversed, new_reversed, 1)

    # Reverse the string back to original order
    return s_reversed[::-1]

In [None]:
show_doc(replace_last)

---

[source](https://github.com/panukorn17/breadth-first-fragmentation/blob/main/breadth_first_fragmentation/fragmentation.py#L39){target="_blank" style="float:right; font-size:smaller"}

### replace_last

>      replace_last (s:str, old:str, new:str)

*Function to replace the last occuring dummy label with a fragment.*

|    | **Type** | **Details** |
| -- | -------- | ----------- |
| s | str | the string (fragment) to which the dummy label * is to be replaced with another fragment |
| old | str | the string from the fragment s to be replaced |
| new | str | the string to replace the 'old' string in the fragment s |
| **Returns** | **str** | **the original string s with the replacement** |

In [None]:
s = 'N(*)C(=O)C[NH+]1CCC(OCc2ccc(F)cc2)CC1'
old = '*'
new = 'c1c(C)cccc1C'
print(replace_last(s, old, new))

N(c1c(C)cccc1C)C(=O)C[NH+]1CCC(OCc2ccc(F)cc2)CC1


In [None]:
s = 'C(*)(=O)C[NH+]1CCC(OCc2ccc(F)cc2)CC1'
old = '*'
new = 'N*'
print(replace_last(s, old, new))

C(N*)(=O)C[NH+]1CCC(OCc2ccc(F)cc2)CC1


Unit Tests

In [None]:
test_eq(replace_last('N(*)C(=O)C[NH+]1CCC(OCc2ccc(F)cc2)CC1', '*', 'c1c(C)cccc1C'), 'N(c1c(C)cccc1C)C(=O)C[NH+]1CCC(OCc2ccc(F)cc2)CC1')
test_eq(replace_last('C(*)(=O)C[NH+]1CCC(OCc2ccc(F)cc2)CC1', '*', 'N*'), 'C(N*)(=O)C[NH+]1CCC(OCc2ccc(F)cc2)CC1')

In [None]:
#| export
def check_reconstruction(frags:list[str], # list of fragments in SMILES format
                         frag_1:str, # head/tail fragment in SMILES format
                         frag_2:str, # head/tail fragment in SMILES format
                         orig_smi, # original molecule in SMILES format
                         )->bool: # whether the original molecule was reconstructed
    'Function to test whether the original molecule has been reconstructed.'
    try:
        frags_test = frags.copy()
        frags_test.append(frag_1)
        frags_test.append(frag_2)
        frag_2_re = frags_test[-1]
        for i in range(len(frags_test)-1):
            frag_1_re = frags_test[-1*i-2]
            recomb = replace_last(frag_2_re, '*', frag_1_re.replace('*', '',1))
            recomb_canon = root_smiles(recomb, rootedAtAtom = 1)
            frag_2_re = recomb_canon
        orig_smi_canon = root_smiles(orig_smi, rootedAtAtom = 1)
        if recomb_canon == orig_smi_canon:
            return True
        else:
            return False
    except:
        return False

In [None]:
show_doc(check_reconstruction)

---

[source](https://github.com/panukorn17/breadth-first-fragmentation/blob/main/breadth_first_fragmentation/fragmentation.py#L55){target="_blank" style="float:right; font-size:smaller"}

### check_reconstruction

>      check_reconstruction (frags:list[str], frag_1:str, frag_2:str, orig_smi)

*Function to test whether the original molecule has been reconstructed.*

|    | **Type** | **Details** |
| -- | -------- | ----------- |
| frags | list | list of fragments in SMILES format |
| frag_1 | str | head/tail fragment in SMILES format |
| frag_2 | str | head/tail fragment in SMILES format |
| orig_smi |  | original molecule in SMILES format |
| **Returns** | **bool** | **whether the original molecule was reconstructed** |

In [None]:
frags = []
frag_1 = '*CCC'
frag_2 = 'N(*)(CCc1cccc(-c2ccccc2)c1)C(=O)C1OC(C(=O)O)=CC(N)C1NC(C)=O'
orig_smi = 'CCCN(CCc1cccc(-c2ccccc2)c1)C(=O)C1OC(C(=O)O)=CC(N)C1NC(C)=O'
print(check_reconstruction(frags, frag_1, frag_2, orig_smi))

True


In [None]:
frags = ['*c1c(C)cccc1C', '*N*', '*CC(*)=O']
frag_1 = '*C1CC[NH+](*)CC1'
frag_2 = 'O(*)Cc1ccc(F)cc1'
orig_smi = 'Cc1cccc(C)c1NC(=O)C[NH+]1CCC(OCc2ccc(F)cc2)CC1'
print(check_reconstruction(frags, frag_1, frag_2, orig_smi))

True


Unit Tests

In [None]:
test_eq(check_reconstruction([], '*CCC', 'N(*)(CCc1cccc(-c2ccccc2)c1)C(=O)C1OC(C(=O)O)=CC(N)C1NC(C)=O', 'CCCN(CCc1cccc(-c2ccccc2)c1)C(=O)C1OC(C(=O)O)=CC(N)C1NC(C)=O'), True)
test_eq(check_reconstruction(['*c1c(C)cccc1C', '*N*', '*CC(*)=O'], '*C1CC[NH+](*)CC1', 'O(*)Cc1ccc(F)cc1', 'Cc1cccc(C)c1NC(=O)C[NH+]1CCC(OCc2ccc(F)cc2)CC1'), True)
test_fail(check_reconstruction([], '*CCC', 'N(*)(CCc1cccc(-c2ccccc2)c1)C(=O)C1OC(C(=O)O)=CC(N)C1NC(C)=O', 'C(CCc1cccc(-c2ccccc2)c1)C(=O)C1OC(C(=O)O)=CC(N)C1NC(C)=O'))

In [None]:
#| export
def check_bond_no(bonds:list, # the list of BRIC bonds locations
                  frags:list, # the list of fragments
                  frag_list_len:int, # the length of the fragment list
                  smi:str, # the smiles string of the molecule
                  verbose:int=0, # print fragmentation process, set verbose to 1
                  )->tuple: # a tuple containing the fragment list and a boolean value to indicate whether fragmentation is complete
    'This function checks if the molecule has less bonds than the limit of BRIC bonds.'
    if (len(bonds) <= frag_list_len):
        if verbose == 1:
            print('Final Fragment: ', smi)
        frags.append(root_smiles(smi, rootedAtAtom=1))
        fragComplete = True
        return frags, fragComplete
    else:
        fragComplete = False
        return frags, fragComplete

In [None]:
show_doc(check_bond_no)

---

[source](https://github.com/panukorn17/breadth-first-fragmentation/blob/main/breadth_first_fragmentation/fragmentation.py#L80){target="_blank" style="float:right; font-size:smaller"}

### check_bond_no

>      check_bond_no (bonds:list, frags:list, frag_list_len:int, smi:str,
>                     verbose:int=0)

*This function checks if the molecule has less bonds than the limit of BRIC bonds.*

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| bonds | list |  | the list of BRIC bonds locations |
| frags | list |  | the list of fragments |
| frag_list_len | int |  | the length of the fragment list |
| smi | str |  | the smiles string of the molecule |
| verbose | int | 0 | print fragmentation process, set verbose to 1 |
| **Returns** | **tuple** |  | **a tuple containing the fragment list and a boolean value to indicate whether fragmentation is complete** |

In [None]:
bonds = []
frags = ['*c1c(C)cccc1C', '*N*', '*CC(*)=O', '*C1CC[NH+](*)CC1', '*O*', '*C*', 'c1(*)ccc(F)cc1']
frags_list_len = 0
smi = 'c1(*)ccc(F)cc1'
print(check_bond_no(bonds, frags, frags_list_len, smi))

(['*c1c(C)cccc1C', '*N*', '*CC(*)=O', '*C1CC[NH+](*)CC1', '*O*', '*C*', 'c1(*)ccc(F)cc1', 'c1(*)ccc(F)cc1'], True)


In [None]:
bonds = [((9, 8), ('1', '5')), ((16, 17), ('3', '4')), ((16, 15), ('3', '15')), ((11, 12), ('4', '5')), ((8, 7), ('5', '16')), ((17, 18), ('8', '16'))]
frags = []
frags_list_len = 0
smi = 'Cc1cccc(C)c1NC(=O)C[NH+]1CCC(OCc2ccc(F)cc2)CC1'
print(check_bond_no(bonds, frags, frags_list_len, smi))

([], False)


Unit Tests

In [None]:
test_eq(check_bond_no([], ['*c1c(C)cccc1C', '*N*', '*CC(*)=O', '*C1CC[NH+](*)CC1', '*O*', '*C*', 'c1(*)ccc(F)cc1'], 0, 'c1(*)ccc(F)cc1'),(['*c1c(C)cccc1C', '*N*', '*CC(*)=O', '*C1CC[NH+](*)CC1', '*O*', '*C*', 'c1(*)ccc(F)cc1', 'c1(*)ccc(F)cc1'], True))
test_eq(check_bond_no([((9, 8), ('1', '5')), ((16, 17), ('3', '4')), ((16, 15), ('3', '15')), ((11, 12), ('4', '5')), ((8, 7), ('5', '16')), ((17, 18), ('8', '16'))], [], 0, 'c1(*)ccc(F)cc1'),([], False))

In [None]:
#| export
def fragment_recursive(mol_smi_orig:str, # the original smiles string of the molecule
                       mol_smi:str, # the smiles string of the molecule
                       frags:list, # the list of fragments
                       counter:int, # the counter for the recursion
                       frag_list_len:int, # the length of the fragment list
                       min_length:int=0, # the minimum number of atoms in a fragment
                       verbose:int=0, # print fragmentation process, set verbose to 1
                       )->list: # the list of fragments
    'This recursive function fragments a molecule using the DEFRAGMO fragmentation method.'
    fragComplete = False
    try:
        counter += 1
        mol = mol_from_smiles(mol_smi)
        bonds = list(BRICS.FindBRICSBonds(mol))

        # Check if the mol has less bonds than the limit of BRIC bonds
        frags, fragComplete = check_bond_no(bonds, frags, frag_list_len, mol_smi, verbose)
        if fragComplete:
            return frags

        idxs, labs = list(zip(*bonds))

        bond_idxs = []
        for a1, a2 in idxs:
            bond = mol.GetBondBetweenAtoms(a1, a2)
            bond_idxs.append(bond.GetIdx())

        order = np.argsort(bond_idxs).tolist()
        bond_idxs = [bond_idxs[i] for i in order]
        for bond in bond_idxs:
            broken = Chem.FragmentOnBonds(mol,
                                        bondIndices=[bond],
                                        dummyLabels=[(0, 0)])
            head, tail = Chem.GetMolFrags(broken, asMols=True)
            head_bric_bond_no = len(list(BRICS.FindBRICSBonds(head)))
            tail_bric_bond_no = len(list(BRICS.FindBRICSBonds(tail)))
            
            if head_bric_bond_no <= frag_list_len:
                head_smi = mol_to_smiles(head)
                tail_smi = mol_to_smiles(tail, rootedAtAtom=1)
                if check_reconstruction(frags, head_smi, tail_smi, mol_smi_orig) & (get_size(head) >= min_length):
                    if verbose == 1:
                        print('Head fragment: ', head_smi)
                        print('Recurse tail: ', tail_smi)
                    frags.append(head_smi)
                    fragComplete = fragment_recursive(mol_smi_orig, tail_smi, frags, counter, frag_list_len = 0, min_length=min_length, verbose=verbose)  
                    if fragComplete:
                        return frags
                # if reconstruction fails, and there is only one bond, then add the fragment to the fragment list
                elif (len(bond_idxs) == 1) & (get_size(mol_from_smiles(mol_smi)) >= min_length):
                    if verbose == 1:
                        print('Final Fragment: ', mol_smi)
                    frags.append(root_smiles(mol_smi, rootedAtAtom=1))
                    fragComplete = True
                    return frags
                elif bond == bond_idxs[-1]:
                    fragComplete = fragment_recursive(mol_smi_orig, root_smiles(mol_smi, rootedAtAtom=1), frags, counter, frag_list_len + 1, min_length=min_length, verbose=verbose)
                    if fragComplete:
                        return frags
            elif tail_bric_bond_no <= frag_list_len:
                tail_smi = mol_to_smiles(tail)
                head_smi = mol_to_smiles(head, rootedAtAtom=1)
                if check_reconstruction(frags, tail_smi, head_smi, mol_smi_orig) & (get_size(tail) >= min_length):
                    if verbose == 1:
                        print('Tail: ', tail_smi)
                        print('Recurse Head: ', head_smi)
                    frags.append(tail_smi)
                    fragComplete = fragment_recursive(mol_smi_orig, head_smi, frags, counter, frag_list_len = 0, min_length=min_length, verbose=verbose)  
                    if fragComplete:
                        return frags
                elif (len(bond_idxs) == 1) & (get_size(mol_from_smiles(mol_smi)) >= min_length):
                    if verbose == 1:
                        print('Final fragment: ', mol_smi)
                    frags.append(root_smiles(mol_smi, rootedAtAtom=1))
                    fragComplete = True
                    return frags
                elif bond == bond_idxs[-1]:
                    fragComplete = fragment_recursive(mol_smi_orig, root_smiles(mol_smi, rootedAtAtom=1), frags, counter, frag_list_len + 1, min_length=min_length, verbose=verbose)
                    if fragComplete:
                        return frags
    except Exception:
        pass

In [None]:
show_doc(fragment_recursive)

---

[source](https://github.com/panukorn17/breadth-first-fragmentation/blob/main/breadth_first_fragmentation/fragmentation.py#L98){target="_blank" style="float:right; font-size:smaller"}

### fragment_recursive

>      fragment_recursive (mol_smi_orig:str, mol_smi:str, frags:list,
>                          counter:int, frag_list_len:int, min_length:int=0,
>                          verbose:int=0)

*This recursive function fragments a molecule using the DEFRAGMO fragmentation method.*

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| mol_smi_orig | str |  | the original smiles string of the molecule |
| mol_smi | str |  | the smiles string of the molecule |
| frags | list |  | the list of fragments |
| counter | int |  | the counter for the recursion |
| frag_list_len | int |  | the length of the fragment list |
| min_length | int | 0 | the minimum number of atoms in a fragment |
| verbose | int | 0 | print fragmentation process, set verbose to 1 |
| **Returns** | **list** |  | **the list of fragments** |

In [None]:
smi = 'CCCN(CCc1cccc(-c2ccccc2)c1)C(=O)C1OC(C(=O)O)=CC(N)C1NC(C)=O'
frags = []
print(fragment_recursive(smi, smi, frags, 0, 0, min_length=0, verbose=1))

Head fragment:  *CCC
Recurse tail:  N(*)(CCc1cccc(-c2ccccc2)c1)C(=O)C1OC(C(=O)O)=CC(N)C1NC(C)=O
Tail:  *CCc1cccc(-c2ccccc2)c1
Recurse Head:  N(*)(*)C(=O)C1OC(C(=O)O)=CC(N)C1NC(C)=O
Head fragment:  *N(*)*
Recurse tail:  C(*)(=O)C1OC(C(=O)O)=CC(N)C1NC(C)=O
Head fragment:  *C(*)=O
Recurse tail:  C1(*)OC(C(=O)O)=CC(N)C1NC(C)=O
Head fragment:  *NC1C(N)C=C(C(=O)O)OC1*
Recurse tail:  C(*)(C)=O
Final Fragment:  C(*)(C)=O
['*CCC', '*CCc1cccc(-c2ccccc2)c1', '*N(*)*', '*C(*)=O', '*NC1C(N)C=C(C(=O)O)OC1*', 'C(*)(C)=O']


In [None]:
smi = 'CCCN(CCc1cccc(-c2ccccc2)c1)C(=O)C1OC(C(=O)O)=CC(N)C1NC(C)=O'
frags = []
print(fragment_recursive(smi, smi, frags, 0, 0, min_length=3, verbose=1))

Head fragment:  *CCC
Recurse tail:  N(*)(CCc1cccc(-c2ccccc2)c1)C(=O)C1OC(C(=O)O)=CC(N)C1NC(C)=O
Tail:  *CCc1cccc(-c2ccccc2)c1
Recurse Head:  N(*)(*)C(=O)C1OC(C(=O)O)=CC(N)C1NC(C)=O
Head fragment:  *C(=O)N(*)*
Recurse tail:  C1(*)OC(C(=O)O)=CC(N)C1NC(C)=O
Head fragment:  *NC1C(N)C=C(C(=O)O)OC1*
Recurse tail:  C(*)(C)=O
Final Fragment:  C(*)(C)=O
['*CCC', '*CCc1cccc(-c2ccccc2)c1', '*C(=O)N(*)*', '*NC1C(N)C=C(C(=O)O)OC1*', 'C(*)(C)=O']


In [None]:
smi = 'COc1ccc(C(=O)Nc2cc(C(C)C)c(O)c(S(=O)(=O)c3ccc(OC)cc3)c2C)cc1'
frags = []
print(fragment_recursive(smi, smi, frags, 0, 0, min_length=0, verbose=1))

Head fragment:  *OC
Recurse tail:  c1(*)ccc(C(=O)Nc2cc(C(C)C)c(O)c(S(=O)(=O)c3ccc(OC)cc3)c2C)cc1
Head fragment:  *c1ccc(*)cc1
Recurse tail:  C(*)(=O)Nc1cc(C(C)C)c(O)c(S(=O)(=O)c2ccc(OC)cc2)c1C
Head fragment:  *C(*)=O
Recurse tail:  N(*)c1cc(C(C)C)c(O)c(S(=O)(=O)c2ccc(OC)cc2)c1C
Head fragment:  *N*
Recurse tail:  c1(*)cc(C(C)C)c(O)c(S(=O)(=O)c2ccc(OC)cc2)c1C
Tail:  *C(C)C
Recurse Head:  c1(*)cc(*)c(O)c(S(=O)(=O)c2ccc(OC)cc2)c1C
Head fragment:  *c1ccc(S(=O)(=O)c2c(C)c(*)cc(*)c2O)cc1
Recurse tail:  O(*)C
Final Fragment:  O(*)C
['*OC', '*c1ccc(*)cc1', '*C(*)=O', '*N*', '*C(C)C', '*c1ccc(S(=O)(=O)c2c(C)c(*)cc(*)c2O)cc1', 'O(*)C']


Unit Tests

In [None]:
test_eq(fragment_recursive('CCCN(CCc1cccc(-c2ccccc2)c1)C(=O)C1OC(C(=O)O)=CC(N)C1NC(C)=O', 'CCCN(CCc1cccc(-c2ccccc2)c1)C(=O)C1OC(C(=O)O)=CC(N)C1NC(C)=O', [], 0, 0, min_length=0), ['*CCC', '*CCc1cccc(-c2ccccc2)c1', '*N(*)*', '*C(*)=O', '*NC1C(N)C=C(C(=O)O)OC1*', 'C(*)(C)=O'])
test_eq(fragment_recursive('CCCN(CCc1cccc(-c2ccccc2)c1)C(=O)C1OC(C(=O)O)=CC(N)C1NC(C)=O', 'CCCN(CCc1cccc(-c2ccccc2)c1)C(=O)C1OC(C(=O)O)=CC(N)C1NC(C)=O', [], 0, 0, min_length=3), ['*CCC', '*CCc1cccc(-c2ccccc2)c1', '*C(=O)N(*)*', '*NC1C(N)C=C(C(=O)O)OC1*', 'C(*)(C)=O'])
test_eq(fragment_recursive('COc1ccc(C(=O)Nc2cc(C(C)C)c(O)c(S(=O)(=O)c3ccc(OC)cc3)c2C)cc1', 'COc1ccc(C(=O)Nc2cc(C(C)C)c(O)c(S(=O)(=O)c3ccc(OC)cc3)c2C)cc1', [], 0, 0, min_length=0), ['*OC', '*c1ccc(*)cc1', '*C(*)=O', '*N*', '*C(C)C', '*c1ccc(S(=O)(=O)c2c(C)c(*)cc(*)c2O)cc1', 'O(*)C'])

In [None]:
#| export
def break_into_fragments_defragmo(mol:Chem.rdchem.Mol, # the molecule object
                                  smi:str, # the smiles string of the molecule
                                  min_length: int=0, # the minimum number of atoms in a fragment
                                  verbose:int=0, # print fragmentation process, set verbose to 1
                                  )->tuple: # a tuple containing the original smiles, the fragmented smiles, and the number of fragments
    'This function breaks a molecule into fragments using the DEFRAGMO fragmentation method.'
    frags = []
    fragment_recursive(smi, smi, frags, 0, 0, min_length=min_length, verbose=verbose)

    # if no fragments are found
    if len(frags) == 0:
        return smi, np.nan, 0

    # if only one fragment is found
    if len(frags) == 1:
        return smi, smi, 1
    
    return smi, ' '.join(frags), len(frags)

In [None]:
show_doc(break_into_fragments_defragmo)

---

[source](https://github.com/panukorn17/breadth-first-fragmentation/blob/main/breadth_first_fragmentation/fragmentation.py#L182){target="_blank" style="float:right; font-size:smaller"}

### break_into_fragments_defragmo

>      break_into_fragments_defragmo (mol:rdkit.Chem.rdchem.Mol, smi:str,
>                                     min_length:int=0, verbose:int=0)

*This function breaks a molecule into fragments using the DEFRAGMO fragmentation method.*

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| mol | Mol |  | the molecule object |
| smi | str |  | the smiles string of the molecule |
| min_length | int | 0 | the minimum number of atoms in a fragment |
| verbose | int | 0 | print fragmentation process, set verbose to 1 |
| **Returns** | **tuple** |  | **a tuple containing the original smiles, the fragmented smiles, and the number of fragments** |

In [None]:
smi = 'CCCN(CCc1cccc(-c2ccccc2)c1)C(=O)C1OC(C(=O)O)=CC(N)C1NC(C)=O'
mol = mol_from_smiles(smi)
break_into_fragments_defragmo(mol, smi, min_length=0, verbose=1)

Head fragment:  *CCC
Recurse tail:  N(*)(CCc1cccc(-c2ccccc2)c1)C(=O)C1OC(C(=O)O)=CC(N)C1NC(C)=O
Tail:  *CCc1cccc(-c2ccccc2)c1
Recurse Head:  N(*)(*)C(=O)C1OC(C(=O)O)=CC(N)C1NC(C)=O
Head fragment:  *N(*)*
Recurse tail:  C(*)(=O)C1OC(C(=O)O)=CC(N)C1NC(C)=O
Head fragment:  *C(*)=O
Recurse tail:  C1(*)OC(C(=O)O)=CC(N)C1NC(C)=O
Head fragment:  *NC1C(N)C=C(C(=O)O)OC1*
Recurse tail:  C(*)(C)=O
Final Fragment:  C(*)(C)=O


('CCCN(CCc1cccc(-c2ccccc2)c1)C(=O)C1OC(C(=O)O)=CC(N)C1NC(C)=O',
 '*CCC *CCc1cccc(-c2ccccc2)c1 *N(*)* *C(*)=O *NC1C(N)C=C(C(=O)O)OC1* C(*)(C)=O',
 6)

In [None]:
smi = 'CCCN(CCc1cccc(-c2ccccc2)c1)C(=O)C1OC(C(=O)O)=CC(N)C1NC(C)=O'
mol = mol_from_smiles(smi)
break_into_fragments_defragmo(mol, smi, min_length=3, verbose=1)

Head fragment:  *CCC
Recurse tail:  N(*)(CCc1cccc(-c2ccccc2)c1)C(=O)C1OC(C(=O)O)=CC(N)C1NC(C)=O
Tail:  *CCc1cccc(-c2ccccc2)c1
Recurse Head:  N(*)(*)C(=O)C1OC(C(=O)O)=CC(N)C1NC(C)=O
Head fragment:  *C(=O)N(*)*
Recurse tail:  C1(*)OC(C(=O)O)=CC(N)C1NC(C)=O
Head fragment:  *NC1C(N)C=C(C(=O)O)OC1*
Recurse tail:  C(*)(C)=O
Final Fragment:  C(*)(C)=O


('CCCN(CCc1cccc(-c2ccccc2)c1)C(=O)C1OC(C(=O)O)=CC(N)C1NC(C)=O',
 '*CCC *CCc1cccc(-c2ccccc2)c1 *C(=O)N(*)* *NC1C(N)C=C(C(=O)O)OC1* C(*)(C)=O',
 5)

In [None]:
smi = 'Cc1cccc(C)c1NC(=O)C[NH+]1CCC(OCc2ccc(F)cc2)CC1'
mol = mol_from_smiles(smi)
break_into_fragments_defragmo(mol, smi, min_length=0, verbose=1)


Head fragment:  *c1c(C)cccc1C
Recurse tail:  N(*)C(=O)C[NH+]1CCC(OCc2ccc(F)cc2)CC1
Head fragment:  *N*
Recurse tail:  C(*)(=O)C[NH+]1CCC(OCc2ccc(F)cc2)CC1
Head fragment:  *CC(*)=O
Recurse tail:  [NH+]1(*)CCC(OCc2ccc(F)cc2)CC1
Head fragment:  *C1CC[NH+](*)CC1
Recurse tail:  O(*)Cc1ccc(F)cc1
Head fragment:  *O*
Recurse tail:  C(*)c1ccc(F)cc1
Head fragment:  *C*
Recurse tail:  c1(*)ccc(F)cc1
Final Fragment:  c1(*)ccc(F)cc1


('Cc1cccc(C)c1NC(=O)C[NH+]1CCC(OCc2ccc(F)cc2)CC1',
 '*c1c(C)cccc1C *N* *CC(*)=O *C1CC[NH+](*)CC1 *O* *C* c1(*)ccc(F)cc1',
 7)

In [None]:
smi = 'Cc1cccc(C)c1NC(=O)C[NH+]1CCC(OCc2ccc(F)cc2)CC1'
mol = mol_from_smiles(smi)
break_into_fragments_defragmo(mol, smi, min_length=3, verbose=1)

Head fragment:  *c1c(C)cccc1C
Recurse tail:  N(*)C(=O)C[NH+]1CCC(OCc2ccc(F)cc2)CC1
Head fragment:  *CC(=O)N*
Recurse tail:  [NH+]1(*)CCC(OCc2ccc(F)cc2)CC1
Head fragment:  *C1CC[NH+](*)CC1
Recurse tail:  O(*)Cc1ccc(F)cc1
Final Fragment:  O(*)Cc1ccc(F)cc1


('Cc1cccc(C)c1NC(=O)C[NH+]1CCC(OCc2ccc(F)cc2)CC1',
 '*c1c(C)cccc1C *CC(=O)N* *C1CC[NH+](*)CC1 O(*)Cc1ccc(F)cc1',
 4)

Unit Tests

In [None]:
test_eq(break_into_fragments_defragmo(mol_from_smiles('CCCN(CCc1cccc(-c2ccccc2)c1)C(=O)C1OC(C(=O)O)=CC(N)C1NC(C)=O'),'CCCN(CCc1cccc(-c2ccccc2)c1)C(=O)C1OC(C(=O)O)=CC(N)C1NC(C)=O'), ('CCCN(CCc1cccc(-c2ccccc2)c1)C(=O)C1OC(C(=O)O)=CC(N)C1NC(C)=O', '*CCC *CCc1cccc(-c2ccccc2)c1 *N(*)* *C(*)=O *NC1C(N)C=C(C(=O)O)OC1* C(*)(C)=O', 6))
test_eq(break_into_fragments_defragmo(mol_from_smiles('CCCN(CCc1cccc(-c2ccccc2)c1)C(=O)C1OC(C(=O)O)=CC(N)C1NC(C)=O'),'CCCN(CCc1cccc(-c2ccccc2)c1)C(=O)C1OC(C(=O)O)=CC(N)C1NC(C)=O', min_length=3), ('CCCN(CCc1cccc(-c2ccccc2)c1)C(=O)C1OC(C(=O)O)=CC(N)C1NC(C)=O', '*CCC *CCc1cccc(-c2ccccc2)c1 *C(=O)N(*)* *NC1C(N)C=C(C(=O)O)OC1* C(*)(C)=O', 5))
test_eq(break_into_fragments_defragmo(mol_from_smiles('Cc1cccc(C)c1NC(=O)C[NH+]1CCC(OCc2ccc(F)cc2)CC1'),'Cc1cccc(C)c1NC(=O)C[NH+]1CCC(OCc2ccc(F)cc2)CC1'), ('Cc1cccc(C)c1NC(=O)C[NH+]1CCC(OCc2ccc(F)cc2)CC1', '*c1c(C)cccc1C *N* *CC(*)=O *C1CC[NH+](*)CC1 *O* *C* c1(*)ccc(F)cc1', 7))
test_eq(break_into_fragments_defragmo(mol_from_smiles('Cc1cccc(C)c1NC(=O)C[NH+]1CCC(OCc2ccc(F)cc2)CC1'),'Cc1cccc(C)c1NC(=O)C[NH+]1CCC(OCc2ccc(F)cc2)CC1', min_length=3), ('Cc1cccc(C)c1NC(=O)C[NH+]1CCC(OCc2ccc(F)cc2)CC1', '*c1c(C)cccc1C *CC(=O)N* *C1CC[NH+](*)CC1 O(*)Cc1ccc(F)cc1', 4))

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()