In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import itertools
import copy
import os

# Cheminformatics stack
import rdkit
from rdkit import Chem
from rdkit.Chem import rdDistGeom
from rdkit.Chem import AllChem
from rdkit.Chem import FragmentMatcher
from rdkit.Chem import rdFMCS
from glob import glob

# For segfault
import faulthandler
import signal
from subprocess import Popen, PIPE
from rdkit.Chem import rdMMPA

In [2]:
import stk


def build_molecule_from_frag(
    right_block_smiles, photochromic_block_smiles, catalytic_block_smiles,
    inhibitor_block
):
        """
        This function takes the 3 fragments and builds a molecule using the STK library.
        Parameters:
        right_block: RDKit molecule, right block
        left_block: RDKit molecule, left block
        linker: RDKit molecule, linker

        """
        right_block_smiles = right_block_smiles.replace("[*:2]", "Br")
        try:
                bb_right = stk.BuildingBlock(
                        smiles=right_block_smiles, functional_groups=[stk.BromoFactory()]
                )
        except:
                print("Error with right block")
                print(right_block_smiles)
                return None
        try:
                bb_photo = stk.BuildingBlock(
                        smiles=photochromic_block_smiles, functional_groups=[stk.BromoFactory()]
                )
        except:
                print("Error with photo block")
                print(photochromic_block_smiles)
                return None
        try:
                catalytic_block_smiles = catalytic_block_smiles.replace("[*:1]", "Br")
                catalytic_block_smiles = catalytic_block_smiles.replace("[*:2]", "Br")
                bb_linker = stk.BuildingBlock(
                smiles=catalytic_block_smiles, functional_groups=[stk.BromoFactory()]
                )
        except:
                print("Error with linker block")
                print(catalytic_block_smiles)
                return None
        try: 
                inhibitor_block = inhibitor_block.replace("[*:2]", "Br")
                bb_inhibitor = stk.BuildingBlock(
                        smiles=inhibitor_block, functional_groups=[stk.BromoFactory()]
                )
        except:
                print("Error with inhibitor block")
                print(inhibitor_block)
                return None
        
        try:
                constructed_molecule = stk.ConstructedMolecule(
                        stk.polymer.Linear(
                        building_blocks=[bb_inhibitor,bb_photo, bb_linker, bb_right],
                        repeating_unit="ABCD",
                        num_repeating_units=1,
                        optimizer=stk.MCHammer(),
                        )
                )
        except:
                print("Error with constructed molecule")
                return None
        return constructed_molecule

In [26]:
df_catalysts_right_block


Unnamed: 0,smiles,linkers_smiles,catalytic_block,right_block,catalytic_block_smiles,right_block_smiles,number_of_atoms_right_block,number_of_atoms_catalytic_block
0,c1ccc([*:2])cc1,[N]([N][*:2])[*:1],<rdkit.Chem.rdchem.Mol object at 0x704860b61970>,<rdkit.Chem.rdchem.Mol object at 0x704860b63c80>,[N]([N][*:2])[*:1],c1ccc([*:2])cc1,7,4
1,c1n[nH]cc1[*:2],[N]([N][*:2])[*:1],<rdkit.Chem.rdchem.Mol object at 0x704860b619e0>,<rdkit.Chem.rdchem.Mol object at 0x704860b63cf0>,[N]([N][*:2])[*:1],c1n[nH]cc1[*:2],6,4
2,c1cc([*:2])[nH]n1,[N]([N][*:2])[*:1],<rdkit.Chem.rdchem.Mol object at 0x704860b61a50>,<rdkit.Chem.rdchem.Mol object at 0x704860b63d60>,[N]([N][*:2])[*:1],c1cc([*:2])[nH]n1,6,4
3,c1c[nH]c([*:2])n1,[N]([N][*:2])[*:1],<rdkit.Chem.rdchem.Mol object at 0x704860b61ac0>,<rdkit.Chem.rdchem.Mol object at 0x704860b63dd0>,[N]([N][*:2])[*:1],c1c[nH]c([*:2])n1,6,4
4,c1ncc([*:2])[nH]1,[N]([N][*:2])[*:1],<rdkit.Chem.rdchem.Mol object at 0x704860b61b30>,<rdkit.Chem.rdchem.Mol object at 0x704860b63e40>,[N]([N][*:2])[*:1],c1ncc([*:2])[nH]1,6,4
...,...,...,...,...,...,...,...,...
110,O=C(NN=Cc1ncccc1-c1ccc([*:2])cc1)c1ccccc1,O=C1C(=O)[C@@H]([*:1])[C@@H]1NCc1cccc([*:2])c1,<rdkit.Chem.rdchem.Mol object at 0x704860b63a50>,<rdkit.Chem.rdchem.Mol object at 0x704860b66d60>,O=C1C(=O)[C@@H]([*:1])[C@@H]1NCc1cccc([*:2])c1,O=C(NN=Cc1ncccc1-c1ccc([*:2])cc1)c1ccccc1,24,16
111,O=C(NN=Cc1ncccc1-c1ccc(-c2cccc([*:2])c2)cc1)c1...,O=C1C(=O)[C@@H]([*:1])[C@@H]1NC[*:2],<rdkit.Chem.rdchem.Mol object at 0x704860b63ac0>,<rdkit.Chem.rdchem.Mol object at 0x704860b66dd0>,O=C1C(=O)[C@@H]([*:1])[C@@H]1NC[*:2],O=C(NN=Cc1ncccc1-c1ccc(-c2cccc([*:2])c2)cc1)c1...,30,10
112,c1ccc(N=Nc2ccc([*:2])cc2)cc1,O=C(Nc1ccncc1)[C@H](C[*:2])[*:1],<rdkit.Chem.rdchem.Mol object at 0x704860b63b30>,<rdkit.Chem.rdchem.Mol object at 0x704860b66e40>,O=C(Nc1ccncc1)[C@H](C[*:2])[*:1],c1ccc(N=Nc2ccc([*:2])cc2)cc1,15,13
113,c1ccc(N=Nc2ccc(C[*:2])cc2)cc1,O=C(N[C@H]([*:1])[*:2])c1ccccc1,<rdkit.Chem.rdchem.Mol object at 0x704860b63ba0>,<rdkit.Chem.rdchem.Mol object at 0x704860b66eb0>,O=C(N[C@H]([*:1])[*:2])c1ccccc1,c1ccc(N=Nc2ccc(C[*:2])cc2)cc1,16,12


Unnamed: 0,linkers_smiles,catalytic_block
0,O=C(N[*:1])N[*:2],<rdkit.Chem.rdchem.Mol object at 0x7048606f4890>
1,O=C(NC[*:1])N[*:2],<rdkit.Chem.rdchem.Mol object at 0x7048606f4900>
2,O=C(NC[*:1])NC[*:2],<rdkit.Chem.rdchem.Mol object at 0x7048606f4970>
3,O=C(N[*:1])N[*:2],<rdkit.Chem.rdchem.Mol object at 0x7048606f49e0>
4,O=C(Nc1ccc([*:1])cc1)N[*:2],<rdkit.Chem.rdchem.Mol object at 0x7048606f4a50>
...,...,...
11455,S=C(N[*:1])N[*:2],<rdkit.Chem.rdchem.Mol object at 0x7048605b3ba0>
11456,S=C(Nc1nc([*:1])nc2ccccc12)N[*:2],<rdkit.Chem.rdchem.Mol object at 0x7048605b3c10>
11457,S=C(N[*:1])N[*:2],<rdkit.Chem.rdchem.Mol object at 0x7048605b3c80>
11458,S=C(Nc1ccccc1[*:1])N[*:2],<rdkit.Chem.rdchem.Mol object at 0x7048605b3cf0>


In [28]:
df_catalysts_right_block = pd.read_csv('data/catalyst_right_blocks.csv')
df_catalysts_right_block['catalytic_block'] = df_catalysts_right_block['linkers_smiles'].apply(Chem.MolFromSmiles)
df_catalysts_right_block['right_block'] = df_catalysts_right_block['smiles'].apply(Chem.MolFromSmiles)
df_catalysts_right_block['catalytic_block_smiles'] = df_catalysts_right_block['linkers_smiles']
df_catalysts_right_block['right_block_smiles'] = df_catalysts_right_block['smiles']
df_catalysts_right_block['number_of_atoms_right_block'] = df_catalysts_right_block['right_block'].apply(lambda x: x.GetNumAtoms())
df_catalysts_right_block['number_of_atoms_catalytic_block'] = df_catalysts_right_block['catalytic_block'].apply(lambda x: x.GetNumAtoms())
df_catalysts_right_block_unique = df_catalysts_right_block.drop_duplicates(subset=['smiles'])
df_linkers = df_catalysts_right_block[['linkers_smiles','catalytic_block']]
df_catalysts_linker_unique = df_linkers.drop_duplicates(subset=['smiles'])
df_catalysts_right_block_unique = df_catalysts_right_block_unique[['smiles']]
df_catalysts_right_block_unique.to_csv('data/catalyst_right_blocks_unique.csv', index=False)
df_catalysts_linker_unique.to_csv('data/catalyst_linkers_unique.csv', index=False)

FileNotFoundError: [Errno 2] No such file or directory: 'data/catalyst_linkers_unique.csv'

In [29]:
df_catalysts_right_block_unique

Unnamed: 0,right_block,smiles,linkers_smiles,catalytic_block,catalytic_block_smiles,right_block_smiles,number_of_atoms_right_block,number_of_atoms_catalytic_block
0,<rdkit.Chem.rdchem.Mol object at 0x7048605b3dd0>,C#CC[*:2],O=C(N[*:1])N[*:2],<rdkit.Chem.rdchem.Mol object at 0x7048606f4890>,O=C(N[*:1])N[*:2],C#CC[*:2],4,6
2,<rdkit.Chem.rdchem.Mol object at 0x7048605b3eb0>,C#C[*:2],O=C(NC[*:1])NC[*:2],<rdkit.Chem.rdchem.Mol object at 0x7048606f4970>,O=C(NC[*:1])NC[*:2],C#C[*:2],3,8
3,<rdkit.Chem.rdchem.Mol object at 0x7048605b3f20>,O=[N+]([O-])c1ccc([*:2])cc1,O=C(N[*:1])N[*:2],<rdkit.Chem.rdchem.Mol object at 0x7048606f49e0>,O=C(N[*:1])N[*:2],O=[N+]([O-])c1ccc([*:2])cc1,10,6
4,<rdkit.Chem.rdchem.Mol object at 0x7048605b3f90>,C#Cc1ccc([*:2])cc1,O=C(Nc1ccc([*:1])cc1)N[*:2],<rdkit.Chem.rdchem.Mol object at 0x7048606f4a50>,O=C(Nc1ccc([*:1])cc1)N[*:2],C#Cc1ccc([*:2])cc1,9,12
6,<rdkit.Chem.rdchem.Mol object at 0x7048605b50b0>,O=[N+]([O-])[*:2],O=C(Nc1ccc([*:1])cc1)Nc1ccc([*:2])cc1,<rdkit.Chem.rdchem.Mol object at 0x7048606f4b30>,O=C(Nc1ccc([*:1])cc1)Nc1ccc([*:2])cc1,O=[N+]([O-])[*:2],4,18
...,...,...,...,...,...,...,...,...
11430,<rdkit.Chem.rdchem.Mol object at 0x704860d51ba0>,c1ccc2c(c1)cc([*:2])c1ccccc12,S=C(Nc1ccc2ccccc2c1-c1c(P(c2ccccc2)c2ccccc2)cc...,<rdkit.Chem.rdchem.Mol object at 0x7048605b30b0>,S=C(Nc1ccc2ccccc2c1-c1c(P(c2ccccc2)c2ccccc2)cc...,c1ccc2c(c1)cc([*:2])c1ccccc12,15,40
11450,<rdkit.Chem.rdchem.Mol object at 0x704860d664a0>,C(=Nc1ccccc1[*:2])c1c2ccccc2cc2ccccc12,S=C(N[*:1])N[*:2],<rdkit.Chem.rdchem.Mol object at 0x7048605b3970>,S=C(N[*:1])N[*:2],C(=Nc1ccccc1[*:2])c1c2ccccc2cc2ccccc12,23,6
11451,<rdkit.Chem.rdchem.Mol object at 0x704860d66510>,Clc1ccc(C=Nc2ccccc2[*:2])cc1,S=C(N[*:1])N[*:2],<rdkit.Chem.rdchem.Mol object at 0x7048605b39e0>,S=C(N[*:1])N[*:2],Clc1ccc(C=Nc2ccccc2[*:2])cc1,16,6
11453,<rdkit.Chem.rdchem.Mol object at 0x704860d665f0>,S=C(Nc1ccccc1)Nc1ccccc1[*:2],S=C(N[*:1])N[*:2],<rdkit.Chem.rdchem.Mol object at 0x7048605b3ac0>,S=C(N[*:1])N[*:2],S=C(Nc1ccccc1)Nc1ccccc1[*:2],17,6


In [30]:
df_linker_filtered


Unnamed: 0,Column 1,Structure of smiles [idcode],smiles,photochromic_block,photochromic_block_smiles
0,0,foAHA@FBTXFI[DRYV}ej\Lz@HYjRACb@,[H]c1c([H])c(Br)c([H])c(/N=N\c2nn([H])c([H])c2...,<rdkit.Chem.rdchem.Mol object at 0x704860b8cf20>,[H]c1c([H])c(Br)c([H])c(/N=N\c2nn([H])c([H])c2...
1,1,foAHA@HHYpFI[DRYWVUbTLziY`BHAAb@,[H]c1c([H])c(Br)c([H])c(/N=N\c2nn(Br)c([H])c2[...,<rdkit.Chem.rdchem.Mol object at 0x704860b8cf90>,[H]c1c([H])c(Br)c([H])c(/N=N\c2nn(Br)c([H])c2[...
2,4,foAHA@NBTXFH{DRYWUezLMZ@BYibACb@,[H]c1c([H])c(/N=N\c2nn([H])c(Br)c2[H])c([H])c(...,<rdkit.Chem.rdchem.Mol object at 0x704860b8ceb0>,[H]c1c([H])c(/N=N\c2nn([H])c(Br)c2[H])c([H])c(...
3,5,foAHA@B\TXFH{DRYe_evBMZBAIibAAb@,[H]c1c([H])c([H])c(/N=N\c2nn([H])c(Br)c2[H])c(...,<rdkit.Chem.rdchem.Mol object at 0x704860b8ce40>,[H]c1c([H])c([H])c(/N=N\c2nn([H])c(Br)c2[H])c(...
4,6,foAHA@B\TXFI[DRYe_efBLzBAIjRAAb@,[H]c1c([H])c([H])c(/N=N\c2nn([H])c([H])c2Br)c(...,<rdkit.Chem.rdchem.Mol object at 0x704860b8cdd0>,[H]c1c([H])c([H])c(/N=N\c2nn([H])c([H])c2Br)c(...
...,...,...,...,...,...
406,981,fluHC@DJMxFHxHSbILkZwloQdmAc@@ACSTBBFd,[H]c1sc(/N=N\c2c([H])c([H])c([H])c3nc(Br)c([H]...,<rdkit.Chem.rdchem.Mol object at 0x704860b412e0>,[H]c1sc(/N=N\c2c([H])c([H])c([H])c3nc(Br)c([H]...
407,983,fluHC@NRMxFHxHSbYEEDhhTUDdsbmAc@@DCSTBBFd,[H]c1nc(/N=N\c2c([H])c([H])c([H])c3nc([H])c([H...,<rdkit.Chem.rdchem.Mol object at 0x704860b41350>,[H]c1nc(/N=N\c2c([H])c([H])c([H])c3nc([H])c([H...
408,984,fluHC@BZMxFHxHSbYEEDeLbeDdpTeAc@@@SSTBBFd,[H]c1nc(/N=N\c2c([H])c([H])c(Br)c3nc([H])c([H]...,<rdkit.Chem.rdchem.Mol object at 0x704860b413c0>,[H]c1nc(/N=N\c2c([H])c([H])c(Br)c3nc([H])c([H]...
409,986,fluHC@FBuxFHxHSbILkJk|oSbcAc@@PISTBBBd,[H]c1nc2c([H])c(Br)c([H])c(/N=N\c3nc(Br)c([H])...,<rdkit.Chem.rdchem.Mol object at 0x704860b41430>,[H]c1nc2c([H])c(Br)c([H])c(/N=N\c3nc(Br)c([H])...


In [32]:
df_linker_filtered = pd.read_csv('data/2_murcko_azo_photo_cores_Br_filtered.txt',delimiter='\t')
df_linker_filtered = df_linker_filtered.drop_duplicates()
df_linker_filtered['photochromic_block'] = df_linker_filtered['smiles'].apply(Chem.MolFromSmiles)
df_linker_filtered['photochromic_block_smiles'] = df_linker_filtered['smiles']
df_linker_filtered.to_csv('data/2_murcko_azo_photo_cores_Br_filtered.csv', index=False)
print('number of photochromic blocks:', len(df_linker_filtered))
print('number of block on the right of the catalyst:', len(set(df_catalysts_right_block_unique['smiles'])))
print('number of catalysts units:', len(set(df_catalysts_linker_unique['smiles'])))
print(f"size of the dataset: {len(df_linker_filtered)*len(set(df_catalysts_right_block_unique['smiles']))*len(set(df_catalysts_linker_unique['smiles'])):.2e}")

number of photochromic blocks: 411
number of block on the right of the catalyst: 1086
number of catalysts units: 582
size of the dataset: 2.60e+08


In [11]:
df_inhibitor = pd.read_csv('data/inhibitor_right_blocks.csv')
df_inhibitor['inhibitor_block'] = df_inhibitor['smiles'].apply(Chem.MolFromSmiles)
df_inhibitor['inhibitor_block_smiles'] = df_inhibitor['smiles']
print('number of inhibitors:', len(df_inhibitor))



number of inhibitors: 21


In [1]:
constructed_molecule_list = []
number_of_molecules = 100
rb_list, pb_list, cb_list,ib_list= [], [], [], []
for i in range(number_of_molecules):
    rb = df_catalysts_right_block.sample(3,random_state= i)['right_block_smiles'].values[0]
    pb = df_linker_filtered.sample(3,random_state= i)['photochromic_block_smiles'].values[0]
    cb = df_catalysts_right_block.sample(3,random_state= i)['catalytic_block_smiles'].values[0]
    ib = df_inhibitor.sample(3,random_state= i)['inhibitor_block_smiles'].values[0]
    constructed_molecule = build_molecule_from_frag(rb, pb, cb,ib)
    if constructed_molecule is not None:
        rb_list.append(rb)
        pb_list.append(pb)
        cb_list.append(cb)
        ib_list.append(ib)
        constructed_molecule_list.append(constructed_molecule.to_rdkit_mol())
print('number of constructed molecules:', len(constructed_molecule_list))

NameError: name 'df_catalysts_right_block' is not defined

In [39]:
constructed_molecule_list_smiles = [Chem.MolToSmiles(x) for x in constructed_molecule_list]
pd_constructed_molecules = pd.DataFrame((constructed_molecule_list_smiles,rb_list,pb_list,cb_list,ib_list)).T
pd_constructed_molecules.columns=['smiles','right_block','photo_block','catalytic_block','inhibitor_block']
pd_constructed_molecules.head()
pd_constructed_molecules.to_csv('constructed_organo_photocat.csv',index=False)