- bb1の付け根はC5でキャッピング
- bb1のbb23側はフルオレンでキャっピング

In [1]:
import os
from pathlib import Path

def is_kaggle_kernel():
    return os.path.exists('/kaggle/working')

if is_kaggle_kernel():

    BASE_DIR = Path("/kaggle")
    DATA_DIR = BASE_DIR / "input"
    OUTPUT_DIR = BASE_DIR / "working"
    print('on kaggle notebook')

else:
    BASE_DIR = Path(os.getcwd()) / './../'
    DATA_DIR = BASE_DIR / "data"
    OUTPUT_DIR = BASE_DIR / "output/eda"
    
class paths:    
    DATA_DIR = DATA_DIR
    
    # TEST_PATH = DATA_DIR / "test.parquet"
    OUTPUT_DIR = OUTPUT_DIR
    SHRUNKEN_DATA_DIR = DATA_DIR / "shrunken-data"
    
    TRAIN_PATH_ORIG = DATA_DIR / "shrunken-train-set/train.parquet"
    TEST_PATH_ORIG = DATA_DIR / "shrunken-train-set/test.parquet"
    
    TRAIN_PATH = SHRUNKEN_DATA_DIR / "train.parquet"
    TEST_PATH = SHRUNKEN_DATA_DIR / "test.parquet"
    SUB_PATH = SHRUNKEN_DATA_DIR / "sub.parquet"
    
    OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

In [4]:
import os
import gc
import math
import numpy as np
import pandas as pd
from glob import glob
# import duckdb
# import lightgbm as lgb

from rdkit import Chem
from rdkit.Chem import Draw

import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from rdkit.Chem.Scaffolds import MurckoScaffold
from rdkit.Chem import AllChem

import pickle
from funcs.chemical_func import combine_fragments

In [5]:
bb_cols = ['buildingblock1_smiles', 'buildingblock2_smiles','buildingblock3_smiles', 
           'buildingblock1_smiles_scaffold', "buildingblock2_smiles_scaffold", "buildingblock3_smiles_scaffold",
    ]

TARGETS = ['binds_BRD4', 'binds_HSA','binds_sEH']

df_train = pd.read_parquet(paths.TRAIN_PATH, columns=bb_cols + TARGETS)
df_test = pd.read_parquet(paths.TEST_PATH, columns=bb_cols)

df_train_orig = pd.read_parquet(paths.TRAIN_PATH_ORIG, columns=["molecule_smiles"])
df_test_orig = pd.read_parquet(paths.TEST_PATH_ORIG, columns=["molecule_smiles"])

with open(paths.SHRUNKEN_DATA_DIR / 'bb1_smiles2idx.pickle', 'rb') as file:
    bb1_smiles2idx = pickle.load(file)
with open(paths.SHRUNKEN_DATA_DIR / 'bb23_smiles2idx.pickle', 'rb') as file:
    bb23_smiles2idx = pickle.load(file)
    
bb1_idx2smiles = {val:key for key, val in bb1_smiles2idx.items()}
bb23_idx2smiles = {val:key for key, val in bb23_smiles2idx.items()}


KeyboardInterrupt: 

In [None]:
# おそらく使わない関数
def replace_structure(main_mol, pattern_mol, replace_mol, idx=0):
    
    # patternにマッチする構造を削除して*に置き換える
    if main_mol.HasSubstructMatch(pattern_mol):
        main_mol = AllChem.ReplaceSubstructs(main_mol, pattern_mol, Chem.MolFromSmiles('*'))[idx]
        # *をreplace_molで置き換える
        main_mol = combine_fragments(main_mol, replace_mol)
    
    return main_mol

# テスト
bb1_smiles = "C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21"
pattern = "NC(=O)OCC1c2ccccc2-c2ccccc21"
bb1_replace = "Nc1nc(N)nc(N)n1"
bb1_mol = Chem.MolFromSmiles(bb1_smiles)
pattern_mol = Chem.MolFromSmiles(pattern)
bb1_replace_mol = Chem.MolFromSmiles(bb1_replace)

# replace_structure(bb1_mol, pattern_mol, bb1_replace_mol)
AllChem.ReplaceSubstructs(bb1_mol, pattern_mol, bb1_replace_mol)[0]

In [None]:
def clean_and_capping_bb1_structure(main_smiles, del_t_butyl_halo_patterns=[], del_fluorene_halo_patterns=[], tert_butyl_ester_smiles=[],ex_dict={}):
    
    fluorene = Chem.MolFromSmiles("NC(=O)OCC1c2ccccc2-c2ccccc21")
    fluorene_without_n = Chem.MolFromSmiles("C(=O)OCC2c3ccccc3-c3ccccc32")
    fluorene_scaffold = Chem.MolFromSmiles("CC3c1ccccc1c2ccccc23")
    
    dy_scaffold = Chem.MolFromSmiles("C[Dy]")
    
    triazine = Chem.MolFromSmiles("Nc1nc(N)nc(N)n1")
    triazine_and_fluorene = Chem.MolFromSmiles("Nc7nc(NC3c1ccccc1c2ccccc23)nc(NC6c4ccccc4c5ccccc56)n7")
    triazine_and_Dy = Chem.MolFromSmiles("Nc7nc([Dy])nc([Dy])n7")
    
    t_butyl_O = Chem.MolFromSmiles("COC(C)(C)(C)")
    carboxyl = Chem.MolFromSmiles("C(=O)O")
    
    propane_ester = Chem.MolFromSmiles("C(=O)OCCCCCCC")
    fe_ester = Chem.MolFromSmiles("C(=O)O[Fe]")
    
    metyl = Chem.MolFromSmiles("C")
    br = Chem.MolFromSmiles("Br")
    I = Chem.MolFromSmiles("I")
    Cl = Chem.MolFromSmiles("Cl")
    
    # 例外処理
    if main_smiles in ex_dict.keys():
        return ex_dict[main_smiles]
    
    main_mol = Chem.MolFromSmiles(main_smiles)
    
    # fluoreneがなく、triazineにならないもの
    for pattern in del_t_butyl_halo_patterns:
        pattern_mol = Chem.MolFromSmiles(pattern)
        if main_mol.HasSubstructMatch(pattern_mol):
            main_mol = AllChem.ReplaceSubstructs(main_mol, t_butyl_O, dy_scaffold)[0]
            main_mol = AllChem.ReplaceSubstructs(main_mol, br, dy_scaffold)[0]
            main_mol = AllChem.ReplaceSubstructs(main_mol, I, dy_scaffold)[0]
            main_mol = AllChem.ReplaceSubstructs(main_mol, Cl, dy_scaffold)[0]
            # main_mol = AllChem.DeleteSubstructs(main_mol, t_butyl_O)
            
            
            main_mol = AllChem.ReplaceSubstructs(main_mol, carboxyl, fe_ester)[0]
            
            main_smiles = Chem.MolToSmiles(main_mol)    
            return main_smiles
    
    # fluoreneはあるがtriaineにならないもの
    for pattern in del_fluorene_halo_patterns:
        pattern_mol = Chem.MolFromSmiles(pattern)
        if main_mol.HasSubstructMatch(pattern_mol):
            main_mol = AllChem.ReplaceSubstructs(main_mol, fluorene_without_n, dy_scaffold)[0]
            main_mol = AllChem.ReplaceSubstructs(main_mol, br, dy_scaffold)[0]
            main_mol = AllChem.ReplaceSubstructs(main_mol, I, dy_scaffold)[0]
            main_mol = AllChem.ReplaceSubstructs(main_mol, Cl, dy_scaffold)[0]
            
            main_mol = AllChem.ReplaceSubstructs(main_mol, carboxyl, fe_ester)[0]
            main_smiles = Chem.MolToSmiles(main_mol)    
            return main_smiles

    
    # fluorene→triazine
    main_mol = AllChem.ReplaceSubstructs(main_mol, fluorene, triazine_and_Dy)[0]
    
    # carboxylをpropane_ester
    if main_smiles in tert_butyl_ester_smiles:
        main_mol = AllChem.ReplaceSubstructs(main_mol, carboxyl, fe_ester)[1]        
    else:
        main_mol = AllChem.ReplaceSubstructs(main_mol, carboxyl, fe_ester)[0]
 
    main_smiles = Chem.MolToSmiles(main_mol)
            
    return main_smiles

# trainのbb1を一括で変換して確認

In [None]:
## trainのbb1を一括で見る
df = df_train.copy()
df = df.drop_duplicates(["buildingblock1_smiles"])
df['main'] = df_train_orig.iloc[df.index]["molecule_smiles"].values
df['buildingblock1_smiles'] = df['buildingblock1_smiles'].map(bb1_idx2smiles)
df['buildingblock2_smiles'] = df['buildingblock2_smiles'].map(bb23_idx2smiles)
df['buildingblock3_smiles'] = df['buildingblock3_smiles'].map(bb23_idx2smiles)
temp = df.iloc[25]


tert_butyl_ester_smiles = ["CC(C)(C)OC(=O)CC(NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O",
                           "CC(C)(C)OC(=O)CCC(NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O",
                           "CC(C)(C)OC(=O)N1CCN(C(=O)OCC2c3ccccc3-c3ccccc32)C1C(=O)O",
                           "CC(C)(C)OC(=O)N1C[C@@H](NC(=O)OCC2c3ccccc3-c3ccccc32)[C@H](C(=O)O)C1",
                           "COC(=O)CC(NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O",
                           "COC(=O)c1ccc(C(=O)O)c(NC(=O)OCC2c3ccccc3-c3ccccc32)c1",
                           ]

# 207, 209, 270
smiles = temp['buildingblock1_smiles']
print(smiles)
mol = Chem.MolFromSmiles(smiles)
mol

smiles = clean_and_capping_bb1_structure(smiles, tert_butyl_ester_smiles=tert_butyl_ester_smiles)
print(smiles)
Chem.MolFromSmiles(smiles)

In [None]:
## trainのbb1を一括で見る
df = df_train.copy()
df = df.drop_duplicates(["buildingblock1_smiles"])
df['main'] = df_train_orig.iloc[df.index]["molecule_smiles"].values
df['buildingblock1_smiles'] = df['buildingblock1_smiles'].map(bb1_idx2smiles)
df['buildingblock2_smiles'] = df['buildingblock2_smiles'].map(bb23_idx2smiles)
df['buildingblock3_smiles'] = df['buildingblock3_smiles'].map(bb23_idx2smiles)

df['replaced'] = df['buildingblock1_smiles'].apply(lambda x: clean_and_capping_bb1_structure(x,tert_butyl_ester_smiles=tert_butyl_ester_smiles))

col = ['buildingblock1_smiles',"replaced", 'main', 'buildingblock2_smiles','buildingblock3_smiles', ]
# col = ['buildingblock2_smiles','buildingblock3_smiles', 'main']
df[col] = df[col].applymap(lambda x: Chem.MolFromSmiles(x))

i = 1
chunk = 100
mols_list = df.iloc[i*chunk:i*chunk+chunk][col].values.flatten()
# mols_list = df[['buildingblock1_smiles','replaced', 'main', 'buildingblock2_smiles','buildingblock3_smiles']].values.flatten()
label_list  = [str(i) for i in range(i*100, i*100+100) for _ in range(5)]

img = Draw.MolsToGridImage(mols_list,
                        molsPerRow=5, #一列に配置する分子の数
                        subImgSize=(400,300),
                        maxMols =len(mols_list)*5,
                        legends=label_list
                            )
img

# TEST MOLECULE も

In [None]:
# デバッグ用
df = df_test.copy()
# trainに含まれるのは除く
train_bb1_idx = df_train['buildingblock1_smiles'].unique().tolist()
df = df.loc[~df_test['buildingblock1_smiles'].isin(train_bb1_idx)]
df = df.drop_duplicates(["buildingblock1_smiles"])
print(len(df))

df['main'] = df_test_orig.iloc[df.index]["molecule_smiles"].values
df['buildingblock1_smiles'] = df['buildingblock1_smiles'].map(bb1_idx2smiles)
df['buildingblock2_smiles'] = df['buildingblock2_smiles'].map(bb23_idx2smiles)
df['buildingblock3_smiles'] = df['buildingblock3_smiles'].map(bb23_idx2smiles)

i = 69
temp = df.iloc[i]
smiles = temp['buildingblock1_smiles']
print(smiles)
mol = Chem.MolFromSmiles(smiles)

# mol = AllChem.DeleteSubstructs(mol, Chem.MolFromSmiles("CC(C)(C)O"))
mol = AllChem.ReplaceSubstructs(mol, Chem.MolFromSmiles("Br"), Chem.MolFromSmiles("C"))[0]
mol = AllChem.ReplaceSubstructs(mol, Chem.MolFromSmiles("C(=O)OCC2c3ccccc3-c3ccccc32"), Chem.MolFromSmiles("C"))[0]
mol

In [None]:
df = df_test.copy()

# trainに含まれるのは除く
train_bb1_idx = df_train['buildingblock1_smiles'].unique().tolist()
df = df.loc[~df_test['buildingblock1_smiles'].isin(train_bb1_idx)]
df = df.drop_duplicates(["buildingblock1_smiles"])
print(len(df))

df['main'] = df_test_orig.iloc[df.index]["molecule_smiles"].values
df['buildingblock1_smiles'] = df['buildingblock1_smiles'].map(bb1_idx2smiles)
df['buildingblock2_smiles'] = df['buildingblock2_smiles'].map(bb23_idx2smiles)
df['buildingblock3_smiles'] = df['buildingblock3_smiles'].map(bb23_idx2smiles)

del_t_butyl_halo_patterns = [
    "CC(C)(C)OC(=O)N1CC(c2ccccc2)=C[C@H]1C(=O)O",
    "CC(C)(C)OC(=O)N1CCC(C(=O)O)(c2ccccc2)CC1",
    "CC(C)(C)OC(=O)N1CCC(COc2ccccc2C(=O)O)CC1",
    "CC(C)(C)OC(=O)N1CCC(COc2ccc(C(=O)O)cc2)CC1",
    "CC(C)(C)OC(=O)N1CCC(Oc2cc(C(=O)O)ccc2I)CC1",
    "CC(C)(C)OC(=O)N1CCC[C@@H](c2ccccc2)[C@@H]1C(=O)O",
    "CC(C)(C)OC(=O)N1CCC[C@@H](n2cccc2C(=O)O)C1",
    "CC(C)(C)OC(=O)N1CCC[C@H](n2cc(C(=O)O)c3ccccc32)C1",
    "CC(C)(C)OC(=O)N1CC[C@@](Cc2cccs2)(C(=O)O)C1",
    "CC(C)(C)OC(=O)N1CC[C@H](Oc2ccccc2C(=O)O)C1",
    "CC(C)(C)OC(=O)N1CC[C@](Cc2ccccc2)(C(=O)O)C1",
    "CC(C)(C)OC(=O)N1C[C@@H](C(=O)O)[C@H](c2ccccc2)C1",
    "CC(C)(C)OC(=O)N1C[C@@H](Oc2ccccn2)C[C@@H]1C(=O)O",
    "CC(C)(C)OC(=O)N1C[C@@H](n2cncc2)C[C@H]1C(=O)O",
    "CC(C)(C)OC(=O)N1C[C@H](Oc2ccccc2)C[C@@H]1C(=O)O",
    "CC(C)(C)OC(=O)N[C@@H]1CCCN(c2ncccc2C(=O)O)C1",
    "CN(c1cc(C(=O)O)ccn1)C1CCN(C(=O)OC(C)(C)C)C1",
    ]
del_fluorene_halo_patterns = [
    "CN(c1ncccc1C(=O)O)C1CCN(C(=O)OCC2c3ccccc3-c3ccccc32)C1",
    "O=C(NC1CN(c2cc(C(=O)O)ccn2)C1)OCC1c2ccccc2-c2ccccc21",
    "O=C(N[C@@H]1CCN(c2cc(C(=O)O)ccn2)C1)OCC1c2ccccc2-c2ccccc21",
    "O=C(N[C@@H]1CCN(c2ncccc2C(=O)O)C1)OCC1c2ccccc2-c2ccccc21",
    "O=C(O)[C@@H]1C=C(c2cccnc2)CN1C(=O)OCC1c2ccccc2-c2ccccc21",
    "O=C(O)[C@@H]1C=C(c2ccncc2)CN1C(=O)OCC1c2ccccc2-c2ccccc21",
    "O=C(O)[C@@H]1C=C(c2cncnc2)CN1C(=O)OCC1c2ccccc2-c2ccccc21",
    "O=C(O)[C@@H]1CN(C(=O)OCC2c3ccccc3-c3ccccc32)C[C@H]1c1cccnc1",
    "O=C(O)[C@@H]1CN(C(=O)OCC2c3ccccc3-c3ccccc32)C[C@H]1c1ccncc1",
    "O=C(O)c1cccnc1N1CCCN(C(=O)OCC2c3ccccc3-c3ccccc32)CC1",
    "O=C(O)c1cccnc1N1CCN(C(=O)OCC2c3ccccc3-c3ccccc32)CC1",
    "O=C(O)c1ccnc(N2CCCN(C(=O)OCC3c4ccccc4-c4ccccc43)CC2)c1",
    "O=C(O)c1ccnc(N2CCN(C(=O)OCC3c4ccccc4-c4ccccc43)CC2)c1",
    "O=C(OCC1c2ccccc2-c2ccccc21)N1CCC(Cc2ccncc2)(C(=O)O)C1",
    "O=C(OCC1c2ccccc2-c2ccccc21)N1CCC(Cc2ccncc2)(C(=O)O)CC1",
]

tert_butyl_ester_smiles = ["CC(C)(C)OC(=O)CC(NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O",
                           "CC(C)(C)OC(=O)CCC(NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O",
                           "CC(C)(C)OC(=O)N1CCN(C(=O)OCC2c3ccccc3-c3ccccc32)C1C(=O)O",
                           "CC(C)(C)OC(=O)N1C[C@@H](NC(=O)OCC2c3ccccc3-c3ccccc32)[C@H](C(=O)O)C1",
                           "COC(=O)CC(NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O",
                           "COC(=O)c1ccc(C(=O)O)c(NC(=O)OCC2c3ccccc3-c3ccccc32)c1",
                           ]

ex_dict = {}
df['replaced'] = df['buildingblock1_smiles'].apply(lambda x: clean_and_capping_bb1_structure(x, 
                                                                                 del_t_butyl_halo_patterns=del_t_butyl_halo_patterns, 
                                                                                 del_fluorene_halo_patterns=del_fluorene_halo_patterns,
                                                                                 tert_butyl_ester_smiles=tert_butyl_ester_smiles,
                                                                                 ex_dict=ex_dict))

col = ['buildingblock1_smiles',"replaced", 'main', 'buildingblock2_smiles','buildingblock3_smiles', ]
df[col] = df[col].applymap(lambda x: Chem.MolFromSmiles(x))

i = 0
chunk = 100
mols_list = df.iloc[i*chunk:i*chunk+chunk][col].values.flatten()
label_list  = [str(i) for i in range(i*100, i*100+100) for _ in range(5)]

img = Draw.MolsToGridImage(mols_list,
                        molsPerRow=5, #一列に配置する分子の数
                        subImgSize=(400,300),
                        maxMols = len(mols_list)*5,
                        legends = label_list,
                            )
img


# 一括チェック 

In [None]:
df = df_test.copy()
df = df.drop_duplicates(["buildingblock1_smiles"])
print(len(df))

df['main'] = df_test_orig.iloc[df.index]["molecule_smiles"].values
df['buildingblock1_smiles'] = df['buildingblock1_smiles'].map(bb1_idx2smiles)
df['buildingblock2_smiles'] = df['buildingblock2_smiles'].map(bb23_idx2smiles)
df['buildingblock3_smiles'] = df['buildingblock3_smiles'].map(bb23_idx2smiles)
df.iloc[145]['buildingblock1_smiles']

In [None]:
df = df_test.copy()
df = df.drop_duplicates(["buildingblock1_smiles"])
print(len(df))

df['main'] = df_test_orig.iloc[df.index]["molecule_smiles"].values
df['buildingblock1_smiles'] = df['buildingblock1_smiles'].map(bb1_idx2smiles)
df['buildingblock2_smiles'] = df['buildingblock2_smiles'].map(bb23_idx2smiles)
df['buildingblock3_smiles'] = df['buildingblock3_smiles'].map(bb23_idx2smiles)


del_t_butyl_halo_patterns = [
    "CC(C)(C)OC(=O)N1CC(c2ccccc2)=C[C@H]1C(=O)O",
    "CC(C)(C)OC(=O)N1CCC(C(=O)O)(c2ccccc2)CC1",
    "CC(C)(C)OC(=O)N1CCC(COc2ccccc2C(=O)O)CC1",
    "CC(C)(C)OC(=O)N1CCC(COc2ccc(C(=O)O)cc2)CC1",
    "CC(C)(C)OC(=O)N1CCC(Oc2cc(C(=O)O)ccc2I)CC1",
    "CC(C)(C)OC(=O)N1CCC[C@@H](c2ccccc2)[C@@H]1C(=O)O",
    "CC(C)(C)OC(=O)N1CCC[C@@H](n2cccc2C(=O)O)C1",
    "CC(C)(C)OC(=O)N1CCC[C@H](n2cc(C(=O)O)c3ccccc32)C1",
    "CC(C)(C)OC(=O)N1CC[C@@](Cc2cccs2)(C(=O)O)C1",
    "CC(C)(C)OC(=O)N1CC[C@H](Oc2ccccc2C(=O)O)C1",
    "CC(C)(C)OC(=O)N1CC[C@](Cc2ccccc2)(C(=O)O)C1",
    "CC(C)(C)OC(=O)N1C[C@@H](C(=O)O)[C@H](c2ccccc2)C1",
    "CC(C)(C)OC(=O)N1C[C@@H](Oc2ccccn2)C[C@@H]1C(=O)O",
    "CC(C)(C)OC(=O)N1C[C@@H](n2cncc2)C[C@H]1C(=O)O",
    "CC(C)(C)OC(=O)N1C[C@H](Oc2ccccc2)C[C@@H]1C(=O)O",
    "CC(C)(C)OC(=O)N[C@@H]1CCCN(c2ncccc2C(=O)O)C1",
    "CN(c1cc(C(=O)O)ccn1)C1CCN(C(=O)OC(C)(C)C)C1",
    ]
del_fluorene_halo_patterns = [
    "CN(c1ncccc1C(=O)O)C1CCN(C(=O)OCC2c3ccccc3-c3ccccc32)C1",
    "O=C(NC1CN(c2cc(C(=O)O)ccn2)C1)OCC1c2ccccc2-c2ccccc21",
    "O=C(N[C@@H]1CCN(c2cc(C(=O)O)ccn2)C1)OCC1c2ccccc2-c2ccccc21",
    "O=C(N[C@@H]1CCN(c2ncccc2C(=O)O)C1)OCC1c2ccccc2-c2ccccc21",
    "O=C(O)[C@@H]1C=C(c2cccnc2)CN1C(=O)OCC1c2ccccc2-c2ccccc21",
    "O=C(O)[C@@H]1C=C(c2ccncc2)CN1C(=O)OCC1c2ccccc2-c2ccccc21",
    "O=C(O)[C@@H]1C=C(c2cncnc2)CN1C(=O)OCC1c2ccccc2-c2ccccc21",
    "O=C(O)[C@@H]1CN(C(=O)OCC2c3ccccc3-c3ccccc32)C[C@H]1c1cccnc1",
    "O=C(O)[C@@H]1CN(C(=O)OCC2c3ccccc3-c3ccccc32)C[C@H]1c1ccncc1",
    "O=C(O)c1cccnc1N1CCCN(C(=O)OCC2c3ccccc3-c3ccccc32)CC1",
    "O=C(O)c1cccnc1N1CCN(C(=O)OCC2c3ccccc3-c3ccccc32)CC1",
    "O=C(O)c1ccnc(N2CCCN(C(=O)OCC3c4ccccc4-c4ccccc43)CC2)c1",
    "O=C(O)c1ccnc(N2CCN(C(=O)OCC3c4ccccc4-c4ccccc43)CC2)c1",
    "O=C(OCC1c2ccccc2-c2ccccc21)N1CCC(Cc2ccncc2)(C(=O)O)C1",
    "O=C(OCC1c2ccccc2-c2ccccc21)N1CCC(Cc2ccncc2)(C(=O)O)CC1",
]
tert_butyl_ester_smiles = ["CC(C)(C)OC(=O)CC(NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O",
                           "CC(C)(C)OC(=O)CCC(NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O",
                           "CC(C)(C)OC(=O)N1CCN(C(=O)OCC2c3ccccc3-c3ccccc32)C1C(=O)O",
                           "CC(C)(C)OC(=O)N1C[C@@H](NC(=O)OCC2c3ccccc3-c3ccccc32)[C@H](C(=O)O)C1",
                           "COC(=O)CC(NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O",
                           "COC(=O)c1ccc(C(=O)O)c(NC(=O)OCC2c3ccccc3-c3ccccc32)c1",
                           ]
ex_dict = {}
df['replaced'] = df['buildingblock1_smiles'].apply(lambda x: clean_and_capping_bb1_structure(x, 
                                                                                 del_t_butyl_halo_patterns=del_t_butyl_halo_patterns, 
                                                                                 del_fluorene_halo_patterns=del_fluorene_halo_patterns,
                                                                                 tert_butyl_ester_smiles=tert_butyl_ester_smiles,
                                                                                 ex_dict=ex_dict))

col = ['buildingblock1_smiles','replaced', 'buildingblock2_smiles','buildingblock3_smiles', 'main']
col_mols = [f'{c}_mol' for c in col]

df[col_mols] = df[col].applymap(lambda x: Chem.MolFromSmiles(x))

i = 2
chunk = 10
mols_list = df.iloc[i*chunk:i*chunk+chunk][col_mols].values.flatten()
label_list  = [str(i) for i in range(chunk) for _ in range(5)]

img = Draw.MolsToGridImage(mols_list,
                        molsPerRow=5, #一列に配置する分子の数
                        subImgSize=(400,300),
                        maxMols = len(mols_list)*5,
                        legends = label_list,
                            )
img

In [None]:
# 記述子計算
from rdkit.Chem import AllChem, Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
from mordred import Calculator, descriptors 
from rdkit.Chem.Fingerprints import FingerprintMols

mols_list = df['replaced_mol'].values.tolist()
descriptor_names = [desc_name for desc_name, _ in Descriptors.descList]
descriptor_calculation = MoleculeDescriptors.MolecularDescriptorCalculator(descriptor_names)
RDkit = [descriptor_calculation.CalcDescriptors(mol_temp) for mol_temp in mols_list]
df_RDkit = pd.DataFrame(RDkit, columns = descriptor_names)
df_RDkit