# 概要
- trainとtestのbbをまとめる（indexずれを防ぐため）
- BB1のscaffoldの情報を使って、FOLDを設定する
- bb1~3と同様にbb1もindex化して列を足し、scaffold用の辞書も準備

In [1]:
import os
from pathlib import Path

def is_kaggle_kernel():
    return os.path.exists('/kaggle/working')

if is_kaggle_kernel():

    BASE_DIR = Path("/kaggle")
    DATA_DIR = BASE_DIR / "input"
    OUTPUT_DIR = BASE_DIR / "working"
    print('on kaggle notebook')

else:
    BASE_DIR = Path(os.getcwd()) / './../'
    DATA_DIR = BASE_DIR / "data"
    OUTPUT_DIR = BASE_DIR / "output/eda"
    
class paths:    
    DATA_DIR = DATA_DIR
    TRAIN_PATH = DATA_DIR / "train.parquet"
    TEST_PATH = DATA_DIR / "test.parquet"
    OUTPUT_DIR = OUTPUT_DIR
    SHRUNKEN_DATA_DIR = DATA_DIR / "shrunken-train-set"
    
    OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

# 必要なデータ
- smiles to indexの辞書, index to smilesの辞書
    - bb1
    - bb1 scaffold
    - bb2, 3 scaffold
    - bb2, 3 scaffold
    - main moleculeの scaffoldの辞書

- shrunkenしたテストデータ
    - train
    - test（pretrain用）
- non shrunken test（submitt用）

In [2]:
# !pip install rdkit

In [3]:
import os
import gc
import math
import numpy as np
import pandas as pd
from glob import glob
# import duckdb
# import lightgbm as lgb
from tqdm import tqdm
from rdkit import Chem
from rdkit.Chem import Draw

import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from rdkit.Chem.Scaffolds import MurckoScaffold
from funcs.chemical_func import clean_and_capping_bb1_structure, clean_and_capping_bb23_structure, clean_and_capping_metal_bb1_structure, clean_and_capping_metal_bb23_structure

import pickle

In [4]:
bb_cols = ['buildingblock1_smiles', 'buildingblock2_smiles','buildingblock3_smiles', 
        #    'molecule_smiles'
           ]
TARGETS = ['binds_BRD4', 'binds_HSA','binds_sEH']

df_train = pd.read_parquet(paths.DATA_DIR / 'shrunken-train-set/train.parquet', columns=bb_cols + TARGETS)
df_test = pd.read_parquet(paths.DATA_DIR / 'shrunken-train-set/test.parquet', columns=bb_cols)

# submit用のtestデータ
df_sub = pd.read_parquet(paths.DATA_DIR / 'test.parquet', columns=['id','protein_name']+bb_cols)

In [5]:
# building block smiles
# NOTE: trainとtestのindexとsmilesは一致していないっぽい
with open(paths.SHRUNKEN_DATA_DIR / 'train_dicts/BBs_dict_reverse_1.p', 'rb') as file:
    train_dicts_bb1 = pickle.load(file)
with open(paths.SHRUNKEN_DATA_DIR / 'train_dicts/BBs_dict_reverse_2.p', 'rb') as file:
    train_dicts_bb2 = pickle.load(file)
with open(paths.SHRUNKEN_DATA_DIR / 'train_dicts/BBs_dict_reverse_3.p', 'rb') as file:
    train_dicts_bb3 = pickle.load(file)

with open(paths.SHRUNKEN_DATA_DIR / 'test_dicts/BBs_dict_reverse_1_test.p', 'rb') as file:
    test_dicts_bb1 = pickle.load(file)
with open(paths.SHRUNKEN_DATA_DIR / 'test_dicts/BBs_dict_reverse_2_test.p', 'rb') as file:
    test_dicts_bb2 = pickle.load(file)
with open(paths.SHRUNKEN_DATA_DIR / 'test_dicts/BBs_dict_reverse_3_test.p', 'rb') as file:
    test_dicts_bb3= pickle.load(file)

一旦smilesに直しておく

In [6]:
# 一旦smilesに直しておく
df_train['buildingblock1_smiles'] = df_train['buildingblock1_smiles'].map(train_dicts_bb1)
df_train['buildingblock2_smiles'] = df_train['buildingblock2_smiles'].map(train_dicts_bb2)
df_train['buildingblock3_smiles'] = df_train['buildingblock3_smiles'].map(train_dicts_bb3)
df_test['buildingblock1_smiles'] = df_test['buildingblock1_smiles'].map(test_dicts_bb1)
df_test['buildingblock2_smiles'] = df_test['buildingblock2_smiles'].map(test_dicts_bb2)
df_test['buildingblock3_smiles'] = df_test['buildingblock3_smiles'].map(test_dicts_bb3)

In [7]:
# cleaned structureに直す
bb1 = list(set(df_train['buildingblock1_smiles'].unique()) | set(df_test['buildingblock1_smiles'].unique()))
bb23 = list(set(df_train['buildingblock2_smiles'].unique()) | set(df_test['buildingblock2_smiles'].unique()) | set(df_train['buildingblock3_smiles'].unique()) | set(df_test['buildingblock3_smiles'].unique()))

bb1_clean_dict = {bb:clean_and_capping_bb1_structure(bb) for bb in bb1}
bb23_clean_dict = {bb:clean_and_capping_bb23_structure(bb) for bb in bb23}

# metalでcappingしたverのdict
bb1_clean_dict_metal = {bb:clean_and_capping_metal_bb1_structure(bb) for bb in bb1}
bb23_clean_dict_metal = {bb:clean_and_capping_metal_bb23_structure(bb) for bb in bb23}


df_train['buildingblock1_smiles'] = df_train['buildingblock1_smiles'].map(bb1_clean_dict)
df_train['buildingblock2_smiles'] = df_train['buildingblock2_smiles'].map(bb23_clean_dict)
df_train['buildingblock3_smiles'] = df_train['buildingblock3_smiles'].map(bb23_clean_dict)
df_test['buildingblock1_smiles'] = df_test['buildingblock1_smiles'].map(bb1_clean_dict)
df_test['buildingblock2_smiles'] = df_test['buildingblock2_smiles'].map(bb23_clean_dict)
df_test['buildingblock3_smiles'] = df_test['buildingblock3_smiles'].map(bb23_clean_dict)
df_sub['buildingblock1_smiles'] = df_sub['buildingblock1_smiles'].map(bb1_clean_dict)
df_sub['buildingblock2_smiles'] = df_sub['buildingblock2_smiles'].map(bb23_clean_dict)
df_sub['buildingblock3_smiles'] = df_sub['buildingblock3_smiles'].map(bb23_clean_dict)


## smilet to idxの辞書を作成
trainとtestを合わせた辞書を作成する

In [8]:
bb1_smiles = sorted(list(set(bb1_clean_dict.values())))
bb23_smiles = sorted(set(list(bb23_clean_dict.values())))

print(len(bb1_smiles), len(bb23_smiles))

# smilesをindexに変換する辞書を作成
bb1_smiles2idx = {smiles: idx for idx, smiles in enumerate(bb1_smiles)}
bb23_smiles2idx = {smiles: idx for idx, smiles in enumerate(bb23_smiles)}


341 1769


In [9]:
# metalでキャッピングされたものを作成
# cleaned structure → metal capping cleaned structure
bb1_clean2cleanmetal = {bb1_clean_dict[bb]: bb1_clean_dict_metal[bb] for bb in bb1}
bb23_clean2cleanmetal = {bb23_clean_dict[bb]: bb23_clean_dict_metal[bb] for bb in bb23}

# bb1_smilesmetal2idx
bb1_smilesmetal2idx = {bb1_clean2cleanmetal[smiles]: idx for smiles, idx in bb1_smiles2idx.items()}
bb23_smilesmetal2idx = {bb23_clean2cleanmetal[smiles]: idx for smiles, idx in bb23_smiles2idx.items()}

## scaffold smilesを作成

In [10]:
def get_scaffold_smiles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    scaffold = MurckoScaffold.GetScaffoldForMol(mol)
    scaffold_smiles = Chem.MolToSmiles(scaffold)
    
    if scaffold_smiles == '':
        scaffold_smiles = 'C'
    
    return scaffold_smiles

# bb1のscaffoldを取得
bb1_scafold_dict = {}
for smiles in tqdm(bb1_smiles):
    bb1_scafold_dict[smiles] = get_scaffold_smiles(smiles)

bb23_scafold_dict = {}
for smiles in tqdm(bb23_smiles):
    bb23_scafold_dict[smiles] = get_scaffold_smiles(smiles)

# scaffoldのリスト TODO:
bb1_scaffold_smiles = sorted(list(set([smiles for smiles in bb1_scafold_dict.values()])))
bb23_scaffold_smiles = sorted(list(set([smiles for smiles in bb23_scafold_dict.values()])))

# scaffoldのsmilesをindexに変換する辞書を作成
bb1_scaffold_smiles2idx = {smiles: idx for idx, smiles in enumerate(bb1_scaffold_smiles)}
bb23_scaffold_smiles2idx = {smiles: idx for idx, smiles in enumerate(bb23_scaffold_smiles)}

100%|████████████████████████████████████████████████████████████████████████████████████| 341/341 [00:00<00:00, 1518.97it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 1769/1769 [00:00<00:00, 4082.97it/s]


In [11]:
df_train["buildingblock1_smiles_scaffold"] = df_train["buildingblock1_smiles"].map(bb1_scafold_dict)
df_train["buildingblock2_smiles_scaffold"] = df_train["buildingblock2_smiles"].map(bb23_scafold_dict)
df_train["buildingblock3_smiles_scaffold"] = df_train["buildingblock3_smiles"].map(bb23_scafold_dict)

df_test["buildingblock1_smiles_scaffold"] = df_test["buildingblock1_smiles"].map(bb1_scafold_dict)
df_test["buildingblock2_smiles_scaffold"] = df_test["buildingblock2_smiles"].map(bb23_scafold_dict)
df_test["buildingblock3_smiles_scaffold"] = df_test["buildingblock3_smiles"].map(bb23_scafold_dict)

df_sub["buildingblock1_smiles_scaffold"] = df_sub["buildingblock1_smiles"].map(bb1_scafold_dict)
df_sub["buildingblock2_smiles_scaffold"] = df_sub["buildingblock2_smiles"].map(bb23_scafold_dict)
df_sub["buildingblock3_smiles_scaffold"] = df_sub["buildingblock3_smiles"].map(bb23_scafold_dict)

## FOLDは旧版のものを受け継ぐ

In [12]:
df_fold = pd.read_parquet(paths.DATA_DIR / 'shrunken-train-set/train_fold_only.parquet', columns=['fold'])
df_train["fold"] = df_fold['fold'].values

# foldを分ける
display(df_train['fold'].value_counts())

# >>> fold2のbinds_sEHだけ以上に多い
for fold in range(5):
    print(f'fold{fold}')
    print(df_train[df_train['fold'] == fold][TARGETS].mean())
    print('---')

1    31600668
4    27966077
0    19967499
2    10163121
3     8718245
Name: fold, dtype: int64

fold0
binds_BRD4    0.006064
binds_HSA     0.004038
binds_sEH     0.002274
dtype: float64
---
fold1
binds_BRD4    0.003272
binds_HSA     0.004994
binds_sEH     0.002603
dtype: float64
---
fold2
binds_BRD4    0.004564
binds_HSA     0.002907
binds_sEH     0.044195
dtype: float64
---
fold3
binds_BRD4    0.002707
binds_HSA     0.003055
binds_sEH     0.001804
dtype: float64
---
fold4
binds_BRD4    0.005811
binds_HSA     0.004069
binds_sEH     0.004720
dtype: float64
---


## 全てIDXに変換して、変換用の辞書を保存する

In [13]:
col_order = [
    'buildingblock1_smiles', 'buildingblock2_smiles','buildingblock3_smiles', 
       'buildingblock1_smiles_scaffold', 'buildingblock2_smiles_scaffold','buildingblock3_smiles_scaffold',
       'fold',
       'binds_BRD4', 'binds_HSA', 'binds_sEH']
df_train = df_train[col_order]
df_test = df_test[col_order[:-4]]

In [14]:
# 全てindexに変換する
def smiles2idx(df):
    df = df.copy()
    df['buildingblock1_smiles'] = df['buildingblock1_smiles'].map(bb1_smiles2idx)
    df['buildingblock2_smiles'] = df['buildingblock2_smiles'].map(bb23_smiles2idx)
    df['buildingblock3_smiles'] = df['buildingblock3_smiles'].map(bb23_smiles2idx)
    df['buildingblock1_smiles_scaffold'] = df['buildingblock1_smiles_scaffold'].map(bb1_scaffold_smiles2idx)
    df['buildingblock2_smiles_scaffold'] = df['buildingblock2_smiles_scaffold'].map(bb23_scaffold_smiles2idx)
    df['buildingblock3_smiles_scaffold'] = df['buildingblock3_smiles_scaffold'].map(bb23_scaffold_smiles2idx)
    return df

df_train = smiles2idx(df_train)
df_test = smiles2idx(df_test)
df_sub = smiles2idx(df_sub)


In [15]:
bb1_smiles2idx

{'C#CCCC[C@H](Nc1nc(NC2c3ccccc3-c3ccccc32)nc(NC2c3ccccc3-c3ccccc32)n1)C(=O)OCCCCCCC': 0,
 'C#CC[C@@H](CC(=O)OCCCCCCC)Nc1nc(NC2c3ccccc3-c3ccccc32)nc(NC2c3ccccc3-c3ccccc32)n1': 1,
 'C#CC[C@@H](Nc1nc(NC2c3ccccc3-c3ccccc32)nc(NC2c3ccccc3-c3ccccc32)n1)C(=O)OCCCCCCC': 2,
 'C#CC[C@@](C)(Nc1nc(NC2c3ccccc3-c3ccccc32)nc(NC2c3ccccc3-c3ccccc32)n1)C(=O)OCCCCCCC': 3,
 'C#CC[C@H](CC(=O)OCCCCCCC)Nc1nc(NC2c3ccccc3-c3ccccc32)nc(NC2c3ccccc3-c3ccccc32)n1': 4,
 'C#CC[C@H](Nc1nc(NC2c3ccccc3-c3ccccc32)nc(NC2c3ccccc3-c3ccccc32)n1)C(=O)OCCCCCCC': 5,
 'C1CCCCC1.CCCCCCCOC(=O)CC[C@@H](Nc1nc(NC2c3ccccc3-c3ccccc32)nc(NC2c3ccccc3-c3ccccc32)n1)C(=O)O': 6,
 'C=CCC(CC=C)(Nc1nc(NC2c3ccccc3-c3ccccc32)nc(NC2c3ccccc3-c3ccccc32)n1)C(=O)OCCCCCCC': 7,
 'C=CCC(Nc1nc(NC2c3ccccc3-c3ccccc32)nc(NC2c3ccccc3-c3ccccc32)n1)C(=O)OCCCCCCC': 8,
 'C=CCC[C@@H](Nc1nc(NC2c3ccccc3-c3ccccc32)nc(NC2c3ccccc3-c3ccccc32)n1)C(=O)OCCCCCCC': 9,
 'C=CC[C@@H](Nc1nc(NC2c3ccccc3-c3ccccc32)nc(NC2c3ccccc3-c3ccccc32)n1)C(=O)OCCCCCCC': 10,
 'C=CC[C@H](Nc1nc(

In [16]:
## 保存
SAVE_DIR = paths.DATA_DIR / 'shrunken-data-capping'
SAVE_DIR.mkdir(exist_ok=True, parents=True)

df_train.to_parquet(SAVE_DIR / 'train.parquet')
df_test.to_parquet(SAVE_DIR / 'test.parquet')
df_sub.to_parquet(SAVE_DIR / 'sub.parquet')

with open(SAVE_DIR / 'bb1_smiles2idx.pickle', mode='wb') as f:
    pickle.dump(bb1_smiles2idx, f)
with open(SAVE_DIR / 'bb23_smiles2idx.pickle', mode='wb') as f:
    pickle.dump(bb23_smiles2idx, f)
with open(SAVE_DIR / 'bb1_scaffold_smiles2idx.pickle', mode='wb') as f:
    pickle.dump(bb1_scaffold_smiles2idx, f)
with open(SAVE_DIR / 'bb23_scaffold_smiles2idx.pickle', mode='wb') as f:
    pickle.dump(bb23_scaffold_smiles2idx, f)

with open(SAVE_DIR / 'bb1_smiles2idx_metalcapping.pickle', mode='wb') as f:
    pickle.dump(bb1_smilesmetal2idx, f)
with open(SAVE_DIR / 'bb23_smiles2idx_metalcapping.pickle', mode='wb') as f:
    pickle.dump(bb23_smilesmetal2idx, f)