# 概要
- trainとtestのbbをまとめる（indexずれを防ぐため）
- BB1のscaffoldの情報を使って、FOLDを設定する
- bb1~3と同様にbb1もindex化して列を足し、scaffold用の辞書も準備

In [2]:
import os
from pathlib import Path

def is_kaggle_kernel():
    return os.path.exists('/kaggle/working')

if is_kaggle_kernel():

    BASE_DIR = Path("/kaggle")
    DATA_DIR = BASE_DIR / "input"
    OUTPUT_DIR = BASE_DIR / "working"
    print('on kaggle notebook')

else:
    BASE_DIR = Path(os.getcwd()) / './../'
    DATA_DIR = BASE_DIR / "data"
    OUTPUT_DIR = BASE_DIR / "output/eda"
    
class paths:    
    DATA_DIR = DATA_DIR
    TRAIN_PATH = DATA_DIR / "train.parquet"
    TEST_PATH = DATA_DIR / "test.parquet"
    OUTPUT_DIR = OUTPUT_DIR
    SHRUNKEN_DATA_DIR = DATA_DIR / "shrunken-train-set"
    
    OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

# 必要なデータ
- smiles to indexの辞書, index to smilesの辞書
    - bb1
    - bb1 scaffold
    - bb2, 3 scaffold
    - bb2, 3 scaffold
    - main moleculeの scaffoldの辞書

- shrunkenしたテストデータ
    - train
    - test（pretrain用）
- non shrunken test（submitt用）

In [3]:
# !pip install rdkit

In [4]:
import os
import gc
import math
import numpy as np
import pandas as pd
from glob import glob
# import duckdb
# import lightgbm as lgb
from tqdm import tqdm
from rdkit import Chem
from rdkit.Chem import Draw

import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from rdkit.Chem.Scaffolds import MurckoScaffold
from funcs.chemical_func import clean_bb1_structure

import pickle

In [5]:
bb_cols = ['buildingblock1_smiles', 'buildingblock2_smiles','buildingblock3_smiles', 
        #    'molecule_smiles'
           ]
TARGETS = ['binds_BRD4', 'binds_HSA','binds_sEH']

df_train = pd.read_parquet(paths.DATA_DIR / 'shrunken-train-set/train.parquet', columns=bb_cols + TARGETS)
df_test = pd.read_parquet(paths.DATA_DIR / 'shrunken-train-set/test.parquet', columns=bb_cols)

# submit用のtestデータ
df_sub = pd.read_parquet(paths.DATA_DIR / 'test.parquet', columns=['id','protein_name']+bb_cols)

In [6]:
# building block smiles
# NOTE: trainとtestのindexとsmilesは一致していないっぽい
with open(paths.SHRUNKEN_DATA_DIR / 'train_dicts/BBs_dict_reverse_1.p', 'rb') as file:
    train_dicts_bb1 = pickle.load(file)
with open(paths.SHRUNKEN_DATA_DIR / 'train_dicts/BBs_dict_reverse_2.p', 'rb') as file:
    train_dicts_bb2 = pickle.load(file)
with open(paths.SHRUNKEN_DATA_DIR / 'train_dicts/BBs_dict_reverse_3.p', 'rb') as file:
    train_dicts_bb3 = pickle.load(file)

with open(paths.SHRUNKEN_DATA_DIR / 'test_dicts/BBs_dict_reverse_1_test.p', 'rb') as file:
    test_dicts_bb1 = pickle.load(file)
with open(paths.SHRUNKEN_DATA_DIR / 'test_dicts/BBs_dict_reverse_2_test.p', 'rb') as file:
    test_dicts_bb2 = pickle.load(file)
with open(paths.SHRUNKEN_DATA_DIR / 'test_dicts/BBs_dict_reverse_3_test.p', 'rb') as file:
    test_dicts_bb3= pickle.load(file)

一旦smilesに直しておく

In [7]:
# 一旦smilesに直しておく
df_train['buildingblock1_smiles'] = df_train['buildingblock1_smiles'].map(train_dicts_bb1)
df_train['buildingblock2_smiles'] = df_train['buildingblock2_smiles'].map(train_dicts_bb2)
df_train['buildingblock3_smiles'] = df_train['buildingblock3_smiles'].map(train_dicts_bb3)
df_test['buildingblock1_smiles'] = df_test['buildingblock1_smiles'].map(test_dicts_bb1)
df_test['buildingblock2_smiles'] = df_test['buildingblock2_smiles'].map(test_dicts_bb2)
df_test['buildingblock3_smiles'] = df_test['buildingblock3_smiles'].map(test_dicts_bb3)

## smilet to idxの辞書を作成
trainとtestを合わせた辞書を作成する

In [8]:
bb1_smiles_train = [smiles for smiles in train_dicts_bb1.values()]
bb2_smiles_train = [smiles for smiles in train_dicts_bb2.values()]
bb3_smiles_train = [smiles for smiles in train_dicts_bb3.values()]
bb1_smiles_test = [smiles for smiles in test_dicts_bb1.values()]
bb2_smiles_test = [smiles for smiles in test_dicts_bb2.values()]
bb3_smiles_test = [smiles for smiles in test_dicts_bb3.values()]

bb1_smiles = sorted(list(set(bb1_smiles_train) | set(bb1_smiles_test)))
bb23_smiles = sorted(list(set(bb2_smiles_train) | set(bb2_smiles_test) | set(bb3_smiles_train) | set(bb3_smiles_test)))

print(len(bb1_smiles), len(bb23_smiles))

# smilesをindexに変換する辞書を作成
bb1_smiles2idx = {smiles: idx for idx, smiles in enumerate(bb1_smiles)}
bb23_smiles2idx = {smiles: idx for idx, smiles in enumerate(bb23_smiles)}


341 1769


## scaffold smilesを作成

In [9]:
# bb1のscaffoldを取得
bb1_scafold_dict = {}
for smiles in tqdm(bb1_smiles):
    mol = Chem.MolFromSmiles(smiles)
    scaffold = MurckoScaffold.GetScaffoldForMol(mol)
    scaffold_smiles = Chem.MolToSmiles(scaffold)
    
    if scaffold_smiles == '':
        scaffold_smiles = 'C'
    
    bb1_scafold_dict[smiles] = scaffold_smiles

bb23_scafold_dict = {}
for smiles in tqdm(bb23_smiles):
    mol = Chem.MolFromSmiles(smiles)
    scaffold = MurckoScaffold.GetScaffoldForMol(mol)
    scaffold_smiles = Chem.MolToSmiles(scaffold)
    
    if scaffold_smiles == '':
        scaffold_smiles = 'C'
    
    bb23_scafold_dict[smiles] = scaffold_smiles

# scaffoldのリスト
bb1_scaffold_smiles = list(set([smiles for smiles in bb1_scafold_dict.values()]))
bb23_scaffold_smiles = list(set([smiles for smiles in bb23_scafold_dict.values()]))

# scaffoldのsmilesをindexに変換する辞書を作成
bb1_scaffold_smiles2idx = {smiles: idx for idx, smiles in enumerate(bb1_scaffold_smiles)}
bb23_scaffold_smiles2idx = {smiles: idx for idx, smiles in enumerate(bb23_scaffold_smiles)}

100%|██████████| 341/341 [00:00<00:00, 6217.30it/s]
100%|██████████| 1769/1769 [00:00<00:00, 18402.23it/s]


In [10]:
df_train["buildingblock1_smiles_scaffold"] = df_train["buildingblock1_smiles"].map(bb1_scafold_dict)
df_train["buildingblock2_smiles_scaffold"] = df_train["buildingblock2_smiles"].map(bb23_scafold_dict)
df_train["buildingblock3_smiles_scaffold"] = df_train["buildingblock3_smiles"].map(bb23_scafold_dict)

df_test["buildingblock1_smiles_scaffold"] = df_test["buildingblock1_smiles"].map(bb1_scafold_dict)
df_test["buildingblock2_smiles_scaffold"] = df_test["buildingblock2_smiles"].map(bb23_scafold_dict)
df_test["buildingblock3_smiles_scaffold"] = df_test["buildingblock3_smiles"].map(bb23_scafold_dict)

df_sub["buildingblock1_smiles_scaffold"] = df_sub["buildingblock1_smiles"].map(bb1_scafold_dict)
df_sub["buildingblock2_smiles_scaffold"] = df_sub["buildingblock2_smiles"].map(bb23_scafold_dict)
df_sub["buildingblock3_smiles_scaffold"] = df_sub["buildingblock3_smiles"].map(bb23_scafold_dict)

## FOLDは旧版のものを受け継ぐ

In [11]:
df_fold = pd.read_parquet(paths.DATA_DIR / 'shrunken-train-set/train_fold_only.parquet', columns=['fold'])
df_train["fold"] = df_fold['fold'].values

# foldを分ける
display(df_train['fold'].value_counts())

# >>> fold2のbinds_sEHだけ以上に多い
for fold in range(5):
    print(f'fold{fold}')
    print(df_train[df_train['fold'] == fold][TARGETS].mean())
    print('---')

fold
1    31600668
4    27966077
0    19967499
2    10163121
3     8718245
Name: count, dtype: int64

fold0
binds_BRD4    0.006064
binds_HSA     0.004038
binds_sEH     0.002274
dtype: float64
---
fold1
binds_BRD4    0.003272
binds_HSA     0.004994
binds_sEH     0.002603
dtype: float64
---
fold2
binds_BRD4    0.004564
binds_HSA     0.002907
binds_sEH     0.044195
dtype: float64
---
fold3
binds_BRD4    0.002707
binds_HSA     0.003055
binds_sEH     0.001804
dtype: float64
---
fold4
binds_BRD4    0.005811
binds_HSA     0.004069
binds_sEH     0.004720
dtype: float64
---


## 全てIDXに変換して、変換用の辞書を保存する

In [12]:
col_order = [
    'buildingblock1_smiles', 'buildingblock2_smiles','buildingblock3_smiles', 
       'buildingblock1_smiles_scaffold', 'buildingblock2_smiles_scaffold','buildingblock3_smiles_scaffold',
       'fold',
       'binds_BRD4', 'binds_HSA', 'binds_sEH']
df_train = df_train[col_order]
df_test = df_test[col_order[:-4]]

In [13]:
# 全てindexに変換する
def smiles2idx(df):
    df = df.copy()
    df['buildingblock1_smiles'] = df['buildingblock1_smiles'].map(bb1_smiles2idx)
    df['buildingblock2_smiles'] = df['buildingblock2_smiles'].map(bb23_smiles2idx)
    df['buildingblock3_smiles'] = df['buildingblock3_smiles'].map(bb23_smiles2idx)
    df['buildingblock1_smiles_scaffold'] = df['buildingblock1_smiles_scaffold'].map(bb1_scaffold_smiles2idx)
    df['buildingblock2_smiles_scaffold'] = df['buildingblock2_smiles_scaffold'].map(bb23_scaffold_smiles2idx)
    df['buildingblock3_smiles_scaffold'] = df['buildingblock3_smiles_scaffold'].map(bb23_scaffold_smiles2idx)
    return df

df_train = smiles2idx(df_train)
df_test = smiles2idx(df_test)
df_sub = smiles2idx(df_sub)


In [14]:
df_sub.head()

Unnamed: 0,id,protein_name,buildingblock1_smiles,buildingblock2_smiles,buildingblock3_smiles,buildingblock1_smiles_scaffold,buildingblock2_smiles_scaffold,buildingblock3_smiles_scaffold
0,295246830,BRD4,0,20,20,17,116,116
1,295246831,HSA,0,20,20,17,116,116
2,295246832,sEH,0,20,20,17,116,116
3,295246833,BRD4,0,20,119,17,116,438
4,295246834,HSA,0,20,119,17,116,438


In [15]:
# cleanになったsmilesをindexに変換する辞書を作成
cleaned_bb1_smiles2idx = {clean_bb1_structure(smiles): idx for smiles, idx in bb1_smiles2idx.items()}
cleaned_bb1_smiles2idx

{'C#CCCCC(Nc1nc(N)nc(N)n1)C(=O)NCC': 0,
 'C#CCC(CC(=O)NCC)Nc1nc(N)nc(N)n1': 4,
 'C#CCC(Nc1nc(N)nc(N)n1)C(=O)NCC': 5,
 'C#CCC(C)(Nc1nc(N)nc(N)n1)C(=O)NCC': 3,
 'C=CCC(CC=C)(Nc1nc(N)nc(N)n1)C(=O)NCC': 6,
 'C=CCC(Nc1nc(N)nc(N)n1)C(=O)NCC': 10,
 'C=CCCC(Nc1nc(N)nc(N)n1)C(=O)NCC': 8,
 'CCNC(=O)C(Cc1ccc(C(C)=O)cc1)Nc1nc(N)nc(N)n1': 11,
 'CCNC(=O)C(CC(=O)OC(C)(C)C)Nc1nc(N)nc(N)n1': 12,
 'CCNC(=O)C(CCC(=O)OC(C)(C)C)Nc1nc(N)nc(N)n1': 13,
 'CCNC(=O)C1C=C(c2ccccc2C)CN1C=O': 14,
 'CCNC(=O)C1(c2ccc(C)cc2)CCN(C=O)CC1': 15,
 'CCNC(=O)C1(c2cccc(C)c2)CCN(C=O)CC1': 16,
 'CCNC(=O)c1ccc(C)cc1OCC1CCN(C=O)CC1': 17,
 'CCNC(=O)c1ccc(OCC2CCN(C=O)CC2)c(C)c1': 18,
 'CCNC(=O)c1ccc(C)c(OC2CCN(C=O)CC2)c1': 19,
 'CCNC(=O)C1C(c2cccc(C)c2)CCCN1C=O': 20,
 'CCNC(=O)c1cc(C)cn1C1CCCN(C=O)C1': 21,
 'CCNC(=O)c1cn(C2CCCN(C=O)C2)c2ccc(C)cc12': 22,
 'CCNC(=O)C1N(C(=O)OC(C)(C)C)CCN1c1nc(N)nc(N)n1': 23,
 'CCNC(=O)C1(Cc2ccc(C)s2)CCN(C=O)C1': 24,
 'CCNC(=O)c1ccc(C)cc1OC1CCN(C=O)C1': 25,
 'CCNC(=O)C1(Cc2ccc(C)cc2)CCN(C=O)C1': 26,
 

In [16]:
## 保存
SAVE_DIR = paths.DATA_DIR / 'shrunken-data'
SAVE_DIR.mkdir(exist_ok=True, parents=True)

df_train.to_parquet(SAVE_DIR / 'train.parquet')
df_test.to_parquet(SAVE_DIR / 'test.parquet')
df_sub.to_parquet(SAVE_DIR / 'sub.parquet')

with open(SAVE_DIR / 'bb1_smiles2idx.pickle', mode='wb') as f:
    pickle.dump(bb1_smiles2idx, f)
with open(SAVE_DIR / 'bb23_smiles2idx.pickle', mode='wb') as f:
    pickle.dump(bb23_smiles2idx, f)
with open(SAVE_DIR / 'bb1_scaffold_smiles2idx.pickle', mode='wb') as f:
    pickle.dump(bb1_scaffold_smiles2idx, f)
with open(SAVE_DIR / 'bb23_scaffold_smiles2idx.pickle', mode='wb') as f:
    pickle.dump(bb23_scaffold_smiles2idx, f)

with open(SAVE_DIR / 'cleaned_bb1_smiles2idx.pickle', mode='wb') as f:
    pickle.dump(cleaned_bb1_smiles2idx, f)