# 概要
- trainとtestのbbをまとめる（indexずれを防ぐため）
- BB1のscaffoldの情報を使って、FOLDを設定する
- bb1~3と同様にbb1もindex化して列を足し、scaffold用の辞書も準備

In [1]:
import os
from pathlib import Path

def is_kaggle_kernel():
    return os.path.exists('/kaggle/working')

if is_kaggle_kernel():

    BASE_DIR = Path("/kaggle")
    DATA_DIR = BASE_DIR / "input"
    OUTPUT_DIR = BASE_DIR / "working"
    print('on kaggle notebook')

else:
    BASE_DIR = Path(os.getcwd()) / './../'
    DATA_DIR = BASE_DIR / "data"
    OUTPUT_DIR = BASE_DIR / "output/eda"
    
class paths:    
    DATA_DIR = DATA_DIR
    TRAIN_PATH = DATA_DIR / "train.parquet"
    TEST_PATH = DATA_DIR / "test.parquet"
    OUTPUT_DIR = OUTPUT_DIR
    SHRUNKEN_DATA_DIR = DATA_DIR / "shrunken-train-set"
    
    OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

# 必要なデータ
- smiles to indexの辞書, index to smilesの辞書
    - bb1
    - bb1 scaffold
    - bb2, 3 scaffold
    - bb2, 3 scaffold
    - main moleculeの scaffoldの辞書

- shrunkenしたテストデータ
    - train
    - test（pretrain用）
- non shrunken test（submitt用）

In [2]:
# !pip install rdkit

In [3]:
import os
import gc
import math
import numpy as np
import pandas as pd
from glob import glob
# import duckdb
# import lightgbm as lgb
from tqdm import tqdm
from rdkit import Chem
from rdkit.Chem import Draw

import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from rdkit.Chem.Scaffolds import MurckoScaffold

import pickle

# mainの骨格のscaffoldから

In [4]:
bb_cols = ['molecule_smiles']
TARGETS = ['binds_BRD4', 'binds_HSA','binds_sEH']

df_train_main = pd.read_parquet(paths.DATA_DIR / 'shrunken-train-set/train.parquet', columns=bb_cols)
df_test_main = pd.read_parquet(paths.DATA_DIR / 'shrunken-train-set/test.parquet', columns=bb_cols)

# submit用のtestデータ
df_sub_main = pd.read_parquet(paths.DATA_DIR / 'test.parquet', columns=['id']+bb_cols)

In [5]:
from joblib import Parallel, delayed

def convert_scaffold(smiles):
    mol = Chem.MolFromSmiles(smiles)
    scaffold = MurckoScaffold.GetScaffoldForMol(mol)
    scaffold_smiles = Chem.MolToSmiles(scaffold)
    
    return scaffold_smiles

In [10]:
smiles_list  = df_train_main['molecule_smiles'].values.tolist()
train_scaffold = Parallel(n_jobs=16)(delayed(convert_scaffold)(smiles) for smiles in tqdm(smiles_list))
df_train_main['molecule_smiles'] = train_scaffold

  0%|          | 0/98415610 [00:00<?, ?it/s]

100%|██████████| 98415610/98415610 [6:28:38<00:00, 4220.50it/s]   


In [6]:
smiles_list  = df_test_main['molecule_smiles'].values.tolist()
test_scaffold = Parallel(n_jobs=8)(delayed(convert_scaffold)(smiles) for smiles in tqdm(smiles_list))
df_test_main['molecule_smiles'] = test_scaffold

100%|██████████| 878022/878022 [00:34<00:00, 25577.02it/s]


In [11]:
smiles_list  = df_sub_main['molecule_smiles'].values.tolist()
df_sub_main['molecule_smiles'] = Parallel(n_jobs=8)(delayed(convert_scaffold)(smiles) for smiles in tqdm(smiles_list))

100%|██████████| 1674896/1674896 [01:13<00:00, 22643.25it/s]


In [12]:
scaffold_set = sorted(list(set(test_scaffold)|set(test_scaffold)))
scaffold_smiles2idx = {scaffold: i for i, scaffold in enumerate(scaffold_set)}

In [13]:
df_train_main['molecule_smiles'] = df_train_main['molecule_smiles'].map(scaffold_smiles2idx)
df_test_main['molecule_smiles'] = df_test_main['molecule_smiles'].map(scaffold_smiles2idx)
df_sub_main['molecule_smiles'] = df_sub_main['molecule_smiles'].map(scaffold_smiles2idx)

In [18]:
df_train_main.to_parquet(paths.DATA_DIR / 'shrunken-data/train_main_scaffold.parquet')
df_test_main.to_parquet(paths.DATA_DIR / 'shrunken-data/test_main_scaffold.parquet')
df_sub_main.to_parquet(paths.DATA_DIR / 'shrunken-data/sub_main_scaffold.parquet')

# scaffold_smiles2idxを保存
with open(paths.DATA_DIR / 'shrunken-data/scaffold_smiles2idx.pickle', 'wb') as f:
    pickle.dump(scaffold_smiles2idx, f)