In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [20]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

import random
import os

DATA_PATH = '/content/drive/MyDrive/데이콘 캐글 컴페티션/2023신약개발/data/'
SEED = 42

In [21]:
from sklearn.feature_selection import VarianceThreshold


In [22]:
train = pd.read_csv(f"{DATA_PATH}train.csv")
test = pd.read_csv(f"{DATA_PATH}test.csv")
submission = pd.read_csv(f"{DATA_PATH}sample_submission.csv")

# 데이터 확인
- SMILES : 화합물 분자구조
- MLM/ HLM: 화합물의 대사안정성 지표 (인간, 쥐 - 대사되지 않고 남아있는 화합물의 양을 측정한 것) : 낮을 수록 안정성 좋은것
- AlogP : 화합물이 물-유기용매 사이에서 분배되는 정도 (로그파티션 계수)
- Molecular Weight: 분자량, 분자의 총 무게
- Num_H_Acceptors: 화합물의 수소 수용체 개수. 수소 원자가 수용체로 작동하는 원자를 의미
- Num_H_Donors : 화합물의 수소 공여체 개수. 수소 원자가 수소 결합을 형성할 수 있는 원자를 의미
- Num_RotatableBonds: 분자 내에서 회전이 가능한 결합 개수
- LogD : 화합물의 분배 계수. 로그 파티션 계수와 유사하나, 조금 다름. '어떻게 분배되나?'를 포함함
- Molecular_PolarSurfaceArea: 분자의 극성 표면 면적. 분자 내에서 극성 원자들이 차지하는 면적

In [None]:
train.columns

Index(['id', 'SMILES', 'MLM', 'HLM', 'AlogP', 'Molecular_Weight',
       'Num_H_Acceptors', 'Num_H_Donors', 'Num_RotatableBonds', 'LogD',
       'Molecular_PolarSurfaceArea'],
      dtype='object')

In [None]:
train.head()

Unnamed: 0,id,SMILES,MLM,HLM,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea
0,TRAIN_0000,CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC,26.01,50.68,3.259,400.495,5,2,8,3.259,117.37
1,TRAIN_0001,Cc1nc(C)c(CN2CC(C)C(=O)Nc3ccccc32)s1,29.27,50.59,2.169,301.407,2,1,2,2.172,73.47
2,TRAIN_0002,CCCN1CCN(c2nn3nnnc3c3ccccc23)CC1,5.586,80.892,1.593,297.358,5,0,3,1.585,62.45
3,TRAIN_0003,Cc1ccc(-c2ccc(-n3nc(C)c(S(=O)(=O)N4CCN(C5CCCCC...,5.71,2.0,4.771,494.652,6,0,5,3.475,92.6
4,TRAIN_0004,Cc1ccc2c(c1)N(C(=O)c1ccncc1)CC(C)O2,93.27,99.99,2.335,268.31,3,0,1,2.337,42.43


In [None]:
test.head()

Unnamed: 0,id,SMILES,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea
0,TEST_000,CC(C)Nc1ccnc(N2CCN(Cc3cccs3)C(CCO)C2)n1,2.641,361.505,4,2,7,2.635,92.76
1,TEST_001,COc1cc(=O)n(-c2ccccc2)cc1C(=O)N1CCC2(CC1)OCCO2,0.585,370.399,5,0,3,0.585,68.31
2,TEST_002,Cc1cccc(NC(=N)/N=c2\nc(O)c(Cc3ccccc3)c(C)[nH]2)c1,4.276,347.414,4,4,5,4.29,92.86
3,TEST_003,O=C(c1nc2ncccn2n1)N1CCCn2cc(-c3ccccc3)nc21,1.795,345.358,5,0,2,1.795,81.21
4,TEST_004,CCN1CCN(C(=O)c2cc3c(=O)n4cc(C)ccc4nc3n2C)CC1,1.219,353.418,4,0,2,0.169,61.15


In [None]:
train.shape, test.shape  #총 3498개의 데이터 (엄청 적음..) -> 총 483개의 데이터를 맞추기  (10%를)

((3498, 11), (483, 9))

In [None]:
3498/483

7.24223602484472

# * 이전 대회 자료 확인 (feature)

# 1) 사전학습 모델
- https://dacon.io/competitions/official/235953/codeshare/6033?page=1&dtype=recent



# 2) Mordred Package
- https://dacon.io/en/codeshare/3243

In [2]:

#### 노트북 셀 가로 넓게 보기
from IPython.core.display import display, HTML

notebook_config = ""
notebook_config += "<style>"
notebook_config += "  .container { width:90% !important; }"
notebook_config += "</style>"
HTML(notebook_config)

In [3]:
# SEED 및 프로그램 버전 설정
IS_COLAB = False
SEED = 42
PROGRAM_VERSION = "13"

In [4]:
if IS_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')

In [6]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.1-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.1


In [10]:
!pip3 search bayes_opt

[31mERROR: XMLRPC request failed [code: -32500]
RuntimeError: PyPI no longer supports 'pip search' (or XML-RPC search). Please use https://pypi.org/search (via a browser) instead. See https://warehouse.pypa.io/api-reference/xml-rpc.html#deprecated-methods for more information.[0m[31m
[0m

In [13]:
!pip install bayes_opt


[31mERROR: Could not find a version that satisfies the requirement bayes_opt (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for bayes_opt[0m[31m
[0m

In [14]:
import pandas as pd
import numpy as np
import random
import os
import matplotlib.pyplot as plt
import warnings; warnings.filterwarnings("ignore")
import shutil
import time
from scipy.stats import gmean
from itertools import combinations
from tqdm import tqdm
tqdm.pandas()

import tensorflow as tf
from tensorflow import keras


from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense
from tensorflow.keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D, LSTM
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.layers import TimeDistributed, LayerNormalization
from tensorflow.keras import layers
from tensorflow.keras.regularizers import l2
from tensorflow.keras.models import Model

from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import QuantileTransformer

# Modeling
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lasso
from sklearn.linear_model import ARDRegression
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import HuberRegressor
from sklearn.linear_model import RANSACRegressor
from sklearn.linear_model import TheilSenRegressor
from sklearn.linear_model import PassiveAggressiveRegressor


from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import StackingRegressor

# AdaBoost
from sklearn.ensemble import AdaBoostRegressor

# XGBoost
from xgboost import plot_importance
from xgboost import XGBRegressor

# lightgbm
from lightgbm import LGBMRegressor

# catboost
from catboost import CatBoostRegressor

# metric
from sklearn.metrics import mean_absolute_error

from tensorflow.keras.callbacks import CSVLogger, ModelCheckpoint
from tensorflow.keras.utils import to_categorical

# from sklearn.model_selection import RandomizedSearchCV
# from bayes_opt import BayesianOptimization
# from skopt.space import Real, Categorical, Integer
# from skopt import BayesSearchCV


print(tf.__version__)
print(keras.__version__)

2.12.0
2.12.0


In [16]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2023.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.7/29.7 MB[0m [31m36.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit
Successfully installed rdkit-2023.3.3


In [18]:
!pip install mordred

Collecting mordred
  Downloading mordred-1.2.0.tar.gz (128 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m128.8/128.8 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting networkx==2.* (from mordred)
  Downloading networkx-2.8.8-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m42.6 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: mordred
  Building wheel for mordred (setup.py) ... [?25l[?25hdone
  Created wheel for mordred: filename=mordred-1.2.0-py3-none-any.whl size=176721 sha256=848baf578bb05f1630d46a42d709dfba95f5483e9d279a8159beebce8a0f1150
  Stored in directory: /root/.cache/pip/wheels/a7/4f/b8/d4c6591f6ac944aaced7865b349477695f662388ad958743c7
Successfully built mordred
Installing collected packages: networkx, mordred
  Attempting uninstall: networkx
    Found existing installation: networkx 3.1
    Uninstall

In [19]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
from mordred import Calculator, descriptors

IPythonConsole.drawOptions.addAtomIndices = True
IPythonConsole.ipython_useSVG = True
# IPythonConsole.molSize = 500, 500

In [23]:
calc = Calculator(descriptors, ignore_3D=False)
len(calc.descriptors)

1826

In [24]:
x_train = calc.pandas([Chem.MolFromSmiles(x) for x in train.SMILES])
x_train

100%|██████████| 3498/3498 [14:22<00:00,  4.06it/s]


Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,21.379612,17.449011,0,0,35.689316,2.421199,4.745523,35.689316,1.274618,4.249154,...,10.081676,78.761075,400.156912,7.695325,2380,40,142.0,165.0,9.500000,6.361111
1,16.539255,14.049653,0,0,26.575899,2.426398,4.757199,26.575899,1.265519,3.984419,...,9.907828,69.149596,301.124883,7.528122,870,35,112.0,132.0,7.138889,4.527778
2,17.475469,13.660693,2,1,29.802128,2.510668,4.982923,29.802128,1.354642,4.049690,...,10.144510,70.158066,297.170194,7.248054,1028,36,120.0,145.0,5.277778,4.888889
3,27.857311,20.034364,0,1,45.884166,2.532483,4.973440,45.884166,1.310976,4.500758,...,10.613467,86.199585,494.246395,7.162991,4170,61,192.0,231.0,10.784722,7.500000
4,15.722758,12.817176,0,0,26.308663,2.452930,4.905860,26.308663,1.315433,3.935574,...,9.978363,53.872357,268.121178,7.447810,762,32,106.0,125.0,6.277778,4.361111
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3493,19.701948,17.330258,0,0,30.902711,2.515832,4.792938,30.902711,1.236108,4.158311,...,10.208580,77.363487,395.052750,10.129558,1615,38,136.0,162.0,10.062500,5.250000
3494,21.148902,17.245220,0,0,35.887372,2.494190,4.985907,35.887372,1.329162,4.237338,...,10.291162,75.955433,359.138225,8.162232,1765,45,144.0,173.0,7.750000,6.027778
3495,14.348985,12.232287,0,0,23.546531,2.450594,4.723643,23.546531,1.239291,3.855798,...,9.677277,66.189153,261.147727,6.872309,795,26,94.0,108.0,6.916667,4.333333
3496,14.458374,12.578196,0,0,23.936088,2.314213,4.623104,23.936088,1.259794,3.855415,...,9.604475,65.335399,284.056385,8.876762,812,25,94.0,107.0,6.916667,4.250000



---

