In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import re
import torch
from tqdm.auto import tqdm
import random
import os


import matplotlib.pyplot as plt
import seaborn as sns


DATA_PATH = '/content/drive/MyDrive/데이콘 캐글 컴페티션/2023신약개발/data/'
SEED = 42


def reset_seeds(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [3]:
train = pd.read_csv(f"{DATA_PATH}train.csv")
test = pd.read_csv(f"{DATA_PATH}test.csv")
submission = pd.read_csv(f"{DATA_PATH}sample_submission.csv")

In [4]:
train = pd.read_csv(f'{DATA_PATH}train_mordred_1835.csv')
test = pd.read_csv(f'{DATA_PATH}test_mordred_1835.csv')


  train = pd.read_csv(f'{DATA_PATH}train_mordred_1835.csv')


### -> 이상치 제거 (총 18개)

In [5]:
# 1. 극단치만 판단해서 18개 제거
outliers = train[(train['MLM'] > 100.0) | (train['HLM'] > 100.0) | (train['AlogP'] < -3) | (train['Molecular_Weight'] > 800)
     | (train['Num_H_Acceptors'] > 14) | (train['Num_H_Donors'] > 9) | (train['Num_RotatableBonds'] > 20)
     | (train['LogD'] < -4) | (train['Molecular_PolarSurfaceArea'] > 250)]


In [6]:
outliers.index

Int64Index([ 179,  662,  834,  983, 1092, 1172, 1239, 1584, 2159, 2258, 2367,
            2410, 2586, 2711, 2948, 3157, 3247, 3403],
           dtype='int64')

In [7]:
train = train.drop(index=outliers.index, axis=0)

In [8]:
train.shape # 3498 -> 3480

(3480, 1837)

In [9]:
duplicates = train[train.duplicated(subset=['SMILES'], keep=False)]

duplicates.sort_values(by='SMILES')

Unnamed: 0,id,SMILES,MLM,HLM,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
2276,TRAIN_2276,C(=C/c1nnn(Cc2ccccc2)n1)\c1ccccc1,1.535,31.453,3.556,262.309,3,0,4,3.556,...,9.530611,66.421505,262.121846,7.709466,969,23,100.0,112.0,4.444444,4.5
451,TRAIN_0451,C(=C/c1nnn(Cc2ccccc2)n1)\c1ccccc1,0.31,24.67,3.556,262.309,3,0,4,3.556,...,9.530611,66.421505,262.121846,7.709466,969,23,100.0,112.0,4.444444,4.5
2891,TRAIN_2891,CC(=O)Nc1ccc(N2N=C(c3ccc(O)cc3)C(C)CC2=O)cc1,55.95,69.95,2.172,337.372,4,2,3,2.169,...,10.10704,59.750412,337.142641,7.662333,1584,39,130.0,151.0,8.75,5.444444
543,TRAIN_0543,CC(=O)Nc1ccc(N2N=C(c3ccc(O)cc3)C(C)CC2=O)cc1,68.485,85.872,2.172,337.372,4,2,3,2.169,...,10.10704,59.750412,337.142641,7.662333,1584,39,130.0,151.0,8.75,5.444444
837,TRAIN_0837,CC(=O)Nc1nc2ccc(-c3nn(C(C)C)c4nc(N)ncc34)cc2s1,63.522,62.488,2.293,367.428,5,2,3,2.307,...,10.293467,77.575687,367.121529,8.53771,1673,42,144.0,173.0,8.722222,5.5
366,TRAIN_0366,CC(=O)Nc1nc2ccc(-c3nn(C(C)C)c4nc(N)ncc34)cc2s1,73.74,66.85,2.293,367.428,5,2,3,2.307,...,10.293467,77.575687,367.121529,8.53771,1673,42,144.0,173.0,8.722222,5.5
1085,TRAIN_1085,CC(C)(C)OC(=O)N1CCC(c2n[nH]c3nc(S(C)(=O)=O)ncc...,23.63,71.47,1.684,381.45,7,1,4,1.684,...,10.273222,75.316682,381.147075,7.778512,1839,40,142.0,165.0,10.402778,5.319444
2848,TRAIN_2848,CC(C)(C)OC(=O)N1CCC(c2n[nH]c3nc(S(C)(=O)=O)ncc...,40.657,99.9,1.684,381.45,7,1,4,1.684,...,10.273222,75.316682,381.147075,7.778512,1839,40,142.0,165.0,10.402778,5.319444
2096,TRAIN_2096,CC(C)COc1cc(=O)n2c(c1C(=O)N1CCC(C)CC1)CCCCC2,1.88,61.28,2.843,360.49,3,0,4,2.843,...,10.154519,68.711453,360.241293,6.211057,1561,45,134.0,157.0,9.0,5.75
1666,TRAIN_1666,CC(C)COc1cc(=O)n2c(c1C(=O)N1CCC(C)CC1)CCCCC2,5.494,25.397,2.843,360.49,3,0,4,2.843,...,10.154519,68.711453,360.241293,6.211057,1561,45,134.0,157.0,9.0,5.75


In [10]:
processed_duplicate = duplicates.groupby('SMILES')['MLM','HLM'].mean().reset_index()

  processed_duplicate = duplicates.groupby('SMILES')['MLM','HLM'].mean().reset_index()


In [11]:
tmp = pd.merge(processed_duplicate,duplicates,how='left',on='SMILES')

### -> 중복치 제거 (총 26개)

In [12]:
tmp = tmp.iloc[0::2]  # 첫 번째 행부터 시작하여 2씩 증가하여 행 선택
tmp.drop(['MLM_y','HLM_y'],axis =1,inplace=True)

In [13]:
# tmp.columns = ['SMILES', 'MLM', 'HLM','id',  'AlogP', 'Molecular_Weight',
#        'Num_H_Acceptors', 'Num_H_Donors', 'Num_RotatableBonds', 'LogD',
#        'Molecular_PolarSurfaceArea']

In [14]:
# processed_duplicate = tmp[['id', 'SMILES', 'MLM', 'HLM', 'AlogP', 'Molecular_Weight',
#        'Num_H_Acceptors', 'Num_H_Donors', 'Num_RotatableBonds', 'LogD',
#        'Molecular_PolarSurfaceArea']]

In [15]:
train = train.drop_duplicates(subset=['SMILES'], keep=False) #기존 train에서 중복치들을 우선 모두 제거
train.shape

(3428, 1837)

In [16]:
train

Unnamed: 0,id,SMILES,MLM,HLM,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,TRAIN_0000,CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC,26.010,50.680,3.259,400.495,5,2,8,3.259,...,10.081676,78.761075,400.156912,7.695325,2380,40,142.0,165.0,9.500000,6.361111
1,TRAIN_0001,Cc1nc(C)c(CN2CC(C)C(=O)Nc3ccccc32)s1,29.270,50.590,2.169,301.407,2,1,2,2.172,...,9.907828,69.149596,301.124883,7.528122,870,35,112.0,132.0,7.138889,4.527778
2,TRAIN_0002,CCCN1CCN(c2nn3nnnc3c3ccccc23)CC1,5.586,80.892,1.593,297.358,5,0,3,1.585,...,10.144510,70.158066,297.170194,7.248054,1028,36,120.0,145.0,5.277778,4.888889
3,TRAIN_0003,Cc1ccc(-c2ccc(-n3nc(C)c(S(=O)(=O)N4CCN(C5CCCCC...,5.710,2.000,4.771,494.652,6,0,5,3.475,...,10.613467,86.199585,494.246395,7.162991,4170,61,192.0,231.0,10.784722,7.500000
4,TRAIN_0004,Cc1ccc2c(c1)N(C(=O)c1ccncc1)CC(C)O2,93.270,99.990,2.335,268.310,3,0,1,2.337,...,9.978363,53.872357,268.121178,7.447810,762,32,106.0,125.0,6.277778,4.361111
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3493,TRAIN_3493,Cn1nc(CNC(=O)Cn2nc(C(F)(F)F)c3c2CCC3)c(Cl)c1Cl,1.556,3.079,3.409,396.195,3,1,5,3.409,...,10.208580,77.363487,395.052750,10.129558,1615,38,136.0,162.0,10.062500,5.250000
3494,TRAIN_3494,CCn1[nH]cc/c1=N\C(=O)c1nn(-c2ccccc2)c(=O)c2ccc...,35.560,47.630,1.912,359.381,4,1,3,1.844,...,10.291162,75.955433,359.138225,8.162232,1765,45,144.0,173.0,7.750000,6.027778
3495,TRAIN_3495,CCOC(=O)CCCc1nc2cc(N)ccc2n1C,56.150,1.790,1.941,261.320,3,1,6,2.124,...,9.677277,66.189153,261.147727,6.872309,795,26,94.0,108.0,6.916667,4.333333
3496,TRAIN_3496,Nc1cc(C(=O)OCCC2CCOC2=O)cnc1Cl,0.030,2.770,0.989,284.696,5,1,5,0.989,...,9.604475,65.335399,284.056385,8.876762,812,25,94.0,107.0,6.916667,4.250000


In [17]:
tmp.shape

(26, 1837)

In [18]:
tmp.rename(columns={'HLM_x': 'HLM', 'MLM_x': 'MLM'}, inplace=True)

In [19]:
train = pd.concat([train, tmp], axis=0, ignore_index=True)
train.shape

(3454, 1837)

In [20]:
train

Unnamed: 0,id,SMILES,MLM,HLM,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,TRAIN_0000,CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC,26.0100,50.6800,3.259,400.495,5,2,8,3.259,...,10.081676,78.761075,400.156912,7.695325,2380,40,142.0,165.0,9.500000,6.361111
1,TRAIN_0001,Cc1nc(C)c(CN2CC(C)C(=O)Nc3ccccc32)s1,29.2700,50.5900,2.169,301.407,2,1,2,2.172,...,9.907828,69.149596,301.124883,7.528122,870,35,112.0,132.0,7.138889,4.527778
2,TRAIN_0002,CCCN1CCN(c2nn3nnnc3c3ccccc23)CC1,5.5860,80.8920,1.593,297.358,5,0,3,1.585,...,10.144510,70.158066,297.170194,7.248054,1028,36,120.0,145.0,5.277778,4.888889
3,TRAIN_0003,Cc1ccc(-c2ccc(-n3nc(C)c(S(=O)(=O)N4CCN(C5CCCCC...,5.7100,2.0000,4.771,494.652,6,0,5,3.475,...,10.613467,86.199585,494.246395,7.162991,4170,61,192.0,231.0,10.784722,7.500000
4,TRAIN_0004,Cc1ccc2c(c1)N(C(=O)c1ccncc1)CC(C)O2,93.2700,99.9900,2.335,268.310,3,0,1,2.337,...,9.978363,53.872357,268.121178,7.447810,762,32,106.0,125.0,6.277778,4.361111
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3449,TRAIN_1833,Cc1ccc(-c2cc(-c3nc4c5ccccc5ncn4n3)[nH]n2)c(C)c1,28.4630,24.1955,4.609,340.381,4,1,2,4.736,...,10.344674,77.430819,340.143645,8.098658,1730,43,148.0,180.0,6.611111,5.500000
3450,TRAIN_0898,Cc1ccc(-c2ccc(C(CN3CCCC3)N(C)C(=O)CN3C(=O)COc4...,56.0435,66.1260,4.282,553.480,5,0,7,4.009,...,10.593756,88.164476,552.169496,8.120140,4874,63,204.0,242.0,12.055556,8.222222
3451,TRAIN_2575,Cc1ccccc1-c1nc2ccccc2cc1C(C)n1c(=O)[nH]c2c(S(C...,56.3130,63.7320,4.304,459.520,6,1,4,4.304,...,10.677177,84.226168,459.136511,8.502528,2888,61,186.0,227.0,10.895833,6.944444
3452,TRAIN_0216,N#Cc1nccnc1OC1CCN(C(=O)C2CC(=O)N(C3CCOCC3)C2)C1,70.2150,93.8400,-1.133,385.417,7,0,4,-1.133,...,10.177818,79.094663,385.175004,7.552451,2262,42,148.0,175.0,8.000000,6.222222


# Mordred Columns확인

-  Object Columns (총 575개)
- train, test 하나라도

In [21]:
train_prop = train.iloc[:, 4:]
test_prop = test.iloc[:, 2:]

target_1 = train['MLM']
target_2 = train['HLM']

In [22]:
object_columns_train = []

for column in train_prop.columns:
    dtype = train_prop[column].dtype
    # print(f"Column: {column}, Data Type: {dtype}")
    if dtype == object :
        object_columns_train.append(column)


In [23]:
object_columns_test = []

for column in test_prop.columns:
    dtype = test_prop[column].dtype
    # print(f"Column: {column}, Data Type: {dtype}")
    if dtype == object :
        object_columns_test.append(column)


In [24]:
object_columns = object_columns_train + object_columns_test

object_columns = set(object_columns)
object_columns = list(object_columns)

len(object_columns_train), len(object_columns_test), len(object_columns)

(575, 403, 575)

In [25]:
pure_object_cols_train = []

for i in range(len(object_columns)):
    numeric_values = pd.to_numeric(train_prop[object_columns[i]], errors='coerce')
    column_sum = numeric_values.sum()
    # print('column',i,':', column_sum)
    if column_sum == 0.0:
        pure_object_cols_train.append(object_columns[i])


In [26]:
pure_object_cols_test = []

for i in range(len(object_columns)):
    numeric_values = pd.to_numeric(test_prop[object_columns[i]], errors='coerce')
    column_sum = numeric_values.sum()
    # print('column',i,':', column_sum)
    if column_sum == 0.0:
        pure_object_cols_test.append(object_columns[i])


In [27]:
pure_object_cols = pure_object_cols_train + pure_object_cols_test

pure_object_cols = set(pure_object_cols)
pure_object_cols = list(pure_object_cols)


len(pure_object_cols_train), len(pure_object_cols_test), len(pure_object_cols)

(313, 317, 319)

## PURE OBJECT COLS 버렷을 때 : LGBM 31.426668504334383

In [28]:
train_prop.drop(columns =pure_object_cols,inplace=True)
test_prop.drop(columns =pure_object_cols, inplace=True)

# TOTAL OBJECT COLS 버렸을 때 : LGBM 31.566453305994656

In [29]:
# train_prop.drop(columns =object_columns,inplace=True)
# test_prop.drop(columns =object_columns, inplace=True)

# PURE OBJECT COLS + 범주화 안했을 때 : 31.3721918205426

In [30]:
import pandas as pd

# train_prop_object의 모든 열을 처리
for column_name in train_prop.columns:
    train_prop[column_name] = pd.to_numeric(train_prop[column_name], errors='coerce')

# test_prop_object의 모든 열을 처리
for column_name in test_prop.columns:
    test_prop[column_name] = pd.to_numeric(test_prop[column_name], errors='coerce')


In [31]:
train_prop.isna().sum().sum(), test_prop.isna().sum().sum()

(196434, 27481)

In [32]:
train_prop.fillna(0, inplace=True)
test_prop.fillna(0, inplace=True)

In [33]:
train_prop = train_prop.astype(float)
test_prop = test_prop.astype(float)

- pure object columns(str) -> 삭제 (319개)
    - object칼럼 중 합계 0 인 칼럼
    - 행별로 동일해서 의미없는 object column으로 판단함

In [34]:
# # 열(column_name)의 숫자 데이터만 추출하여 합계 계산
# numeric_values = pd.to_numeric(train_prop[object_columns[0]], errors='coerce')
# column_sum = numeric_values.sum()
# print(i, column_sum)

In [35]:
train_prop.head(10)

Unnamed: 0,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea,ABC,ABCGG,nAcid,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,3.259,400.495,5.0,2.0,8.0,3.259,117.37,21.379612,17.449011,0.0,...,10.081676,78.761075,400.156912,7.695325,2380.0,40.0,142.0,165.0,9.5,6.361111
1,2.169,301.407,2.0,1.0,2.0,2.172,73.47,16.539255,14.049653,0.0,...,9.907828,69.149596,301.124883,7.528122,870.0,35.0,112.0,132.0,7.138889,4.527778
2,1.593,297.358,5.0,0.0,3.0,1.585,62.45,17.475469,13.660693,2.0,...,10.14451,70.158066,297.170194,7.248054,1028.0,36.0,120.0,145.0,5.277778,4.888889
3,4.771,494.652,6.0,0.0,5.0,3.475,92.6,27.857311,20.034364,0.0,...,10.613467,86.199585,494.246395,7.162991,4170.0,61.0,192.0,231.0,10.784722,7.5
4,2.335,268.31,3.0,0.0,1.0,2.337,42.43,15.722758,12.817176,0.0,...,9.978363,53.872357,268.121178,7.44781,762.0,32.0,106.0,125.0,6.277778,4.361111
5,1.335,419.422,4.0,1.0,7.0,1.335,78.95,23.477072,19.253271,0.0,...,10.481701,85.464816,419.165663,7.908786,2578.0,52.0,162.0,197.0,10.333333,6.611111
6,1.954,547.707,7.0,4.0,9.0,0.464,144.42,19.095142,16.597191,2.0,...,10.200736,59.865252,366.088557,8.513687,1442.0,41.0,128.0,149.0,10.951389,5.430556
7,2.967,293.343,3.0,0.0,2.0,2.967,68.18,16.849242,13.229959,0.0,...,9.877605,70.704035,293.062283,9.158196,957.0,32.0,114.0,135.0,4.416667,4.583333
8,-0.946,347.366,7.0,2.0,3.0,-0.723,104.89,14.906262,12.419742,0.0,...,9.925396,66.940188,257.152812,6.767179,686.0,30.0,102.0,122.0,6.027778,4.194444
9,1.259,357.453,5.0,2.0,5.0,-0.373,90.04,20.413286,16.66526,0.0,...,10.137729,76.595367,357.227708,6.740145,1798.0,39.0,138.0,164.0,6.888889,5.805556


In [36]:
# len(object_columns), len(pure_object_cols)

# 575개 중에 pure object cols 313개는 object(catboost로) / 나머지는 머신러닝으로 -> 앙상블?

In [37]:
df_corr = pd.concat([train[['HLM']], train_prop],axis=1)

In [38]:
hlm_corr = df_corr.corr().sort_values(by='HLM')[:300].index.tolist()

In [39]:
df_corr2 = pd.concat([train[['MLM']], train_prop],axis=1)

In [40]:
mlm_corr = df_corr2.corr().sort_values(by='MLM')[:300].index.tolist()

In [41]:
corr_list = list(set(mlm_corr + hlm_corr))
len(corr_list)

366

# PURE OBJECT COLUMNS + 100개만 범주화 했을 때 : 31.450351634625314

In [42]:

import pandas as pd

# 각 열을 0~10 범주로 나누기
num_bins = 10  # 원하는 범주 수

for col in corr_list:
    bin_edges = pd.qcut(train_prop[col], q=num_bins, labels=False, duplicates='drop')
    bin_labels = [str(i) for i in range(num_bins)]
    train_prop[f'{col}_Category'] = pd.Categorical(bin_edges, categories=range(num_bins), ordered=True)
    train_prop[f'{col}_Category'] = train_prop[f'{col}_Category'].cat.rename_categories(bin_labels)

# 결과 출력
print(train_prop)




      AlogP  Molecular_Weight  Num_H_Acceptors  Num_H_Donors  \
0     3.259           400.495              5.0           2.0   
1     2.169           301.407              2.0           1.0   
2     1.593           297.358              5.0           0.0   
3     4.771           494.652              6.0           0.0   
4     2.335           268.310              3.0           0.0   
...     ...               ...              ...           ...   
3449  4.609           340.381              4.0           1.0   
3450  4.282           553.480              5.0           0.0   
3451  4.304           459.520              6.0           1.0   
3452 -1.133           385.417              7.0           0.0   
3453  5.630           466.572              4.0           0.0   

      Num_RotatableBonds   LogD  Molecular_PolarSurfaceArea        ABC  \
0                    8.0  3.259                      117.37  21.379612   
1                    2.0  2.172                       73.47  16.539255   
2        

In [43]:
num_bins = 10  # 원하는 범주 수

for col in corr_list:
    bin_edges = pd.qcut(test_prop[col], q=num_bins, labels=False, duplicates='drop')
    bin_labels = [str(i) for i in range(num_bins)]
    test_prop[f'{col}_Category'] = pd.Categorical(bin_edges, categories=range(num_bins), ordered=True)
    test_prop[f'{col}_Category'] = test_prop[f'{col}_Category'].cat.rename_categories(bin_labels)

# 결과 출력
print(test_prop)


     AlogP  Molecular_Weight  Num_H_Acceptors  Num_H_Donors  \
0    2.641           361.505              4.0           2.0   
1    0.585           370.399              5.0           0.0   
2    4.276           347.414              4.0           4.0   
3    1.795           345.358              5.0           0.0   
4    1.219           353.418              4.0           0.0   
..     ...               ...              ...           ...   
478  4.207           306.443              2.0           1.0   
479 -0.608           335.398              5.0           0.0   
480  1.792           349.383              3.0           1.0   
481  0.790           341.132              3.0           2.0   
482  2.782           250.380              2.0           0.0   

     Num_RotatableBonds   LogD  Molecular_PolarSurfaceArea        ABC  \
0                   7.0  2.635                       92.76  19.229782   
1                   3.0  0.585                       68.31  21.229782   
2                   5.0 

# Mol, 일부원소 추가

In [44]:
!pip install Chem

Collecting Chem
  Downloading chem-1.2.0-py3-none-any.whl (24 kB)
Installing collected packages: Chem
Successfully installed Chem-1.2.0


In [45]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2023.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.7/29.7 MB[0m [31m41.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit
Successfully installed rdkit-2023.3.3


In [46]:
from rdkit import Chem, DataStructs
from rdkit.Chem import Descriptors
from rdkit.Chem import AllChem
from rdkit import DataStructs
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole     # 화면에 출력하기 위한 옵션
IPythonConsole.ipython_useSVG=True

In [47]:
train_prop['mol'] = train['SMILES'].apply(lambda x: Chem.MolFromSmiles(x))
train_prop['mol'] = train_prop['mol'].apply(lambda x: Chem.AddHs(x))
train_prop['num_of_atoms'] = train_prop['mol'].apply(lambda x: x.GetNumAtoms())
train_prop['num_of_heavy_atoms'] = train_prop['mol'].apply(lambda x: x.GetNumHeavyAtoms())
train_prop.head()

Unnamed: 0,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea,ABC,ABCGG,nAcid,...,LogEE_Dzv_Category,ATS5i_Category,nBondsKS_Category,MPC3_Category,SRW02_Category,ATS2are_Category,VE1_Dzpe_Category,mol,num_of_atoms,num_of_heavy_atoms
0,3.259,400.495,5.0,2.0,8.0,3.259,117.37,21.379612,17.449011,0.0,...,7,5,7,5,6,6,7,<rdkit.Chem.rdchem.Mol object at 0x7a66c3bbbb50>,52,28
1,2.169,301.407,2.0,1.0,2.0,2.172,73.47,16.539255,14.049653,0.0,...,1,4,3,2,2,3,2,<rdkit.Chem.rdchem.Mol object at 0x7a66c3bbbbc0>,40,21
2,1.593,297.358,5.0,0.0,3.0,1.585,62.45,17.475469,13.660693,2.0,...,3,5,4,4,3,4,2,<rdkit.Chem.rdchem.Mol object at 0x7a66c3bbbc30>,41,22
3,4.771,494.652,6.0,0.0,5.0,3.475,92.6,27.857311,20.034364,0.0,...,9,9,9,9,9,9,9,<rdkit.Chem.rdchem.Mol object at 0x7a66c3bbbca0>,69,35
4,2.335,268.31,3.0,0.0,1.0,2.337,42.43,15.722758,12.817176,0.0,...,1,2,1,2,1,1,1,<rdkit.Chem.rdchem.Mol object at 0x7a66c3bbbd10>,36,20


In [48]:
# 탄소 패턴을 지정한다
c_patt = Chem.MolFromSmiles('C')
# 이 패턴이 들어있는 곳을 찾는다. 패턴의 수를 세면 탄소 원자가 몇개 들어있는지 알 수 있다
print(train_prop['mol'][0].GetSubstructMatches(c_patt))

((0,), (1,), (3,), (4,), (5,), (6,), (7,), (9,), (11,), (12,), (13,), (14,), (16,), (17,), (19,), (20,), (23,), (24,), (26,), (27,))


- num_of_{}_atoms 추가

In [49]:
# 임의의 패턴(원자)를 몇개 포함하고 있는지를 얻는 함수
def number_of_atoms(atom_list, df):
    for i in atom_list:
        df['num_of_{}_atoms'.format(i)] = train_prop['mol'].apply(lambda x: len(x.GetSubstructMatches(Chem.MolFromSmiles(i))))

number_of_atoms(['C', 'O', 'N', 'Cl'], train_prop) # 탄소, 산소, 질소, 염소
train.head()

Unnamed: 0,id,SMILES,MLM,HLM,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,TRAIN_0000,CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC,26.01,50.68,3.259,400.495,5,2,8,3.259,...,10.081676,78.761075,400.156912,7.695325,2380,40,142.0,165.0,9.5,6.361111
1,TRAIN_0001,Cc1nc(C)c(CN2CC(C)C(=O)Nc3ccccc32)s1,29.27,50.59,2.169,301.407,2,1,2,2.172,...,9.907828,69.149596,301.124883,7.528122,870,35,112.0,132.0,7.138889,4.527778
2,TRAIN_0002,CCCN1CCN(c2nn3nnnc3c3ccccc23)CC1,5.586,80.892,1.593,297.358,5,0,3,1.585,...,10.14451,70.158066,297.170194,7.248054,1028,36,120.0,145.0,5.277778,4.888889
3,TRAIN_0003,Cc1ccc(-c2ccc(-n3nc(C)c(S(=O)(=O)N4CCN(C5CCCCC...,5.71,2.0,4.771,494.652,6,0,5,3.475,...,10.613467,86.199585,494.246395,7.162991,4170,61,192.0,231.0,10.784722,7.5
4,TRAIN_0004,Cc1ccc2c(c1)N(C(=O)c1ccncc1)CC(C)O2,93.27,99.99,2.335,268.31,3,0,1,2.337,...,9.978363,53.872357,268.121178,7.44781,762,32,106.0,125.0,6.277778,4.361111


- 테스트 데이터에도 추가

In [50]:
test_prop['mol'] = test['SMILES'].apply(lambda x: Chem.MolFromSmiles(x))
test_prop['mol'] = test_prop['mol'].apply(lambda x: Chem.AddHs(x))
test_prop['num_of_atoms'] = test_prop['mol'].apply(lambda x: x.GetNumAtoms())
test_prop['num_of_heavy_atoms'] = test_prop['mol'].apply(lambda x: x.GetNumHeavyAtoms())

number_of_atoms(['C','O', 'N', 'Cl'], test_prop)
test.head()

Unnamed: 0,id,SMILES,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea,ABC,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,TEST_000,CC(C)Nc1ccnc(N2CCN(Cc3cccs3)C(CCO)C2)n1,2.641,361.505,4,2,7,2.635,92.76,19.229782,...,9.935713,72.422574,361.193631,6.946031,1607,34,126.0,144.0,7.527778,5.638889
1,TEST_001,COc1cc(=O)n(-c2ccccc2)cc1C(=O)N1CCC2(CC1)OCCO2,0.585,370.399,5,0,3,0.585,68.31,21.229782,...,10.330421,76.111616,370.152872,7.55414,1853,45,146.0,175.0,7.840278,5.972222
2,TEST_002,Cc1cccc(NC(=N)/N=c2\nc(O)c(Cc3ccccc3)c(C)[nH]2)c1,4.276,347.414,4,4,5,4.29,92.86,20.155669,...,10.008163,60.614034,347.17461,7.386694,1905,37,132.0,150.0,8.388889,5.722222
3,TEST_003,O=C(c1nc2ncccn2n1)N1CCCn2cc(-c3ccccc3)nc21,1.795,345.358,5,0,2,1.795,81.21,21.079953,...,10.288444,77.464779,345.133808,8.417898,1673,42,146.0,177.0,6.0,5.583333
4,TEST_004,CCN1CCN(C(=O)c2cc3c(=O)n4cc(C)ccc4nc3n2C)CC1,1.219,353.418,4,0,2,0.169,61.15,20.620135,...,10.380312,75.874629,353.185175,7.207861,1656,47,144.0,176.0,8.722222,5.638889


In [51]:
import networkx as nx
import matplotlib.pyplot as plt

In [52]:
# from rdkit import Chem
# import pandas as pd

# # SMILES 문자열 리스트 정의
# smiles_list = train['SMILES']

# num_atoms_list = []
# num_bonds_list = []

# for smiles in smiles_list:
#     # SMILES 문자열 파싱
#     mol = Chem.MolFromSmiles(smiles)

#     if mol is not None:
#         # 원자 및 결합 정보 추출
#         num_atoms = mol.GetNumAtoms()
#         num_bonds = mol.GetNumBonds()

#         # 리스트에 정보 추가
#         num_atoms_list.append(num_atoms)
#         num_bonds_list.append(num_bonds)
#     # else:
#     #     # SMILES 문자열을 파싱할 수 없는 경우
#     #     num_atoms_list.append(None)
#     #     num_bonds_list.append(None)

# # 데이터프레임에 새로운 열로 추가
# # train['num_atoms'] = num_atoms_list
# train['num_bonds'] = num_bonds_list

In [53]:
# from rdkit import Chem
# import pandas as pd

# # SMILES 문자열 리스트 정의
# smiles_list = test['SMILES']

# num_atoms_list = []
# num_bonds_list = []

# for smiles in smiles_list:
#     # SMILES 문자열 파싱
#     mol = Chem.MolFromSmiles(smiles)

#     if mol is not None:
#         # 원자 및 결합 정보 추출
#         num_atoms = mol.GetNumAtoms()
#         num_bonds = mol.GetNumBonds()

#         # 리스트에 정보 추가
#         num_atoms_list.append(num_atoms)
#         num_bonds_list.append(num_bonds)
#     # else:
#     #     # SMILES 문자열을 파싱할 수 없는 경우
#     #     num_atoms_list.append(None)
#     #     num_bonds_list.append(None)

# # 데이터프레임에 새로운 열로 추가
# # test['num_atoms'] = num_atoms_list
# test['num_bonds'] = num_bonds_list

# GNN : 31.265770026916055

In [54]:
# !pip install torch


In [55]:
# !pip install torch-geometric


In [56]:
# import torch
# from rdkit import Chem
# from rdkit.Chem import AllChem
# from rdkit.Chem import MolFromSmiles
# from torch_geometric.data import Data

# # SMILES 문자열을 입력으로 받아 RDKit Mol 객체로 변환
# def smiles_to_mol(smiles):
#     mol = MolFromSmiles(smiles)
#     mol = Chem.AddHs(mol)  # 수소 원자 추가
#     mol = Chem.MolToSmiles(mol)  # 정규화된 SMILES로 변환
#     mol = Chem.MolFromSmiles(mol)
#     return mol

# # RDKit Mol 객체를 PyTorch Geometric Data로 변환
# def mol_to_geometric_data(mol):
#     num_atoms = mol.GetNumAtoms()
#     edge_indices = []
#     edge_attr = []

#     for bond in mol.GetBonds():
#         start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
#         edge_indices.append((start, end))
#         edge_indices.append((end, start))  # 무방향 그래프이므로 역방향 엣지 추가
#         edge_attr.extend([bond.GetBondTypeAsDouble()] * 2)

#     x = torch.eye(num_atoms)  # 노드 피처는 항등 행렬로 초기화

#     data = Data(x=x, edge_index=torch.tensor(edge_indices).t().contiguous(), edge_attr=torch.tensor(edge_attr))
#     return data

# # SMILES 문자열을 입력으로 받아 그래프 데이터로 변환
# def smiles_to_graph(smiles):
#     mol = smiles_to_mol(smiles)
#     data = mol_to_geometric_data(mol)
#     return data



In [57]:

# # 예제 SMILES 문자열
# smiles_string = "CCO"  # 여기에 원하는 SMILES 문자열을 넣으세요

# # SMILES를 그래프로 변환
# graph_data = smiles_to_graph(smiles_string)

# # 그래프 데이터 확인
# print(graph_data)

# 그래프 데이터에서 필요한 정보를 추출하여 사용하세요

In [58]:
# graph_data.x.numpy().sum()

In [59]:
# graph_data.edge_index.numpy().sum()

In [60]:
graph_data.edge_attr.numpy().sum()

NameError: ignored

In [None]:

# # 예제 SMILES 문자열
# smiles_string = train['SMILES'][0]  # 여기에 원하는 SMILES 문자열을 넣으세요

# # SMILES를 그래프로 변환
# graph_data = smiles_to_graph(smiles_string)

# # 그래프 데이터 확인
# print(graph_data)

In [None]:
# graph_data.x.numpy().sum()

In [None]:
# graph_data.edge_index.numpy().sum()

In [None]:
# graph_data.edge_attr.numpy().sum()

In [None]:
# data.x : 노드 특징 행렬
# [num_nodes, num_node_features]
# data.edge_index : 그래프의 연결성
# [2, num_edges]
# data.edge_attr : 엣지 특징 행렬
# [num_edges, num_edge_features]

In [None]:
# # 예제 SMILES 문자열
# smiles_string = train['SMILES'][i]  # 여기에 원하는 SMILES 문자열을 넣으세요

# # SMILES를 그래프로 변환
# graph_data = smiles_to_graph(smiles_string)

# # 그래프 데이터 확인
# # print(graph_data)

# train['graph_x'][i] = graph_data.x
# train['graph_index'][i] = graph_data.edge_index
# train['graph_attr'][i] = graph_data.edge_attr




# # 그래프 데이터에서 필요한 정보를 추출하여 사용하세요

In [None]:
# df = pd.DataFrame(columns=['SMILES', 'graph_x', 'graph_index', 'graph_attr'])
# df['SMILES'] = train['SMILES']

In [None]:
# df = pd.DataFrame(columns=['SMILES', 'graph_x', 'graph_index', 'graph_attr'])

# # 예제 SMILES 문자열


# smiles_strings = train['SMILES']  # 여기에 원하는 SMILES 문자열을 넣으세요

# # SMILES를 그래프로 변환하고 데이터프레임에 추가
# for smiles_string in smiles_strings:
#     graph_data = smiles_to_graph(smiles_string)  # 앞서 정의한 함수를 사용하여 SMILES를 그래프로 변환

#     # 데이터프레임에 추가
#     df = df.append({'SMILES': smiles_string,
#                           'graph_x': graph_data.x.numpy().sum(),
#                           'graph_index': graph_data.edge_index.numpy().sum(),
#                           'graph_attr': graph_data.edge_attr.numpy().sum()}, ignore_index=True)

In [None]:
df

In [None]:
# df_train = df[['graph_x','graph_index','graph_attr']]
# df_train

# train_prop = pd.concat([train_prop,df_train],axis=1)

In [None]:
# df = pd.DataFrame(columns=['SMILES', 'graph_x', 'graph_index', 'graph_attr'])

# # 예제 SMILES 문자열


# smiles_strings = test['SMILES']  # 여기에 원하는 SMILES 문자열을 넣으세요

# # SMILES를 그래프로 변환하고 데이터프레임에 추가
# for smiles_string in smiles_strings:
#     graph_data = smiles_to_graph(smiles_string)  # 앞서 정의한 함수를 사용하여 SMILES를 그래프로 변환

#     # 데이터프레임에 추가
#     df = df.append({'SMILES': smiles_string,
#                           'graph_x': graph_data.x.numpy().sum(),
#                           'graph_index': graph_data.edge_index.numpy().sum(),
#                           'graph_attr': graph_data.edge_attr.numpy().sum()}, ignore_index=True)

In [None]:
# df_test = df[['graph_x','graph_index','graph_attr']]
# df_test

# test_prop = pd.concat([test_prop,df_test],axis=1)

In [None]:
# test_prop

In [None]:
train_prop.drop(columns='mol', inplace=True)
test_prop.drop(columns='mol', inplace=True)

In [None]:
train_prop = train_prop.astype(float)
test_prop = test_prop.astype(float)

# 모델학습, 검증, 제출

# BIO DL

In [None]:
from sklearn.metrics import mean_squared_error

def rmse(y_valid, pred):
    mse = mean_squared_error(y_valid, pred)
    return np.sqrt(mse)

- smiles모델

In [None]:
# !pip install transformers

In [None]:
# model_name = "seyonec/PubChem10M_SMILES_BPE_450k"

In [None]:
# from transformers import AutoTokenizer
# from transformers import AutoModel

# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModel.from_pretrained(model_name)
# model

In [None]:
# train_token = tokenizer(train_smiles.tolist(), padding=True)

In [None]:
# input_ids = np.array(train_token['input_ids'])
# attention_mask = np.array(train_token['attention_mask'])

# input_ids.shape, attention_mask.shape

In [None]:
# train_prop = pd.DataFrame(train_prop)

In [None]:
# test_prop = pd.DataFrame(test_prop)

In [None]:
# attention_mask

# 모델 학습 및 예측

In [None]:
train_prop.iloc[:.:-306]

In [None]:
train_prop =train_prop.iloc[:, : -307]
test_prop = test_prop.iloc[:, : -307]

In [None]:
train_prop

In [None]:
train_cat = train_prop.iloc[:, -306:]
test_cat = test_prop.iloc[:, -306:]

# STANDARD SCALER : 31.252652342708398 *****

In [None]:
from sklearn.preprocessing import StandardScaler

# 샘플 데이터 생성 (2차원 배열 형태로 가정)

# StandardScaler 객체 생성
scaler = StandardScaler()

# 데이터를 표준 스케일링 (평균이 0, 표준편차가 1로 변환)
train_prop = scaler.fit_transform(train_prop)
test_prop = scaler.transform(test_prop)

train_prop = pd.DataFrame(train_prop)
test_prop = pd.DataFrame(test_prop)

train_prop

# MINMAX SCALER 31.388637080253783

In [None]:
# from sklearn.preprocessing import MinMaxScaler

# # 샘플 데이터 생성 (2차원 배열 형태로 가정)

# # StandardScaler 객체 생성
# scaler = MinMaxScaler()

# # 데이터를 표준 스케일링 (평균이 0, 표준편차가 1로 변환)
# train_prop = scaler.fit_transform(train_prop)
# test_prop = scaler.transform(test_prop)

# train_prop = pd.DataFrame(train_prop)
# test_prop = pd.DataFrame(test_prop)

# train_prop

- power transformer

In [None]:
# from sklearn.preprocessing import PowerTransformer


# # StandardScaler 객체 생성
# scaler = PowerTransformer()

# # 데이터를 표준 스케일링 (평균이 0, 표준편차가 1로 변환)
# train_prop = scaler.fit_transform(train_prop)
# test_prop = scaler.transform(test_prop)

# train_prop = pd.DataFrame(train_prop)
# test_prop = pd.DataFrame(test_prop)

# train_prop

# ROBUST SCALER : 31.44811691605782

In [None]:

# from sklearn.preprocessing import RobustScaler


# # StandardScaler 객체 생성
# scaler_robust = RobustScaler()

# # 데이터를 표준 스케일링 (평균이 0, 표준편차가 1로 변환)
# train_prop = scaler_robust.fit_transform(train_prop)
# test_prop = scaler_robust.transform(test_prop)

# train_prop = pd.DataFrame(train_prop)
# test_prop = pd.DataFrame(test_prop)

# train_prop

# MAXABS SCALER :31.377457615856507



In [None]:


# from sklearn.preprocessing import MaxAbsScaler


# # StandardScaler 객체 생성
# scaler_maxabs = MaxAbsScaler()

# # 데이터를 표준 스케일링 (평균이 0, 표준편차가 1로 변환)
# train_prop = scaler_maxabs.fit_transform(train_prop)
# test_prop = scaler_maxabs.transform(test_prop)

# train_prop = pd.DataFrame(train_prop)
# test_prop = pd.DataFrame(test_prop)

# train_prop

# QuantileTransformer : 31.388637080253783

In [None]:


# from sklearn.preprocessing import QuantileTransformer


# # StandardScaler 객체 생성
# scaler = QuantileTransformer()

# # 데이터를 표준 스케일링 (평균이 0, 표준편차가 1로 변환)
# train_prop = scaler_maxabs.fit_transform(train_prop)
# test_prop = scaler_maxabs.transform(test_prop)

# train_prop = pd.DataFrame(train_prop)
# test_prop = pd.DataFrame(test_prop)

# train_prop

In [None]:
# train_prop = train.iloc[:, 4:]
# test_prop = test.iloc[:, 2:]

# import pandas as pd

# # train_prop_object의 모든 열을 처리
# for column_name in train_prop.columns:
#     train_prop[column_name] = pd.to_numeric(train_prop[column_name], errors='coerce')
# # test_prop_object의 모든 열을 처리
# for column_name in train_prop.columns:
#     train_prop[column_name] = pd.to_numeric(train_prop[column_name], errors='coerce')

# import pandas as pd

# # train_prop_object의 모든 열을 처리
# for column_name in test_prop.columns:
#     test_prop[column_name] = pd.to_numeric(test_prop[column_name], errors='coerce')
# # test_prop_object의 모든 열을 처리
# for column_name in test_prop.columns:
#     test_prop[column_name] = pd.to_numeric(test_prop[column_name], errors='coerce')



# train_prop = train_prop.astype(float)
# test_prop = test_prop.astype(float)

# train_prop.fillna(0, inplace=True)
# test_prop.fillna(0, inplace=True)


In [None]:
from lightgbm import LGBMRegressor

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

cv = KFold(n_splits=5, shuffle=True, random_state=SEED)

scores = []
for tri, val in tqdm(cv.split(train_prop, target_1)):
    # 학습데이터
    x_train = train_prop.iloc[tri]
    y_train_1 = target_1.iloc[tri]
    y_train_2 = target_2.iloc[tri]

    # 검증데이터
    x_valid = train_prop.iloc[val]
    y_valid_1 = target_1.iloc[val]
    y_valid_2 = target_2.iloc[val]

    # MLM
    model = LGBMRegressor(random_state=SEED, objective='regression')
    model.fit(x_train, y_train_1, eval_set=[(x_valid, y_valid_1)], eval_metric="rmse")
    pred = model.predict(x_valid)
    score_1 = rmse(y_valid_1, pred)

    # HLM
    model = LGBMRegressor(random_state=SEED, objective='regression')
    model.fit(x_train, y_train_2, eval_set=[(x_valid, y_valid_2)], eval_metric="rmse")
    pred = model.predict(x_valid)
    score_2 = rmse(y_valid_2, pred)


    score = 0.5*score_1 + 0.5*score_2
    scores.append(score)

print(scores)
np.mean(scores)

In [None]:
# model.fit(train_prop, target_1)
# lgbm_pred_1 = model.predict(test_prop)

# model.fit(train_prop, target_2)
# lgbm_pred_2 = model.predict(test_prop)

In [None]:
model.fit(train_prop, target_1)
lgbm_pred_1 = model.predict(test_prop)

model.fit(train_prop, target_2)
lgbm_pred_2 = model.predict(test_prop)

In [None]:
submission

In [None]:
ensemble = pd.read_csv(f"{DATA_PATH}ensemble_test_2.csv")
cat_quant = pd.read_csv(f"{DATA_PATH}submission_ML23(cat_quantile) (2).csv")

In [None]:
mlm_pred = (lgbm_pred_1)*0.2 + ensemble['MLM']*0.5 + cat_quant['MLM']*0.3
hlm_pred = (lgbm_pred_2)*0.2 + ensemble['HLM']* 0.5 + cat_quant['HLM']*0.3

In [None]:
submission['MLM'] = mlm_pred
submission['HLM'] = hlm_pred

In [None]:
submission.to_csv(f'{DATA_PATH}submission_cat_lgbm_0923_.csv', index=False)