In [40]:
!pip install seaborn

Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting matplotlib!=3.6.1,>=3.4 (from seaborn)
  Downloading matplotlib-3.9.2-cp311-cp311-win_amd64.whl.metadata (11 kB)
Collecting contourpy>=1.0.1 (from matplotlib!=3.6.1,>=3.4->seaborn)
  Downloading contourpy-1.3.0-cp311-cp311-win_amd64.whl.metadata (5.4 kB)
Collecting cycler>=0.10 (from matplotlib!=3.6.1,>=3.4->seaborn)
  Downloading cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib!=3.6.1,>=3.4->seaborn)
  Downloading fonttools-4.53.1-cp311-cp311-win_amd64.whl.metadata (165 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib!=3.6.1,>=3.4->seaborn)
  Downloading kiwisolver-1.4.7-cp311-cp311-win_amd64.whl.metadata (6.4 kB)
Downloading seaborn-0.13.2-py3-none-any.whl (294 kB)
Downloading matplotlib-3.9.2-cp311-cp311-win_amd64.whl (7.8 MB)
   ---------------------------------------- 0.0/7.8 MB ? eta -:--:--
   ---------------------------------------- 7.8/7.

### 1. csv 파일 불러오기

In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from deepchem.feat.molecule_featurizers import RDKitDescriptors
from rdkit import Chem
from rdkit.Chem import Descriptors
import warnings
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
train = pd.read_csv('train.csv')
test  = pd.read_csv('test.csv')

In [4]:
warnings.filterwarnings('ignore')           # 출력 창 경고 메시지 무시
plt.rcParams['axes.unicode_minus'] = False  # 음수부호 깨짐 관련
plt.rcParams['font.family'] = 'Malgun Gothic' # window 전용 한글 폰트 설정

### 2. RDKit Descriptors로 feature 탐색

In [5]:
# RDKit Descriptors의 이름을 가져오기
descriptor_names = [desc[0] for desc in Descriptors._descList]
print("RDKit Descriptors:", descriptor_names)

RDKit Descriptors: ['MaxAbsEStateIndex', 'MaxEStateIndex', 'MinAbsEStateIndex', 'MinEStateIndex', 'qed', 'SPS', 'MolWt', 'HeavyAtomMolWt', 'ExactMolWt', 'NumValenceElectrons', 'NumRadicalElectrons', 'MaxPartialCharge', 'MinPartialCharge', 'MaxAbsPartialCharge', 'MinAbsPartialCharge', 'FpDensityMorgan1', 'FpDensityMorgan2', 'FpDensityMorgan3', 'BCUT2D_MWHI', 'BCUT2D_MWLOW', 'BCUT2D_CHGHI', 'BCUT2D_CHGLO', 'BCUT2D_LOGPHI', 'BCUT2D_LOGPLOW', 'BCUT2D_MRHI', 'BCUT2D_MRLOW', 'AvgIpc', 'BalabanJ', 'BertzCT', 'Chi0', 'Chi0n', 'Chi0v', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v', 'Chi3n', 'Chi3v', 'Chi4n', 'Chi4v', 'HallKierAlpha', 'Ipc', 'Kappa1', 'Kappa2', 'Kappa3', 'LabuteASA', 'PEOE_VSA1', 'PEOE_VSA10', 'PEOE_VSA11', 'PEOE_VSA12', 'PEOE_VSA13', 'PEOE_VSA14', 'PEOE_VSA2', 'PEOE_VSA3', 'PEOE_VSA4', 'PEOE_VSA5', 'PEOE_VSA6', 'PEOE_VSA7', 'PEOE_VSA8', 'PEOE_VSA9', 'SMR_VSA1', 'SMR_VSA10', 'SMR_VSA2', 'SMR_VSA3', 'SMR_VSA4', 'SMR_VSA5', 'SMR_VSA6', 'SMR_VSA7', 'SMR_VSA8', 'SMR_VSA9', 'SlogP_VSA1'

In [6]:
def rdkit_descriptors(df):
    descriptor_names = [desc[0] for desc in Descriptors._descList]

    features = []
    for smiles in tqdm(df.Smiles):
        mol = Chem.MolFromSmiles(smiles)
        if mol is not None:
            feature = [desc[1](mol) for desc in Descriptors._descList]
            features.append(feature)
        else:
            features.append([np.nan] * len(descriptor_names))  # SMILES 변환 실패 시 NaN 추가
    
    features_df = pd.DataFrame(features, columns=descriptor_names)
    return features_df

In [7]:
train_features = rdkit_descriptors(train)
test_features = rdkit_descriptors(test)

100%|██████████████████████████████████████████████████████████████████████████████| 1952/1952 [00:22<00:00, 88.50it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 113/113 [00:01<00:00, 79.98it/s]


In [8]:
train_features

Unnamed: 0,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,14.635817,14.635817,0.021978,-0.828882,0.059187,20.597222,995.188,924.628,994.516405,388,...,0,0,0,0,0,0,0,0,0,0
1,14.673248,14.673248,0.205025,-1.585128,0.479279,19.333333,535.624,501.352,535.270716,206,...,0,0,0,0,0,0,0,0,0,0
2,14.663674,14.663674,0.195978,-1.596212,0.466884,19.333333,537.596,505.340,537.249981,206,...,0,0,0,0,0,0,0,0,0,0
3,14.610375,14.610375,0.119300,-2.432700,0.448013,17.256410,545.566,515.326,545.236222,208,...,0,0,0,0,0,0,0,0,0,0
4,14.011531,14.011531,0.000644,-0.859426,0.045219,17.373134,936.189,870.669,935.461533,358,...,0,0,0,0,0,1,0,0,8,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1947,12.044186,12.044186,0.147374,-0.555627,0.563801,10.476190,283.247,274.175,283.070539,104,...,0,0,0,0,0,0,0,0,0,0
1948,12.446045,12.446045,0.207820,-0.207820,0.735215,10.869565,327.815,309.671,327.113840,118,...,0,0,0,0,0,0,0,0,1,0
1949,13.131519,13.131519,0.026383,-1.055036,0.572289,10.695652,318.239,310.175,318.056447,116,...,0,0,0,0,0,0,0,0,0,0
1950,10.504955,10.504955,0.188770,-1.017718,0.304395,22.843750,449.536,426.352,449.152161,164,...,0,0,0,0,0,1,0,0,0,0


In [9]:
test_features

Unnamed: 0,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,13.121525,13.121525,0.208147,-0.236941,0.481693,17.558824,477.634,446.386,477.219846,178,...,0,0,0,0,0,1,0,0,0,0
1,9.236981,9.236981,0.420655,0.420655,0.437543,17.457143,466.593,436.353,466.259343,178,...,0,0,0,0,0,0,0,0,0,0
2,9.421735,9.421735,0.332235,0.332235,0.484605,20.483871,415.505,390.305,415.223292,158,...,0,0,0,0,0,0,0,0,0,0
3,11.749881,11.749881,0.097965,-0.150181,0.398314,19.648649,498.591,468.351,498.249172,190,...,0,0,0,0,0,0,0,0,0,0
4,9.007353,9.007353,0.222502,0.222502,0.526311,20.969697,444.543,416.319,444.238608,170,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108,9.218043,9.218043,0.422749,0.422749,0.506391,17.205882,454.538,428.330,454.222957,172,...,0,0,0,0,0,0,0,0,0,0
109,13.397071,13.397071,0.009681,-0.228398,0.361473,17.150000,561.756,522.444,561.288594,212,...,0,0,0,0,0,1,0,0,0,0
110,9.435997,9.435997,0.332104,0.332104,0.472711,18.906250,429.532,402.316,429.238942,164,...,0,0,0,0,0,0,0,0,0,0
111,11.520042,11.520042,0.136868,0.136868,0.515486,14.645161,411.469,390.301,411.180758,154,...,0,0,0,0,0,0,0,0,0,0


### 3. RDKit Descriptors로 feature 정리 후 prototype csv파일 저장

In [10]:
train_column_means = train_features.mean()
test_column_means = test_features.mean()

In [11]:
train_column_means

MaxAbsEStateIndex    12.677679
MaxEStateIndex       12.677679
MinAbsEStateIndex     0.138557
MinEStateIndex       -1.022795
qed                   0.508707
                       ...    
fr_thiazole           0.224385
fr_thiocyan           0.000000
fr_thiophene          0.053279
fr_unbrch_alkane      0.022541
fr_urea               0.005635
Length: 210, dtype: float64

In [12]:
test_column_means

MaxAbsEStateIndex    10.261540
MaxEStateIndex       10.261540
MinAbsEStateIndex     0.279113
MinEStateIndex        0.132020
qed                   0.435577
                       ...    
fr_thiazole           0.132743
fr_thiocyan           0.000000
fr_thiophene          0.000000
fr_unbrch_alkane      0.000000
fr_urea               0.000000
Length: 210, dtype: float64

In [13]:
train_df = train[['Smiles', 'pIC50']]
test_df = test[['Smiles']]

In [14]:
train_non_zero_mean_columns = train_column_means[train_column_means != 0].index
test_non_zero_mean_columns = test_column_means[test_column_means != 0].index

In [15]:
train_features_df = train_features[train_non_zero_mean_columns]
test_features_df = test_features[test_non_zero_mean_columns]

In [16]:
train_df_proto = pd.concat([train_df, train_features_df], axis = 1)
test_df_proto = pd.concat([test_df, test_features_df], axis = 1)

In [17]:
train_df_proto

Unnamed: 0,Smiles,pIC50,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,...,fr_priamide,fr_pyridine,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_tetrazole,fr_thiazole,fr_thiophene,fr_unbrch_alkane,fr_urea
0,CN[C@@H](C)C(=O)N[C@H](C(=O)N1C[C@@H](NC(=O)CC...,10.66,14.635817,14.635817,0.021978,-0.828882,0.059187,20.597222,995.188,924.628,...,1,1,0,0,0,0,0,0,0,0
1,CC(C)(O)[C@H](F)CN1Cc2cc(NC(=O)c3cnn4cccnc34)c...,10.59,14.673248,14.673248,0.205025,-1.585128,0.479279,19.333333,535.624,501.352,...,0,0,0,0,0,0,0,0,0,0
2,CC(C)(O)[C@H](F)CN1Cc2cc(NC(=O)c3cnn4cccnc34)c...,10.11,14.663674,14.663674,0.195978,-1.596212,0.466884,19.333333,537.596,505.340,...,0,0,0,0,0,0,0,0,0,0
3,CC(C)(O)[C@H](F)CN1Cc2cc(NC(=O)c3cnn4cccnc34)c...,10.09,14.610375,14.610375,0.119300,-2.432700,0.448013,17.256410,545.566,515.326,...,0,0,0,0,0,0,0,0,0,0
4,COc1cc2c(OC[C@@H]3CCC(=O)N3)ncc(C#CCCCCCCCCCCC...,10.00,14.011531,14.011531,0.000644,-0.859426,0.045219,17.373134,936.189,870.669,...,1,1,0,0,0,0,1,0,8,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1947,O=C(Nc1nc2cc[nH]cc-2n1)c1cccc([N+](=O)[O-])c1,4.52,12.044186,12.044186,0.147374,-0.555627,0.563801,10.476190,283.247,274.175,...,0,1,0,0,0,0,0,0,0,0
1948,CCCCn1c(NC(=O)c2cccc(Cl)c2)nc2ccccc21,4.52,12.446045,12.446045,0.207820,-0.207820,0.735215,10.869565,327.815,309.671,...,0,0,0,0,0,0,0,0,1,0
1949,O=C(Nc1nc2cc(F)c(F)cc2[nH]1)c1cccc([N+](=O)[O-...,4.52,13.131519,13.131519,0.026383,-1.055036,0.572289,10.695652,318.239,310.175,...,0,0,0,0,0,0,0,0,0,0
1950,OC[C@H]1C[C@@H](Nc2nc(Nc3ccccc3)ncc2-c2nc3cccc...,4.38,10.504955,10.504955,0.188770,-1.017718,0.304395,22.843750,449.536,426.352,...,0,0,0,0,0,0,1,0,0,0


In [18]:
test_df_proto

Unnamed: 0,Smiles,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,ExactMolWt,...,fr_morpholine,fr_nitrile,fr_para_hydroxylation,fr_phenol,fr_phenol_noOrthoHbond,fr_piperdine,fr_piperzine,fr_priamide,fr_pyridine,fr_thiazole
0,O=C(C1=CSC(C2=CC=CN=C2)=N1)NC3=CC(NC4CCN(C)CC4...,13.121525,13.121525,0.208147,-0.236941,0.481693,17.558824,477.634,446.386,477.219846,...,0,0,0,0,0,1,0,0,1,1
1,N#CC1=CC(C=C2)=C(C=C1)N2C(N=C3)=NC(NC4CCCCC4)=...,9.236981,9.236981,0.420655,0.420655,0.437543,17.457143,466.593,436.353,466.259343,...,0,1,0,0,0,1,0,0,0,0
2,N#CC(C=C1)=C(N[C@@H]2CCNC2)C=C1NC(N=C3)=NC=C3C...,9.421735,9.421735,0.332235,0.332235,0.484605,20.483871,415.505,390.305,415.223292,...,0,1,0,0,0,0,0,0,0,0
3,N#CC(C=C1)=CC=C1NC(N=C2)=NC(NC3CC(NC(C=C)=O)CC...,11.749881,11.749881,0.097965,-0.150181,0.398314,19.648649,498.591,468.351,498.249172,...,0,1,0,0,0,0,0,0,0,0
4,N#CC(C=C1)=CC=C1NC(N=C2)=NC(NC3CC(N)CC3)=C2C(C...,9.007353,9.007353,0.222502,0.222502,0.526311,20.969697,444.543,416.319,444.238608,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108,N#CC1=CC(C=C2)=C(C=C1)N2C(N=C3)=NC(N4CCOCC4)=C...,9.218043,9.218043,0.422749,0.422749,0.506391,17.205882,454.538,428.330,454.222957,...,1,1,0,0,0,1,0,0,0,0
109,O=C(C1=CSC(C2=CC=NC=C2)=N1)NC3=CC(NC4CCN(C(C)C...,13.397071,13.397071,0.009681,-0.228398,0.361473,17.150000,561.756,522.444,561.288594,...,0,0,0,0,0,2,0,0,1,1
110,N#Cc1ccc(Nc2ncc(cn2)c3cnn(c3)C4CCNCC4)cc1N[C@@...,9.435997,9.435997,0.332104,0.332104,0.472711,18.906250,429.532,402.316,429.238942,...,0,1,0,0,0,1,0,0,0,0
111,O=C(C)N(CC1)CCC1N2N=CC(C3=CN=C(N4C(C=CC(C#N)=C...,11.520042,11.520042,0.136868,0.136868,0.515486,14.645161,411.469,390.301,411.180758,...,0,1,0,0,0,1,0,0,0,0


In [19]:
train_df_proto.to_csv('rdkit_train.csv', index=False)
test_df_proto.to_csv('rdkit_test.csv', index=False)