In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import re
import torch
from tqdm.auto import tqdm
import random
import os


import matplotlib.pyplot as plt
import seaborn as sns


DATA_PATH = '/content/drive/MyDrive/데이콘 캐글 컴페티션/2023신약개발/data/'
SEED = 42


def reset_seeds(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [3]:
train = pd.read_csv(f"{DATA_PATH}train.csv")
test = pd.read_csv(f"{DATA_PATH}test.csv")
submission = pd.read_csv(f"{DATA_PATH}sample_submission.csv")

In [4]:
train.shape, test.shape, submission.shape

((3498, 11), (483, 9), (483, 3))

### -> 이상치 제거 (총 18개)

In [5]:
# 1. 극단치만 판단해서 18개 제거
outliers = train[(train['MLM'] > 100.0) | (train['HLM'] > 100.0) | (train['AlogP'] < -3) | (train['Molecular_Weight'] > 800)
     | (train['Num_H_Acceptors'] > 14) | (train['Num_H_Donors'] > 9) | (train['Num_RotatableBonds'] > 20)
     | (train['LogD'] < -4) | (train['Molecular_PolarSurfaceArea'] > 250)]


In [6]:
outliers.index

Int64Index([ 179,  662,  834,  983, 1092, 1172, 1239, 1584, 2159, 2258, 2367,
            2410, 2586, 2711, 2948, 3157, 3247, 3403],
           dtype='int64')

In [7]:
len(outliers.index)

18

In [8]:
train = train.drop(index=outliers.index, axis=0)


In [9]:
train.shape # 3498 -> 3480

(3480, 11)

In [10]:
duplicates = train[train.duplicated(subset=['SMILES'], keep=False)]

duplicates.sort_values(by='SMILES')


Unnamed: 0,id,SMILES,MLM,HLM,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea
2276,TRAIN_2276,C(=C/c1nnn(Cc2ccccc2)n1)\c1ccccc1,1.535,31.453,3.556,262.309,3,0,4,3.556,43.6
451,TRAIN_0451,C(=C/c1nnn(Cc2ccccc2)n1)\c1ccccc1,0.31,24.67,3.556,262.309,3,0,4,3.556,43.6
2891,TRAIN_2891,CC(=O)Nc1ccc(N2N=C(c3ccc(O)cc3)C(C)CC2=O)cc1,55.95,69.95,2.172,337.372,4,2,3,2.169,82.0
543,TRAIN_0543,CC(=O)Nc1ccc(N2N=C(c3ccc(O)cc3)C(C)CC2=O)cc1,68.485,85.872,2.172,337.372,4,2,3,2.169,82.0
837,TRAIN_0837,CC(=O)Nc1nc2ccc(-c3nn(C(C)C)c4nc(N)ncc34)cc2s1,63.522,62.488,2.293,367.428,5,2,3,2.307,139.85
366,TRAIN_0366,CC(=O)Nc1nc2ccc(-c3nn(C(C)C)c4nc(N)ncc34)cc2s1,73.74,66.85,2.293,367.428,5,2,3,2.307,139.85
1085,TRAIN_1085,CC(C)(C)OC(=O)N1CCC(c2n[nH]c3nc(S(C)(=O)=O)ncc...,23.63,71.47,1.684,381.45,7,1,4,1.684,126.52
2848,TRAIN_2848,CC(C)(C)OC(=O)N1CCC(c2n[nH]c3nc(S(C)(=O)=O)ncc...,40.657,99.9,1.684,381.45,7,1,4,1.684,126.52
2096,TRAIN_2096,CC(C)COc1cc(=O)n2c(c1C(=O)N1CCC(C)CC1)CCCCC2,1.88,61.28,2.843,360.49,3,0,4,2.843,49.85
1666,TRAIN_1666,CC(C)COc1cc(=O)n2c(c1C(=O)N1CCC(C)CC1)CCCCC2,5.494,25.397,2.843,360.49,3,0,4,2.843,49.85


In [11]:
duplicates.shape # 이상치 제거로 27 -> 26쌍

(52, 11)

In [12]:
processed_duplicate = duplicates.groupby('SMILES')['MLM','HLM'].mean().reset_index()
processed_duplicate

  processed_duplicate = duplicates.groupby('SMILES')['MLM','HLM'].mean().reset_index()


Unnamed: 0,SMILES,MLM,HLM
0,C(=C/c1nnn(Cc2ccccc2)n1)\c1ccccc1,0.9225,28.0615
1,CC(=O)Nc1ccc(N2N=C(c3ccc(O)cc3)C(C)CC2=O)cc1,62.2175,77.911
2,CC(=O)Nc1nc2ccc(-c3nn(C(C)C)c4nc(N)ncc34)cc2s1,68.631,64.669
3,CC(C)(C)OC(=O)N1CCC(c2n[nH]c3nc(S(C)(=O)=O)ncc...,32.1435,85.685
4,CC(C)COc1cc(=O)n2c(c1C(=O)N1CCC(C)CC1)CCCCC2,3.687,43.3385
5,CC(C)NC(=O)c1c(Cl)nn(C)c1NC(=O)c1cc(Br)nn1-c1n...,62.1085,68.1015
6,CC1CC(=O)N(c2ccc(-c3cccc(C#N)c3)cc2)N=C1c1ccc(...,43.17,31.13
7,CCCCC/N=c1\n(C)c(=O)nc2sccn12,2.3395,36.8145
8,CCOC(=O)CC1(NC(=O)N2Cc3c(sc4c3CCCC4)-n3cccc3C2...,3.442,3.6015
9,CCc1nc2cc(Br)c(C(=O)OC)nc2n1CC(=O)c1ccccc1,73.545,1.1345


In [13]:
tmp = pd.merge(processed_duplicate,duplicates,how='left',on='SMILES')
tmp

Unnamed: 0,SMILES,MLM_x,HLM_x,id,MLM_y,HLM_y,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea
0,C(=C/c1nnn(Cc2ccccc2)n1)\c1ccccc1,0.9225,28.0615,TRAIN_0451,0.31,24.67,3.556,262.309,3,0,4,3.556,43.6
1,C(=C/c1nnn(Cc2ccccc2)n1)\c1ccccc1,0.9225,28.0615,TRAIN_2276,1.535,31.453,3.556,262.309,3,0,4,3.556,43.6
2,CC(=O)Nc1ccc(N2N=C(c3ccc(O)cc3)C(C)CC2=O)cc1,62.2175,77.911,TRAIN_0543,68.485,85.872,2.172,337.372,4,2,3,2.169,82.0
3,CC(=O)Nc1ccc(N2N=C(c3ccc(O)cc3)C(C)CC2=O)cc1,62.2175,77.911,TRAIN_2891,55.95,69.95,2.172,337.372,4,2,3,2.169,82.0
4,CC(=O)Nc1nc2ccc(-c3nn(C(C)C)c4nc(N)ncc34)cc2s1,68.631,64.669,TRAIN_0366,73.74,66.85,2.293,367.428,5,2,3,2.307,139.85
5,CC(=O)Nc1nc2ccc(-c3nn(C(C)C)c4nc(N)ncc34)cc2s1,68.631,64.669,TRAIN_0837,63.522,62.488,2.293,367.428,5,2,3,2.307,139.85
6,CC(C)(C)OC(=O)N1CCC(c2n[nH]c3nc(S(C)(=O)=O)ncc...,32.1435,85.685,TRAIN_1085,23.63,71.47,1.684,381.45,7,1,4,1.684,126.52
7,CC(C)(C)OC(=O)N1CCC(c2n[nH]c3nc(S(C)(=O)=O)ncc...,32.1435,85.685,TRAIN_2848,40.657,99.9,1.684,381.45,7,1,4,1.684,126.52
8,CC(C)COc1cc(=O)n2c(c1C(=O)N1CCC(C)CC1)CCCCC2,3.687,43.3385,TRAIN_1666,5.494,25.397,2.843,360.49,3,0,4,2.843,49.85
9,CC(C)COc1cc(=O)n2c(c1C(=O)N1CCC(C)CC1)CCCCC2,3.687,43.3385,TRAIN_2096,1.88,61.28,2.843,360.49,3,0,4,2.843,49.85


### -> 중복치 제거 (총 26개)

In [14]:
tmp = tmp.iloc[0::2]  # 첫 번째 행부터 시작하여 2씩 증가하여 행 선택
tmp.drop(['MLM_y','HLM_y'],axis =1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp.drop(['MLM_y','HLM_y'],axis =1,inplace=True)


In [15]:
tmp.columns = ['SMILES', 'MLM', 'HLM','id',  'AlogP', 'Molecular_Weight',
       'Num_H_Acceptors', 'Num_H_Donors', 'Num_RotatableBonds', 'LogD',
       'Molecular_PolarSurfaceArea']

In [16]:
processed_duplicate = tmp[['id', 'SMILES', 'MLM', 'HLM', 'AlogP', 'Molecular_Weight',
       'Num_H_Acceptors', 'Num_H_Donors', 'Num_RotatableBonds', 'LogD',
       'Molecular_PolarSurfaceArea']]

In [17]:
train = train.drop_duplicates(subset=['SMILES'], keep=False) #기존 train에서 중복치들을 우선 모두 제거
train.shape

(3428, 11)

In [18]:
processed_duplicate.shape

(26, 11)

In [19]:
train = pd.concat([train, processed_duplicate], axis=0, ignore_index=True)
train.shape

(3454, 11)

# 피처 추가

In [20]:
!pip install Chem

Collecting Chem
  Downloading chem-1.2.0-py3-none-any.whl (24 kB)
Installing collected packages: Chem
Successfully installed Chem-1.2.0


In [21]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2023.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.7/29.7 MB[0m [31m40.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit
Successfully installed rdkit-2023.3.3


In [22]:
from rdkit import Chem, DataStructs
from rdkit.Chem import Descriptors
from rdkit.Chem import AllChem
from rdkit import DataStructs
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole     # 화면에 출력하기 위한 옵션
IPythonConsole.ipython_useSVG=True

In [23]:
train['mol'] = train['SMILES'].apply(lambda x: Chem.MolFromSmiles(x))
train['mol'] = train['mol'].apply(lambda x: Chem.AddHs(x))
train['num_of_atoms'] = train['mol'].apply(lambda x: x.GetNumAtoms())
train['num_of_heavy_atoms'] = train['mol'].apply(lambda x: x.GetNumHeavyAtoms())
train.head()

Unnamed: 0,id,SMILES,MLM,HLM,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea,mol,num_of_atoms,num_of_heavy_atoms
0,TRAIN_0000,CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC,26.01,50.68,3.259,400.495,5,2,8,3.259,117.37,<rdkit.Chem.rdchem.Mol object at 0x7eea6c2a0cf0>,52,28
1,TRAIN_0001,Cc1nc(C)c(CN2CC(C)C(=O)Nc3ccccc32)s1,29.27,50.59,2.169,301.407,2,1,2,2.172,73.47,<rdkit.Chem.rdchem.Mol object at 0x7eea6c2a0d60>,40,21
2,TRAIN_0002,CCCN1CCN(c2nn3nnnc3c3ccccc23)CC1,5.586,80.892,1.593,297.358,5,0,3,1.585,62.45,<rdkit.Chem.rdchem.Mol object at 0x7eea6c2a0dd0>,41,22
3,TRAIN_0003,Cc1ccc(-c2ccc(-n3nc(C)c(S(=O)(=O)N4CCN(C5CCCCC...,5.71,2.0,4.771,494.652,6,0,5,3.475,92.6,<rdkit.Chem.rdchem.Mol object at 0x7eea6c2a0e40>,69,35
4,TRAIN_0004,Cc1ccc2c(c1)N(C(=O)c1ccncc1)CC(C)O2,93.27,99.99,2.335,268.31,3,0,1,2.337,42.43,<rdkit.Chem.rdchem.Mol object at 0x7eea6c2a0eb0>,36,20


In [24]:
# 탄소 패턴을 지정한다
c_patt = Chem.MolFromSmiles('C')
# 이 패턴이 들어있는 곳을 찾는다. 패턴의 수를 세면 탄소 원자가 몇개 들어있는지 알 수 있다
print(train['mol'][0].GetSubstructMatches(c_patt))

((0,), (1,), (3,), (4,), (5,), (6,), (7,), (9,), (11,), (12,), (13,), (14,), (16,), (17,), (19,), (20,), (23,), (24,), (26,), (27,))


- num_of_{}_atoms 추가

In [25]:
# 임의의 패턴(원자)를 몇개 포함하고 있는지를 얻는 함수
def number_of_atoms(atom_list, df):
    for i in atom_list:
        df['num_of_{}_atoms'.format(i)] = df['mol'].apply(lambda x: len(x.GetSubstructMatches(Chem.MolFromSmiles(i))))

number_of_atoms(['C', 'O', 'N', 'Cl'], train) # 탄소, 산소, 질소, 염소
train.head()

Unnamed: 0,id,SMILES,MLM,HLM,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea,mol,num_of_atoms,num_of_heavy_atoms,num_of_C_atoms,num_of_O_atoms,num_of_N_atoms,num_of_Cl_atoms
0,TRAIN_0000,CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC,26.01,50.68,3.259,400.495,5,2,8,3.259,117.37,<rdkit.Chem.rdchem.Mol object at 0x7eea6c2a0cf0>,52,28,20,3,4,0
1,TRAIN_0001,Cc1nc(C)c(CN2CC(C)C(=O)Nc3ccccc32)s1,29.27,50.59,2.169,301.407,2,1,2,2.172,73.47,<rdkit.Chem.rdchem.Mol object at 0x7eea6c2a0d60>,40,21,16,1,3,0
2,TRAIN_0002,CCCN1CCN(c2nn3nnnc3c3ccccc23)CC1,5.586,80.892,1.593,297.358,5,0,3,1.585,62.45,<rdkit.Chem.rdchem.Mol object at 0x7eea6c2a0dd0>,41,22,15,0,7,0
3,TRAIN_0003,Cc1ccc(-c2ccc(-n3nc(C)c(S(=O)(=O)N4CCN(C5CCCCC...,5.71,2.0,4.771,494.652,6,0,5,3.475,92.6,<rdkit.Chem.rdchem.Mol object at 0x7eea6c2a0e40>,69,35,26,2,6,0
4,TRAIN_0004,Cc1ccc2c(c1)N(C(=O)c1ccncc1)CC(C)O2,93.27,99.99,2.335,268.31,3,0,1,2.337,42.43,<rdkit.Chem.rdchem.Mol object at 0x7eea6c2a0eb0>,36,20,16,2,2,0


- 테스트 데이터에도 추가

In [26]:
test['mol'] = test['SMILES'].apply(lambda x: Chem.MolFromSmiles(x))
test['mol'] = test['mol'].apply(lambda x: Chem.AddHs(x))
test['num_of_atoms'] = test['mol'].apply(lambda x: x.GetNumAtoms())
test['num_of_heavy_atoms'] = test['mol'].apply(lambda x: x.GetNumHeavyAtoms())

number_of_atoms(['C','O', 'N', 'Cl'], test)
test.head()

Unnamed: 0,id,SMILES,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea,mol,num_of_atoms,num_of_heavy_atoms,num_of_C_atoms,num_of_O_atoms,num_of_N_atoms,num_of_Cl_atoms
0,TEST_000,CC(C)Nc1ccnc(N2CCN(Cc3cccs3)C(CCO)C2)n1,2.641,361.505,4,2,7,2.635,92.76,<rdkit.Chem.rdchem.Mol object at 0x7eea6c302880>,52,25,18,1,5,0
1,TEST_001,COc1cc(=O)n(-c2ccccc2)cc1C(=O)N1CCC2(CC1)OCCO2,0.585,370.399,5,0,3,0.585,68.31,<rdkit.Chem.rdchem.Mol object at 0x7eea6c3028f0>,49,27,20,5,2,0
2,TEST_002,Cc1cccc(NC(=N)/N=c2\nc(O)c(Cc3ccccc3)c(C)[nH]2)c1,4.276,347.414,4,4,5,4.29,92.86,<rdkit.Chem.rdchem.Mol object at 0x7eea6c302960>,47,26,20,1,5,0
3,TEST_003,O=C(c1nc2ncccn2n1)N1CCCn2cc(-c3ccccc3)nc21,1.795,345.358,5,0,2,1.795,81.21,<rdkit.Chem.rdchem.Mol object at 0x7eea6c3029d0>,41,26,18,1,7,0
4,TEST_004,CCN1CCN(C(=O)c2cc3c(=O)n4cc(C)ccc4nc3n2C)CC1,1.219,353.418,4,0,2,0.169,61.15,<rdkit.Chem.rdchem.Mol object at 0x7eea6c302a40>,49,26,19,2,5,0


In [27]:
import networkx as nx
import matplotlib.pyplot as plt

In [28]:
# from rdkit import Chem
# import pandas as pd

# # SMILES 문자열 리스트 정의
# smiles_list = train['SMILES']

# num_atoms_list = []
# num_bonds_list = []

# for smiles in smiles_list:
#     # SMILES 문자열 파싱
#     mol = Chem.MolFromSmiles(smiles)

#     if mol is not None:
#         # 원자 및 결합 정보 추출
#         num_atoms = mol.GetNumAtoms()
#         num_bonds = mol.GetNumBonds()

#         # 리스트에 정보 추가
#         num_atoms_list.append(num_atoms)
#         num_bonds_list.append(num_bonds)
#     # else:
#     #     # SMILES 문자열을 파싱할 수 없는 경우
#     #     num_atoms_list.append(None)
#     #     num_bonds_list.append(None)

# # 데이터프레임에 새로운 열로 추가
# # train['num_atoms'] = num_atoms_list
# train['num_bonds'] = num_bonds_list

In [29]:
# from rdkit import Chem
# import pandas as pd

# # SMILES 문자열 리스트 정의
# smiles_list = test['SMILES']

# num_atoms_list = []
# num_bonds_list = []

# for smiles in smiles_list:
#     # SMILES 문자열 파싱
#     mol = Chem.MolFromSmiles(smiles)

#     if mol is not None:
#         # 원자 및 결합 정보 추출
#         num_atoms = mol.GetNumAtoms()
#         num_bonds = mol.GetNumBonds()

#         # 리스트에 정보 추가
#         num_atoms_list.append(num_atoms)
#         num_bonds_list.append(num_bonds)
#     # else:
#     #     # SMILES 문자열을 파싱할 수 없는 경우
#     #     num_atoms_list.append(None)
#     #     num_bonds_list.append(None)

# # 데이터프레임에 새로운 열로 추가
# # test['num_atoms'] = num_atoms_list
# test['num_bonds'] = num_bonds_list

# GNN

In [30]:
!pip install torch




In [31]:
!pip install torch-geometric


Collecting torch-geometric
  Downloading torch_geometric-2.3.1.tar.gz (661 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m661.6/661.6 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: torch-geometric
  Building wheel for torch-geometric (pyproject.toml) ... [?25l[?25hdone
  Created wheel for torch-geometric: filename=torch_geometric-2.3.1-py3-none-any.whl size=910454 sha256=e933d7bbeb8d964259a0690eb51561532c21870148898ab84dfdec7dcc3929e0
  Stored in directory: /root/.cache/pip/wheels/ac/dc/30/e2874821ff308ee67dcd7a66dbde912411e19e35a1addda028
Successfully built torch-geometric
Installing collected packages: torch-geometric
Successfully installed torch-geometric-2.3.1


In [32]:
import torch
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import MolFromSmiles
from torch_geometric.data import Data

# SMILES 문자열을 입력으로 받아 RDKit Mol 객체로 변환
def smiles_to_mol(smiles):
    mol = MolFromSmiles(smiles)
    mol = Chem.AddHs(mol)  # 수소 원자 추가
    mol = Chem.MolToSmiles(mol)  # 정규화된 SMILES로 변환
    mol = Chem.MolFromSmiles(mol)
    return mol

# RDKit Mol 객체를 PyTorch Geometric Data로 변환
def mol_to_geometric_data(mol):
    num_atoms = mol.GetNumAtoms()
    edge_indices = []
    edge_attr = []

    for bond in mol.GetBonds():
        start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
        edge_indices.append((start, end))
        edge_indices.append((end, start))  # 무방향 그래프이므로 역방향 엣지 추가
        edge_attr.extend([bond.GetBondTypeAsDouble()] * 2)

    x = torch.eye(num_atoms)  # 노드 피처는 항등 행렬로 초기화

    data = Data(x=x, edge_index=torch.tensor(edge_indices).t().contiguous(), edge_attr=torch.tensor(edge_attr))
    return data

# SMILES 문자열을 입력으로 받아 그래프 데이터로 변환
def smiles_to_graph(smiles):
    mol = smiles_to_mol(smiles)
    data = mol_to_geometric_data(mol)
    return data



In [33]:

# 예제 SMILES 문자열
smiles_string = "CCO"  # 여기에 원하는 SMILES 문자열을 넣으세요

# SMILES를 그래프로 변환
graph_data = smiles_to_graph(smiles_string)

# 그래프 데이터 확인
print(graph_data)

# 그래프 데이터에서 필요한 정보를 추출하여 사용하세요

Data(x=[3, 3], edge_index=[2, 4], edge_attr=[4])


In [34]:
graph_data.x.numpy().sum()

3.0

In [35]:
graph_data.edge_index.numpy().sum()

8

In [36]:
graph_data.edge_attr.numpy().sum()

4.0

In [37]:

# 예제 SMILES 문자열
smiles_string = train['SMILES'][0]  # 여기에 원하는 SMILES 문자열을 넣으세요

# SMILES를 그래프로 변환
graph_data = smiles_to_graph(smiles_string)

# 그래프 데이터 확인
print(graph_data)

Data(x=[28, 28], edge_index=[2, 60], edge_attr=[60])


In [38]:
graph_data.x.numpy().sum()

28.0

In [39]:
graph_data.edge_index.numpy().sum()

1550

In [40]:
graph_data.edge_attr.numpy().sum()

78.0

In [41]:
# data.x : 노드 특징 행렬
# [num_nodes, num_node_features]
# data.edge_index : 그래프의 연결성
# [2, num_edges]
# data.edge_attr : 엣지 특징 행렬
# [num_edges, num_edge_features]

In [42]:
# # 예제 SMILES 문자열
# smiles_string = train['SMILES'][i]  # 여기에 원하는 SMILES 문자열을 넣으세요

# # SMILES를 그래프로 변환
# graph_data = smiles_to_graph(smiles_string)

# # 그래프 데이터 확인
# # print(graph_data)

# train['graph_x'][i] = graph_data.x
# train['graph_index'][i] = graph_data.edge_index
# train['graph_attr'][i] = graph_data.edge_attr




# # 그래프 데이터에서 필요한 정보를 추출하여 사용하세요

In [43]:
df = pd.DataFrame(columns=['SMILES', 'graph_x', 'graph_index', 'graph_attr'])
df['SMILES'] = train['SMILES']

In [44]:
df = pd.DataFrame(columns=['SMILES', 'graph_x', 'graph_index', 'graph_attr'])

# 예제 SMILES 문자열


smiles_strings = train['SMILES']  # 여기에 원하는 SMILES 문자열을 넣으세요

# SMILES를 그래프로 변환하고 데이터프레임에 추가
for smiles_string in smiles_strings:
    graph_data = smiles_to_graph(smiles_string)  # 앞서 정의한 함수를 사용하여 SMILES를 그래프로 변환

    # 데이터프레임에 추가
    df = df.append({'SMILES': smiles_string,
                          'graph_x': graph_data.x.numpy().sum(),
                          'graph_index': graph_data.edge_index.numpy().sum(),
                          'graph_attr': graph_data.edge_attr.numpy().sum()}, ignore_index=True)

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df

In [45]:
df

Unnamed: 0,SMILES,graph_x,graph_index,graph_attr
0,CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC,28.0,1550,78.0
1,Cc1nc(C)c(CN2CC(C)C(=O)Nc3ccccc32)s1,21.0,898,59.0
2,CCCN1CCN(c2nn3nnnc3c3ccccc23)CC1,22.0,1026,65.0
3,Cc1ccc(-c2ccc(-n3nc(C)c(S(=O)(=O)N4CCN(C5CCCCC...,35.0,2586,99.0
4,Cc1ccc2c(c1)N(C(=O)c1ccncc1)CC(C)O2,20.0,820,58.0
...,...,...,...,...
3449,Cc1ccc(-c2cc(-c3nc4c5ccccc5ncn4n3)[nH]n2)c(C)c1,26.0,1516,86.0
3450,Cc1ccc(-c2ccc(C(CN3CCCC3)N(C)C(=O)CN3C(=O)COc4...,38.0,3028,106.0
3451,Cc1ccccc1-c1nc2ccccc2cc1C(C)n1c(=O)[nH]c2c(S(C...,33.0,2284,107.0
3452,N#Cc1nccnc1OC1CCN(C(=O)C2CC(=O)N(C3CCOCC3)C2)C1,28.0,1636,76.0


In [46]:
df_train = df[['graph_x','graph_index','graph_attr']]
df_train

train = pd.concat([train,df_train],axis=1)

In [47]:
df = pd.DataFrame(columns=['SMILES', 'graph_x', 'graph_index', 'graph_attr'])

# 예제 SMILES 문자열


smiles_strings = test['SMILES']  # 여기에 원하는 SMILES 문자열을 넣으세요

# SMILES를 그래프로 변환하고 데이터프레임에 추가
for smiles_string in smiles_strings:
    graph_data = smiles_to_graph(smiles_string)  # 앞서 정의한 함수를 사용하여 SMILES를 그래프로 변환

    # 데이터프레임에 추가
    df = df.append({'SMILES': smiles_string,
                          'graph_x': graph_data.x.numpy().sum(),
                          'graph_index': graph_data.edge_index.numpy().sum(),
                          'graph_attr': graph_data.edge_attr.numpy().sum()}, ignore_index=True)

  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.a

In [48]:
df_test = df[['graph_x','graph_index','graph_attr']]
df_test

test = pd.concat([test,df_test],axis=1)

In [49]:
train['graph_index'] = train['graph_index'].astype(float)
test['graph_index'] = test['graph_index'].astype(float)

# 모델학습, 검증, 제출

In [50]:
cols = ['mol']

train = train.drop(columns = cols,axis=1)
test = test.drop(columns = cols,axis=1)

In [51]:
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold

In [52]:
from sklearn.metrics import mean_squared_error

def rmse(y_valid, pred):
    mse = mean_squared_error(y_valid, pred)
    return np.sqrt(mse)

In [53]:
from sklearn.metrics import make_scorer

rmse_score = make_scorer(rmse, greater_is_better=False)

In [54]:
# 결측치 채우기
train["AlogP"] = np.where(pd.isna(train["AlogP"]), train["LogD"], train["AlogP"])
test["AlogP"] = np.where(pd.isna(test["AlogP"]), test["LogD"], test["AlogP"])

In [55]:
train_prop = train.iloc[:, 4:]
test_prop = test.iloc[:, 2:]

target_1 = train['MLM']
target_2 = train['HLM']

In [56]:
train_prop

Unnamed: 0,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea,num_of_atoms,num_of_heavy_atoms,num_of_C_atoms,num_of_O_atoms,num_of_N_atoms,num_of_Cl_atoms,num_bonds,graph_x,graph_index,graph_attr
0,3.259,400.495,5,2,8,3.259,117.37,52,28,20,3,4,0,30,28.0,1550.0,78.0
1,2.169,301.407,2,1,2,2.172,73.47,40,21,16,1,3,0,23,21.0,898.0,59.0
2,1.593,297.358,5,0,3,1.585,62.45,41,22,15,0,7,0,25,22.0,1026.0,65.0
3,4.771,494.652,6,0,5,3.475,92.60,69,35,26,2,6,0,39,35.0,2586.0,99.0
4,2.335,268.310,3,0,1,2.337,42.43,36,20,16,2,2,0,22,20.0,820.0,58.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3449,4.609,340.381,4,1,2,4.736,71.75,42,26,20,0,6,0,30,26.0,1516.0,86.0
3450,4.282,553.480,5,0,7,4.009,65.98,68,38,29,3,4,2,42,38.0,3028.0,106.0
3451,4.304,459.520,6,1,4,4.304,113.53,54,33,24,3,5,0,37,33.0,2284.0,107.0
3452,-1.133,385.417,7,0,4,-1.133,108.65,51,28,19,4,5,0,31,28.0,1636.0,76.0


In [57]:
test_prop

Unnamed: 0,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea,num_of_atoms,num_of_heavy_atoms,num_of_C_atoms,num_of_O_atoms,num_of_N_atoms,num_of_Cl_atoms,num_bonds,graph_x,graph_index,graph_attr
0,2.641,361.505,4,2,7,2.635,92.76,52,25,18,1,5,0,27,25.0,1270.0,65.0
1,0.585,370.399,5,0,3,0.585,68.31,49,27,20,5,2,0,30,27.0,1488.0,76.0
2,4.276,347.414,4,4,5,4.290,92.86,47,26,20,1,5,0,28,26.0,1372.0,78.0
3,1.795,345.358,5,0,2,1.795,81.21,41,26,18,1,7,0,30,26.0,1464.0,83.0
4,1.219,353.418,4,0,2,0.169,61.15,49,26,19,2,5,0,29,26.0,1444.0,77.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
478,4.207,306.443,2,1,7,4.207,55.13,52,22,18,2,2,0,23,22.0,946.0,53.0
479,-0.608,335.398,5,0,1,-1.736,70.16,49,24,17,4,3,0,26,24.0,1178.0,60.0
480,1.792,349.383,3,1,3,1.792,69.72,45,26,20,3,3,0,29,26.0,1452.0,76.0
481,0.790,341.132,3,2,2,0.423,69.64,30,20,13,3,2,0,21,20.0,776.0,58.0


#### randomforest

In [58]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

cv = KFold(n_splits=5, shuffle=True, random_state=SEED)

scores = []
for tri, val in cv.split(train_prop, target_1):
    # 학습데이터
    x_train = train_prop.iloc[tri]
    y_train_1 = target_1.iloc[tri]
    y_train_2 = target_2.iloc[tri]

    # 검증데이터
    x_valid = train_prop.iloc[val]
    y_valid_1 = target_1.iloc[val]
    y_valid_2 = target_2.iloc[val]

    # MLM
    model = RandomForestRegressor(random_state=SEED)
    model.fit(x_train, y_train_1)
    pred = model.predict(x_valid)
    score_1 = rmse(y_valid_1, pred)

    #HLM
    model = RandomForestRegressor(random_state=SEED)
    model.fit(x_train, y_train_2)
    pred = model.predict(x_valid)
    score_2 = rmse(y_valid_2, pred)

    score = 0.5*score_1 + 0.5*score_2
    scores.append(score)

print(scores)
np.mean(scores)

[31.484707540217013, 34.342484795703086, 32.47949588088221, 31.75112629637141, 32.18897879789753]


32.44935866221425

In [59]:
model.fit(train_prop, target_1)
rf_pred_1 = model.predict(test_prop)

model.fit(train_prop, target_2)
rf_pred_2 = model.predict(test_prop)

#### XGBRegressor

## SVM

In [60]:
# from sklearn.svm import SVR
# from sklearn.metrics import mean_squared_error
# import numpy as np
# from sklearn.model_selection import KFold

# # KFold 교차 검증을 설정합니다.
# cv = KFold(n_splits=5, shuffle=True, random_state=SEED)

# scores = []
# for tri, val in cv.split(train_prop, target_1):
#     # 학습데이터
#     x_train = train_prop.iloc[tri]
#     y_train_1 = target_1.iloc[tri]
#     y_train_2 = target_2.iloc[tri]

#     # 검증데이터
#     x_valid = train_prop.iloc[val]
#     y_valid_1 = target_1.iloc[val]
#     y_valid_2 = target_2.iloc[val]

#     # MLM (Multi-Level Model)
#     model = SVR(kernel='linear')  # 선형 SVM 사용 (커널 선택 가능)
#     model.fit(x_train, y_train_1)
#     pred = model.predict(x_valid)
#     score_1 = np.sqrt(mean_squared_error(y_valid_1, pred))

#     # HLM (Hierarchical Linear Model)
#     model = SVR(kernel='linear')  # 선형 SVM 사용 (커널 선택 가능)
#     model.fit(x_train, y_train_2)
#     pred = model.predict(x_valid)
#     score_2 = np.sqrt(mean_squared_error(y_valid_2, pred))

#     score = 0.5 * score_1 + 0.5 * score_2
#     scores.append(score)

# print(scores)
# print(np.mean(scores))


[32.65045959291161, 34.553675974636576, 33.31609615991323, 32.58166946937624, 33.28650202217723]
33.27768064380298


In [61]:
scores = []
for tri, val in cv.split(train_prop, target_1):
    # 학습데이터
    x_train = train_prop.iloc[tri]
    y_train_1 = target_1.iloc[tri]
    y_train_2 = target_2.iloc[tri]

    # 검증데이터
    x_valid = train_prop.iloc[val]
    y_valid_1 = target_1.iloc[val]
    y_valid_2 = target_2.iloc[val]

    # MLM
    model = XGBRegressor(random_state=SEED)
    model.fit(x_train, y_train_1, eval_set=[(x_valid, y_valid_1)], eval_metric='rmse')
    pred = model.predict(x_valid)
    score_1 = rmse(y_valid_1, pred)

    # HLM
    model = XGBRegressor(random_state=SEED)
    model.fit(x_train, y_train_2, eval_set=[(x_valid, y_valid_2)], eval_metric='rmse')
    pred = model.predict(x_valid)
    score_2 = rmse(y_valid_2, pred)

    score = 0.5*score_1 + 0.5*score_2
    scores.append(score)

print(scores)
np.mean(scores)

In [62]:
model.fit(train_prop, target_1)
xgb_pred_1 = model.predict(test_prop)

model.fit(train_prop, target_2)
xgb_pred_2 = model.predict(test_prop)

#### LGBMRegressor

In [63]:
scores = []
for tri, val in cv.split(train_prop, target_1):
    # 학습데이터
    x_train = train_prop.iloc[tri]
    y_train_1 = target_1.iloc[tri]
    y_train_2 = target_2.iloc[tri]

    # 검증데이터
    x_valid = train_prop.iloc[val]
    y_valid_1 = target_1.iloc[val]
    y_valid_2 = target_2.iloc[val]

    # MLM
    model = LGBMRegressor(random_state=SEED, objective='regression')
    model.fit(x_train, y_train_1, eval_set=[(x_valid, y_valid_1)], eval_metric="rmse")
    pred = model.predict(x_valid)
    score_1 = rmse(y_valid_1, pred)

    # HLM
    model = LGBMRegressor(random_state=SEED, objective='regression')
    model.fit(x_train, y_train_2, eval_set=[(x_valid, y_valid_2)], eval_metric="rmse")
    pred = model.predict(x_valid)
    score_2 = rmse(y_valid_2, pred)


    score = 0.5*score_1 + 0.5*score_2
    scores.append(score)

print(scores)
np.mean(scores)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1643
[LightGBM] [Info] Number of data points in the train set: 2763, number of used features: 17
[LightGBM] [Info] Start training from score 37.549743
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1643
[LightGBM] [Info] Number of data points in the train set: 2763, number of used features: 17
[LightGBM] [Info] Start training from score 53.023585
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1634
[LightGBM] [Info] Number of data points in the train set: 2763, number of used features: 17
[LightGBM] [Info] Start training from score 37.414670
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1634
[L

32.89339126366317

In [64]:
model.fit(train_prop, target_1)
lgbm_pred_1 = model.predict(test_prop)

model.fit(train_prop, target_2)
lgbm_pred_2 = model.predict(test_prop)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1645
[LightGBM] [Info] Number of data points in the train set: 3454, number of used features: 17
[LightGBM] [Info] Start training from score 37.276169
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1645
[LightGBM] [Info] Number of data points in the train set: 3454, number of used features: 17
[LightGBM] [Info] Start training from score 53.008790


In [65]:
mlm_pred = (rf_pred_1 + xgb_pred_1 + lgbm_pred_1) / 3
hlm_pred = (rf_pred_2 + xgb_pred_2 + lgbm_pred_2) / 3

In [66]:
submission['MLM'] = mlm_pred
submission['HLM'] = hlm_pred

In [67]:
submission

Unnamed: 0,id,MLM,HLM
0,TEST_000,24.281795,49.546547
1,TEST_001,55.670106,68.437368
2,TEST_002,25.825089,53.024026
3,TEST_003,54.686489,71.077074
4,TEST_004,60.646106,80.137465
...,...,...,...
478,TEST_478,1.432164,20.753056
479,TEST_479,80.800460,91.850237
480,TEST_480,46.751381,67.357644
481,TEST_481,62.149209,73.390913


In [70]:
submission.to_csv("submission_ML15(preprocess).csv", index=False)

In [71]:
submission

Unnamed: 0,id,MLM,HLM
0,TEST_000,24.281795,49.546547
1,TEST_001,55.670106,68.437368
2,TEST_002,25.825089,53.024026
3,TEST_003,54.686489,71.077074
4,TEST_004,60.646106,80.137465
...,...,...,...
478,TEST_478,1.432164,20.753056
479,TEST_479,80.800460,91.850237
480,TEST_480,46.751381,67.357644
481,TEST_481,62.149209,73.390913
