In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import re
import torch
from tqdm.auto import tqdm
import random
import os


DATA_PATH = '/content/drive/MyDrive/데이콘 캐글 컴페티션/2023신약개발/data/'
SEED = 42


def reset_seeds(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [31]:
train = pd.read_csv(f"{DATA_PATH}train.csv")
test = pd.read_csv(f"{DATA_PATH}test.csv")
submission = pd.read_csv(f"{DATA_PATH}sample_submission.csv")

# 데이터

In [4]:
train.head()

Unnamed: 0,id,SMILES,MLM,HLM,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea
0,TRAIN_0000,CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC,26.01,50.68,3.259,400.495,5,2,8,3.259,117.37
1,TRAIN_0001,Cc1nc(C)c(CN2CC(C)C(=O)Nc3ccccc32)s1,29.27,50.59,2.169,301.407,2,1,2,2.172,73.47
2,TRAIN_0002,CCCN1CCN(c2nn3nnnc3c3ccccc23)CC1,5.586,80.892,1.593,297.358,5,0,3,1.585,62.45
3,TRAIN_0003,Cc1ccc(-c2ccc(-n3nc(C)c(S(=O)(=O)N4CCN(C5CCCCC...,5.71,2.0,4.771,494.652,6,0,5,3.475,92.6
4,TRAIN_0004,Cc1ccc2c(c1)N(C(=O)c1ccncc1)CC(C)O2,93.27,99.99,2.335,268.31,3,0,1,2.337,42.43


In [5]:
test.head()

Unnamed: 0,id,SMILES,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea
0,TEST_000,CC(C)Nc1ccnc(N2CCN(Cc3cccs3)C(CCO)C2)n1,2.641,361.505,4,2,7,2.635,92.76
1,TEST_001,COc1cc(=O)n(-c2ccccc2)cc1C(=O)N1CCC2(CC1)OCCO2,0.585,370.399,5,0,3,0.585,68.31
2,TEST_002,Cc1cccc(NC(=N)/N=c2\nc(O)c(Cc3ccccc3)c(C)[nH]2)c1,4.276,347.414,4,4,5,4.29,92.86
3,TEST_003,O=C(c1nc2ncccn2n1)N1CCCn2cc(-c3ccccc3)nc21,1.795,345.358,5,0,2,1.795,81.21
4,TEST_004,CCN1CCN(C(=O)c2cc3c(=O)n4cc(C)ccc4nc3n2C)CC1,1.219,353.418,4,0,2,0.169,61.15


# Feature Engineering

## SMILES 이용

### 기본 통계기법

- 분자식 길이

In [32]:
train['SMILES_len'] = train['SMILES'].apply(len)
test['SMILES_len'] = test['SMILES'].apply(len)

- 수소 포함 개수

In [33]:
smiles_list = train['SMILES'].tolist()
# 수소 갯수 계산
hydrogen_counts = [smiles.count('H') for smiles in smiles_list]
train['Hydrogen_Counts'] = hydrogen_counts

smiles_list = test['SMILES'].tolist()
# 수소 갯수 계산
hydrogen_counts = [smiles.count('H') for smiles in smiles_list]
test['Hydrogen_Counts'] = hydrogen_counts

- 빈도분석

In [34]:
from collections import Counter

all_smiles = ''.join(train['SMILES'])
character_frequencies = Counter(all_smiles)

for char, freq in character_frequencies.items():
    print(f"'{char}': {freq}")


'C': 26001
'O': 8501
'c': 39802
'1': 10300
'(': 14796
'N': 4798
'=': 5753
')': 14796
'2': 8238
'-': 1310
'3': 3930
's': 581
'n': 6846
'[': 1312
'H': 1178
']': 1312
'S': 836
'4': 1010
'5': 170
'F': 695
'/': 292
'l': 780
'B': 284
'r': 284
'#': 153
'o': 489
'@': 739
'+': 18
'\': 80
'6': 16
'e': 2
'I': 7


### Molecular Fingerprint

In [35]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.decomposition import PCA

# 기존 데이터프레임에서 SMILES 열 추출
smiles_list = train['SMILES'].tolist()

# 분자 지문 생성 및 저장
fingerprints = [generate_fingerprint(smiles) for smiles in smiles_list]

# 기존 데이터프레임에 새로운 열로 분자 지문 추가
train['Fingerprints'] = fingerprints

# 분자 지문을 이진 벡터로 변환
binary_fingerprints = [list(fp) for fp in fingerprints]

# PCA를 사용하여 차원 축소
n_components = 50  # 축소할 차원 수
pca = PCA(n_components=n_components)
reduced_fingerprints = pca.fit_transform(binary_fingerprints)

# 축소된 피처로 데이터프레임 업데이트
for i in range(n_components):
    train[f'PC_{i + 1}'] = reduced_fingerprints[:, i]

# 결과 출력
print(train)



              id                                             SMILES     MLM  \
0     TRAIN_0000    CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC  26.010   
1     TRAIN_0001               Cc1nc(C)c(CN2CC(C)C(=O)Nc3ccccc32)s1  29.270   
2     TRAIN_0002                   CCCN1CCN(c2nn3nnnc3c3ccccc23)CC1   5.586   
3     TRAIN_0003  Cc1ccc(-c2ccc(-n3nc(C)c(S(=O)(=O)N4CCN(C5CCCCC...   5.710   
4     TRAIN_0004                Cc1ccc2c(c1)N(C(=O)c1ccncc1)CC(C)O2  93.270   
...          ...                                                ...     ...   
3493  TRAIN_3493     Cn1nc(CNC(=O)Cn2nc(C(F)(F)F)c3c2CCC3)c(Cl)c1Cl   1.556   
3494  TRAIN_3494  CCn1[nH]cc/c1=N\C(=O)c1nn(-c2ccccc2)c(=O)c2ccc...  35.560   
3495  TRAIN_3495                       CCOC(=O)CCCc1nc2cc(N)ccc2n1C  56.150   
3496  TRAIN_3496                     Nc1cc(C(=O)OCCC2CCOC2=O)cnc1Cl   0.030   
3497  TRAIN_3497                   COc1ccc(-c2nc(Cc3ccccc3)sc2C)cc1   0.450   

         HLM  AlogP  Molecular_Weight  Num_H_Accept

In [36]:
# 기존 데이터프레임에서 SMILES 열 추출
smiles_list = test['SMILES'].tolist()

# 분자 지문 생성 및 저장
fingerprints = [generate_fingerprint(smiles) for smiles in smiles_list]

# 기존 데이터프레임에 새로운 열로 분자 지문 추가
test['Fingerprints'] = fingerprints

# 분자 지문을 이진 벡터로 변환
binary_fingerprints = [list(fp) for fp in fingerprints]

# PCA를 사용하여 차원 축소
n_components = 50  # 축소할 차원 수
pca = PCA(n_components=n_components)
reduced_fingerprints = pca.fit_transform(binary_fingerprints)

# 축소된 피처로 데이터프레임 업데이트
for i in range(n_components):
    test[f'PC_{i + 1}'] = reduced_fingerprints[:, i]

# 결과 출력
test.head()

Unnamed: 0,id,SMILES,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea,SMILES_len,...,PC_41,PC_42,PC_43,PC_44,PC_45,PC_46,PC_47,PC_48,PC_49,PC_50
0,TEST_000,CC(C)Nc1ccnc(N2CCN(Cc3cccs3)C(CCO)C2)n1,2.641,361.505,4,2,7,2.635,92.76,39,...,-0.527368,0.144101,0.44729,0.319501,-0.419446,0.645945,0.007578,-0.618481,0.122102,0.535547
1,TEST_001,COc1cc(=O)n(-c2ccccc2)cc1C(=O)N1CCC2(CC1)OCCO2,0.585,370.399,5,0,3,0.585,68.31,46,...,-0.214816,0.303061,-0.398966,-0.21994,0.1088,0.130951,0.127768,0.778246,-0.951289,0.246275
2,TEST_002,Cc1cccc(NC(=N)/N=c2\nc(O)c(Cc3ccccc3)c(C)[nH]2)c1,4.276,347.414,4,4,5,4.29,92.86,49,...,-0.479896,-0.268357,-0.673097,-0.407284,-0.019356,-0.039498,0.400446,-0.943895,0.076092,-0.755738
3,TEST_003,O=C(c1nc2ncccn2n1)N1CCCn2cc(-c3ccccc3)nc21,1.795,345.358,5,0,2,1.795,81.21,42,...,0.105465,0.24349,1.007909,-0.280794,0.470036,0.150462,0.204643,-0.096116,0.111812,-0.065553
4,TEST_004,CCN1CCN(C(=O)c2cc3c(=O)n4cc(C)ccc4nc3n2C)CC1,1.219,353.418,4,0,2,0.169,61.15,44,...,0.299462,0.549956,0.933419,0.415184,0.469491,0.13312,0.602756,0.412167,-0.97022,-0.534081


### TFIDF
- 이 방법 자체로는 분자를 원소단위로 쪼개기에 별 의미없음. 다른 분자식구조랑 함께 활용해야함

In [9]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# import pandas as pd

# # SMILES 문자열을 리스트로 변환
# smiles_list = train['SMILES'].tolist()

# # TF-IDF 벡터화
# vectorizer = TfidfVectorizer(analyzer=lambda x: x)  # 각 문자를 토큰으로 취급
# tfidf_matrix = vectorizer.fit_transform(smiles_list)

# # print("TF-IDF Matrix:")
# # print(tfidf_matrix.toarray())  # TF-IDF 행렬 출력
# # print("Feature Names:")
# # print(vectorizer.get_feature_names_out())  # 토큰(문자) 목록 출력

In [10]:
# print(tfidf_matrix)

  (0, 24)	0.1303530628961888
  (0, 17)	0.1317257616179909
  (0, 22)	0.1303530628961888
  (0, 28)	0.21491971581093552
  (0, 31)	0.1574091063632859
  (0, 8)	0.19084826041409203
  (0, 4)	0.12034855351432185
  (0, 7)	0.12270848564187077
  (0, 2)	0.22143993564199127
  (0, 12)	0.06806038133744849
  (0, 19)	0.06732827254735825
  (0, 1)	0.22143993564199127
  (0, 6)	0.11040398545946326
  (0, 25)	0.6748454273710598
  (0, 20)	0.18507117327126107
  (0, 15)	0.4501542329104055
  (1, 28)	0.09029090788712975
  (1, 31)	0.19838958565921888
  (1, 8)	0.2405344149527897
  (1, 7)	0.15465487471339182
  (1, 2)	0.2790904420677487
  (1, 12)	0.0857794772189505
  (1, 19)	0.16971353694104707
  (1, 1)	0.2790904420677487
  (1, 6)	0.13914697463486775
  :	:
  (3495, 25)	0.617210576661879
  (3495, 20)	0.1934460094886089
  (3495, 15)	0.617563539173205
  (3496, 27)	0.23065329097603632
  (3496, 28)	0.10723655380844502
  (3496, 7)	0.18368024180987302
  (3496, 2)	0.16573483370902586
  (3496, 12)	0.20375685082155365
  (3496,

In [14]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2023.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.7/29.7 MB[0m [31m49.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit
Successfully installed rdkit-2023.3.3


In [15]:
# from rdkit import Chem
# from sklearn.feature_extraction.text import TfidfVectorizer
# import pandas as pd

# # SMILES를 화학 구조로 변환하여 리스트로 저장
# mol_list = [Chem.MolToSmiles(Chem.MolFromSmiles(smiles)) for smiles in train['SMILES']]

# # TF-IDF 벡터화
# vectorizer = TfidfVectorizer()
# tfidf_matrix = vectorizer.fit_transform(mol_list)

# # TF-IDF 결과를 데이터프레임에 추가
# tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
# train_with_tfidf = pd.concat([train, tfidf_df], axis=1)

# print(train_with_tfidf)


### Word2Vec (Graph Embedding)

In [38]:
!pip install node2vec

Collecting node2vec
  Downloading node2vec-0.4.6-py3-none-any.whl (7.0 kB)
Collecting networkx<3.0,>=2.5 (from node2vec)
  Downloading networkx-2.8.8-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: networkx, node2vec
  Attempting uninstall: networkx
    Found existing installation: networkx 3.1
    Uninstalling networkx-3.1:
      Successfully uninstalled networkx-3.1
Successfully installed networkx-2.8.8 node2vec-0.4.6


In [44]:
import pandas as pd
from rdkit import Chem
import numpy as np
import networkx as nx
from node2vec import Node2Vec
from sklearn.preprocessing import MinMaxScaler

# 기존 데이터프레임에서 SMILES 열 추출
smiles_list = train['SMILES'].tolist()

# 분자 그래프 생성
def create_molecule_graph(smiles):
    mol = Chem.MolFromSmiles(smiles)
    G = nx.Graph()

    for atom in mol.GetAtoms():
        G.add_node(atom.GetIdx(), atom=atom.GetSymbol())

    for bond in mol.GetBonds():
        G.add_edge(bond.GetBeginAtomIdx(), bond.GetEndAtomIdx(), bond_type=bond.GetBondTypeAsDouble())

    return G

# 모든 분자 그래프를 하나의 그래프로 통합
all_graphs = [create_molecule_graph(smiles) for smiles in smiles_list]
combined_graph = nx.compose_all(all_graphs)

# Node2Vec 모델 학습
node2vec = Node2Vec(combined_graph, dimensions=128, walk_length=30, num_walks=200, workers=4)
model = node2vec.fit(window=10, min_count=1, batch_words=4)

# 그래프 임베딩 벡터 추출
embedding_vectors = np.array([model.wv[str(node)] for node in combined_graph.nodes()])

# 임베딩 벡터 스케일링
scaler = MinMaxScaler()
scaled_embedding_vectors = scaler.fit_transform(embedding_vectors)

# 스케일링된 벡터를 피처로 추가
for i in range(scaled_embedding_vectors.shape[1]):
    train[f'Embedding_{i + 1}'] = scaled_embedding_vectors[:, i]

# 결과 출력
print(train)


Computing transition probabilities:   0%|          | 0/96 [00:00<?, ?it/s]

ValueError: ignored

In [45]:
from node2vec import Node2Vec

# 모델 학습을 위한 그래프 생성 및 학습
# ...

# 노드 인덱스 매핑 확인
print("Node Index Mapping:")
for node_idx in model.wv.index2entity[:10]:  # 처음 10개의 노드 인덱스만 확인
    print(f"Node Index: {node_idx}, Node Embedding: {model.wv[node_idx]}")

# 실제 데이터의 노드 인덱스 확인
real_node_indices = combined_graph.nodes()  # 실제 데이터의 노드 인덱스

# 노드 인덱스의 일치 여부 확인
matching = all(node_idx in model.wv.index2entity for node_idx in real_node_indices)
print(f"Node Indices Match: {matching}")


Node Index Mapping:


AttributeError: ignored

### Molecular GNN
- Molecular Graph Neural Networks

## Log P, Volume 3d
- Log P :  (파티션 계수): 분자가 친수성(수성 용매) 및 친지성(지방성 용매) 상에서 어떻게 분배되는지를 나타내는 지표. 분자가 생물학적 시스템에서 어떻게 분포하고 투과성을 가질지를 예측
- Volume 3d : 분자가 공간에서 차지하는 세 개의 차원적 부피를 의미합니다. 분자의 크기와 모양에 대한 정보를 제공

In [17]:
!pip install rdkit



In [18]:
from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors

def calculate_2d_3d_combined_descriptor(smiles):
    mol = Chem.MolFromSmiles(smiles)

    if mol is None:
        raise ValueError("Invalid SMILES string")

    # Calculate 2D descriptors
    mw = Descriptors.MolWt(mol)  # Molecular weight
    logp = Descriptors.MolLogP(mol)  # LogP

    # Generate 3D coordinates
    mol = Chem.AddHs(mol)  # Add hydrogens for accurate 3D calculation
    AllChem.EmbedMolecule(mol, randomSeed=42)  # Embed the molecule in 3D space

    # Calculate 3D descriptor: Predicted molecular volume
    volume_3d = AllChem.ComputeMolVolume(mol)

    # Combine 2D and 3D descriptors into a dictionary
    combined_descriptor = {
        'MolecularWeight': mw,
        'LogP': logp,
        'Volume3D': volume_3d
    }

    return combined_descriptor

In [19]:
# Example SMILES string
example_smiles = train['SMILES'][0] # Replace this with your SMILES string

# Calculate combined 2D/3D descriptor
combined_descriptor = calculate_2d_3d_combined_descriptor(example_smiles)

print(combined_descriptor)

{'MolecularWeight': 400.50400000000013, 'LogP': 3.8774400000000018, 'Volume3D': 361.6400000000001}


In [20]:
combined_descriptor['MolecularWeight']

400.50400000000013

In [50]:
# 기존 데이터프레임에서 SMILES 열 추출
smiles_list = train['SMILES'].tolist()

# LogP 계산 및 tqdm 적용
logP_values = []
for smiles in tqdm(smiles_list, desc="Calculating LogP"):
    descriptor = calculate_2d_3d_combined_descriptor(smiles)
    logP_values.append(descriptor['LogP'])

# 계산 결과를 데이터프레임에 추가
train['LogP'] = logP_values


Calculating LogP:   0%|          | 0/3498 [00:00<?, ?it/s]

[21:18:18] UFFTYPER: Unrecognized atom type: Se2+2 (8)
[21:18:18] UFFTYPER: Unrecognized atom type: Se2+2 (8)
[21:20:21] UFFTYPER: Unrecognized atom type: Se2+2 (6)


In [51]:
# 기존 데이터프레임에서 SMILES 열 추출
smiles_list = test['SMILES'].tolist()

# LogP 계산 및 tqdm 적용
logP_values = []
for smiles in tqdm(smiles_list, desc="Calculating LogP"):
    descriptor = calculate_2d_3d_combined_descriptor(smiles)
    logP_values.append(descriptor['LogP'])

# 계산 결과를 데이터프레임에 추가
test['LogP'] = logP_values

Calculating LogP:   0%|          | 0/483 [00:00<?, ?it/s]

In [None]:
# train['LogP'] = train['SMILES'].apply(lambda x: calculate_2d_3d_combined_descriptor(x)['LogP'])
# train['Volume3d'] = train['SMILES'].apply(lambda x: calculate_2d_3d_combined_descriptor(x)['Volume3d'])

In [None]:
# 기존 데이터프레임에서 SMILES 열 추출
smiles_list = train['SMILES'].tolist()

# Volume 3d 계산 및 tqdm 적용
Volume3d_values = []
for smiles in tqdm(smiles_list, desc="Calculating Volume3d"):
    descriptor = calculate_2d_3d_combined_descriptor(smiles)
    Volume3d_values.append(descriptor['Volume3d'])

# 계산 결과를 데이터프레임에 추가
train['Volume3d'] = Volume3d_values


In [None]:
# 기존 데이터프레임에서 SMILES 열 추출
smiles_list = test['SMILES'].tolist()

# Volume 3d 계산 및 tqdm 적용
Volume3d_values = []
for smiles in tqdm(smiles_list, desc="Calculating Volume3d"):
    descriptor = calculate_2d_3d_combined_descriptor(smiles)
    Volume3d_values.append(descriptor['Volume3d'])

# 계산 결과를 데이터프레임에 추가
test['Volume3d'] = Volume3d_values

In [None]:
# logp_list = []
# volume_3d_list = []

# for i in train['SMILES']:
#     logp = calculate_2d_3d_combined_descriptor(i)['LogP']
#     volume_3d = calculate_2d_3d_combined_descriptor(i)['Volume3D']
#     logp_list.append(logp)
#     volume_3d_list.append(volume_3d)

## Lipinski 5가지 법칙

In [47]:
train.columns

Index(['id', 'SMILES', 'MLM', 'HLM', 'AlogP', 'Molecular_Weight',
       'Num_H_Acceptors', 'Num_H_Donors', 'Num_RotatableBonds', 'LogD',
       'Molecular_PolarSurfaceArea', 'SMILES_len', 'Hydrogen_Counts',
       'Fingerprints', 'PC_1', 'PC_2', 'PC_3', 'PC_4', 'PC_5', 'PC_6', 'PC_7',
       'PC_8', 'PC_9', 'PC_10', 'PC_11', 'PC_12', 'PC_13', 'PC_14', 'PC_15',
       'PC_16', 'PC_17', 'PC_18', 'PC_19', 'PC_20', 'PC_21', 'PC_22', 'PC_23',
       'PC_24', 'PC_25', 'PC_26', 'PC_27', 'PC_28', 'PC_29', 'PC_30', 'PC_31',
       'PC_32', 'PC_33', 'PC_34', 'PC_35', 'PC_36', 'PC_37', 'PC_38', 'PC_39',
       'PC_40', 'PC_41', 'PC_42', 'PC_43', 'PC_44', 'PC_45', 'PC_46', 'PC_47',
       'PC_48', 'PC_49', 'PC_50'],
      dtype='object')

In [54]:
train.head()

Unnamed: 0,id,SMILES,MLM,HLM,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,...,PC_48,PC_49,PC_50,LogP,lip_MolWeight,lip_LogP,lip_H_Acc,lip_H_Don,lip_RB,lip_pass
0,TRAIN_0000,CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC,26.01,50.68,3.259,400.495,5,2,8,3.259,...,-0.364824,-0.735401,-0.366571,3.87744,1,1,1,1,0,0
1,TRAIN_0001,Cc1nc(C)c(CN2CC(C)C(=O)Nc3ccccc32)s1,29.27,50.59,2.169,301.407,2,1,2,2.172,...,0.419956,0.05008,-0.090533,3.35474,1,1,1,1,1,1
2,TRAIN_0002,CCCN1CCN(c2nn3nnnc3c3ccccc23)CC1,5.586,80.892,1.593,297.358,5,0,3,1.585,...,-0.570556,0.648748,-0.106735,1.2045,1,1,1,1,1,1
3,TRAIN_0003,Cc1ccc(-c2ccc(-n3nc(C)c(S(=O)(=O)N4CCN(C5CCCCC...,5.71,2.0,4.771,494.652,6,0,5,3.475,...,0.000243,0.195782,-0.386533,3.89356,1,1,1,1,1,1
4,TRAIN_0004,Cc1ccc2c(c1)N(C(=O)c1ccncc1)CC(C)O2,93.27,99.99,2.335,268.31,3,0,1,2.337,...,-0.314686,0.625947,-0.151936,2.81772,1,1,1,1,1,1


In [53]:
#분자량 500이하
train['lip_MolWeight'] = (train['Molecular_Weight'] <= 500).astype(int)
train['lip_LogP'] = (train['LogP'] <= 5).astype(int)
train['lip_H_Acc'] = (train['Num_H_Acceptors'] <= 10).astype(int)
train['lip_H_Don'] = (train['Num_H_Donors'] <= 5).astype(int)
train['lip_RB'] = (train['Num_RotatableBonds'] <= 5).astype(int)
train['lip_pass'] = train['lip_MolWeight']*train['lip_LogP']*train['lip_H_Acc']*train['lip_H_Don']*train['lip_RB']

test['lip_MolWeight'] = (test['Molecular_Weight'] <= 500).astype(int)
test['lip_LogP'] = (test['LogP'] <= 5).astype(int)
test['lip_H_Acc'] = (test['Num_H_Acceptors'] <= 10).astype(int)
test['lip_H_Don'] = (test['Num_H_Donors'] <= 5).astype(int)
test['lip_RB'] = (test['Num_RotatableBonds'] <= 5).astype(int)
test['lip_pass'] = test['lip_MolWeight']*test['lip_LogP']*test['lip_H_Acc']*test['lip_H_Don']*test['lip_RB']


# 모델학습 및 추론

## Machine Learning

In [55]:
from sklearn.metrics import mean_squared_error

def rmse(y_valid, pred):
    mse = mean_squared_error(y_valid, pred)
    return np.sqrt(mse)

In [56]:
from sklearn.metrics import make_scorer

rmse_score = make_scorer(rmse, greater_is_better=False)

In [57]:
# 결측치 채우기
train["AlogP"] = np.where(pd.isna(train["AlogP"]), train["LogD"], train["AlogP"])
test["AlogP"] = np.where(pd.isna(test["AlogP"]), test["LogD"], test["AlogP"])

In [70]:
train_prop.columns

Index(['AlogP', 'Molecular_Weight', 'Num_H_Acceptors', 'Num_H_Donors',
       'Num_RotatableBonds', 'LogD', 'Molecular_PolarSurfaceArea',
       'SMILES_len', 'Hydrogen_Counts', 'Fingerprints', 'PC_1', 'PC_2', 'PC_3',
       'PC_4', 'PC_5', 'PC_6', 'PC_7', 'PC_8', 'PC_9', 'PC_10', 'PC_11',
       'PC_12', 'PC_13', 'PC_14', 'PC_15', 'PC_16', 'PC_17', 'PC_18', 'PC_19',
       'PC_20', 'PC_21', 'PC_22', 'PC_23', 'PC_24', 'PC_25', 'PC_26', 'PC_27',
       'PC_28', 'PC_29', 'PC_30', 'PC_31', 'PC_32', 'PC_33', 'PC_34', 'PC_35',
       'PC_36', 'PC_37', 'PC_38', 'PC_39', 'PC_40', 'PC_41', 'PC_42', 'PC_43',
       'PC_44', 'PC_45', 'PC_46', 'PC_47', 'PC_48', 'PC_49', 'PC_50', 'LogP',
       'lip_MolWeight', 'lip_LogP', 'lip_H_Acc', 'lip_H_Don', 'lip_RB',
       'lip_pass'],
      dtype='object')

In [71]:
train_prop['Fingerprints']

0       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
1       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
2       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
3       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
4       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
                              ...                        
3493    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
3494    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
3495    [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
3496    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...
3497    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
Name: Fingerprints, Length: 3498, dtype: object

In [79]:
train_prop = train.iloc[:, 4:]
test_prop = test.iloc[:, 2:]

train_prop = train_prop.drop(columns=['Fingerprints'])  # inplace=True로 설정하면 원본 데이터프레임이 변경됩니다
test_prop = test_prop.drop(columns=['Fingerprints'])


target_1 = train['MLM']
target_2 = train['HLM']

In [80]:
train_prop

Unnamed: 0,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea,SMILES_len,Hydrogen_Counts,PC_1,...,PC_48,PC_49,PC_50,LogP,lip_MolWeight,lip_LogP,lip_H_Acc,lip_H_Don,lip_RB,lip_pass
0,3.259,400.495,5,2,8,3.259,117.37,47,1,0.528629,...,-0.364824,-0.735401,-0.366571,3.87744,1,1,1,1,0,0
1,2.169,301.407,2,1,2,2.172,73.47,36,0,-0.687794,...,0.419956,0.050080,-0.090533,3.35474,1,1,1,1,1,1
2,1.593,297.358,5,0,3,1.585,62.45,32,0,-0.977596,...,-0.570556,0.648748,-0.106735,1.20450,1,1,1,1,1,1
3,4.771,494.652,6,0,5,3.475,92.60,63,0,-0.149620,...,0.000243,0.195782,-0.386533,3.89356,1,1,1,1,1,1
4,2.335,268.310,3,0,1,2.337,42.43,35,0,0.189263,...,-0.314686,0.625947,-0.151936,2.81772,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3493,3.409,396.195,3,1,5,3.409,64.74,46,0,1.009402,...,-0.032295,0.156641,0.133530,2.74730,1,1,1,1,1,1
3494,1.912,359.381,4,1,3,1.844,77.37,50,1,-0.476505,...,0.071984,-0.278225,-0.071696,2.27630,1,1,1,1,1,1
3495,1.941,261.320,3,1,6,2.124,70.14,28,0,0.044358,...,-0.477669,0.124260,1.076611,2.04130,1,1,1,1,0,0
3496,0.989,284.696,5,1,5,0.989,91.51,30,0,0.635548,...,-0.018120,0.641135,0.297128,1.42720,1,1,1,1,1,1


In [81]:
test_prop

Unnamed: 0,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea,SMILES_len,Hydrogen_Counts,PC_1,...,PC_48,PC_49,PC_50,LogP,lip_MolWeight,lip_LogP,lip_H_Acc,lip_H_Don,lip_RB,lip_pass
0,2.641,361.505,4,2,7,2.635,92.76,39,0,0.163080,...,-0.618481,0.122102,0.535547,2.43160,1,1,1,1,0,0
1,0.585,370.399,5,0,3,0.585,68.31,46,0,1.278969,...,0.778246,-0.951289,0.246275,1.82520,1,1,1,1,1,1
2,4.276,347.414,4,4,5,4.290,92.86,49,1,-1.583589,...,-0.943895,0.076092,-0.755738,3.27051,1,1,1,1,1,1
3,1.795,345.358,5,0,2,1.795,81.21,42,0,-0.382394,...,-0.096116,0.111812,-0.065553,2.03830,1,1,1,1,1,1
4,1.219,353.418,4,0,2,0.169,61.15,44,0,0.492952,...,0.412167,-0.970220,-0.534081,1.27232,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
478,4.207,306.443,2,1,7,4.207,55.13,33,0,0.994901,...,-0.318738,0.032358,-0.118354,3.81860,1,1,1,1,0,0
479,-0.608,335.398,5,0,1,-1.736,70.16,46,0,1.228119,...,0.598038,-0.180673,-0.005163,0.01480,1,1,1,1,1,1
480,1.792,349.383,3,1,3,1.792,69.72,45,0,0.376178,...,-0.180209,-0.524285,-0.196469,2.32600,1,1,1,1,1,1
481,0.790,341.132,3,2,2,0.423,69.64,37,0,0.460326,...,-0.455332,0.141004,-0.184703,2.24480,1,1,1,1,1,1


In [82]:
train_prop.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3498 entries, 0 to 3497
Data columns (total 66 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   AlogP                       3498 non-null   float64
 1   Molecular_Weight            3498 non-null   float64
 2   Num_H_Acceptors             3498 non-null   int64  
 3   Num_H_Donors                3498 non-null   int64  
 4   Num_RotatableBonds          3498 non-null   int64  
 5   LogD                        3498 non-null   float64
 6   Molecular_PolarSurfaceArea  3498 non-null   float64
 7   SMILES_len                  3498 non-null   int64  
 8   Hydrogen_Counts             3498 non-null   int64  
 9   PC_1                        3498 non-null   float64
 10  PC_2                        3498 non-null   float64
 11  PC_3                        3498 non-null   float64
 12  PC_4                        3498 non-null   float64
 13  PC_5                        3498 

#### randomforest

In [83]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [84]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

cv = KFold(n_splits=5, shuffle=True, random_state=SEED)

scores = []
for tri, val in cv.split(train_prop, target_1):
    # 학습데이터
    x_train = train_prop.iloc[tri]
    y_train_1 = target_1.iloc[tri]
    y_train_2 = target_2.iloc[tri]

    # 검증데이터
    x_valid = train_prop.iloc[val]
    y_valid_1 = target_1.iloc[val]
    y_valid_2 = target_2.iloc[val]

    # MLM
    model = RandomForestRegressor(random_state=SEED)
    model.fit(x_train, y_train_1)
    pred = model.predict(x_valid)
    score_1 = rmse(y_valid_1, pred)

    #HLM
    model = RandomForestRegressor(random_state=SEED)
    model.fit(x_train, y_train_2)
    pred = model.predict(x_valid)
    score_2 = rmse(y_valid_2, pred)

    score = 0.5*score_1 + 0.5*score_2
    scores.append(score)

print(scores)
np.mean(scores)

[32.60447693731417, 32.665977963852285, 31.421803845781795, 30.942368917234617, 31.38619139695375]


31.804163812227323

In [85]:
model.fit(train_prop, target_1)
rf_pred_1 = model.predict(test_prop)

model.fit(train_prop, target_2)
rf_pred_2 = model.predict(test_prop)

#### LGBMRegressor

In [87]:
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error

In [88]:
scores = []
for tri, val in cv.split(train_prop, target_1):
    # 학습데이터
    x_train = train_prop.iloc[tri]
    y_train_1 = target_1.iloc[tri]
    y_train_2 = target_2.iloc[tri]

    # 검증데이터
    x_valid = train_prop.iloc[val]
    y_valid_1 = target_1.iloc[val]
    y_valid_2 = target_2.iloc[val]

    # MLM
    model = LGBMRegressor(random_state=SEED, objective='regression')
    model.fit(x_train, y_train_1, eval_set=[(x_valid, y_valid_1)], eval_metric="rmse")
    pred = model.predict(x_valid)
    score_1 = rmse(y_valid_1, pred)

    # HLM
    model = LGBMRegressor(random_state=SEED, objective='regression')
    model.fit(x_train, y_train_2, eval_set=[(x_valid, y_valid_2)], eval_metric="rmse")
    pred = model.predict(x_valid)
    score_2 = rmse(y_valid_2, pred)


    score = 0.5*score_1 + 0.5*score_2
    scores.append(score)

print(scores)
np.mean(scores)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14140
[LightGBM] [Info] Number of data points in the train set: 2798, number of used features: 64
[LightGBM] [Info] Start training from score 37.337043
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14140
[LightGBM] [Info] Number of data points in the train set: 2798, number of used features: 64
[LightGBM] [Info] Start training from score 53.327330
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14141
[LightGBM] [Info] Number of data points in the train set: 2798, number of used features: 64
[LightGBM] [Info] Start training from score 36.596965
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14141
[LightGBM] [Info] Number of data points in the train set: 2798, number of used features: 64
[LightGBM] [Info] Start training from score 52.459990
You can set `force_col_wise=true` to remove 

32.31361325493158

In [89]:
model.fit(train_prop, target_1)
lgbm_pred_1 = model.predict(test_prop)

model.fit(train_prop, target_2)
lgbm_pred_2 = model.predict(test_prop)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14145
[LightGBM] [Info] Number of data points in the train set: 3498, number of used features: 64
[LightGBM] [Info] Start training from score 37.384742
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14145
[LightGBM] [Info] Number of data points in the train set: 3498, number of used features: 64
[LightGBM] [Info] Start training from score 53.090206


## Catboost

In [90]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.1-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.1


In [91]:
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

In [92]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

cv = KFold(n_splits=5, shuffle=True, random_state=SEED)

scores = []
for tri, val in cv.split(train_prop, target_1):
    # 학습데이터
    x_train = train_prop.iloc[tri]
    y_train_1 = target_1.iloc[tri]
    y_train_2 = target_2.iloc[tri]

    # 검증데이터
    x_valid = train_prop.iloc[val]
    y_valid_1 = target_1.iloc[val]
    y_valid_2 = target_2.iloc[val]

    # MLM
    model = CatBoostRegressor(random_seed=SEED, verbose=0)
    model.fit(x_train, y_train_1)
    pred = model.predict(x_valid)
    score_1 = rmse(y_valid_1, pred)

    #HLM
    model = CatBoostRegressor(random_seed=SEED, verbose=0)
    model.fit(x_train, y_train_2)
    pred = model.predict(x_valid)
    score_2 = rmse(y_valid_2, pred)

    score = 0.5*score_1 + 0.5*score_2
    scores.append(score)

print(scores)
np.mean(scores)

[32.792690109801114, 32.92348085191327, 31.281040626138598, 30.53663346178582, 30.964349276614385]


31.69963886525063

In [93]:
model.fit(train_prop, target_1)
cat_pred_1 = model.predict(test_prop)

model.fit(train_prop, target_2)
cat_pred_2 = model.predict(test_prop)

In [94]:
mlm_pred = (rf_pred_1 + cat_pred_1 + lgbm_pred_1) / 3
hlm_pred = (rf_pred_2 + cat_pred_2 + lgbm_pred_2) / 3

In [95]:
submission['MLM'] = mlm_pred
submission['HLM'] = hlm_pred

In [96]:
submission

Unnamed: 0,id,MLM,HLM
0,TEST_000,32.694185,48.626736
1,TEST_001,65.871112,83.160080
2,TEST_002,45.181088,62.995566
3,TEST_003,43.477763,59.423919
4,TEST_004,70.509179,82.059421
...,...,...,...
478,TEST_478,15.776795,22.541206
479,TEST_479,71.215631,84.660760
480,TEST_480,43.896069,67.820756
481,TEST_481,58.648135,71.670097


In [97]:
submission.to_csv(f"{DATA_PATH}submission_ml1.csv", index=False)

In [98]:
submission.to_csv("submission_ml1.csv", index=False)

## Deep Learning

In [99]:
class CustomDataset(Dataset):
    def __init__(self, df, target, transform, is_test=False):
        self.df = df
        self.target = target # HLM or MLM
        self.is_test = is_test # train,valid / test

        self.feature_select = transform
        if not self.is_test:
            self.fp = self.feature_select.fit_transform(np.stack(df['FPs']))
        else: # valid or test
            self.fp = self.feature_select.transform(np.stack(df['FPs']))

    def __getitem__(self, index):
        fp = self.fp[index]
        if not self.is_test: # test가 아닌 경우(label 존재)
            label = self.df[self.target][index]
            return torch.tensor(fp).float(), torch.tensor(label).float().unsqueeze(dim=-1) # feature, label

        else: # test인 경우
            return torch.tensor(fp).float() # feature

    def __len__(self):
        return len(self.df)

NameError: ignored

In [None]:
transform = VarianceThreshold(threshold=0.05)

train_MLM = CustomDataset(df=train, target='MLM', transform=transform, is_test=False)
train_HLM = CustomDataset(df=train, target='HLM', transform=transform, is_test=False)

input_size = train_MLM.fp.shape[1]
input_size

In [None]:
# Hyperparameter
CFG = {'BATCH_SIZE': 64,
       'EPOCHS': 10,
       'INPUT_SIZE': input_size,
       'HIDDEN_SIZE': 1024,
       'OUTPUT_SIZE': 1,
       'DROPOUT_RATE': 0.8,
       'LEARNING_RATE': 0.001}

In [None]:
# train,valid split
train_MLM_dataset, valid_MLM_dataset = train_test_split(train_MLM, test_size=0.2, random_state=42)
train_HLM_dataset, valid_HLM_dataset = train_test_split(train_HLM, test_size=0.2, random_state=42)

In [None]:
train_MLM_loader = DataLoader(dataset=train_MLM_dataset,
                              batch_size=CFG['BATCH_SIZE'],
                              shuffle=True)

valid_MLM_loader = DataLoader(dataset=valid_MLM_dataset,
                              batch_size=CFG['BATCH_SIZE'],
                              shuffle=False)


train_HLM_loader = DataLoader(dataset=train_HLM_dataset,
                              batch_size=CFG['BATCH_SIZE'],
                              shuffle=True)

valid_HLM_loader = DataLoader(dataset=valid_HLM_dataset,
                              batch_size=CFG['BATCH_SIZE'],
                              shuffle=False)

In [None]:
class Net(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_rate, out_size):
        super(Net, self).__init__()

        # fc 레이어 3개와 출력 레이어
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, hidden_size)
        self.fc_out = nn.Linear(hidden_size, out_size)

        # 정규화
        self.ln1 = nn.LayerNorm(hidden_size)
        self.ln2 = nn.LayerNorm(hidden_size)
        self.ln3 = nn.LayerNorm(hidden_size)

        # 활성화 함수
        self.activation = nn.LeakyReLU()

        # Dropout
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):
        out = self.fc1(x)
        out = self.ln1(out)
        out = self.activation(out)
        out = self.dropout(out)

        out = self.fc2(out)
        out = self.ln2(out)
        out = self.activation(out)
        out = self.dropout(out)

        out = self.fc3(out)
        out = self.ln3(out)
        out = self.activation(out)
        out = self.dropout(out)

        out = self.fc_out(out)
        return out

In [None]:
rmse = lambda x,y : np.mean((x-y)**2)**0.5