In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import re
import torch
from tqdm.auto import tqdm
import random
import os


import matplotlib.pyplot as plt
import seaborn as sns


DATA_PATH = '/content/drive/MyDrive/데이콘 캐글 컴페티션/2023신약개발/data/'
SEED = 42


def reset_seeds(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [3]:
train = pd.read_csv(f"{DATA_PATH}train.csv")
test = pd.read_csv(f"{DATA_PATH}test.csv")
submission = pd.read_csv(f"{DATA_PATH}sample_submission.csv")

In [4]:
train.shape, test.shape, submission.shape

((3498, 11), (483, 9), (483, 3))

# 전처리

### -> 이상치 제거 (총 18개)

In [5]:
# 1. 극단치만 판단해서 18개 제거
outliers = train[(train['MLM'] > 100.0) | (train['HLM'] > 100.0) | (train['AlogP'] < -3) | (train['Molecular_Weight'] > 800)
     | (train['Num_H_Acceptors'] > 14) | (train['Num_H_Donors'] > 9) | (train['Num_RotatableBonds'] > 20)
     | (train['LogD'] < -4) | (train['Molecular_PolarSurfaceArea'] > 250)]


In [6]:
outliers.index

Int64Index([ 179,  662,  834,  983, 1092, 1172, 1239, 1584, 2159, 2258, 2367,
            2410, 2586, 2711, 2948, 3157, 3247, 3403],
           dtype='int64')

In [7]:
train = train.drop(index=outliers.index, axis=0)


In [8]:
duplicates = train[train.duplicated(subset=['SMILES'], keep=False)]

duplicates.sort_values(by='SMILES')

Unnamed: 0,id,SMILES,MLM,HLM,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea
2276,TRAIN_2276,C(=C/c1nnn(Cc2ccccc2)n1)\c1ccccc1,1.535,31.453,3.556,262.309,3,0,4,3.556,43.6
451,TRAIN_0451,C(=C/c1nnn(Cc2ccccc2)n1)\c1ccccc1,0.31,24.67,3.556,262.309,3,0,4,3.556,43.6
2891,TRAIN_2891,CC(=O)Nc1ccc(N2N=C(c3ccc(O)cc3)C(C)CC2=O)cc1,55.95,69.95,2.172,337.372,4,2,3,2.169,82.0
543,TRAIN_0543,CC(=O)Nc1ccc(N2N=C(c3ccc(O)cc3)C(C)CC2=O)cc1,68.485,85.872,2.172,337.372,4,2,3,2.169,82.0
837,TRAIN_0837,CC(=O)Nc1nc2ccc(-c3nn(C(C)C)c4nc(N)ncc34)cc2s1,63.522,62.488,2.293,367.428,5,2,3,2.307,139.85
366,TRAIN_0366,CC(=O)Nc1nc2ccc(-c3nn(C(C)C)c4nc(N)ncc34)cc2s1,73.74,66.85,2.293,367.428,5,2,3,2.307,139.85
1085,TRAIN_1085,CC(C)(C)OC(=O)N1CCC(c2n[nH]c3nc(S(C)(=O)=O)ncc...,23.63,71.47,1.684,381.45,7,1,4,1.684,126.52
2848,TRAIN_2848,CC(C)(C)OC(=O)N1CCC(c2n[nH]c3nc(S(C)(=O)=O)ncc...,40.657,99.9,1.684,381.45,7,1,4,1.684,126.52
2096,TRAIN_2096,CC(C)COc1cc(=O)n2c(c1C(=O)N1CCC(C)CC1)CCCCC2,1.88,61.28,2.843,360.49,3,0,4,2.843,49.85
1666,TRAIN_1666,CC(C)COc1cc(=O)n2c(c1C(=O)N1CCC(C)CC1)CCCCC2,5.494,25.397,2.843,360.49,3,0,4,2.843,49.85


In [9]:
processed_duplicate = duplicates.groupby('SMILES')['MLM','HLM'].mean().reset_index()

  processed_duplicate = duplicates.groupby('SMILES')['MLM','HLM'].mean().reset_index()


In [10]:
tmp = pd.merge(processed_duplicate,duplicates,how='left',on='SMILES')

### -> 중복치 제거 (총 26개)

In [11]:
tmp = tmp.iloc[0::2]  # 첫 번째 행부터 시작하여 2씩 증가하여 행 선택
tmp.drop(['MLM_y','HLM_y'],axis =1,inplace=True)

In [12]:
train = train.drop_duplicates(subset=['SMILES'], keep=False) #기존 train에서 중복치들을 우선 모두 제거
train.shape

(3428, 11)

In [13]:
tmp.rename(columns={'HLM_x': 'HLM', 'MLM_x': 'MLM'}, inplace=True)

In [14]:
train = pd.concat([train, tmp], axis=0, ignore_index=True)
train.shape

(3454, 11)

#  CMMS-GCL

 Sim2Vec 모델

In [15]:
!pip install gensim




In [34]:
from gensim.models import Word2Vec
import logging

# 로그를 표시하기 위한 설정
logging.basicConfig(format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO)

# 간단한 SMILES 데이터 예시
smiles_data = train['SMILES'].tolist()

#  [
#     "CCO",   # 에탄올
#     "CC(C)C(=O)OC1=CC=CC=C1C(=O)O",  # 아스피린
#     "CCN(CC)C(=O)OC1=CC=CC=C1C(=O)OC",  # 아세트아미노펜
# ]

# SMILES 데이터를 토큰화
tokenized_data = [smiles.split() for smiles in smiles_data]

# Sim2Vec 모델 학습
model = Word2Vec(sentences=tokenized_data, vector_size=100, window=5, min_count=1, sg=1)

# 모델 저장
model.save("sim2vec_model.model")

# 모델 불러오기
loaded_model = Word2Vec.load("sim2vec_model.model")

# 벡터 검색 예제
vector = loaded_model.wv[train['SMILES'][0]]  # 에탄올에 대한 벡터 검색
print("Vector for test 0 ", vector)



Vector for test 0  [-4.0072431e-03  2.0588385e-03  6.9160699e-03  8.1697822e-04
 -6.3379989e-03 -4.5507159e-03  6.8248142e-03 -7.5713219e-03
  3.7052000e-03 -3.2713306e-03  2.6448930e-03  7.0784390e-03
  3.3049190e-03 -4.7795512e-03 -1.6886246e-03  1.6950726e-03
  1.7469061e-03 -9.9024270e-03  6.2662913e-03 -6.7065642e-03
 -6.0968841e-03  8.1657972e-03  6.1959303e-03 -5.6781771e-04
  2.6139843e-03 -9.1163991e-03 -6.8144486e-03  1.2473214e-03
  6.5295543e-03  5.5278791e-03  6.2426887e-03 -2.7214622e-03
  5.7745110e-03  6.0774205e-04  3.1013167e-03  4.8031854e-03
  3.9978026e-05 -9.9254474e-03  3.5853506e-04  6.9635902e-03
  5.3212284e-03  2.2247934e-03 -9.8300911e-03  8.3675636e-03
 -2.0900404e-03  9.4376446e-04  1.9513452e-03 -2.2907353e-03
 -2.7469003e-03 -4.6844040e-03  7.9237949e-03  5.2906084e-03
  8.3519015e-03 -2.2896540e-03  9.7606396e-03 -4.9393214e-03
  5.0979648e-03  2.3796726e-03 -6.7516090e-03  9.6298931e-03
 -6.2089493e-03  7.0640800e-04 -7.7012302e-03 -7.7694808e-03
 -1.9

In [35]:
import torch
import torch.nn as nn

class Sim2Vec(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(Sim2Vec, self).__init__()
        self.embedding = nn.Embedding(input_dim, hidden_dim)
        self.gru = nn.GRU(hidden_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        gru_output, _ = self.gru(embedded)
        mean_pooling = torch.mean(gru_output, dim=1)  # 평균 풀링
        output = self.fc(mean_pooling)
        return output

# 모델 인스턴스 생성
input_dim = 100  # 입력 차원 (예: SMILES 데이터의 단어 수)
hidden_dim = 64  # 임베딩 및 GRU hidden state 차원
output_dim = 32  # Sim2Vec 모델의 출력 차원

sim2vec_model = Sim2Vec(input_dim, hidden_dim, output_dim)

# 예제 입력 데이터
# 예제 입력 데이터는 단어 시퀀스를 나타내는 정수로 가정합니다.
# 이 데이터는 실제 SMILES 데이터에 맞게 수정해야 합니다.
sample_input = torch.randint(input_dim, (1, 10))  # 10개의 단어로 구성된 시퀀스 (배치 크기 1)

# 모델 적용
output = sim2vec_model(sample_input)
print(output.shape)  # 출력 차원 확인


torch.Size([1, 32])


In [42]:
import torch
import torch.nn as nn

class PretrainedSim2Vec(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(PretrainedSim2Vec, self).__init__()
        self.embedding = nn.Embedding(input_dim, output_dim)  # 임베딩 레이어 예시
        self.gru = nn.GRU(output_dim, output_dim, batch_first=True)  # GRU 레이어 예시
        self.fc = nn.Linear(output_dim, output_dim)  # 밀집 레이어 예시

    def forward(self, x):
        # 입력을 임베딩 레이어를 통과시킵니다.
        embedded = self.embedding(x)

        # 임베딩을 GRU 레이어를 통해 처리합니다.
        gru_output, _ = self.gru(embedded)

        # GRU 출력을 밀집 레이어를 통해 최종 초기 특성을 계산합니다.
        initial_features = self.fc(gru_output)

        return initial_features


Vector for test 0  [-4.0072431e-03  2.0588385e-03  6.9160699e-03  8.1697822e-04
 -6.3379989e-03 -4.5507159e-03  6.8248142e-03 -7.5713219e-03
  3.7052000e-03 -3.2713306e-03  2.6448930e-03  7.0784390e-03
  3.3049190e-03 -4.7795512e-03 -1.6886246e-03  1.6950726e-03
  1.7469061e-03 -9.9024270e-03  6.2662913e-03 -6.7065642e-03
 -6.0968841e-03  8.1657972e-03  6.1959303e-03 -5.6781771e-04
  2.6139843e-03 -9.1163991e-03 -6.8144486e-03  1.2473214e-03
  6.5295543e-03  5.5278791e-03  6.2426887e-03 -2.7214622e-03
  5.7745110e-03  6.0774205e-04  3.1013167e-03  4.8031854e-03
  3.9978026e-05 -9.9254474e-03  3.5853506e-04  6.9635902e-03
  5.3212284e-03  2.2247934e-03 -9.8300911e-03  8.3675636e-03
 -2.0900404e-03  9.4376446e-04  1.9513452e-03 -2.2907353e-03
 -2.7469003e-03 -4.6844040e-03  7.9237949e-03  5.2906084e-03
  8.3519015e-03 -2.2896540e-03  9.7606396e-03 -4.9393214e-03
  5.0979648e-03  2.3796726e-03 -6.7516090e-03  9.6298931e-03
 -6.2089493e-03  7.0640800e-04 -7.7012302e-03 -7.7694808e-03
 -1.9

Atomic Similarity-Based Sequence Encoder:
- Multihead BiGRU를 사용하여 분자 시퀀스 임베딩 생성.
- Sim2Vec 모델을 사용하여 초기 특성 얻기.


In [37]:
import torch
import torch.nn as nn

# class AtomicSimilarityEncoder(nn.Module):
#     def __init__(self, input_dim, hidden_dim, num_heads):
#         super(AtomicSimilarityEncoder, self).__init__()
#         self.gru = nn.GRU(input_dim, hidden_dim, num_layers=2, bidirectional=True)
#         self.sim2vec = PretrainedSim2Vec()  # Pretrained Sim2Vec model

#     def forward(self, sequence_input):
#         initial_features = self.sim2vec(sequence_input)
#         sequence_output, _ = self.gru(initial_features)
#         return sequence_output



In [48]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# Multihead BiGRU 모델 정의
class MultiheadBiGRU(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_heads):
        super(MultiheadBiGRU, self).__init__()
        self.num_heads = num_heads
        self.gru_layers = nn.ModuleList([nn.GRU(input_dim, hidden_dim, bidirectional=True) for _ in range(num_heads)])

    def forward(self, x):
        outputs = [gru(x)[0] for gru in self.gru_layers]
        return torch.cat(outputs, dim=-1)  # 각 head의 출력을 연결하여 반환



# AtomicSimilarityEncoder 클래스 내의 forward 메서드 수정
class AtomicSimilarityEncoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_heads, sim2vec_output_dim):
        super(AtomicSimilarityEncoder, self).__init__()
        self.multihead_bigru = MultiheadBiGRU(input_dim, hidden_dim, num_heads)
        self.sim2vec = PretrainedSim2Vec(input_dim, sim2vec_output_dim)  # PretrainedSim2Vec 모델을 초기화할 때 필요한 인자로 수정해야 합니다.

    def forward(self, sequence_input):
        # Multihead BiGRU를 통해 시퀀스 임베딩 생성
        sequence_output = self.multihead_bigru(sequence_input)

        sequence_input = sequence_input.to(torch.float32)

        # Sim2Vec 모델을 통해 초기 특성 얻기
        initial_features = self.sim2vec(sequence_input)  # PretrainedSim2Vec 모델을 호출하여 초기 특성을 계산합니다.

        # 시퀀스 임베딩과 초기 특성을 결합하여 최종 임베딩 생성
        final_embedding = torch.cat([sequence_output, initial_features], dim=-1)

        return final_embedding


# # Atomic Similarity-Based Sequence Encoder 정의
# class AtomicSimilarityEncoder(nn.Module):
#     def __init__(self, input_dim, hidden_dim, num_heads, sim2vec_output_dim):
#         super(AtomicSimilarityEncoder, self).__init__()
#         self.multihead_bigru = MultiheadBiGRU(input_dim, hidden_dim, num_heads)
#         self.sim2vec = PretrainedSim2Vec(input_dim, sim2vec_output_dim)

#     def forward(self, sequence_input):
#         # Multihead BiGRU를 통해 시퀀스 임베딩 생성
#         sequence_output = self.multihead_bigru(sequence_input)

#         # Sim2Vec 모델을 통해 초기 특성 얻기
#         initial_features = self.sim2vec(sequence_input)

#         # 시퀀스 임베딩과 초기 특성을 결합하여 최종 임베딩 생성
#         final_embedding = torch.cat([sequence_output, initial_features], dim=-1)

#         return final_embedding

# 모델 인스턴스 생성
input_dim = 100  # 입력 차원
hidden_dim = 64  # BiGRU hidden state 차원
num_heads = 4  # Multihead BiGRU 헤드 개수
sim2vec_output_dim = 32  # Sim2Vec 모델의 출력 차원

encoder = AtomicSimilarityEncoder(input_dim, hidden_dim, num_heads, sim2vec_output_dim)

# 예제 입력 데이터
sequence_input = torch.randn(10, 20, input_dim)  # (sequence_length, batch_size, input_dim)
# sequence_input = sequence_input.to(torch.float32) # torch.LongTensor로 변환


# 모델 적용
output_embedding = encoder(sequence_input)
print(output_embedding.shape)  # 출력 임베딩의 형태 확인


RuntimeError: ignored

Molecular Graph Structure Encoder:
- GIN (Graph Isomorphism Network) 레이어를 사용하여 분자 그래프 임베딩 생성.

In [25]:
!pip install torch-geometric


Collecting torch-geometric
  Downloading torch_geometric-2.3.1.tar.gz (661 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/661.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.4/661.6 kB[0m [31m2.4 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━[0m [32m430.1/661.6 kB[0m [31m6.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m661.6/661.6 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: torch-geometric
  Building wheel for torch-geometric (pyproject.toml) ... [?25l[?25hdone
  Created wheel for torch-geometric: filename=torch_geometric-2.3.1-py3-none-any.whl size=910454 sh

In [26]:
from torch_geometric.nn import GINConv

class MolecularGraphEncoder(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(MolecularGraphEncoder, self).__init__()
        self.gin1 = GINConv(nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim)
        ))
        self.gin2 = GINConv(nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim)
        ))

    def forward(self, graph_input):
        x, edge_index = graph_input
        x = self.gin1(x, edge_index)
        x = self.gin2(x, edge_index)
        return x


Inter-View Graph Contrastive Learning:
- 원래 분자 그래프와 보강된 분자 그래프 간의 대비 학습을 수행.

In [27]:
class InterViewGraphContrastiveLearning(nn.Module):
    def __init__(self):
        super(InterViewGraphContrastiveLearning, self).__init__()
        # 구현해야 할 대비 학습 로직

    def forward(self, original_graph_embedding, augmented_graph_embedding):
        # 대비 학습을 수행하는 로직을 구현
        # 두 그래프 임베딩 간의 비교를 통해 학습 손실을 계산
        contrastive_loss = compute_contrastive_loss(original_graph_embedding, augmented_graph_embedding)
        return contrastive_loss


Stability Predictor:
- 학습된 표현을 기반으로 대사안정성을 예측하는 모델.

In [28]:
class StabilityPredictor(nn.Module):
    def __init__(self, input_dim):
        super(StabilityPredictor, self).__init__()
        self.fc = nn.Linear(input_dim, 1)  # 입력 차원과 1개의 출력 뉴런을 가지는 선형 레이어

    def forward(self, combined_embedding):
        prediction = torch.sigmoid(self.fc(combined_embedding))  # 시그모이드 함수를 사용하여 예측
        return prediction
