In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import numpy as np
import re
import torch
from tqdm.auto import tqdm
import random
import os


DATA_PATH = '/content/drive/MyDrive/데이콘 캐글 컴페티션/2023신약개발/data/'
SEED = 42


def reset_seeds(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [4]:
from sklearn.feature_selection import VarianceThreshold


In [5]:
train = pd.read_csv(f"{DATA_PATH}train.csv")
test = pd.read_csv(f"{DATA_PATH}test.csv")
submission = pd.read_csv(f"{DATA_PATH}sample_submission.csv")

# 데이터 확인
- SMILES : 화합물 분자구조
- MLM/ HLM: 화합물의 대사안정성 지표 (인간, 쥐 - 대사되지 않고 남아있는 화합물의 양을 측정한 것) : 낮을 수록 안정성 좋은것
- AlogP : 화합물이 물-유기용매 사이에서 분배되는 정도 (로그파티션 계수)
- Molecular Weight: 분자량, 분자의 총 무게
- Num_H_Acceptors: 화합물의 수소 수용체 개수. 수소 원자가 수용체로 작동하는 원자를 의미
- Num_H_Donors : 화합물의 수소 공여체 개수. 수소 원자가 수소 결합을 형성할 수 있는 원자를 의미
- Num_RotatableBonds: 분자 내에서 회전이 가능한 결합 개수
- LogD : 화합물의 분배 계수. 로그 파티션 계수와 유사하나, 조금 다름. '어떻게 분배되나?'를 포함함
- Molecular_PolarSurfaceArea: 분자의 극성 표면 면적. 분자 내에서 극성 원자들이 차지하는 면적

In [6]:
train.columns

Index(['id', 'SMILES', 'MLM', 'HLM', 'AlogP', 'Molecular_Weight',
       'Num_H_Acceptors', 'Num_H_Donors', 'Num_RotatableBonds', 'LogD',
       'Molecular_PolarSurfaceArea'],
      dtype='object')

In [7]:
train.head()

Unnamed: 0,id,SMILES,MLM,HLM,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea
0,TRAIN_0000,CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC,26.01,50.68,3.259,400.495,5,2,8,3.259,117.37
1,TRAIN_0001,Cc1nc(C)c(CN2CC(C)C(=O)Nc3ccccc32)s1,29.27,50.59,2.169,301.407,2,1,2,2.172,73.47
2,TRAIN_0002,CCCN1CCN(c2nn3nnnc3c3ccccc23)CC1,5.586,80.892,1.593,297.358,5,0,3,1.585,62.45
3,TRAIN_0003,Cc1ccc(-c2ccc(-n3nc(C)c(S(=O)(=O)N4CCN(C5CCCCC...,5.71,2.0,4.771,494.652,6,0,5,3.475,92.6
4,TRAIN_0004,Cc1ccc2c(c1)N(C(=O)c1ccncc1)CC(C)O2,93.27,99.99,2.335,268.31,3,0,1,2.337,42.43


In [8]:
test.head()

Unnamed: 0,id,SMILES,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea
0,TEST_000,CC(C)Nc1ccnc(N2CCN(Cc3cccs3)C(CCO)C2)n1,2.641,361.505,4,2,7,2.635,92.76
1,TEST_001,COc1cc(=O)n(-c2ccccc2)cc1C(=O)N1CCC2(CC1)OCCO2,0.585,370.399,5,0,3,0.585,68.31
2,TEST_002,Cc1cccc(NC(=N)/N=c2\nc(O)c(Cc3ccccc3)c(C)[nH]2)c1,4.276,347.414,4,4,5,4.29,92.86
3,TEST_003,O=C(c1nc2ncccn2n1)N1CCCn2cc(-c3ccccc3)nc21,1.795,345.358,5,0,2,1.795,81.21
4,TEST_004,CCN1CCN(C(=O)c2cc3c(=O)n4cc(C)ccc4nc3n2C)CC1,1.219,353.418,4,0,2,0.169,61.15


In [None]:
train.shape, test.shape  #총 3498개의 데이터 (엄청 적음..) -> 총 483개의 데이터를 맞추기  (10%를)

((3498, 11), (483, 9))

# 모델 1 : seyonec/PubChem10M_SMILES_BPE_180k
- https://huggingface.co/seyonec/PubChem10M_SMILES_BPE_180k
- SMILES 문자열을 입력으로 받아서 다양한 화학적 특성을 예측하거나 화학 분자 간의 유사도를 계산하는 등의 작업

In [9]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.32.1-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m42.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m30.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m101.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m72.2 MB/s[0m eta [36m0:00:

In [23]:
from transformers import AutoModelForMaskedLM, AutoTokenizer
import torch
#토크나이저 이용 목적으로 사용함

model_name = "seyonec/PubChem10M_SMILES_BPE_180k"
model = AutoModelForMaskedLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

smiles = "CC(=O)OC1=CC=CC=C1C(=O)O"  # 예시 SMILES 문자열

# SMILES를 토큰화하여 입력 인코딩
inputs = tokenizer(smiles, return_tensors="pt")

# 모델에 입력을 주어 화학 분자의 다양한 특성 예측 수행
outputs = model(**inputs)

# 예측 결과 확인
print(outputs)


Some weights of the model checkpoint at seyonec/PubChem10M_SMILES_BPE_180k were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


MaskedLMOutput(loss=None, logits=tensor([[[ -5.8007, -15.0892,  -5.7983,  ..., -16.4021, -15.6930, -15.6507],
         [ -7.3518, -11.9924,  -6.8966,  ..., -12.2687, -11.3771, -11.5915],
         [ -6.9796,  -6.8501,  -2.3481,  ...,  -7.0034,  -6.8259,  -6.5439],
         ...,
         [ -6.6865,  -7.4691,  -4.4906,  ...,  -8.5996,  -7.9603,  -7.9470],
         [ -5.1531, -10.0733,  -5.5403,  ..., -10.0474, -10.2344, -10.0318],
         [ -5.7025,  -9.5430,  -3.6594,  ..., -10.1341,  -9.7906,  -9.8548]]],
       grad_fn=<ViewBackward0>), hidden_states=None, attentions=None)


In [22]:
len(outputs)

1