In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import re
import torch
from tqdm.auto import tqdm
import random
import os


import matplotlib.pyplot as plt
import seaborn as sns


DATA_PATH = '/content/drive/MyDrive/데이콘 캐글 컴페티션/2023신약개발/data/'
SEED = 42


def reset_seeds(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [28]:
train = pd.read_csv(f"{DATA_PATH}train.csv")
test = pd.read_csv(f"{DATA_PATH}test.csv")
submission = pd.read_csv(f"{DATA_PATH}sample_submission.csv")

In [5]:
train.shape, test.shape, submission.shape

((3498, 11), (483, 9), (483, 3))

In [6]:
train

Unnamed: 0,id,SMILES,MLM,HLM,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea
0,TRAIN_0000,CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC,26.010,50.680,3.259,400.495,5,2,8,3.259,117.37
1,TRAIN_0001,Cc1nc(C)c(CN2CC(C)C(=O)Nc3ccccc32)s1,29.270,50.590,2.169,301.407,2,1,2,2.172,73.47
2,TRAIN_0002,CCCN1CCN(c2nn3nnnc3c3ccccc23)CC1,5.586,80.892,1.593,297.358,5,0,3,1.585,62.45
3,TRAIN_0003,Cc1ccc(-c2ccc(-n3nc(C)c(S(=O)(=O)N4CCN(C5CCCCC...,5.710,2.000,4.771,494.652,6,0,5,3.475,92.60
4,TRAIN_0004,Cc1ccc2c(c1)N(C(=O)c1ccncc1)CC(C)O2,93.270,99.990,2.335,268.310,3,0,1,2.337,42.43
...,...,...,...,...,...,...,...,...,...,...,...
3493,TRAIN_3493,Cn1nc(CNC(=O)Cn2nc(C(F)(F)F)c3c2CCC3)c(Cl)c1Cl,1.556,3.079,3.409,396.195,3,1,5,3.409,64.74
3494,TRAIN_3494,CCn1[nH]cc/c1=N\C(=O)c1nn(-c2ccccc2)c(=O)c2ccc...,35.560,47.630,1.912,359.381,4,1,3,1.844,77.37
3495,TRAIN_3495,CCOC(=O)CCCc1nc2cc(N)ccc2n1C,56.150,1.790,1.941,261.320,3,1,6,2.124,70.14
3496,TRAIN_3496,Nc1cc(C(=O)OCCC2CCOC2=O)cnc1Cl,0.030,2.770,0.989,284.696,5,1,5,0.989,91.51


In [7]:
train['SMILES']

0         CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC
1                    Cc1nc(C)c(CN2CC(C)C(=O)Nc3ccccc32)s1
2                        CCCN1CCN(c2nn3nnnc3c3ccccc23)CC1
3       Cc1ccc(-c2ccc(-n3nc(C)c(S(=O)(=O)N4CCN(C5CCCCC...
4                     Cc1ccc2c(c1)N(C(=O)c1ccncc1)CC(C)O2
                              ...                        
3493       Cn1nc(CNC(=O)Cn2nc(C(F)(F)F)c3c2CCC3)c(Cl)c1Cl
3494    CCn1[nH]cc/c1=N\C(=O)c1nn(-c2ccccc2)c(=O)c2ccc...
3495                         CCOC(=O)CCCc1nc2cc(N)ccc2n1C
3496                       Nc1cc(C(=O)OCCC2CCOC2=O)cnc1Cl
3497                     COc1ccc(-c2nc(Cc3ccccc3)sc2C)cc1
Name: SMILES, Length: 3498, dtype: object

In [8]:
pd.DataFrame(train['SMILES'].apply(len)).min(), pd.DataFrame(train['SMILES'].apply(len)).max()

(SMILES    18
 dtype: int64,
 SMILES    174
 dtype: int64)

In [9]:
# train.corr()

  train.corr()


Unnamed: 0,MLM,HLM,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea
MLM,1.0,0.706725,-0.330083,-0.081239,0.16451,0.210836,-0.092563,-0.350146,0.18485
HLM,0.706725,1.0,-0.346022,-0.175117,0.092313,0.176549,-0.132263,-0.357456,0.094323
AlogP,-0.330083,-0.346022,1.0,0.38976,-0.284415,-0.172222,0.111844,0.957611,-0.298194
Molecular_Weight,-0.081239,-0.175117,0.38976,1.0,0.471814,0.116186,0.583711,0.369462,0.439114
Num_H_Acceptors,0.16451,0.092313,-0.284415,0.471814,1.0,0.208433,0.474012,-0.305506,0.714315
Num_H_Donors,0.210836,0.176549,-0.172222,0.116186,0.208433,1.0,0.176871,-0.212082,0.474614
Num_RotatableBonds,-0.092563,-0.132263,0.111844,0.583711,0.474012,0.176871,1.0,0.071659,0.371574
LogD,-0.350146,-0.357456,0.957611,0.369462,-0.305506,-0.212082,0.071659,1.0,-0.29467
Molecular_PolarSurfaceArea,0.18485,0.094323,-0.298194,0.439114,0.714315,0.474614,0.371574,-0.29467,1.0


In [10]:
# df = pd.DataFrame(train['SMILES'].apply(len))
# train = pd.concat([train, df],axis=1)
# train

Unnamed: 0,id,SMILES,MLM,HLM,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea,SMILES.1
0,TRAIN_0000,CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC,26.010,50.680,3.259,400.495,5,2,8,3.259,117.37,47
1,TRAIN_0001,Cc1nc(C)c(CN2CC(C)C(=O)Nc3ccccc32)s1,29.270,50.590,2.169,301.407,2,1,2,2.172,73.47,36
2,TRAIN_0002,CCCN1CCN(c2nn3nnnc3c3ccccc23)CC1,5.586,80.892,1.593,297.358,5,0,3,1.585,62.45,32
3,TRAIN_0003,Cc1ccc(-c2ccc(-n3nc(C)c(S(=O)(=O)N4CCN(C5CCCCC...,5.710,2.000,4.771,494.652,6,0,5,3.475,92.60,63
4,TRAIN_0004,Cc1ccc2c(c1)N(C(=O)c1ccncc1)CC(C)O2,93.270,99.990,2.335,268.310,3,0,1,2.337,42.43,35
...,...,...,...,...,...,...,...,...,...,...,...,...
3493,TRAIN_3493,Cn1nc(CNC(=O)Cn2nc(C(F)(F)F)c3c2CCC3)c(Cl)c1Cl,1.556,3.079,3.409,396.195,3,1,5,3.409,64.74,46
3494,TRAIN_3494,CCn1[nH]cc/c1=N\C(=O)c1nn(-c2ccccc2)c(=O)c2ccc...,35.560,47.630,1.912,359.381,4,1,3,1.844,77.37,50
3495,TRAIN_3495,CCOC(=O)CCCc1nc2cc(N)ccc2n1C,56.150,1.790,1.941,261.320,3,1,6,2.124,70.14,28
3496,TRAIN_3496,Nc1cc(C(=O)OCCC2CCOC2=O)cnc1Cl,0.030,2.770,0.989,284.696,5,1,5,0.989,91.51,30


In [11]:
# df = pd.DataFrame(test['SMILES'].apply(len))
# test = pd.concat([test, df],axis=1)
# test

Unnamed: 0,id,SMILES,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea,SMILES.1
0,TEST_000,CC(C)Nc1ccnc(N2CCN(Cc3cccs3)C(CCO)C2)n1,2.641,361.505,4,2,7,2.635,92.76,39
1,TEST_001,COc1cc(=O)n(-c2ccccc2)cc1C(=O)N1CCC2(CC1)OCCO2,0.585,370.399,5,0,3,0.585,68.31,46
2,TEST_002,Cc1cccc(NC(=N)/N=c2\nc(O)c(Cc3ccccc3)c(C)[nH]2)c1,4.276,347.414,4,4,5,4.290,92.86,49
3,TEST_003,O=C(c1nc2ncccn2n1)N1CCCn2cc(-c3ccccc3)nc21,1.795,345.358,5,0,2,1.795,81.21,42
4,TEST_004,CCN1CCN(C(=O)c2cc3c(=O)n4cc(C)ccc4nc3n2C)CC1,1.219,353.418,4,0,2,0.169,61.15,44
...,...,...,...,...,...,...,...,...,...,...
478,TEST_478,CCc1noc(CC)c1CC(=O)NCC1(CC)CCCCC1,4.207,306.443,2,1,7,4.207,55.13,33
479,TEST_479,CC(=O)N1CCC2(CC1)OC(=O)C(C)=C2C(=O)N1CCN(C)CC1,-0.608,335.398,5,0,1,-1.736,70.16,46
480,TEST_480,CC(C)NC(=O)CN1C(=O)c2ccccc2N2C(=O)c3ccccc3C12,1.792,349.383,3,1,3,1.792,69.72,45
481,TEST_481,Cn1cc(Br)c(=O)c(NC(=O)c2ccc(O)cc2F)c1,0.790,341.132,3,2,2,0.423,69.64,37


In [12]:
# train.corr()

  train.corr()


Unnamed: 0,MLM,HLM,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea,SMILES
MLM,1.0,0.706725,-0.330083,-0.081239,0.16451,0.210836,-0.092563,-0.350146,0.18485,-0.010193
HLM,0.706725,1.0,-0.346022,-0.175117,0.092313,0.176549,-0.132263,-0.357456,0.094323,-0.112194
AlogP,-0.330083,-0.346022,1.0,0.38976,-0.284415,-0.172222,0.111844,0.957611,-0.298194,0.32589
Molecular_Weight,-0.081239,-0.175117,0.38976,1.0,0.471814,0.116186,0.583711,0.369462,0.439114,0.867837
Num_H_Acceptors,0.16451,0.092313,-0.284415,0.471814,1.0,0.208433,0.474012,-0.305506,0.714315,0.448337
Num_H_Donors,0.210836,0.176549,-0.172222,0.116186,0.208433,1.0,0.176871,-0.212082,0.474614,0.153282
Num_RotatableBonds,-0.092563,-0.132263,0.111844,0.583711,0.474012,0.176871,1.0,0.071659,0.371574,0.451338
LogD,-0.350146,-0.357456,0.957611,0.369462,-0.305506,-0.212082,0.071659,1.0,-0.29467,0.30965
Molecular_PolarSurfaceArea,0.18485,0.094323,-0.298194,0.439114,0.714315,0.474614,0.371574,-0.29467,1.0,0.430259
SMILES,-0.010193,-0.112194,0.32589,0.867837,0.448337,0.153282,0.451338,0.30965,0.430259,1.0


In [13]:
train

Unnamed: 0,id,SMILES,MLM,HLM,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea,SMILES.1
0,TRAIN_0000,CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC,26.010,50.680,3.259,400.495,5,2,8,3.259,117.37,47
1,TRAIN_0001,Cc1nc(C)c(CN2CC(C)C(=O)Nc3ccccc32)s1,29.270,50.590,2.169,301.407,2,1,2,2.172,73.47,36
2,TRAIN_0002,CCCN1CCN(c2nn3nnnc3c3ccccc23)CC1,5.586,80.892,1.593,297.358,5,0,3,1.585,62.45,32
3,TRAIN_0003,Cc1ccc(-c2ccc(-n3nc(C)c(S(=O)(=O)N4CCN(C5CCCCC...,5.710,2.000,4.771,494.652,6,0,5,3.475,92.60,63
4,TRAIN_0004,Cc1ccc2c(c1)N(C(=O)c1ccncc1)CC(C)O2,93.270,99.990,2.335,268.310,3,0,1,2.337,42.43,35
...,...,...,...,...,...,...,...,...,...,...,...,...
3493,TRAIN_3493,Cn1nc(CNC(=O)Cn2nc(C(F)(F)F)c3c2CCC3)c(Cl)c1Cl,1.556,3.079,3.409,396.195,3,1,5,3.409,64.74,46
3494,TRAIN_3494,CCn1[nH]cc/c1=N\C(=O)c1nn(-c2ccccc2)c(=O)c2ccc...,35.560,47.630,1.912,359.381,4,1,3,1.844,77.37,50
3495,TRAIN_3495,CCOC(=O)CCCc1nc2cc(N)ccc2n1C,56.150,1.790,1.941,261.320,3,1,6,2.124,70.14,28
3496,TRAIN_3496,Nc1cc(C(=O)OCCC2CCOC2=O)cnc1Cl,0.030,2.770,0.989,284.696,5,1,5,0.989,91.51,30


In [14]:
train.columns

Index(['id', 'SMILES', 'MLM', 'HLM', 'AlogP', 'Molecular_Weight',
       'Num_H_Acceptors', 'Num_H_Donors', 'Num_RotatableBonds', 'LogD',
       'Molecular_PolarSurfaceArea', 'SMILES'],
      dtype='object')

In [15]:
# train.columns = ['id', 'SMILES', 'MLM', 'HLM', 'AlogP', 'Molecular_Weight',
#        'Num_H_Acceptors', 'Num_H_Donors', 'Num_RotatableBonds', 'LogD',
#        'Molecular_PolarSurfaceArea', 'SMILES_len']

# ChemBERTa
- https://huggingface.co/seyonec/ChemBERTa-zinc-base-v1?text=Paris+is+the+%3Cmask%3E+of+France.

In [16]:
!pip install transformers


Collecting transformers
  Downloading transformers-4.33.2-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.17.1-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.8/294.8 kB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m71.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m62.1 MB/s[0m eta [36m0:00:0

In [17]:
!pip install transformers



In [18]:
# import torch
# from transformers import AutoTokenizer, AutoModelForMaskedLM

# # 모델 이름 설정
# model_name = "seyonec/ChemBERTa-zinc-base-v1"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForMaskedLM.from_pretrained(model_name)

# # 추출할 SMILES 문자열 정의
# smiles_string = "CCO"  # 예시 SMILES 문자열

# # SMILES 문자열을 인코딩하고 모델에 전달
# inputs = tokenizer(smiles_string, return_tensors="pt", padding=True, truncation=True)
# with torch.no_grad():
#     outputs = model(**inputs)

# # 분자 임베딩 추출
# embeddings = outputs.last_hidden_state.mean(dim=1)  # 평균 풀링을 사용하여 임베딩을 추출

# # 추출된 임베딩 확인
# print("Molecule Embeddings Shape:", embeddings.shape)
# print("Molecule Embeddings:", embeddings)


In [19]:
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM

# 모델 이름과 토크나이저 이름 설정
model_name = "seyonec/ChemBERTa-zinc-base-v1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name)

# 추출할 SMILES 문자열 정의
smiles_string = "CCO"  # 예시 SMILES 문자열

# SMILES 문자열을 인코딩하고 모델에 전달
inputs = tokenizer(smiles_string, return_tensors="pt", padding=True, truncation=True)
with torch.no_grad():
    outputs = model(**inputs)

# Masked Language Modeling 작업에서 예측값인 "logits"를 추출
logits = outputs.logits

# 특성 추출
features = logits.mean(dim=1)  # 특성 추출 방법은 평균을 사용했습니다. 다른 방법도 가능합니다.

# 추출된 특성 확인
print("Extracted Features Shape:", features.shape)
print("Extracted Features:", features)



Downloading (…)okenizer_config.json:   0%|          | 0.00/166 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/501 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/9.43k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/3.21k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/179M [00:00<?, ?B/s]

Some weights of the model checkpoint at seyonec/ChemBERTa-zinc-base-v1 were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Extracted Features Shape: torch.Size([1, 767])
Extracted Features: tensor([[-1.1679e+00, -1.4977e+00, -2.0888e+00, -1.8878e+00, -3.2611e+00,
         -1.9664e+00, -2.4960e+00, -1.0562e+00, -3.9395e+00, -1.9483e+00,
         -3.1591e+00, -2.8859e+00,  5.4996e-01,  6.6431e-01, -1.4691e+00,
         -2.2078e+00, -1.2367e+00, -5.7110e-01, -3.2908e+00, -3.2101e-01,
         -2.2854e+00,  3.2204e+00,  5.8153e-02, -9.9245e-01,  5.5396e-01,
         -4.2534e-02, -1.3327e+00, -2.2100e+00, -2.3747e+00, -2.3461e+00,
         -1.2137e+00, -1.8763e+00, -1.8194e+00,  7.3734e-01, -2.6740e+00,
         -1.9882e+00,  3.3700e-01, -1.5732e+00, -2.3090e+00,  5.4768e+00,
         -2.7789e+00, -1.4473e+00,  2.8847e+00, -1.9212e+00, -5.3010e-01,
          6.7692e-01, -2.5265e+00, -2.1813e+00, -1.5182e+00, -2.2657e+00,
          3.0781e+00,  3.0608e+00, -1.2594e-01, -2.1752e+00, -1.6446e+00,
          1.3499e+00, -2.1438e+00, -2.3219e+00, -3.1273e+00, -2.0850e+00,
         -2.5570e+00, -2.5757e+00, -2.9587e+0

In [20]:
features_list = []

for smiles in tqdm(train['SMILES'].tolist()):
        # SMILES 문자열을 인코딩하고 모델에 전달
    inputs = tokenizer(smiles, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)

    # Masked Language Modeling 작업에서 예측값인 "logits"를 추출
    logits = outputs.logits

    # 특성 추출
    features = logits.mean(dim=1)  # 특성 추출 방법은 평균을 사용했습니다. 다른 방법도 가능합니다.
    features_list.append(features)
    # # 추출된 특성 확인
    # print("Extracted Features Shape:", features.shape)
    # print("Extracted Features:", features)


  0%|          | 0/3498 [00:00<?, ?it/s]

In [21]:
train['Molecule_Embeddings'] = features_list

In [22]:
torch.cat(features_list).numpy()

array([[-2.6021812, -2.416166 , -2.331112 , ..., -1.4262753, -1.3408945,
        -2.887336 ],
       [-2.6957455, -2.3544765, -2.4149096, ..., -1.1744883, -1.2499803,
        -2.406683 ],
       [-2.6537948, -2.5313401, -2.3464794, ..., -1.4779081, -1.5680116,
        -2.1839519],
       ...,
       [-2.6111557, -2.130465 , -2.4433079, ..., -1.3749459, -1.3725666,
        -2.9375608],
       [-2.7023823, -2.175494 , -2.347566 , ..., -1.508014 , -1.0166966,
        -2.7035282],
       [-2.485038 , -2.1909103, -2.259136 , ..., -1.6437249, -1.6194086,
        -2.6226118]], dtype=float32)

In [23]:
X = np.concatenate(features_list, axis=0)

# 전치 수행
X_transposed = X.T
df = pd.DataFrame(X)
df #767개의 열. embedding 정보 포함.

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,757,758,759,760,761,762,763,764,765,766
0,-2.602181,-2.416166,-2.331112,-2.174333,-2.822411,-2.385678,-2.322312,0.561042,-2.073987,-2.227908,...,-1.991607,-2.373958,-1.335646,-1.965763,-2.064113,-1.121240,-1.696131,-1.426275,-1.340894,-2.887336
1,-2.695745,-2.354476,-2.414910,-1.989317,-2.639670,-2.185341,-2.173566,0.673285,-2.034645,-2.337377,...,-2.068422,-2.138847,-1.377813,-2.052618,-2.012119,-1.122919,-1.606128,-1.174488,-1.249980,-2.406683
2,-2.653795,-2.531340,-2.346479,-1.918797,-2.537491,-2.759861,-1.968515,0.792431,-2.102950,-2.255583,...,-1.888332,-2.102875,-1.464725,-1.870678,-1.862562,-1.089160,-1.834340,-1.477908,-1.568012,-2.183952
3,-2.669900,-2.521494,-2.461554,-1.999143,-2.775923,-2.366805,-2.332054,0.622149,-2.126953,-2.440044,...,-2.056242,-2.363769,-1.080201,-2.008929,-2.099958,-1.080875,-1.639401,-1.374620,-1.332347,-2.770781
4,-2.408910,-2.244952,-2.493243,-1.964388,-2.863209,-2.118309,-2.101366,0.514143,-2.077693,-2.617118,...,-2.373341,-2.469370,-1.234159,-2.271900,-2.082072,-1.147488,-1.611428,-1.303321,-1.607484,-2.613079
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3493,-2.812243,-2.362064,-2.392483,-2.286793,-2.696639,-2.621839,-2.499537,0.426544,-1.965696,-2.342347,...,-2.134776,-2.190667,-1.483879,-1.770519,-1.896272,-1.650752,-1.621845,-1.248276,-1.392809,-2.582400
3494,-2.511561,-2.340928,-2.470116,-2.358044,-2.794412,-2.667998,-2.299248,0.498471,-2.131320,-2.242376,...,-2.248943,-2.436692,-1.374239,-2.197369,-1.843468,-1.567010,-1.939841,-1.533525,-1.516160,-2.950501
3495,-2.611156,-2.130465,-2.443308,-2.409866,-3.105262,-2.331446,-2.334735,0.576573,-2.176611,-2.412583,...,-2.224009,-2.554606,-1.244632,-2.056181,-2.069504,-1.236695,-1.755226,-1.374946,-1.372567,-2.937561
3496,-2.702382,-2.175494,-2.347566,-2.310582,-2.948305,-2.436399,-2.148951,0.472372,-1.990057,-2.330903,...,-2.143824,-2.641134,-1.501472,-2.200529,-1.695927,-1.434264,-1.777464,-1.508014,-1.016697,-2.703528


In [29]:
train = pd.concat([train,df], axis = 1)

In [30]:
train

Unnamed: 0,id,SMILES,MLM,HLM,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,...,757,758,759,760,761,762,763,764,765,766
0,TRAIN_0000,CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC,26.010,50.680,3.259,400.495,5,2,8,3.259,...,-1.991607,-2.373958,-1.335646,-1.965763,-2.064113,-1.121240,-1.696131,-1.426275,-1.340894,-2.887336
1,TRAIN_0001,Cc1nc(C)c(CN2CC(C)C(=O)Nc3ccccc32)s1,29.270,50.590,2.169,301.407,2,1,2,2.172,...,-2.068422,-2.138847,-1.377813,-2.052618,-2.012119,-1.122919,-1.606128,-1.174488,-1.249980,-2.406683
2,TRAIN_0002,CCCN1CCN(c2nn3nnnc3c3ccccc23)CC1,5.586,80.892,1.593,297.358,5,0,3,1.585,...,-1.888332,-2.102875,-1.464725,-1.870678,-1.862562,-1.089160,-1.834340,-1.477908,-1.568012,-2.183952
3,TRAIN_0003,Cc1ccc(-c2ccc(-n3nc(C)c(S(=O)(=O)N4CCN(C5CCCCC...,5.710,2.000,4.771,494.652,6,0,5,3.475,...,-2.056242,-2.363769,-1.080201,-2.008929,-2.099958,-1.080875,-1.639401,-1.374620,-1.332347,-2.770781
4,TRAIN_0004,Cc1ccc2c(c1)N(C(=O)c1ccncc1)CC(C)O2,93.270,99.990,2.335,268.310,3,0,1,2.337,...,-2.373341,-2.469370,-1.234159,-2.271900,-2.082072,-1.147488,-1.611428,-1.303321,-1.607484,-2.613079
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3493,TRAIN_3493,Cn1nc(CNC(=O)Cn2nc(C(F)(F)F)c3c2CCC3)c(Cl)c1Cl,1.556,3.079,3.409,396.195,3,1,5,3.409,...,-2.134776,-2.190667,-1.483879,-1.770519,-1.896272,-1.650752,-1.621845,-1.248276,-1.392809,-2.582400
3494,TRAIN_3494,CCn1[nH]cc/c1=N\C(=O)c1nn(-c2ccccc2)c(=O)c2ccc...,35.560,47.630,1.912,359.381,4,1,3,1.844,...,-2.248943,-2.436692,-1.374239,-2.197369,-1.843468,-1.567010,-1.939841,-1.533525,-1.516160,-2.950501
3495,TRAIN_3495,CCOC(=O)CCCc1nc2cc(N)ccc2n1C,56.150,1.790,1.941,261.320,3,1,6,2.124,...,-2.224009,-2.554606,-1.244632,-2.056181,-2.069504,-1.236695,-1.755226,-1.374946,-1.372567,-2.937561
3496,TRAIN_3496,Nc1cc(C(=O)OCCC2CCOC2=O)cnc1Cl,0.030,2.770,0.989,284.696,5,1,5,0.989,...,-2.143824,-2.641134,-1.501472,-2.200529,-1.695927,-1.434264,-1.777464,-1.508014,-1.016697,-2.703528


In [27]:
test['SMILES']

Unnamed: 0,SMILES,SMILES.1
0,CC(C)Nc1ccnc(N2CCN(Cc3cccs3)C(CCO)C2)n1,39
1,COc1cc(=O)n(-c2ccccc2)cc1C(=O)N1CCC2(CC1)OCCO2,46
2,Cc1cccc(NC(=N)/N=c2\nc(O)c(Cc3ccccc3)c(C)[nH]2)c1,49
3,O=C(c1nc2ncccn2n1)N1CCCn2cc(-c3ccccc3)nc21,42
4,CCN1CCN(C(=O)c2cc3c(=O)n4cc(C)ccc4nc3n2C)CC1,44
...,...,...
478,CCc1noc(CC)c1CC(=O)NCC1(CC)CCCCC1,33
479,CC(=O)N1CCC2(CC1)OC(=O)C(C)=C2C(=O)N1CCN(C)CC1,46
480,CC(C)NC(=O)CN1C(=O)c2ccccc2N2C(=O)c3ccccc3C12,45
481,Cn1cc(Br)c(=O)c(NC(=O)c2ccc(O)cc2F)c1,37


In [31]:
features_list_test = []

for smiles in tqdm(test['SMILES'].tolist()):
        # SMILES 문자열을 인코딩하고 모델에 전달
    inputs = tokenizer(smiles, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)

    # Masked Language Modeling 작업에서 예측값인 "logits"를 추출
    logits = outputs.logits

    # 특성 추출
    features = logits.mean(dim=1)  # 특성 추출 방법은 평균을 사용했습니다. 다른 방법도 가능합니다.
    features_list_test.append(features)
    # # 추출된 특성 확인
    # print("Extracted Features Shape:", features.shape)
    # print("Extracted Features:", features)

  0%|          | 0/483 [00:00<?, ?it/s]

In [32]:
features_list_test

Output hidden; open in https://colab.research.google.com to view.

In [None]:
# test['Molecule_Embeddings'] = 0

# test['Molecule_Embeddings'] = features_list_test

In [33]:
X = np.concatenate(features_list_test, axis=0)

# 전치 수행
X_transposed = X.T
df_test = pd.DataFrame(X)
df_test #767개의 열. embedding 정보 포함.

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,757,758,759,760,761,762,763,764,765,766
0,-2.911492,-2.429448,-2.290789,-1.885554,-2.582995,-2.265126,-2.102386,0.739251,-1.881882,-2.519799,...,-2.017083,-2.284410,-1.434447,-2.051644,-2.081381,-1.225608,-1.474840,-0.949214,-1.302934,-2.309808
1,-2.679250,-2.312216,-2.471833,-2.179717,-3.076213,-2.453680,-2.148226,0.606782,-1.942386,-2.290443,...,-2.240098,-2.424101,-1.432492,-2.081653,-1.994273,-1.329621,-1.749228,-1.627684,-1.439588,-2.820971
2,-2.567198,-2.438649,-2.524597,-2.073301,-2.758177,-2.442147,-2.174640,0.646876,-1.920426,-2.339089,...,-2.096072,-2.407464,-1.481097,-2.192657,-1.846037,-1.310697,-1.921161,-1.096406,-1.345349,-2.932505
3,-2.619594,-2.140731,-2.174192,-2.299927,-2.760957,-2.632819,-2.174229,0.484088,-1.974229,-2.217012,...,-2.182104,-2.419306,-1.450977,-2.009630,-1.779920,-1.141035,-1.752328,-1.529193,-1.397301,-2.745353
4,-2.762625,-2.538386,-2.125707,-2.099150,-2.849681,-2.586657,-2.268958,0.570860,-2.093197,-2.494037,...,-2.083350,-2.233258,-1.200382,-2.050077,-2.039183,-1.174748,-1.718197,-1.384192,-1.354418,-2.636139
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
478,-2.527565,-2.196282,-2.387540,-1.839070,-3.013028,-2.386143,-2.261006,0.644945,-2.040053,-2.443506,...,-2.274590,-2.431700,-1.346033,-1.974325,-1.983659,-1.226110,-1.784125,-1.289819,-1.525416,-2.374346
479,-2.719265,-2.329174,-2.680459,-2.191716,-2.959354,-2.092572,-2.101208,0.670284,-1.816799,-2.481618,...,-2.545435,-2.571863,-1.436994,-2.085016,-2.097533,-1.404426,-1.696949,-1.087436,-1.045423,-2.596229
480,-3.056160,-2.073884,-2.580395,-2.249451,-2.813703,-2.529265,-2.130109,0.735102,-1.896687,-2.346993,...,-2.006593,-2.375355,-1.517317,-2.064786,-1.763549,-1.347560,-1.772093,-1.291123,-1.099251,-2.687248
481,-2.434457,-2.532714,-2.485792,-2.135011,-2.544180,-2.306498,-2.336752,0.508938,-2.372232,-2.542052,...,-2.088274,-2.406288,-1.374177,-2.185571,-1.876330,-1.485279,-1.799034,-1.498245,-1.445723,-2.682884


In [None]:
# torch.cat(features_list).numpy()

In [34]:
test = pd.concat([test,df_test], axis = 1)

In [35]:
train.shape, test.shape


((3498, 778), (483, 776))

In [36]:
train.columns

Index([                'id',             'SMILES',                'MLM',
                      'HLM',              'AlogP',   'Molecular_Weight',
          'Num_H_Acceptors',       'Num_H_Donors', 'Num_RotatableBonds',
                     'LogD',
       ...
                        757,                  758,                  759,
                        760,                  761,                  762,
                        763,                  764,                  765,
                        766],
      dtype='object', length=778)

In [37]:
test.columns

Index([                        'id',                     'SMILES',
                            'AlogP',           'Molecular_Weight',
                  'Num_H_Acceptors',               'Num_H_Donors',
               'Num_RotatableBonds',                       'LogD',
       'Molecular_PolarSurfaceArea',                            0,
       ...
                                757,                          758,
                                759,                          760,
                                761,                          762,
                                763,                          764,
                                765,                          766],
      dtype='object', length=776)

In [38]:
train.to_csv(f'{DATA_PATH}train_molecule_embeddings.csv', index = False)
test.to_csv(f'{DATA_PATH}test_molecule_embeddings.csv', index = False)

# 전처리

### -> 이상치 제거 (총 18개)

In [39]:
# 1. 극단치만 판단해서 18개 제거
outliers = train[(train['MLM'] > 100.0) | (train['HLM'] > 100.0) | (train['AlogP'] < -3) | (train['Molecular_Weight'] > 800)
     | (train['Num_H_Acceptors'] > 14) | (train['Num_H_Donors'] > 9) | (train['Num_RotatableBonds'] > 20)
     | (train['LogD'] < -4) | (train['Molecular_PolarSurfaceArea'] > 250)]


In [40]:
outliers.index

Int64Index([ 179,  662,  834,  983, 1092, 1172, 1239, 1584, 2159, 2258, 2367,
            2410, 2586, 2711, 2948, 3157, 3247, 3403],
           dtype='int64')

In [41]:
train = train.drop(index=outliers.index, axis=0)


In [42]:
duplicates = train[train.duplicated(subset=['SMILES'], keep=False)]

duplicates.sort_values(by='SMILES')

Unnamed: 0,id,SMILES,MLM,HLM,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,...,757,758,759,760,761,762,763,764,765,766
2276,TRAIN_2276,C(=C/c1nnn(Cc2ccccc2)n1)\c1ccccc1,1.535,31.453,3.556,262.309,3,0,4,3.556,...,-2.42867,-2.348599,-1.413202,-2.231666,-1.643445,-1.553527,-1.916796,-1.550059,-1.539189,-2.717372
451,TRAIN_0451,C(=C/c1nnn(Cc2ccccc2)n1)\c1ccccc1,0.31,24.67,3.556,262.309,3,0,4,3.556,...,-2.42867,-2.348599,-1.413202,-2.231666,-1.643445,-1.553527,-1.916796,-1.550059,-1.539189,-2.717372
2891,TRAIN_2891,CC(=O)Nc1ccc(N2N=C(c3ccc(O)cc3)C(C)CC2=O)cc1,55.95,69.95,2.172,337.372,4,2,3,2.169,...,-2.179101,-2.809379,-1.444064,-2.073145,-1.718139,-1.383798,-1.86393,-1.235812,-1.162968,-2.82214
543,TRAIN_0543,CC(=O)Nc1ccc(N2N=C(c3ccc(O)cc3)C(C)CC2=O)cc1,68.485,85.872,2.172,337.372,4,2,3,2.169,...,-2.179101,-2.809379,-1.444064,-2.073145,-1.718139,-1.383798,-1.86393,-1.235812,-1.162968,-2.82214
837,TRAIN_0837,CC(=O)Nc1nc2ccc(-c3nn(C(C)C)c4nc(N)ncc34)cc2s1,63.522,62.488,2.293,367.428,5,2,3,2.307,...,-2.076351,-2.348424,-1.13802,-1.897986,-2.062215,-1.187602,-1.764906,-1.341969,-1.324379,-2.897856
366,TRAIN_0366,CC(=O)Nc1nc2ccc(-c3nn(C(C)C)c4nc(N)ncc34)cc2s1,73.74,66.85,2.293,367.428,5,2,3,2.307,...,-2.076351,-2.348424,-1.13802,-1.897986,-2.062215,-1.187602,-1.764906,-1.341969,-1.324379,-2.897856
1085,TRAIN_1085,CC(C)(C)OC(=O)N1CCC(c2n[nH]c3nc(S(C)(=O)=O)ncc...,23.63,71.47,1.684,381.45,7,1,4,1.684,...,-2.088642,-2.343824,-1.290519,-1.987467,-2.250722,-1.325639,-1.466831,-1.0712,-1.257508,-2.798274
2848,TRAIN_2848,CC(C)(C)OC(=O)N1CCC(c2n[nH]c3nc(S(C)(=O)=O)ncc...,40.657,99.9,1.684,381.45,7,1,4,1.684,...,-2.088642,-2.343824,-1.290519,-1.987467,-2.250722,-1.325639,-1.466831,-1.0712,-1.257508,-2.798274
2096,TRAIN_2096,CC(C)COc1cc(=O)n2c(c1C(=O)N1CCC(C)CC1)CCCCC2,1.88,61.28,2.843,360.49,3,0,4,2.843,...,-2.371149,-2.590324,-1.230046,-2.253333,-2.149612,-1.311024,-1.630403,-1.235411,-1.431738,-2.639368
1666,TRAIN_1666,CC(C)COc1cc(=O)n2c(c1C(=O)N1CCC(C)CC1)CCCCC2,5.494,25.397,2.843,360.49,3,0,4,2.843,...,-2.371149,-2.590324,-1.230046,-2.253333,-2.149612,-1.311024,-1.630403,-1.235411,-1.431738,-2.639368


In [43]:
processed_duplicate = duplicates.groupby('SMILES')['MLM','HLM'].mean().reset_index()
processed_duplicate

  processed_duplicate = duplicates.groupby('SMILES')['MLM','HLM'].mean().reset_index()


Unnamed: 0,SMILES,MLM,HLM
0,C(=C/c1nnn(Cc2ccccc2)n1)\c1ccccc1,0.9225,28.0615
1,CC(=O)Nc1ccc(N2N=C(c3ccc(O)cc3)C(C)CC2=O)cc1,62.2175,77.911
2,CC(=O)Nc1nc2ccc(-c3nn(C(C)C)c4nc(N)ncc34)cc2s1,68.631,64.669
3,CC(C)(C)OC(=O)N1CCC(c2n[nH]c3nc(S(C)(=O)=O)ncc...,32.1435,85.685
4,CC(C)COc1cc(=O)n2c(c1C(=O)N1CCC(C)CC1)CCCCC2,3.687,43.3385
5,CC(C)NC(=O)c1c(Cl)nn(C)c1NC(=O)c1cc(Br)nn1-c1n...,62.1085,68.1015
6,CC1CC(=O)N(c2ccc(-c3cccc(C#N)c3)cc2)N=C1c1ccc(...,43.17,31.13
7,CCCCC/N=c1\n(C)c(=O)nc2sccn12,2.3395,36.8145
8,CCOC(=O)CC1(NC(=O)N2Cc3c(sc4c3CCCC4)-n3cccc3C2...,3.442,3.6015
9,CCc1nc2cc(Br)c(C(=O)OC)nc2n1CC(=O)c1ccccc1,73.545,1.1345


In [44]:
tmp = pd.merge(processed_duplicate,duplicates,how='left',on='SMILES')

### -> 중복치 제거 (총 26개)

In [45]:
tmp = tmp.iloc[0::2]  # 첫 번째 행부터 시작하여 2씩 증가하여 행 선택
tmp.drop(['MLM_y','HLM_y'],axis =1,inplace=True)

In [46]:
train = train.drop_duplicates(subset=['SMILES'], keep=False) #기존 train에서 중복치들을 우선 모두 제거
train.shape

(3428, 778)

In [47]:
tmp.rename(columns={'HLM_x': 'HLM', 'MLM_x': 'MLM'}, inplace=True)

In [48]:
train = pd.concat([train, tmp], axis=0, ignore_index=True)
train.shape

(3454, 778)

## 학습 검증 추론

In [49]:
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold

In [50]:
from sklearn.metrics import mean_squared_error

def rmse(y_valid, pred):
    mse = mean_squared_error(y_valid, pred)
    return np.sqrt(mse)

In [51]:
from sklearn.metrics import make_scorer

rmse_score = make_scorer(rmse, greater_is_better=False)

In [52]:
# 결측치 채우기
train["AlogP"] = np.where(pd.isna(train["AlogP"]), train["LogD"], train["AlogP"])
test["AlogP"] = np.where(pd.isna(test["AlogP"]), test["LogD"], test["AlogP"])

In [53]:
train_prop = train.iloc[:, 4:]
test_prop = test.iloc[:, 2:]

target_1 = train['MLM']
target_2 = train['HLM']

In [54]:
train_prop

Unnamed: 0,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea,0,1,2,...,757,758,759,760,761,762,763,764,765,766
0,3.259,400.495,5,2,8,3.259,117.37,-2.602181,-2.416166,-2.331112,...,-1.991607,-2.373958,-1.335646,-1.965763,-2.064113,-1.121240,-1.696131,-1.426275,-1.340894,-2.887336
1,2.169,301.407,2,1,2,2.172,73.47,-2.695745,-2.354476,-2.414910,...,-2.068422,-2.138847,-1.377813,-2.052618,-2.012119,-1.122919,-1.606128,-1.174488,-1.249980,-2.406683
2,1.593,297.358,5,0,3,1.585,62.45,-2.653795,-2.531340,-2.346479,...,-1.888332,-2.102875,-1.464725,-1.870678,-1.862562,-1.089160,-1.834340,-1.477908,-1.568012,-2.183952
3,4.771,494.652,6,0,5,3.475,92.60,-2.669900,-2.521494,-2.461554,...,-2.056242,-2.363769,-1.080201,-2.008929,-2.099958,-1.080875,-1.639401,-1.374620,-1.332347,-2.770781
4,2.335,268.310,3,0,1,2.337,42.43,-2.408910,-2.244952,-2.493243,...,-2.373341,-2.469370,-1.234159,-2.271900,-2.082072,-1.147488,-1.611428,-1.303321,-1.607484,-2.613079
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3449,4.609,340.381,4,1,2,4.736,71.75,-2.569803,-2.460406,-2.307648,...,-1.897803,-2.244878,-1.145704,-1.966325,-1.927710,-1.243472,-1.953197,-1.679651,-1.594018,-2.746540
3450,4.282,553.480,5,0,7,4.009,65.98,-2.521334,-2.366596,-2.532322,...,-1.963544,-2.406016,-1.321722,-2.236139,-2.043725,-1.143327,-1.601323,-1.432468,-1.342690,-2.837662
3451,4.304,459.520,6,1,4,4.304,113.53,-2.646402,-2.371946,-2.526056,...,-2.308575,-2.393866,-1.374097,-1.956857,-2.063557,-1.320436,-1.718614,-1.335498,-1.451332,-2.911163
3452,-1.133,385.417,7,0,4,-1.133,108.65,-2.824234,-2.312467,-2.399506,...,-2.210922,-2.376919,-1.329937,-2.236022,-2.120409,-1.124050,-1.531582,-1.318419,-1.289543,-2.470385


In [55]:
test_prop

Unnamed: 0,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea,0,1,2,...,757,758,759,760,761,762,763,764,765,766
0,2.641,361.505,4,2,7,2.635,92.76,-2.911492,-2.429448,-2.290789,...,-2.017083,-2.284410,-1.434447,-2.051644,-2.081381,-1.225608,-1.474840,-0.949214,-1.302934,-2.309808
1,0.585,370.399,5,0,3,0.585,68.31,-2.679250,-2.312216,-2.471833,...,-2.240098,-2.424101,-1.432492,-2.081653,-1.994273,-1.329621,-1.749228,-1.627684,-1.439588,-2.820971
2,4.276,347.414,4,4,5,4.290,92.86,-2.567198,-2.438649,-2.524597,...,-2.096072,-2.407464,-1.481097,-2.192657,-1.846037,-1.310697,-1.921161,-1.096406,-1.345349,-2.932505
3,1.795,345.358,5,0,2,1.795,81.21,-2.619594,-2.140731,-2.174192,...,-2.182104,-2.419306,-1.450977,-2.009630,-1.779920,-1.141035,-1.752328,-1.529193,-1.397301,-2.745353
4,1.219,353.418,4,0,2,0.169,61.15,-2.762625,-2.538386,-2.125707,...,-2.083350,-2.233258,-1.200382,-2.050077,-2.039183,-1.174748,-1.718197,-1.384192,-1.354418,-2.636139
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
478,4.207,306.443,2,1,7,4.207,55.13,-2.527565,-2.196282,-2.387540,...,-2.274590,-2.431700,-1.346033,-1.974325,-1.983659,-1.226110,-1.784125,-1.289819,-1.525416,-2.374346
479,-0.608,335.398,5,0,1,-1.736,70.16,-2.719265,-2.329174,-2.680459,...,-2.545435,-2.571863,-1.436994,-2.085016,-2.097533,-1.404426,-1.696949,-1.087436,-1.045423,-2.596229
480,1.792,349.383,3,1,3,1.792,69.72,-3.056160,-2.073884,-2.580395,...,-2.006593,-2.375355,-1.517317,-2.064786,-1.763549,-1.347560,-1.772093,-1.291123,-1.099251,-2.687248
481,0.790,341.132,3,2,2,0.423,69.64,-2.434457,-2.532714,-2.485792,...,-2.088274,-2.406288,-1.374177,-2.185571,-1.876330,-1.485279,-1.799034,-1.498245,-1.445723,-2.682884


In [56]:
train_prop = train_prop.astype(float)  # 모든 열을 float64로 변환
test_prop = test_prop.astype(float)  # 모든 열을 float64로 변환


In [57]:
object_columns_train = train_prop.select_dtypes(include=['object']).columns.tolist()
print(object_columns_train)

[]


In [58]:
object_columns_test = test_prop.select_dtypes(include=['object']).columns.tolist()
print(object_columns_test)

[]


In [59]:
train_prop.columns = train_prop.columns.astype(str)
test_prop.columns = test_prop.columns.astype(str)


In [60]:
from sklearn.preprocessing import MinMaxScaler

# Min-Max 스케일링을 위한 객체 생성
scaler = MinMaxScaler()

# Train 데이터를 스케일링하고 변환
train_prop = scaler.fit_transform(train_prop)

# Test 데이터를 스케일링하고 변환
test_prop = scaler.transform(test_prop)


In [61]:
train_prop = pd.DataFrame(train_prop)
train_prop

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,764,765,766,767,768,769,770,771,772,773
0,0.488883,0.408759,0.416667,0.250,0.470588,0.525975,0.546835,0.413118,0.371946,0.690973,...,0.699634,0.596252,0.487355,0.715031,0.310936,0.717066,0.521505,0.553874,0.567793,0.414553
1,0.394950,0.231359,0.166667,0.125,0.117647,0.439098,0.336496,0.340423,0.428284,0.624756,...,0.637765,0.804378,0.452152,0.628800,0.348608,0.715490,0.588881,0.727300,0.623673,0.758518
2,0.345312,0.224110,0.416667,0.000,0.176471,0.392184,0.283695,0.373017,0.266764,0.678830,...,0.782814,0.836221,0.379593,0.809433,0.456969,0.747174,0.418043,0.518311,0.428196,0.917908
3,0.619183,0.577331,0.500000,0.000,0.294118,0.543238,0.428154,0.360504,0.275756,0.587898,...,0.647575,0.605272,0.700611,0.672175,0.284965,0.754950,0.563973,0.589453,0.573046,0.497963
4,0.409255,0.172104,0.250000,0.000,0.058824,0.452286,0.187773,0.563281,0.528305,0.562858,...,0.392177,0.511791,0.572080,0.411093,0.297924,0.692431,0.584914,0.638563,0.403935,0.610817
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3449,0.605222,0.301135,0.333333,0.125,0.117647,0.644022,0.328255,0.438275,0.331545,0.709515,...,0.775185,0.710517,0.645927,0.714473,0.409766,0.602349,0.329067,0.379354,0.412211,0.515309
3450,0.577042,0.682652,0.416667,0.000,0.411765,0.585918,0.300608,0.475932,0.417216,0.531977,...,0.722236,0.567873,0.498978,0.446598,0.325708,0.696337,0.592478,0.549609,0.566689,0.450101
3451,0.578938,0.514433,0.500000,0.125,0.235294,0.609495,0.528437,0.378761,0.412330,0.536928,...,0.444341,0.578629,0.455253,0.723873,0.311339,0.530115,0.504675,0.616400,0.499913,0.397503
3452,0.110393,0.381764,0.583333,0.000,0.235294,0.174952,0.505055,0.240594,0.466648,0.636929,...,0.522993,0.593631,0.492120,0.446713,0.270147,0.714429,0.644686,0.628164,0.599356,0.712932


In [62]:
test_prop = pd.DataFrame(test_prop)
test_prop

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,764,765,766,767,768,769,770,771,772,773
0,0.435626,0.338954,0.333333,0.250,0.411765,0.476103,0.428921,0.172799,0.359816,0.722836,...,0.679114,0.675522,0.404871,0.629767,0.298425,0.619114,0.687163,0.882465,0.591125,0.827843
1,0.258445,0.354877,0.416667,0.000,0.176471,0.312260,0.311772,0.353239,0.466878,0.579776,...,0.499493,0.551865,0.406503,0.599973,0.361539,0.521495,0.481757,0.415148,0.507131,0.462045
2,0.576525,0.313726,0.333333,0.500,0.294118,0.608376,0.429400,0.440299,0.351414,0.538081,...,0.615495,0.566592,0.365926,0.489767,0.468942,0.539256,0.353049,0.781082,0.565055,0.382230
3,0.362720,0.310045,0.416667,0.000,0.117647,0.408967,0.373581,0.399589,0.623484,0.814972,...,0.546203,0.556109,0.391071,0.671479,0.516847,0.698488,0.479436,0.482987,0.533123,0.516159
4,0.313082,0.324475,0.333333,0.000,0.117647,0.279012,0.277466,0.288461,0.260330,0.853284,...,0.625741,0.720803,0.600278,0.631322,0.328999,0.666847,0.504987,0.582860,0.559481,0.594315
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
478,0.570579,0.240375,0.166667,0.125,0.411765,0.601742,0.248622,0.471091,0.572753,0.646384,...,0.471713,0.545138,0.478683,0.706531,0.369229,0.618643,0.455633,0.647863,0.454378,0.781659
479,0.155636,0.292214,0.416667,0.000,0.058824,0.126758,0.320636,0.322150,0.451391,0.414919,...,0.253568,0.421062,0.402745,0.596634,0.286722,0.451288,0.520893,0.787261,0.749403,0.622875
480,0.362461,0.317251,0.250000,0.125,0.176471,0.408728,0.318528,0.060399,0.684532,0.493990,...,0.687563,0.595016,0.335687,0.616719,0.528708,0.504658,0.464641,0.646965,0.716318,0.557740
481,0.276112,0.302479,0.250000,0.250,0.117647,0.299313,0.318145,0.543432,0.265509,0.568745,...,0.621776,0.567632,0.455187,0.496802,0.446993,0.375406,0.444472,0.504303,0.503360,0.560863


#### CatBoost

In [63]:
!pip install CatBoost

Collecting CatBoost
  Downloading catboost-1.2.1.1-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: CatBoost
Successfully installed CatBoost-1.2.1.1


In [64]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from catboost import CatBoostRegressor


cv = KFold(n_splits=5, shuffle=True, random_state=SEED)

scores = []
for tri, val in tqdm(cv.split(train_prop, target_1)):
    # 학습데이터
    x_train = train_prop.iloc[tri]
    y_train_1 = target_1.iloc[tri]
    y_train_2 = target_2.iloc[tri]

    # 검증데이터
    x_valid = train_prop.iloc[val]
    y_valid_1 = target_1.iloc[val]
    y_valid_2 = target_2.iloc[val]

    # MLM
    model = CatBoostRegressor(random_state=SEED, verbose=0)
    model.fit(x_train, y_train_1)
    pred = model.predict(x_valid)
    score_1 = rmse(y_valid_1, pred)

    #HLM
    model = CatBoostRegressor(random_state=SEED, verbose=0)
    model.fit(x_train, y_train_2)
    pred = model.predict(x_valid)
    score_2 = rmse(y_valid_2, pred)

    score = 0.5*score_1 + 0.5*score_2
    scores.append(score)

print(scores)
np.mean(scores)

0it [00:00, ?it/s]

[31.802803651884705, 33.19507553318288, 32.177202439615506, 31.123517988050615, 32.27843634294656]


32.11540719113606

In [65]:
model.fit(train_prop, target_1)
cat_pred_1 = model.predict(test_prop)

model.fit(train_prop, target_2)
cat_pred_2 = model.predict(test_prop)

In [None]:
# import joblib

# # 모델 저장
# joblib.dump(mlm_model, 'mlm_model.pkl')
# joblib.dump(hlm_model, 'hlm_model.pkl')

#### AdaBoost

In [None]:
# from sklearn.model_selection import cross_val_score
# from sklearn.model_selection import KFold

# cv = KFold(n_splits=5, shuffle=True, random_state=SEED)

# scores = []
# for tri, val in tqdm(cv.split(train_prop, target_1)):
#     # 학습데이터
#     x_train = train_prop.iloc[tri]
#     y_train_1 = target_1.iloc[tri]
#     y_train_2 = target_2.iloc[tri]

#     # 검증데이터
#     x_valid = train_prop.iloc[val]
#     y_valid_1 = target_1.iloc[val]
#     y_valid_2 = target_2.iloc[val]

#     # MLM
#     model = AdaBoostRegressor(random_state=SEED)
#     model.fit(x_train, y_train_1)
#     pred = model.predict(x_valid)
#     score_1 = rmse(y_valid_1, pred)

#     #HLM
#     model = AdaBoostRegressor(random_state=SEED)
#     model.fit(x_train, y_train_2)
#     pred = model.predict(x_valid)
#     score_2 = rmse(y_valid_2, pred)

#     score = 0.5*score_1 + 0.5*score_2
#     scores.append(score)

# print(scores)
# np.mean(scores)

5it [02:54, 34.82s/it]

[32.36238469218258, 33.36197027802679, 33.24936591196395, 32.46851128996565, 32.980572138938186]





32.88456086221543

In [None]:
# model.fit(train_prop, target_1)
# ada_pred_1 = model.predict(test_prop)

# model.fit(train_prop, target_2)
# ada_pred_2 = model.predict(test_prop)

#### randomforest

In [66]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

cv = KFold(n_splits=5, shuffle=True, random_state=SEED)

scores = []
for tri, val in tqdm(cv.split(train_prop, target_1)):
    # 학습데이터
    x_train = train_prop.iloc[tri]
    y_train_1 = target_1.iloc[tri]
    y_train_2 = target_2.iloc[tri]

    # 검증데이터
    x_valid = train_prop.iloc[val]
    y_valid_1 = target_1.iloc[val]
    y_valid_2 = target_2.iloc[val]

    # MLM
    model = RandomForestRegressor(random_state=SEED)
    model.fit(x_train, y_train_1)
    pred = model.predict(x_valid)
    score_1 = rmse(y_valid_1, pred)

    #HLM
    model = RandomForestRegressor(random_state=SEED)
    model.fit(x_train, y_train_2)
    pred = model.predict(x_valid)
    score_2 = rmse(y_valid_2, pred)

    score = 0.5*score_1 + 0.5*score_2
    scores.append(score)

print(scores)
np.mean(scores)

0it [00:00, ?it/s]

[32.12753751800642, 33.69555781025497, 32.54389039122165, 31.629482312819015, 32.4144383440483]


32.48218127527007

In [67]:
model.fit(train_prop, target_1)
rf_pred_1 = model.predict(test_prop)

model.fit(train_prop, target_2)
rf_pred_2 = model.predict(test_prop)

#### XGBRegressor

In [73]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

cv = KFold(n_splits=5, shuffle=True, random_state=SEED)

scores = []
for tri, val in tqdm(cv.split(train_prop, target_1)):
    # 학습데이터
    x_train = train_prop.iloc[tri]
    y_train_1 = target_1.iloc[tri]
    y_train_2 = target_2.iloc[tri]

    # 검증데이터
    x_valid = train_prop.iloc[val]
    y_valid_1 = target_1.iloc[val]
    y_valid_2 = target_2.iloc[val]

    # MLM
    model = XGBRegressor(random_state=SEED)
    model.fit(x_train, y_train_1, eval_set=[(x_valid, y_valid_1)], eval_metric='rmse')
    pred = model.predict(x_valid)
    score_1 = rmse(y_valid_1, pred)

    # HLM
    model = XGBRegressor(random_state=SEED)
    model.fit(x_train, y_train_2, eval_set=[(x_valid, y_valid_2)], eval_metric='rmse')
    pred = model.predict(x_valid)
    score_2 = rmse(y_valid_2, pred)

    score = 0.5*score_1 + 0.5*score_2
    scores.append(score)

print(scores)
np.mean(scores)

0it [00:00, ?it/s]



[0]	validation_0-rmse:41.31157
[1]	validation_0-rmse:36.27654
[2]	validation_0-rmse:33.64459
[3]	validation_0-rmse:32.55245
[4]	validation_0-rmse:31.94340
[5]	validation_0-rmse:31.87027
[6]	validation_0-rmse:31.84022
[7]	validation_0-rmse:31.70661
[8]	validation_0-rmse:31.75064
[9]	validation_0-rmse:31.87502
[10]	validation_0-rmse:32.01014
[11]	validation_0-rmse:31.95646
[12]	validation_0-rmse:32.17320
[13]	validation_0-rmse:32.18563
[14]	validation_0-rmse:32.20507
[15]	validation_0-rmse:32.25809
[16]	validation_0-rmse:32.35511
[17]	validation_0-rmse:32.34016
[18]	validation_0-rmse:32.39601
[19]	validation_0-rmse:32.35373
[20]	validation_0-rmse:32.32135
[21]	validation_0-rmse:32.34892
[22]	validation_0-rmse:32.40157
[23]	validation_0-rmse:32.39396
[24]	validation_0-rmse:32.42632
[25]	validation_0-rmse:32.45448
[26]	validation_0-rmse:32.47249
[27]	validation_0-rmse:32.42618
[28]	validation_0-rmse:32.49055
[29]	validation_0-rmse:32.55503
[30]	validation_0-rmse:32.56847
[31]	validation_0-



[0]	validation_0-rmse:50.40639
[1]	validation_0-rmse:42.66418
[2]	validation_0-rmse:38.26563
[3]	validation_0-rmse:35.91813
[4]	validation_0-rmse:34.75305
[5]	validation_0-rmse:34.17028
[6]	validation_0-rmse:34.00774
[7]	validation_0-rmse:34.09969
[8]	validation_0-rmse:33.98475
[9]	validation_0-rmse:33.95651
[10]	validation_0-rmse:33.86038
[11]	validation_0-rmse:33.83134
[12]	validation_0-rmse:33.90231
[13]	validation_0-rmse:33.90872
[14]	validation_0-rmse:33.88084
[15]	validation_0-rmse:33.95145
[16]	validation_0-rmse:33.96738
[17]	validation_0-rmse:33.98689
[18]	validation_0-rmse:34.01672
[19]	validation_0-rmse:34.07890
[20]	validation_0-rmse:34.19708
[21]	validation_0-rmse:34.27475
[22]	validation_0-rmse:34.31802
[23]	validation_0-rmse:34.28744
[24]	validation_0-rmse:34.36098
[25]	validation_0-rmse:34.32804
[26]	validation_0-rmse:34.35403
[27]	validation_0-rmse:34.38031
[28]	validation_0-rmse:34.43417
[29]	validation_0-rmse:34.46545
[30]	validation_0-rmse:34.48900
[31]	validation_0-



[0]	validation_0-rmse:42.66612
[1]	validation_0-rmse:37.92284
[2]	validation_0-rmse:35.72278
[3]	validation_0-rmse:34.49579
[4]	validation_0-rmse:34.08059
[5]	validation_0-rmse:34.02589
[6]	validation_0-rmse:34.06222
[7]	validation_0-rmse:34.12660
[8]	validation_0-rmse:34.15291
[9]	validation_0-rmse:34.24494
[10]	validation_0-rmse:34.24214
[11]	validation_0-rmse:34.18349
[12]	validation_0-rmse:34.28836
[13]	validation_0-rmse:34.45754
[14]	validation_0-rmse:34.34613
[15]	validation_0-rmse:34.43328
[16]	validation_0-rmse:34.46866
[17]	validation_0-rmse:34.49982
[18]	validation_0-rmse:34.52589
[19]	validation_0-rmse:34.63353
[20]	validation_0-rmse:34.62887
[21]	validation_0-rmse:34.66869
[22]	validation_0-rmse:34.74560
[23]	validation_0-rmse:34.79666
[24]	validation_0-rmse:34.75990
[25]	validation_0-rmse:34.74943
[26]	validation_0-rmse:34.75856
[27]	validation_0-rmse:34.83382
[28]	validation_0-rmse:34.87295
[29]	validation_0-rmse:34.87016
[30]	validation_0-rmse:34.94648
[31]	validation_0-



[0]	validation_0-rmse:49.75379
[1]	validation_0-rmse:42.34665
[2]	validation_0-rmse:38.39438
[3]	validation_0-rmse:36.14224
[4]	validation_0-rmse:35.22530
[5]	validation_0-rmse:34.90382
[6]	validation_0-rmse:34.78561
[7]	validation_0-rmse:34.87095
[8]	validation_0-rmse:34.87155
[9]	validation_0-rmse:34.90661
[10]	validation_0-rmse:35.10233
[11]	validation_0-rmse:35.03248
[12]	validation_0-rmse:35.16068
[13]	validation_0-rmse:35.13148
[14]	validation_0-rmse:35.20661
[15]	validation_0-rmse:35.26873
[16]	validation_0-rmse:35.31309
[17]	validation_0-rmse:35.25242
[18]	validation_0-rmse:35.24454
[19]	validation_0-rmse:35.23863
[20]	validation_0-rmse:35.21870
[21]	validation_0-rmse:35.25816
[22]	validation_0-rmse:35.28029
[23]	validation_0-rmse:35.27425
[24]	validation_0-rmse:35.28123
[25]	validation_0-rmse:35.36648
[26]	validation_0-rmse:35.42744
[27]	validation_0-rmse:35.48154
[28]	validation_0-rmse:35.52761
[29]	validation_0-rmse:35.45996
[30]	validation_0-rmse:35.44530
[31]	validation_0-



[0]	validation_0-rmse:44.40057
[1]	validation_0-rmse:39.28803
[2]	validation_0-rmse:36.60263
[3]	validation_0-rmse:34.91148
[4]	validation_0-rmse:34.03846
[5]	validation_0-rmse:33.77365
[6]	validation_0-rmse:33.59888
[7]	validation_0-rmse:33.39497
[8]	validation_0-rmse:33.45153
[9]	validation_0-rmse:33.44430
[10]	validation_0-rmse:33.37921
[11]	validation_0-rmse:33.46392
[12]	validation_0-rmse:33.62041
[13]	validation_0-rmse:33.56804
[14]	validation_0-rmse:33.50060
[15]	validation_0-rmse:33.50285
[16]	validation_0-rmse:33.54956
[17]	validation_0-rmse:33.50524
[18]	validation_0-rmse:33.56642
[19]	validation_0-rmse:33.65640
[20]	validation_0-rmse:33.77286
[21]	validation_0-rmse:33.79966
[22]	validation_0-rmse:33.81238
[23]	validation_0-rmse:33.81736
[24]	validation_0-rmse:33.81029
[25]	validation_0-rmse:33.85518
[26]	validation_0-rmse:33.95539
[27]	validation_0-rmse:33.98323
[28]	validation_0-rmse:34.12429
[29]	validation_0-rmse:34.14642
[30]	validation_0-rmse:34.19905
[31]	validation_0-



[0]	validation_0-rmse:50.83989
[1]	validation_0-rmse:42.90380
[2]	validation_0-rmse:38.26798
[3]	validation_0-rmse:35.76923
[4]	validation_0-rmse:34.41519
[5]	validation_0-rmse:33.99913
[6]	validation_0-rmse:33.74404
[7]	validation_0-rmse:33.74717
[8]	validation_0-rmse:33.84319
[9]	validation_0-rmse:33.75991
[10]	validation_0-rmse:33.72445
[11]	validation_0-rmse:33.78877
[12]	validation_0-rmse:33.91417
[13]	validation_0-rmse:33.94190
[14]	validation_0-rmse:33.88076
[15]	validation_0-rmse:33.95278
[16]	validation_0-rmse:33.97950
[17]	validation_0-rmse:34.09531
[18]	validation_0-rmse:34.22142
[19]	validation_0-rmse:34.26534
[20]	validation_0-rmse:34.24207
[21]	validation_0-rmse:34.25710
[22]	validation_0-rmse:34.29416
[23]	validation_0-rmse:34.29711
[24]	validation_0-rmse:34.33853
[25]	validation_0-rmse:34.36822
[26]	validation_0-rmse:34.36646
[27]	validation_0-rmse:34.41836
[28]	validation_0-rmse:34.41197
[29]	validation_0-rmse:34.38923
[30]	validation_0-rmse:34.40123
[31]	validation_0-



[0]	validation_0-rmse:42.73943
[1]	validation_0-rmse:38.03611
[2]	validation_0-rmse:35.29391
[3]	validation_0-rmse:33.79337
[4]	validation_0-rmse:32.99996
[5]	validation_0-rmse:32.68755
[6]	validation_0-rmse:32.55074
[7]	validation_0-rmse:32.28685
[8]	validation_0-rmse:32.22408
[9]	validation_0-rmse:32.19858
[10]	validation_0-rmse:32.16196
[11]	validation_0-rmse:32.13016
[12]	validation_0-rmse:32.06644
[13]	validation_0-rmse:31.99285
[14]	validation_0-rmse:31.91684
[15]	validation_0-rmse:31.95100
[16]	validation_0-rmse:31.92286
[17]	validation_0-rmse:31.87939
[18]	validation_0-rmse:31.87061
[19]	validation_0-rmse:31.89201
[20]	validation_0-rmse:31.98645
[21]	validation_0-rmse:32.03288
[22]	validation_0-rmse:32.07429
[23]	validation_0-rmse:32.09901
[24]	validation_0-rmse:32.07081
[25]	validation_0-rmse:31.94043
[26]	validation_0-rmse:32.02827
[27]	validation_0-rmse:32.01869
[28]	validation_0-rmse:31.94441
[29]	validation_0-rmse:31.95631
[30]	validation_0-rmse:31.97228
[31]	validation_0-



[0]	validation_0-rmse:50.36023
[1]	validation_0-rmse:42.36663
[2]	validation_0-rmse:37.84455
[3]	validation_0-rmse:35.26012
[4]	validation_0-rmse:33.85732
[5]	validation_0-rmse:33.08991
[6]	validation_0-rmse:32.85887
[7]	validation_0-rmse:32.88608
[8]	validation_0-rmse:32.69949
[9]	validation_0-rmse:32.83146
[10]	validation_0-rmse:32.87382
[11]	validation_0-rmse:32.97654
[12]	validation_0-rmse:32.91585
[13]	validation_0-rmse:32.79539
[14]	validation_0-rmse:32.90294
[15]	validation_0-rmse:32.96274
[16]	validation_0-rmse:32.99705
[17]	validation_0-rmse:33.07173
[18]	validation_0-rmse:33.09198
[19]	validation_0-rmse:33.13374
[20]	validation_0-rmse:33.09538
[21]	validation_0-rmse:33.18646
[22]	validation_0-rmse:33.15758
[23]	validation_0-rmse:33.27233
[24]	validation_0-rmse:33.32843
[25]	validation_0-rmse:33.33023
[26]	validation_0-rmse:33.35343
[27]	validation_0-rmse:33.33426
[28]	validation_0-rmse:33.38896
[29]	validation_0-rmse:33.51184
[30]	validation_0-rmse:33.52745
[31]	validation_0-



[0]	validation_0-rmse:42.22787
[1]	validation_0-rmse:37.17081
[2]	validation_0-rmse:34.29105
[3]	validation_0-rmse:32.98835
[4]	validation_0-rmse:32.42606
[5]	validation_0-rmse:32.41339
[6]	validation_0-rmse:32.32893
[7]	validation_0-rmse:32.28518
[8]	validation_0-rmse:32.23419
[9]	validation_0-rmse:32.38956
[10]	validation_0-rmse:32.45046
[11]	validation_0-rmse:32.38966
[12]	validation_0-rmse:32.47613
[13]	validation_0-rmse:32.55607
[14]	validation_0-rmse:32.64398
[15]	validation_0-rmse:32.65156
[16]	validation_0-rmse:32.68945
[17]	validation_0-rmse:32.89368
[18]	validation_0-rmse:32.89140
[19]	validation_0-rmse:32.96627
[20]	validation_0-rmse:33.00123
[21]	validation_0-rmse:33.09896
[22]	validation_0-rmse:33.14718
[23]	validation_0-rmse:33.14637
[24]	validation_0-rmse:33.11701
[25]	validation_0-rmse:33.09494
[26]	validation_0-rmse:33.06334
[27]	validation_0-rmse:33.08748
[28]	validation_0-rmse:33.08322
[29]	validation_0-rmse:33.09784
[30]	validation_0-rmse:33.08327
[31]	validation_0-



[0]	validation_0-rmse:51.31418
[1]	validation_0-rmse:43.25502
[2]	validation_0-rmse:38.78043
[3]	validation_0-rmse:36.15170
[4]	validation_0-rmse:34.71032
[5]	validation_0-rmse:34.16897
[6]	validation_0-rmse:34.07559
[7]	validation_0-rmse:34.25752
[8]	validation_0-rmse:34.11941
[9]	validation_0-rmse:34.30556
[10]	validation_0-rmse:34.30698
[11]	validation_0-rmse:34.34871
[12]	validation_0-rmse:34.37722
[13]	validation_0-rmse:34.43217
[14]	validation_0-rmse:34.52776
[15]	validation_0-rmse:34.65497
[16]	validation_0-rmse:34.78296
[17]	validation_0-rmse:34.80627
[18]	validation_0-rmse:34.82832
[19]	validation_0-rmse:34.81720
[20]	validation_0-rmse:34.85507
[21]	validation_0-rmse:34.98268
[22]	validation_0-rmse:35.09844
[23]	validation_0-rmse:35.04843
[24]	validation_0-rmse:35.22147
[25]	validation_0-rmse:35.22868
[26]	validation_0-rmse:35.24332
[27]	validation_0-rmse:35.18232
[28]	validation_0-rmse:35.15092
[29]	validation_0-rmse:35.18717
[30]	validation_0-rmse:35.21714
[31]	validation_0-

34.19991226677554

In [None]:
model.fit(train_prop, target_1)
xgb_pred_1 = model.predict(test_prop)

model.fit(train_prop, target_2)
xgb_pred_2 = model.predict(test_prop)

#### LGBMRegressor

In [68]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

cv = KFold(n_splits=5, shuffle=True, random_state=SEED)

scores = []
for tri, val in tqdm(cv.split(train_prop, target_1)):
    # 학습데이터
    x_train = train_prop.iloc[tri]
    y_train_1 = target_1.iloc[tri]
    y_train_2 = target_2.iloc[tri]

    # 검증데이터
    x_valid = train_prop.iloc[val]
    y_valid_1 = target_1.iloc[val]
    y_valid_2 = target_2.iloc[val]

    # MLM
    model = LGBMRegressor(random_state=SEED, objective='regression')
    model.fit(x_train, y_train_1, eval_set=[(x_valid, y_valid_1)], eval_metric="rmse")
    pred = model.predict(x_valid)
    score_1 = rmse(y_valid_1, pred)

    # HLM
    model = LGBMRegressor(random_state=SEED, objective='regression')
    model.fit(x_train, y_train_2, eval_set=[(x_valid, y_valid_2)], eval_metric="rmse")
    pred = model.predict(x_valid)
    score_2 = rmse(y_valid_2, pred)


    score = 0.5*score_1 + 0.5*score_2
    scores.append(score)

print(scores)
np.mean(scores)

0it [00:00, ?it/s]

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 196641
[LightGBM] [Info] Number of data points in the train set: 2763, number of used features: 774
[LightGBM] [Info] Start training from score 37.549743
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 196641
[LightGBM] [Info] Number of data points in the train set: 2763, number of used features: 774
[LightGBM] [Info] Start training from score 53.023585
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 196642
[LightGBM] [Info] Number of data points in the train set: 2763, number of used features: 774
[LightGBM] [Info] Start training from score 37.414670
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 196642
[LightGBM] [Info] Number of data points in the train set: 2763, number of used features: 774
[LightGBM] [Info] Start training from score 53.194288
You can set `force_col_wise=true` to

32.58561886687449

In [69]:
model.fit(train_prop, target_1)
lgbm_pred_1 = model.predict(test_prop)

model.fit(train_prop, target_2)
lgbm_pred_2 = model.predict(test_prop)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 196642
[LightGBM] [Info] Number of data points in the train set: 3454, number of used features: 774
[LightGBM] [Info] Start training from score 37.276169
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 196642
[LightGBM] [Info] Number of data points in the train set: 3454, number of used features: 774
[LightGBM] [Info] Start training from score 53.008790


In [None]:
# mlm_pred = (cat_pred_1 + ada_pred_1 + rf_pred_1 + xgb_pred_1 + lgbm_pred_1) / 5
# hlm_pred = (cat_pred_2 + ada_pred_2 +rf_pred_2 + xgb_pred_2 + lgbm_pred_2) / 5

In [70]:
mlm_pred = (cat_pred_1 + rf_pred_1 + lgbm_pred_1) / 3
hlm_pred = (cat_pred_2 + rf_pred_2 + lgbm_pred_2) / 3

In [71]:
submission['MLM'] = mlm_pred
submission['HLM'] = hlm_pred

In [None]:
submission

Unnamed: 0,id,MLM,HLM
0,TEST_000,23.685651,47.797643
1,TEST_001,56.479102,77.945344
2,TEST_002,39.223625,53.439247
3,TEST_003,45.507721,73.584156
4,TEST_004,66.290823,78.858035
...,...,...,...
478,TEST_478,17.544773,36.862662
479,TEST_479,75.631404,88.271733
480,TEST_480,54.898030,72.452000
481,TEST_481,63.882227,69.706789


In [72]:
submission.to_csv("submission_ML18(chemberta).csv", index=False)

In [None]:
submission

Unnamed: 0,id,MLM,HLM
0,TEST_000,23.685651,47.797643
1,TEST_001,56.479102,77.945344
2,TEST_002,39.223625,53.439247
3,TEST_003,45.507721,73.584156
4,TEST_004,66.290823,78.858035
...,...,...,...
478,TEST_478,17.544773,36.862662
479,TEST_479,75.631404,88.271733
480,TEST_480,54.898030,72.452000
481,TEST_481,63.882227,69.706789


In [None]:
sub_16 = pd.read_csv('submission_ML16(preprocess).csv')

In [None]:
sub_16

Unnamed: 0,id,MLM,HLM
0,TEST_000,34.270580,55.729152
1,TEST_001,60.907718,63.722749
2,TEST_002,30.194104,51.532413
3,TEST_003,57.818966,74.373008
4,TEST_004,63.833841,80.345176
...,...,...,...
478,TEST_478,7.256792,19.426390
479,TEST_479,78.237783,86.651548
480,TEST_480,48.114328,86.437328
481,TEST_481,59.538181,60.353697


# ChemBERTa-77M-MLM (DeepChem)
- https://huggingface.co/DeepChem/ChemBERTa-77M-MLM