In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import re
import torch
from tqdm.auto import tqdm
import random
import os


import matplotlib.pyplot as plt
import seaborn as sns


DATA_PATH = '/content/drive/MyDrive/데이콘 캐글 컴페티션/2023신약개발/data/'
SEED = 42


def reset_seeds(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [3]:
train = pd.read_csv(f"{DATA_PATH}train.csv")
test = pd.read_csv(f"{DATA_PATH}test.csv")
submission = pd.read_csv(f"{DATA_PATH}sample_submission.csv")

In [4]:
train.shape, test.shape, submission.shape

((3498, 11), (483, 9), (483, 3))

# 전처리

### -> 이상치 제거 (총 18개)

In [None]:
# 1. 극단치만 판단해서 18개 제거
outliers = train[(train['MLM'] > 100.0) | (train['HLM'] > 100.0) | (train['AlogP'] < -3) | (train['Molecular_Weight'] > 800)
     | (train['Num_H_Acceptors'] > 14) | (train['Num_H_Donors'] > 9) | (train['Num_RotatableBonds'] > 20)
     | (train['LogD'] < -4) | (train['Molecular_PolarSurfaceArea'] > 250)]


In [None]:
outliers.index

Int64Index([ 179,  662,  834,  983, 1092, 1172, 1239, 1584, 2159, 2258, 2367,
            2410, 2586, 2711, 2948, 3157, 3247, 3403],
           dtype='int64')

In [None]:
len(outliers.index)

18

In [None]:
train = train.drop(index=outliers.index, axis=0)


In [None]:
train.shape # 3498 -> 3480

(3480, 11)

In [None]:
duplicates = train[train.duplicated(subset=['SMILES'], keep=False)]

duplicates.sort_values(by='SMILES')


Unnamed: 0,id,SMILES,MLM,HLM,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea
2276,TRAIN_2276,C(=C/c1nnn(Cc2ccccc2)n1)\c1ccccc1,1.535,31.453,3.556,262.309,3,0,4,3.556,43.6
451,TRAIN_0451,C(=C/c1nnn(Cc2ccccc2)n1)\c1ccccc1,0.31,24.67,3.556,262.309,3,0,4,3.556,43.6
2891,TRAIN_2891,CC(=O)Nc1ccc(N2N=C(c3ccc(O)cc3)C(C)CC2=O)cc1,55.95,69.95,2.172,337.372,4,2,3,2.169,82.0
543,TRAIN_0543,CC(=O)Nc1ccc(N2N=C(c3ccc(O)cc3)C(C)CC2=O)cc1,68.485,85.872,2.172,337.372,4,2,3,2.169,82.0
837,TRAIN_0837,CC(=O)Nc1nc2ccc(-c3nn(C(C)C)c4nc(N)ncc34)cc2s1,63.522,62.488,2.293,367.428,5,2,3,2.307,139.85
366,TRAIN_0366,CC(=O)Nc1nc2ccc(-c3nn(C(C)C)c4nc(N)ncc34)cc2s1,73.74,66.85,2.293,367.428,5,2,3,2.307,139.85
1085,TRAIN_1085,CC(C)(C)OC(=O)N1CCC(c2n[nH]c3nc(S(C)(=O)=O)ncc...,23.63,71.47,1.684,381.45,7,1,4,1.684,126.52
2848,TRAIN_2848,CC(C)(C)OC(=O)N1CCC(c2n[nH]c3nc(S(C)(=O)=O)ncc...,40.657,99.9,1.684,381.45,7,1,4,1.684,126.52
2096,TRAIN_2096,CC(C)COc1cc(=O)n2c(c1C(=O)N1CCC(C)CC1)CCCCC2,1.88,61.28,2.843,360.49,3,0,4,2.843,49.85
1666,TRAIN_1666,CC(C)COc1cc(=O)n2c(c1C(=O)N1CCC(C)CC1)CCCCC2,5.494,25.397,2.843,360.49,3,0,4,2.843,49.85


In [None]:
duplicates.shape # 이상치 제거로 27 -> 26쌍

(52, 11)

In [None]:
processed_duplicate = duplicates.groupby('SMILES')['MLM','HLM'].mean().reset_index()
processed_duplicate

  processed_duplicate = duplicates.groupby('SMILES')['MLM','HLM'].mean().reset_index()


Unnamed: 0,SMILES,MLM,HLM
0,C(=C/c1nnn(Cc2ccccc2)n1)\c1ccccc1,0.9225,28.0615
1,CC(=O)Nc1ccc(N2N=C(c3ccc(O)cc3)C(C)CC2=O)cc1,62.2175,77.911
2,CC(=O)Nc1nc2ccc(-c3nn(C(C)C)c4nc(N)ncc34)cc2s1,68.631,64.669
3,CC(C)(C)OC(=O)N1CCC(c2n[nH]c3nc(S(C)(=O)=O)ncc...,32.1435,85.685
4,CC(C)COc1cc(=O)n2c(c1C(=O)N1CCC(C)CC1)CCCCC2,3.687,43.3385
5,CC(C)NC(=O)c1c(Cl)nn(C)c1NC(=O)c1cc(Br)nn1-c1n...,62.1085,68.1015
6,CC1CC(=O)N(c2ccc(-c3cccc(C#N)c3)cc2)N=C1c1ccc(...,43.17,31.13
7,CCCCC/N=c1\n(C)c(=O)nc2sccn12,2.3395,36.8145
8,CCOC(=O)CC1(NC(=O)N2Cc3c(sc4c3CCCC4)-n3cccc3C2...,3.442,3.6015
9,CCc1nc2cc(Br)c(C(=O)OC)nc2n1CC(=O)c1ccccc1,73.545,1.1345


In [None]:
tmp = pd.merge(processed_duplicate,duplicates,how='left',on='SMILES')
tmp

Unnamed: 0,SMILES,MLM_x,HLM_x,id,MLM_y,HLM_y,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea
0,C(=C/c1nnn(Cc2ccccc2)n1)\c1ccccc1,0.9225,28.0615,TRAIN_0451,0.31,24.67,3.556,262.309,3,0,4,3.556,43.6
1,C(=C/c1nnn(Cc2ccccc2)n1)\c1ccccc1,0.9225,28.0615,TRAIN_2276,1.535,31.453,3.556,262.309,3,0,4,3.556,43.6
2,CC(=O)Nc1ccc(N2N=C(c3ccc(O)cc3)C(C)CC2=O)cc1,62.2175,77.911,TRAIN_0543,68.485,85.872,2.172,337.372,4,2,3,2.169,82.0
3,CC(=O)Nc1ccc(N2N=C(c3ccc(O)cc3)C(C)CC2=O)cc1,62.2175,77.911,TRAIN_2891,55.95,69.95,2.172,337.372,4,2,3,2.169,82.0
4,CC(=O)Nc1nc2ccc(-c3nn(C(C)C)c4nc(N)ncc34)cc2s1,68.631,64.669,TRAIN_0366,73.74,66.85,2.293,367.428,5,2,3,2.307,139.85
5,CC(=O)Nc1nc2ccc(-c3nn(C(C)C)c4nc(N)ncc34)cc2s1,68.631,64.669,TRAIN_0837,63.522,62.488,2.293,367.428,5,2,3,2.307,139.85
6,CC(C)(C)OC(=O)N1CCC(c2n[nH]c3nc(S(C)(=O)=O)ncc...,32.1435,85.685,TRAIN_1085,23.63,71.47,1.684,381.45,7,1,4,1.684,126.52
7,CC(C)(C)OC(=O)N1CCC(c2n[nH]c3nc(S(C)(=O)=O)ncc...,32.1435,85.685,TRAIN_2848,40.657,99.9,1.684,381.45,7,1,4,1.684,126.52
8,CC(C)COc1cc(=O)n2c(c1C(=O)N1CCC(C)CC1)CCCCC2,3.687,43.3385,TRAIN_1666,5.494,25.397,2.843,360.49,3,0,4,2.843,49.85
9,CC(C)COc1cc(=O)n2c(c1C(=O)N1CCC(C)CC1)CCCCC2,3.687,43.3385,TRAIN_2096,1.88,61.28,2.843,360.49,3,0,4,2.843,49.85


### -> 중복치 제거 (총 26개)

In [None]:
tmp = tmp.iloc[0::2]  # 첫 번째 행부터 시작하여 2씩 증가하여 행 선택
tmp.drop(['MLM_y','HLM_y'],axis =1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp.drop(['MLM_y','HLM_y'],axis =1,inplace=True)


In [None]:
tmp.columns = ['SMILES', 'MLM', 'HLM','id',  'AlogP', 'Molecular_Weight',
       'Num_H_Acceptors', 'Num_H_Donors', 'Num_RotatableBonds', 'LogD',
       'Molecular_PolarSurfaceArea']

In [None]:
processed_duplicate = tmp[['id', 'SMILES', 'MLM', 'HLM', 'AlogP', 'Molecular_Weight',
       'Num_H_Acceptors', 'Num_H_Donors', 'Num_RotatableBonds', 'LogD',
       'Molecular_PolarSurfaceArea']]

In [None]:
train = train.drop_duplicates(subset=['SMILES'], keep=False) #기존 train에서 중복치들을 우선 모두 제거
train.shape

(3428, 11)

In [None]:
processed_duplicate.shape

(26, 11)

In [None]:
train = pd.concat([train, processed_duplicate], axis=0, ignore_index=True)
train.shape

(3454, 11)

# 피처 추가

In [None]:
!pip install Chem

Collecting Chem
  Downloading chem-1.2.0-py3-none-any.whl (24 kB)
Installing collected packages: Chem
Successfully installed Chem-1.2.0


In [None]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2023.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.7/29.7 MB[0m [31m34.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit
Successfully installed rdkit-2023.3.3


In [None]:
from rdkit import Chem, DataStructs
from rdkit.Chem import Descriptors
from rdkit.Chem import AllChem
from rdkit import DataStructs
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole     # 화면에 출력하기 위한 옵션
IPythonConsole.ipython_useSVG=True

In [None]:
train['mol'] = train['SMILES'].apply(lambda x: Chem.MolFromSmiles(x))
train['mol'] = train['mol'].apply(lambda x: Chem.AddHs(x))
train['num_of_atoms'] = train['mol'].apply(lambda x: x.GetNumAtoms())
train['num_of_heavy_atoms'] = train['mol'].apply(lambda x: x.GetNumHeavyAtoms())
train.head()

Unnamed: 0,id,SMILES,MLM,HLM,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea,mol,num_of_atoms,num_of_heavy_atoms
0,TRAIN_0000,CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC,26.01,50.68,3.259,400.495,5,2,8,3.259,117.37,<rdkit.Chem.rdchem.Mol object at 0x7a078519c7b0>,52,28
1,TRAIN_0001,Cc1nc(C)c(CN2CC(C)C(=O)Nc3ccccc32)s1,29.27,50.59,2.169,301.407,2,1,2,2.172,73.47,<rdkit.Chem.rdchem.Mol object at 0x7a078519c820>,40,21
2,TRAIN_0002,CCCN1CCN(c2nn3nnnc3c3ccccc23)CC1,5.586,80.892,1.593,297.358,5,0,3,1.585,62.45,<rdkit.Chem.rdchem.Mol object at 0x7a078519c890>,41,22
3,TRAIN_0003,Cc1ccc(-c2ccc(-n3nc(C)c(S(=O)(=O)N4CCN(C5CCCCC...,5.71,2.0,4.771,494.652,6,0,5,3.475,92.6,<rdkit.Chem.rdchem.Mol object at 0x7a078519c900>,69,35
4,TRAIN_0004,Cc1ccc2c(c1)N(C(=O)c1ccncc1)CC(C)O2,93.27,99.99,2.335,268.31,3,0,1,2.337,42.43,<rdkit.Chem.rdchem.Mol object at 0x7a078519c970>,36,20


In [None]:
# 탄소 패턴을 지정한다
c_patt = Chem.MolFromSmiles('C')
# 이 패턴이 들어있는 곳을 찾는다. 패턴의 수를 세면 탄소 원자가 몇개 들어있는지 알 수 있다
print(train['mol'][0].GetSubstructMatches(c_patt))

((0,), (1,), (3,), (4,), (5,), (6,), (7,), (9,), (11,), (12,), (13,), (14,), (16,), (17,), (19,), (20,), (23,), (24,), (26,), (27,))


- num_of_{}_atoms 추가

In [None]:
# 임의의 패턴(원자)를 몇개 포함하고 있는지를 얻는 함수
def number_of_atoms(atom_list, df):
    for i in atom_list:
        df['num_of_{}_atoms'.format(i)] = df['mol'].apply(lambda x: len(x.GetSubstructMatches(Chem.MolFromSmiles(i))))

number_of_atoms(['C', 'O', 'N', 'Cl'], train) # 탄소, 산소, 질소, 염소
train.head()

Unnamed: 0,id,SMILES,MLM,HLM,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea,mol,num_of_atoms,num_of_heavy_atoms,num_of_C_atoms,num_of_O_atoms,num_of_N_atoms,num_of_Cl_atoms
0,TRAIN_0000,CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC,26.01,50.68,3.259,400.495,5,2,8,3.259,117.37,<rdkit.Chem.rdchem.Mol object at 0x7a078519c7b0>,52,28,20,3,4,0
1,TRAIN_0001,Cc1nc(C)c(CN2CC(C)C(=O)Nc3ccccc32)s1,29.27,50.59,2.169,301.407,2,1,2,2.172,73.47,<rdkit.Chem.rdchem.Mol object at 0x7a078519c820>,40,21,16,1,3,0
2,TRAIN_0002,CCCN1CCN(c2nn3nnnc3c3ccccc23)CC1,5.586,80.892,1.593,297.358,5,0,3,1.585,62.45,<rdkit.Chem.rdchem.Mol object at 0x7a078519c890>,41,22,15,0,7,0
3,TRAIN_0003,Cc1ccc(-c2ccc(-n3nc(C)c(S(=O)(=O)N4CCN(C5CCCCC...,5.71,2.0,4.771,494.652,6,0,5,3.475,92.6,<rdkit.Chem.rdchem.Mol object at 0x7a078519c900>,69,35,26,2,6,0
4,TRAIN_0004,Cc1ccc2c(c1)N(C(=O)c1ccncc1)CC(C)O2,93.27,99.99,2.335,268.31,3,0,1,2.337,42.43,<rdkit.Chem.rdchem.Mol object at 0x7a078519c970>,36,20,16,2,2,0


- 테스트 데이터에도 추가

In [None]:
test['mol'] = test['SMILES'].apply(lambda x: Chem.MolFromSmiles(x))
test['mol'] = test['mol'].apply(lambda x: Chem.AddHs(x))
test['num_of_atoms'] = test['mol'].apply(lambda x: x.GetNumAtoms())
test['num_of_heavy_atoms'] = test['mol'].apply(lambda x: x.GetNumHeavyAtoms())

number_of_atoms(['C','O', 'N', 'Cl'], test)
test.head()

Unnamed: 0,id,SMILES,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea,mol,num_of_atoms,num_of_heavy_atoms,num_of_C_atoms,num_of_O_atoms,num_of_N_atoms,num_of_Cl_atoms
0,TEST_000,CC(C)Nc1ccnc(N2CCN(Cc3cccs3)C(CCO)C2)n1,2.641,361.505,4,2,7,2.635,92.76,<rdkit.Chem.rdchem.Mol object at 0x7a0785106880>,52,25,18,1,5,0
1,TEST_001,COc1cc(=O)n(-c2ccccc2)cc1C(=O)N1CCC2(CC1)OCCO2,0.585,370.399,5,0,3,0.585,68.31,<rdkit.Chem.rdchem.Mol object at 0x7a0785106810>,49,27,20,5,2,0
2,TEST_002,Cc1cccc(NC(=N)/N=c2\nc(O)c(Cc3ccccc3)c(C)[nH]2)c1,4.276,347.414,4,4,5,4.29,92.86,<rdkit.Chem.rdchem.Mol object at 0x7a07851067a0>,47,26,20,1,5,0
3,TEST_003,O=C(c1nc2ncccn2n1)N1CCCn2cc(-c3ccccc3)nc21,1.795,345.358,5,0,2,1.795,81.21,<rdkit.Chem.rdchem.Mol object at 0x7a0785106730>,41,26,18,1,7,0
4,TEST_004,CCN1CCN(C(=O)c2cc3c(=O)n4cc(C)ccc4nc3n2C)CC1,1.219,353.418,4,0,2,0.169,61.15,<rdkit.Chem.rdchem.Mol object at 0x7a07851066c0>,49,26,19,2,5,0


# GNN

In [None]:
!pip install torch




In [None]:
!pip install torch-geometric


Collecting torch-geometric
  Downloading torch_geometric-2.3.1.tar.gz (661 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/661.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.4/661.6 kB[0m [31m1.7 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m655.4/661.6 kB[0m [31m11.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m661.6/661.6 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: torch-geometric
  Building wheel for torch-geometric (pyproject.toml) ... [?25l[?25hdone
  Created wheel for torch-geometric: filename=torch_geometric-2.3.1-py3-none-any.whl size=910454 sha256=9eef

In [None]:
import torch
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import MolFromSmiles
from torch_geometric.data import Data

# SMILES 문자열을 입력으로 받아 RDKit Mol 객체로 변환
def smiles_to_mol(smiles):
    mol = MolFromSmiles(smiles)
    mol = Chem.AddHs(mol)  # 수소 원자 추가
    mol = Chem.MolToSmiles(mol)  # 정규화된 SMILES로 변환
    mol = Chem.MolFromSmiles(mol)
    return mol

# RDKit Mol 객체를 PyTorch Geometric Data로 변환
def mol_to_geometric_data(mol):
    num_atoms = mol.GetNumAtoms()
    edge_indices = []
    edge_attr = []

    for bond in mol.GetBonds():
        start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
        edge_indices.append((start, end))
        edge_indices.append((end, start))  # 무방향 그래프이므로 역방향 엣지 추가
        edge_attr.extend([bond.GetBondTypeAsDouble()] * 2)

    x = torch.eye(num_atoms)  # 노드 피처는 항등 행렬로 초기화

    data = Data(x=x, edge_index=torch.tensor(edge_indices).t().contiguous(), edge_attr=torch.tensor(edge_attr))
    return data

# SMILES 문자열을 입력으로 받아 그래프 데이터로 변환
def smiles_to_graph(smiles):
    mol = smiles_to_mol(smiles)
    data = mol_to_geometric_data(mol)
    return data



In [None]:

# 예제 SMILES 문자열
smiles_string = "CCO"  # 여기에 원하는 SMILES 문자열을 넣으세요

# SMILES를 그래프로 변환
graph_data = smiles_to_graph(smiles_string)

# 그래프 데이터 확인
print(graph_data)

# 그래프 데이터에서 필요한 정보를 추출하여 사용하세요

Data(x=[3, 3], edge_index=[2, 4], edge_attr=[4])


In [None]:
graph_data.x.numpy().sum()

3.0

In [None]:
graph_data.edge_index.numpy().sum()

8

In [None]:
graph_data.edge_attr.numpy().sum()

4.0

In [None]:

# 예제 SMILES 문자열
smiles_string = train['SMILES'][0]  # 여기에 원하는 SMILES 문자열을 넣으세요

# SMILES를 그래프로 변환
graph_data = smiles_to_graph(smiles_string)

# 그래프 데이터 확인
print(graph_data)

Data(x=[28, 28], edge_index=[2, 60], edge_attr=[60])


In [None]:
graph_data.x.numpy().sum()

28.0

In [None]:
graph_data.edge_index.numpy().sum()

1550

In [None]:
graph_data.edge_attr.numpy().sum()

78.0

In [None]:
# data.x : 노드 특징 행렬
# [num_nodes, num_node_features]
# data.edge_index : 그래프의 연결성
# [2, num_edges]
# data.edge_attr : 엣지 특징 행렬
# [num_edges, num_edge_features]

In [None]:
# # 예제 SMILES 문자열
# smiles_string = train['SMILES'][i]  # 여기에 원하는 SMILES 문자열을 넣으세요

# # SMILES를 그래프로 변환
# graph_data = smiles_to_graph(smiles_string)

# # 그래프 데이터 확인
# # print(graph_data)

# train['graph_x'][i] = graph_data.x
# train['graph_index'][i] = graph_data.edge_index
# train['graph_attr'][i] = graph_data.edge_attr




# # 그래프 데이터에서 필요한 정보를 추출하여 사용하세요

In [None]:
df = pd.DataFrame(columns=['SMILES', 'graph_x', 'graph_index', 'graph_attr'])
df['SMILES'] = train['SMILES']

In [None]:
df = pd.DataFrame(columns=['SMILES', 'graph_x', 'graph_index', 'graph_attr'])

# 예제 SMILES 문자열


smiles_strings = train['SMILES']  # 여기에 원하는 SMILES 문자열을 넣으세요

# SMILES를 그래프로 변환하고 데이터프레임에 추가
for smiles_string in smiles_strings:
    graph_data = smiles_to_graph(smiles_string)  # 앞서 정의한 함수를 사용하여 SMILES를 그래프로 변환

    # 데이터프레임에 추가
    df = df.append({'SMILES': smiles_string,
                          'graph_x': graph_data.x.numpy().sum(),
                          'graph_index': graph_data.edge_index.numpy().sum(),
                          'graph_attr': graph_data.edge_attr.numpy().sum()}, ignore_index=True)

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df

In [None]:
df

Unnamed: 0,SMILES,graph_x,graph_index,graph_attr
0,CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC,28.0,1550,78.0
1,Cc1nc(C)c(CN2CC(C)C(=O)Nc3ccccc32)s1,21.0,898,59.0
2,CCCN1CCN(c2nn3nnnc3c3ccccc23)CC1,22.0,1026,65.0
3,Cc1ccc(-c2ccc(-n3nc(C)c(S(=O)(=O)N4CCN(C5CCCCC...,35.0,2586,99.0
4,Cc1ccc2c(c1)N(C(=O)c1ccncc1)CC(C)O2,20.0,820,58.0
...,...,...,...,...
3449,Cc1ccc(-c2cc(-c3nc4c5ccccc5ncn4n3)[nH]n2)c(C)c1,26.0,1516,86.0
3450,Cc1ccc(-c2ccc(C(CN3CCCC3)N(C)C(=O)CN3C(=O)COc4...,38.0,3028,106.0
3451,Cc1ccccc1-c1nc2ccccc2cc1C(C)n1c(=O)[nH]c2c(S(C...,33.0,2284,107.0
3452,N#Cc1nccnc1OC1CCN(C(=O)C2CC(=O)N(C3CCOCC3)C2)C1,28.0,1636,76.0


In [None]:
df_train = df[['graph_x','graph_index','graph_attr']]
df_train

train = pd.concat([train,df_train],axis=1)

In [None]:
df = pd.DataFrame(columns=['SMILES', 'graph_x', 'graph_index', 'graph_attr'])

# 예제 SMILES 문자열


smiles_strings = test['SMILES']  # 여기에 원하는 SMILES 문자열을 넣으세요

# SMILES를 그래프로 변환하고 데이터프레임에 추가
for smiles_string in smiles_strings:
    graph_data = smiles_to_graph(smiles_string)  # 앞서 정의한 함수를 사용하여 SMILES를 그래프로 변환

    # 데이터프레임에 추가
    df = df.append({'SMILES': smiles_string,
                          'graph_x': graph_data.x.numpy().sum(),
                          'graph_index': graph_data.edge_index.numpy().sum(),
                          'graph_attr': graph_data.edge_attr.numpy().sum()}, ignore_index=True)

  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.append({'SMILES': smiles_string,
  df = df.a

In [None]:
df_test = df[['graph_x','graph_index','graph_attr']]
df_test

test = pd.concat([test,df_test],axis=1)

In [None]:
train['graph_index'] = train['graph_index'].astype(float)
test['graph_index'] = test['graph_index'].astype(float)

# 이미지 분석

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.33.1-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.17.1-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.8/294.8 kB[0m [31m29.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m65.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m55.7 MB/s[0m eta [36m0:00:0

In [None]:
# from PIL import Image
# PIL_image = Image.fromarray(ndarray_image)

In [None]:
# from transformers import ViTImageProcessor, ViTModel
# from PIL import Image
# import requests

# # url = "https:///content/molecule_0.png"
# # image = Image.open(requests.get(url, stream=True).raw)
# image = '/content/molecule_0.png'

# processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224-in21k')
# model = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
# inputs = processor(images=image, return_tensors="pt")

# outputs = model(**inputs)
# last_hidden_states = outputs.last_hidden_state
#  bvb

In [None]:

# from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
# import torch
# from PIL import Image

# model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
# feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
# tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)


In [None]:

# from transformers import pipeline

# image_to_text = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")

# image_to_text("https://ankur3107.github.io/assets/images/image-captioning-example.png")

# # [{'generated_text': 'a soccer game with a player jumping to catch the ball '}]


In [None]:
# image_to_text("/content/molecule_0.png")

In [None]:
# img_to_text = []
# for n in tqdm(range(train.shape[0])):
#     img_to_text.append(image_to_text(f"/content/molecule_{n}.png"))

In [None]:
# train['img_to_text'] = img_to_text

## Graph Convolutional Network (GCN)

In [None]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2023.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.7/29.7 MB[0m [31m43.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit
Successfully installed rdkit-2023.3.3


In [None]:
!pip install torch




In [None]:
!pip install dgl


Collecting dgl
  Downloading dgl-1.1.2-cp310-cp310-manylinux1_x86_64.whl (6.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.0/6.0 MB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dgl
Successfully installed dgl-1.1.2


In [None]:
import dgl
import dgl.function as fn
import torch
import torch.nn as nn
import torch.nn.functional as F

DGL backend not selected or invalid.  Assuming PyTorch for now.


Setting the default backend to "pytorch". You can change it in the ~/.dgl/config.json file or export the DGLBACKEND environment variable.  Valid options are: pytorch, mxnet, tensorflow (all lowercase)


In [None]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem import AllChem
from rdkit.Chem import PandasTools
from rdkit.DataStructs import BulkTanimotoSimilarity
from sklearn.cluster import AgglomerativeClustering
import matplotlib.pyplot as plt


In [None]:
from PIL import Image

import torch
from transformers import AutoImageProcessor, ResNetForImageClassification
from PIL import Image
import torch.nn.functional as F

In [None]:
# SMILES 문자열을 RDKit Mol 객체로 변환
def smiles_to_mol(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return mol

# 분자 그림 생성
def draw_molecule(mol, file_name):
    img = Draw.MolToImage(mol)
    img.save(file_name)

In [None]:
# PandasTools.AddMoleculeColumnToFrame(train, 'SMILES', 'Molecule')
# train['Molecule'] = train['Molecule'].apply(smiles_to_mol)


In [None]:
train['mol']

0       <rdkit.Chem.rdchem.Mol object at 0x7f2f3ca101b0>
1       <rdkit.Chem.rdchem.Mol object at 0x7f2f3ca10210>
2       <rdkit.Chem.rdchem.Mol object at 0x7f2f3ca10150>
3       <rdkit.Chem.rdchem.Mol object at 0x7f2f3ca10390>
4       <rdkit.Chem.rdchem.Mol object at 0x7f2f3ca10450>
                              ...                       
3449    <rdkit.Chem.rdchem.Mol object at 0x7f2f3c9624b0>
3450    <rdkit.Chem.rdchem.Mol object at 0x7f2f3c962510>
3451    <rdkit.Chem.rdchem.Mol object at 0x7f2f3c962570>
3452    <rdkit.Chem.rdchem.Mol object at 0x7f2f3c9625d0>
3453    <rdkit.Chem.rdchem.Mol object at 0x7f2f3c962630>
Name: mol, Length: 3454, dtype: object

In [None]:
train['mol'] = 0

In [None]:
# train 데이터프레임에 있는 SMILES 문자열을 분자 객체로 변환하고 그리기
for i, smiles in tqdm(enumerate(train['SMILES']), desc="Processing", ncols=100):
    mol = Chem.MolFromSmiles(smiles)  # SMILES 문자열을 RDKit 분자 객체로 변환
    if mol is not None:
        img = Draw.MolToImage(mol, size=(300, 300))  # 분자 객체를 이미지로 그리기
        img.save(f'{DATA_PATH}molecule_{i}.png')  # 이미지 저장

Processing: 0it [00:00, ?it/s]

In [None]:
# train 데이터프레임에 있는 SMILES 문자열을 분자 객체로 변환하고 그리기
for i, smiles in tqdm(enumerate(test['SMILES']), desc="Processing", ncols=100):
    mol = Chem.MolFromSmiles(smiles)  # SMILES 문자열을 RDKit 분자 객체로 변환
    if mol is not None:
        img = Draw.MolToImage(mol, size=(300, 300))  # 분자 객체를 이미지로 그리기
        img.save(f'{DATA_PATH}molecule_test_{i}.png')  # 이미지 저장

Processing: 0it [00:00, ?it/s]

In [None]:
for i, mol in tqdm(enumerate(train['mol'])):
    draw_molecule(mol, f'molecule_{i}.png')

0it [00:00, ?it/s]

AttributeError: ignored

In [None]:
# def get_image_label(i):
#     image_path = f'molecule_{i}.png'
#     image = Image.open(image_path)

#     processor = AutoImageProcessor.from_pretrained("microsoft/resnet-50")
#     model = ResNetForImageClassification.from_pretrained("microsoft/resnet-50")

#     inputs = processor(images=image, return_tensors="pt")

#     with torch.no_grad():
#         logits = model(**inputs).logits

#     # 로짓을 확률로 변환
#     probs = F.softmax(logits, dim=-1)

#     # 모든 라벨의 확률을 추출
#     all_label_probs = probs[0].tolist()

#     return all_label_probs


In [None]:
def get_honeycomb_probability(i):
    image_path = f'molecule_{i}.png'
    image = Image.open(image_path)

    processor = AutoImageProcessor.from_pretrained("microsoft/resnet-50")
    model = ResNetForImageClassification.from_pretrained("microsoft/resnet-50")

    inputs = processor(images=image, return_tensors="pt")

    with torch.no_grad():
        logits = model(**inputs).logits

    # 로짓을 확률로 변환
    probs = F.softmax(logits, dim=-1)

    # 'honeycomb' 라벨의 확률 추출
    honeycomb_label_index = model.config.label2id['honeycomb']
    honeycomb_probability = probs[0][honeycomb_label_index].item()

    return honeycomb_probability


In [None]:
train['mode_label_proba'] = 0

In [None]:
from tqdm import tqdm

# train 데이터프레임의 처음 10개 행에 대해서만 작업을 수행
for i in tqdm(range(10), desc="Processing", ncols=100):
    train['mode_label_proba'][i] = get_honeycomb_probability(i)


Processing:   0%|                                                            | 0/10 [00:00<?, ?it/s]Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['mode_label_proba'][i] = get_honeycomb_probability(i)
Processing:  10%|█████▏                                              | 1/10 [00:01<00:15,  1.74s/it]Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.
Processing:  20%|██████████▍                                         | 2/10 [00:03<00:13,  1.69s/it]Could not find image processor class in the image processor config or the model config. 

In [None]:
train

Unnamed: 0,id,SMILES,MLM,HLM,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,...,num_of_atoms,num_of_heavy_atoms,num_of_C_atoms,num_of_O_atoms,num_of_N_atoms,num_of_Cl_atoms,graph_x,graph_index,graph_attr,mode_label_proba
0,TRAIN_0000,CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC,26.0100,50.6800,3.259,400.495,5,2,8,3.259,...,52,28,20,3,4,0,28.0,1550.0,78.0,0.999686
1,TRAIN_0001,Cc1nc(C)c(CN2CC(C)C(=O)Nc3ccccc32)s1,29.2700,50.5900,2.169,301.407,2,1,2,2.172,...,40,21,16,1,3,0,21.0,898.0,59.0,0.493459
2,TRAIN_0002,CCCN1CCN(c2nn3nnnc3c3ccccc23)CC1,5.5860,80.8920,1.593,297.358,5,0,3,1.585,...,41,22,15,0,7,0,22.0,1026.0,65.0,0.998970
3,TRAIN_0003,Cc1ccc(-c2ccc(-n3nc(C)c(S(=O)(=O)N4CCN(C5CCCCC...,5.7100,2.0000,4.771,494.652,6,0,5,3.475,...,69,35,26,2,6,0,35.0,2586.0,99.0,0.999404
4,TRAIN_0004,Cc1ccc2c(c1)N(C(=O)c1ccncc1)CC(C)O2,93.2700,99.9900,2.335,268.310,3,0,1,2.337,...,36,20,16,2,2,0,20.0,820.0,58.0,0.999424
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3449,TRAIN_1833,Cc1ccc(-c2cc(-c3nc4c5ccccc5ncn4n3)[nH]n2)c(C)c1,28.4630,24.1955,4.609,340.381,4,1,2,4.736,...,42,26,20,0,6,0,26.0,1516.0,86.0,0.000000
3450,TRAIN_0898,Cc1ccc(-c2ccc(C(CN3CCCC3)N(C)C(=O)CN3C(=O)COc4...,56.0435,66.1260,4.282,553.480,5,0,7,4.009,...,68,38,29,3,4,2,38.0,3028.0,106.0,0.000000
3451,TRAIN_2575,Cc1ccccc1-c1nc2ccccc2cc1C(C)n1c(=O)[nH]c2c(S(C...,56.3130,63.7320,4.304,459.520,6,1,4,4.304,...,54,33,24,3,5,0,33.0,2284.0,107.0,0.000000
3452,TRAIN_0216,N#Cc1nccnc1OC1CCN(C(=O)C2CC(=O)N(C3CCOCC3)C2)C1,70.2150,93.8400,-1.133,385.417,7,0,4,-1.133,...,51,28,19,4,5,0,28.0,1636.0,76.0,0.000000


In [None]:
train['mode_label_proba'] = train.index.map(get_honeycomb_probability)


Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.
Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.
Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.
Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.
Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.
Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's f

KeyboardInterrupt: ignored

- 더 정확한


In [None]:
def get_image_label(i):
    image_path = f'molecule_{i}.png'
    image = Image.open(image_path)

    processor = AutoImageProcessor.from_pretrained("microsoft/resnet-50")
    model = ResNetForImageClassification.from_pretrained("microsoft/resnet-50")

    inputs = processor(images=image, return_tensors="pt")

    with torch.no_grad():
        logits = model(**inputs).logits

    predicted_label = logits.argmax(-1).item()
    return model.config.id2label[predicted_label]


- 간소화 함수

In [None]:
def get_image_label(i):
    image_path = f'molecule_{i}.png'
    image = Image.open(image_path)

    processor = AutoImageProcessor.from_pretrained("microsoft/resnet-50")
    model = ResNetForImageClassification.from_pretrained("microsoft/resnet-50")

    inputs = processor(images=image, return_tensors="pt")

    with torch.no_grad():
        logits = model(**inputs).logits

    predicted_label = logits.argmax().item()
    return predicted_label


In [None]:
# # train 데이터프레임의 각 행에 대해 이미지 라벨을 예측하고 열에 추가
# train['predicted_label'] = train.index.map(get_image_label)

# # 결과 확인
# print(train)

In [None]:
# # train 데이터프레임의 각 행에 대해 이미지 라벨을 예측하고 열에 추가
# test['predicted_label'] = test.index.map(get_image_label)

# # 결과 확인
# print(train)

In [None]:
!pip install transformers



In [None]:
# tqdm로 진행 상황을 보면서 이미지 라벨을 얻습니다.
predicted_labels = []
for i in tqdm(range(len(train))):  # train 데이터프레임의 크기에 따라 수정하세요.
    label = get_image_label(i)
    predicted_labels.append(label)

# predicted_labels을 train 데이터프레임에 추가합니다.
train['predicted_label'] = predicted_labels

  0%|          | 0/3454 [00:00<?, ?it/s]Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.
  0%|          | 1/3454 [00:01<1:20:55,  1.41s/it]Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.
  0%|          | 2/3454 [00:02<1:17:32,  1.35s/it]Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.
  0%|          | 3/3454 [00:04<1:21:37,  1.42s/it]Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.
  0%|          | 4/3454 [00:05<1:18:33,  1.37s/it]Could not find image processor class in the image processor config or th

In [None]:
train.to_csv(f'{DATA_PATH}image_label_train.csv',index= False)

In [None]:
predicted_labels = []
for i in tqdm(range(len(test))):  # train 데이터프레임의 크기에 따라 수정하세요.
    label = get_image_label(i)
    predicted_labels.append(label)

# predicted_labels을 train 데이터프레임에 추가합니다.
test['predicted_label'] = predicted_labels

In [None]:
test.to_csv(f'{DATA_PATH}image_label_test.csv',index= False)

In [None]:
most_common_label = train['predicted_label'].mode().iloc[0]


In [None]:
train['predicted_label'].unique()

array(['honeycomb', 'matchstick', 'nail', 'wall clock', 'analog clock',
       'digital clock', 'syringe'], dtype=object)

In [None]:
probs = F.softmax(logits, dim=-1)  # logits는 이미 이전 코드에서 계산되었다고 가정합니다.
for i in range(len(train)):
    label_id = train['predicted_label'].iloc[i]
    probability = probs[i][label_id].item()
    train.at[i, 'predicted_label_probability'] = probability

NameError: ignored

In [None]:
# most_common_label = train['predicted_label'].mode().iloc[0]
# # 가장 최빈 라벨을 찾습니다.
# most_common_label = train['predicted_label'].mode().iloc[0]

# # 예측된 라벨과 가장 최빈 라벨에 대한 확률을 계산하여 새로운 칼럼으로 추가합니다.
# train['most_common_label'] = most_common_label
train['most_common_label_probability'] = (train['predicted_label'] == most_common_label).astype(int)

test['most_common_label_probability'] = (test['predicted_label'] == most_common_label).astype(int)


In [None]:
# 각각의 행에 대한 예측된 라벨에 대한 확률을 계산하여 새로운 칼럼으로 추가합니다.
probs = F.softmax(logits, dim=-1)  # logits는 이미 이전 코드에서 계산되었다고 가정합니다.
for i in range(len(train)):
    label_id = train['predicted_label'].iloc[i]
    probability = probs[i][label_id].item()
    train.at[i, 'predicted_label_probability'] = probability

In [None]:
# 각각의 행에 대한 예측된 라벨에 대한 확률을 계산하여 새로운 칼럼으로 추가합니다.
probs = F.softmax(logits, dim=-1)  # logits는 이미 이전 코드에서 계산되었다고 가정합니다.
for i in range(len(test)):
    label_id = train['predicted_label'].iloc[i]
    probability = probs[i][label_id].item()
    test.at[i, 'predicted_label_probability'] = probability

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['image_label'][i] = get_image_label(i)


In [None]:
label_list =

for i in tqdm(range(train.shape[0])):
    get_image_label(i)

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


honeycomb


Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


honeycomb


Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


honeycomb


Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


honeycomb


Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


honeycomb


Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


matchstick


Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


honeycomb


Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


honeycomb


Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


honeycomb


Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


honeycomb


Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


matchstick


Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


honeycomb


Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


nail


Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


honeycomb


Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


honeycomb


Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


matchstick


Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


honeycomb


Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


honeycomb


Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


honeycomb


Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


honeycomb


Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


KeyboardInterrupt: ignored

In [None]:
# 로짓을 확률로 변환
probs = F.softmax(logits, dim=-1)

# 상위 5개 클래스와 각각의 확률 얻기
top5_probs, top5_labels = torch.topk(probs, 5, dim=-1)

# 클래스 라벨 및 스코어 출력
for i in range(5):
    label_id = top5_labels[0][i].item()
    score = top5_probs[0][i].item()
    label = model.config.id2label[label_id]
    print(f"Label: {label}, Score: {score}")


Label: honeycomb, Score: 0.9996862411499023
Label: matchstick, Score: 5.994642560835928e-05
Label: nail, Score: 2.1261417714413255e-05
Label: lampshade, lamp shade, Score: 5.675171905750176e-06
Label: hook, claw, Score: 4.653947144106496e-06


In [None]:
# fingerprints = [AllChem.GetMorganFingerprintAsBitVect(m, 2, nBits=2048) for m in train['mol']]
# similarity_matrix = np.zeros((len(train), len(train)))

In [None]:
# def mol_to_fingerprint(mol):
#     fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2)  # ECFP4
#     return fp

# # 분자 간의 Tanimoto 유사도 계산 함수
# def calculate_tanimoto_similarity(fp1, fp2):
#     similarity = DataStructs.TanimotoSimilarity(fp1, fp2)
#     return similarity

In [None]:
# train['Fingerprint'] = train['mol'].apply(mol_to_fingerprint)


In [None]:
# # 분자 간의 유사도 계산
# similarity_matrix = []
# for i in range(len(train)):
#     row = []
#     for j in tqdm(range(len(train))):
#         similarity = calculate_tanimoto_similarity(train['Fingerprint'][i], train['Fingerprint'][j])
#         row.append(similarity)
#     similarity_matrix.append(row)

# # 결과 출력
# similarity_df = pd.DataFrame(similarity_matrix, columns=train['SMILES'], index=train['SMILES'])
# print(similarity_df)


In [None]:
# 군집화
clustering = AgglomerativeClustering(n_clusters=2, affinity='precomputed', linkage='average')
labels = clustering.fit_predict(similarity_matrix)




In [None]:
train['Cluster'] = labels
print(train[['SMILES', 'Cluster']])

                                                 SMILES  Cluster
0       CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC        0
1                  Cc1nc(C)c(CN2CC(C)C(=O)Nc3ccccc32)s1        0
2                      CCCN1CCN(c2nn3nnnc3c3ccccc23)CC1        0
3     Cc1ccc(-c2ccc(-n3nc(C)c(S(=O)(=O)N4CCN(C5CCCCC...        0
4                   Cc1ccc2c(c1)N(C(=O)c1ccncc1)CC(C)O2        0
...                                                 ...      ...
3449    Cc1ccc(-c2cc(-c3nc4c5ccccc5ncn4n3)[nH]n2)c(C)c1        0
3450  Cc1ccc(-c2ccc(C(CN3CCCC3)N(C)C(=O)CN3C(=O)COc4...        0
3451  Cc1ccccc1-c1nc2ccccc2cc1C(C)n1c(=O)[nH]c2c(S(C...        0
3452    N#Cc1nccnc1OC1CCN(C(=O)C2CC(=O)N(C3CCOCC3)C2)C1        0
3453  O=S(=O)(c1ccccc1)n1ccc(C/C=C/Cc2ccn(S(=O)(=O)c...        1

[3454 rows x 2 columns]


In [None]:
import gradio as gr
from transformers import AutoModel

def predict_smiles(name):
    device = 'cpu'
    smiles = name
    assert isinstance(smiles, str), 'smiles must be str'

    smiles = smiles.strip()
    if ';' in smiles:
        smiles = smiles.split(";")
    elif ' ' in smiles:
        smiles = smiles.split(" ")
    elif ',' in smiles:
        smiles = smiles.split(",")
    else:
        smiles = [smiles]


    model = AutoModel.from_pretrained("Huhujingjing/custom-mxm", trust_remote_code=True).to(device)

    output, df = model.predict_smiles(smiles)

    return output, df

iface = gr.Interface(fn=predict_smiles, inputs="text", outputs=["text", "dataframe"])
iface.launch(share=True)


In [None]:
train['predicted_label'].value_counts()

599    2999
644     395
892      22
409      15
530      14
677       5
845       4
Name: predicted_label, dtype: int64

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.33.2-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m54.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.17.1-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.8/294.8 kB[0m [31m32.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m110.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m71.7 MB/s[0m eta [36m0:00:

In [None]:
index_label = [599, 644, 677, 892, 409, 530, 845]

import pandas as pd
import torch
from PIL import Image
from transformers import AutoFeatureExtractor, AutoModelForImageClassification, AutoConfig
import numpy as np

# Define your labels

df_train = pd.DataFrame(0, columns=index_label, index=range(train.shape[0]))
df_train

Unnamed: 0,599,644,677,892,409,530,845
0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...
3449,0,0,0,0,0,0,0
3450,0,0,0,0,0,0,0
3451,0,0,0,0,0,0,0
3452,0,0,0,0,0,0,0


In [None]:
import pandas as pd
import torch
from PIL import Image
from transformers import AutoFeatureExtractor, AutoModelForImageClassification, AutoConfig, AutoImageProcessor

# 주어진 라벨 리스트
index_label = [599, 644, 677, 892, 409, 530, 845]

# 모델 및 데이터 경로
MODEL_PATH = "microsoft/resnet-50"
DATA_PATH = '/content/drive/MyDrive/데이콘 캐글 컴페티션/2023신약개발/data/'


# 라벨을 확률로 변환하는 함수
def get_label_probabilities(image_path):
    # 이미지 열기
    image = Image.open(image_path)

    # 이미지 처리기 및 모델 초기화
    processor = AutoImageProcessor.from_pretrained(MODEL_PATH)
    model = AutoModelForImageClassification.from_pretrained(MODEL_PATH)

    # 이미지를 모델에 입력
    inputs = processor(images=image, return_tensors="pt")

    # 확률 예측
    with torch.no_grad():
        logits = model(**inputs).logits

    # 확률로 변환
    probs = torch.nn.functional.softmax(logits, dim=-1)[0].numpy()

    # 주어진 라벨에 대한 확률 추출
    label_probabilities = {str(label): prob for label, prob in zip(index_label, probs)}

    return label_probabilities


In [None]:
# 이미지 파일에 대해 라벨 확률 얻기
i = 0
image_path = f'{DATA_PATH}molecule_{i}.png'  # 특정 이미지 파일 경로를 여기에 입력
label_probabilities = get_label_probabilities(image_path)

# 결과 출력
print(label_probabilities)


Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


{'599': 7.948518e-06, '644': 7.793412e-06, '677': 4.955637e-06, '892': 7.137471e-06, '409': 4.9558453e-06, '530': 9.333687e-06, '845': 5.1614866e-06}


In [None]:
# 새로운 라벨 확률 칼럼을 추가할 빈 데이터프레임 생성
label_probabilities_df = pd.DataFrame(columns=[f'label_{label}' for label in index_label])

# 각 이미지 파일에 대해 라벨 확률 계산 및 데이터프레임에 추가
for i in tqdm(range(len(train))):
    image_path = f'{DATA_PATH}molecule_{i}.png'
    label_probabilities = get_label_probabilities(image_path)  # 이미지 파일에 대한 라벨 확률을 계산하는 함수

    # 결과를 데이터프레임에 추가
    label_probabilities_df = label_probabilities_df.append(label_probabilities, ignore_index=True)

# 기존의 train 데이터프레임과 새로운 라벨 확률 데이터프레임을 합침
train_with_label_probabilities = pd.concat([train, label_probabilities_df], axis=1)


  0%|          | 0/3454 [00:00<?, ?it/s]

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.
  label_probabilities_df = label_probabilities_df.append(label_probabilities, ignore_index=True)
Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.
  label_probabilities_df = label_probabilities_df.append(label_probabilities, ignore_index=True)
Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.
  label_probabilities_df = label_probabilities_df.append(label_probabilities, ignore_index=True)
Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.
  label_probabilities

KeyboardInterrupt: ignored

In [None]:
def get_labels_probability(i, DATA_PATH, df_train):
    image_path = f'{DATA_PATH}molecule_{i}.png'
    image = Image.open(image_path)

    # Load the feature extractor and model
    feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/resnet-50")
    config = AutoConfig.from_pretrained("microsoft/resnet-50")
    model = AutoModelForImageClassification.from_pretrained("microsoft/resnet-50", config=config)

    inputs = feature_extractor(images=image, return_tensors="pt")
    labels = torch.tensor([index_label], dtype=torch.float32)

    with torch.no_grad():
        # Get the logits from the model
        logits = model(**inputs).logits

    # Apply softmax to get label probabilities
    probabilities = torch.nn.functional.softmax(logits, dim=1)[0].numpy()

    # Create a dictionary with label probabilities
    label_probabilities = {label: probability for label, probability in zip(index_label, probabilities)}

    # Append the label probabilities to the DataFrame
    df_train[i] = pd.concat([df_train[i], pd.DataFrame(label_probabilities, index=[0])], ignore_index=True)

    return df_train


In [None]:
import pandas as pd
import numpy as np
from PIL import Image
from transformers import AutoFeatureExtractor, AutoConfig, AutoModelForImageClassification
import torch

def initialize_df(num_images, index_label):
    # Create an empty DataFrame with columns for each label
    df_train = pd.DataFrame(np.zeros((num_images, len(index_label))), columns=index_label)
    return df_train

def get_labels_probability(i, DATA_PATH, df_train, index_label):
    image_path = f'{DATA_PATH}molecule_{i}.png'
    image = Image.open(image_path)

    # Load the feature extractor and model
    feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/resnet-50")
    config = AutoConfig.from_pretrained("microsoft/resnet-50")
    model = AutoModelForImageClassification.from_pretrained("microsoft/resnet-50", config=config)

    inputs = feature_extractor(images=image, return_tensors="pt")

    with torch.no_grad():
        # Get the logits from the model
        logits = model(**inputs).logits

    # Apply softmax to get label probabilities
    probabilities = torch.nn.functional.softmax(logits, dim=1)[0].numpy()

    # Update the DataFrame with label probabilities
    df_train.loc[i] = probabilities

    return df_train


In [None]:
get_labels_probability(0, DATA_PATH, df_train, index_label)

ValueError: ignored

In [None]:
# 모든 이미지의 라벨 확률을 저장할 데이터프레임 초기화
df_train = initialize_df(num_images=len(train), index_label=index_label)

# 각 이미지에 대해 라벨 확률 계산 및 데이터프레임에 추가
for i in range(len(train)):
    df_train = get_labels_probability(i, DATA_PATH, df_train, index_label)

# 결과 데이터프레임 확인
print(df_train)




ValueError: ignored

# 모델학습, 검증, 제출

In [None]:
train

Unnamed: 0,id,SMILES,MLM,HLM,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,...,num_of_heavy_atoms,num_of_C_atoms,num_of_O_atoms,num_of_N_atoms,num_of_Cl_atoms,graph_x,graph_index,graph_attr,mode_label_proba,predicted_label
0,TRAIN_0000,CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC,26.0100,50.6800,3.259,400.495,5,2,8,3.259,...,28,20,3,4,0,28.0,1550.0,78.0,0.00000,599
1,TRAIN_0001,Cc1nc(C)c(CN2CC(C)C(=O)Nc3ccccc32)s1,29.2700,50.5900,2.169,301.407,2,1,2,2.172,...,21,16,1,3,0,21.0,898.0,59.0,0.00000,599
2,TRAIN_0002,CCCN1CCN(c2nn3nnnc3c3ccccc23)CC1,5.5860,80.8920,1.593,297.358,5,0,3,1.585,...,22,15,0,7,0,22.0,1026.0,65.0,0.00000,599
3,TRAIN_0003,Cc1ccc(-c2ccc(-n3nc(C)c(S(=O)(=O)N4CCN(C5CCCCC...,5.7100,2.0000,4.771,494.652,6,0,5,3.475,...,35,26,2,6,0,35.0,2586.0,99.0,0.00000,599
4,TRAIN_0004,Cc1ccc2c(c1)N(C(=O)c1ccncc1)CC(C)O2,93.2700,99.9900,2.335,268.310,3,0,1,2.337,...,20,16,2,2,0,20.0,820.0,58.0,0.00000,599
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3449,TRAIN_1833,Cc1ccc(-c2cc(-c3nc4c5ccccc5ncn4n3)[nH]n2)c(C)c1,28.4630,24.1955,4.609,340.381,4,1,2,4.736,...,26,20,0,6,0,26.0,1516.0,86.0,0.00000,599
3450,TRAIN_0898,Cc1ccc(-c2ccc(C(CN3CCCC3)N(C)C(=O)CN3C(=O)COc4...,56.0435,66.1260,4.282,553.480,5,0,7,4.009,...,38,29,3,4,2,38.0,3028.0,106.0,0.00000,599
3451,TRAIN_2575,Cc1ccccc1-c1nc2ccccc2cc1C(C)n1c(=O)[nH]c2c(S(C...,56.3130,63.7320,4.304,459.520,6,1,4,4.304,...,33,24,3,5,0,33.0,2284.0,107.0,0.00000,599
3452,TRAIN_0216,N#Cc1nccnc1OC1CCN(C(=O)C2CC(=O)N(C3CCOCC3)C2)C1,70.2150,93.8400,-1.133,385.417,7,0,4,-1.133,...,28,19,4,5,0,28.0,1636.0,76.0,0.00000,599


In [None]:
import pandas as pd

In [5]:
train = pd.read_csv('train_label_proba_3.csv')

FileNotFoundError: ignored

In [None]:
test = pd.read_csv('test_label_proba_3 (1).csv')

In [None]:
train.shape, test.shape

((3454, 30), (483, 28))

In [None]:
test.columns

Index(['id', 'SMILES', 'AlogP', 'Molecular_Weight', 'Num_H_Acceptors',
       'Num_H_Donors', 'Num_RotatableBonds', 'LogD',
       'Molecular_PolarSurfaceArea', 'mol', 'num_of_atoms',
       'num_of_heavy_atoms', 'num_of_C_atoms', 'num_of_O_atoms',
       'num_of_N_atoms', 'num_of_Cl_atoms', 'graph_x', 'graph_index',
       'graph_attr', 'mode_label_proba', 'predicted_label', '599', '644',
       '677', '892', '409', '530', '845'],
      dtype='object')

In [None]:
train.columns

Index(['id', 'SMILES', 'MLM', 'HLM', 'AlogP', 'Molecular_Weight',
       'Num_H_Acceptors', 'Num_H_Donors', 'Num_RotatableBonds', 'LogD',
       'Molecular_PolarSurfaceArea', 'mol', 'num_of_atoms',
       'num_of_heavy_atoms', 'num_of_C_atoms', 'num_of_O_atoms',
       'num_of_N_atoms', 'num_of_Cl_atoms', 'graph_x', 'graph_index',
       'graph_attr', 'mode_label_proba', 'predicted_label', '599', '644',
       '677', '892', '409', '530', '845'],
      dtype='object')

In [None]:
cols = ['mol','mode_label_proba']

train = train.drop(columns = cols,axis=1)
test = test.drop(columns = cols,axis=1)

In [None]:
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold

In [None]:
from sklearn.metrics import mean_squared_error

def rmse(y_valid, pred):
    mse = mean_squared_error(y_valid, pred)
    return np.sqrt(mse)

In [None]:
from sklearn.metrics import make_scorer

rmse_score = make_scorer(rmse, greater_is_better=False)

In [None]:
import numpy as np

In [None]:
# 결측치 채우기
train["AlogP"] = np.where(pd.isna(train["AlogP"]), train["LogD"], train["AlogP"])
test["AlogP"] = np.where(pd.isna(test["AlogP"]), test["LogD"], test["AlogP"])

In [None]:
train_prop = train.iloc[:, 4:]
test_prop = test.iloc[:, 2:]

target_1 = train['MLM']
target_2 = train['HLM']

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Min-Max 스케일링을 위한 객체 생성
scaler = MinMaxScaler()

# Train 데이터를 스케일링하고 변환
train_prop = scaler.fit_transform(train_prop)

# Test 데이터를 스케일링하고 변환
test_prop = scaler.transform(test_prop)


In [None]:
train_prop = pd.DataFrame(train_prop)
train_prop

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,0.488883,0.408759,0.416667,0.250,0.470588,0.525975,0.546835,0.481928,0.439024,0.405405,...,0.246679,0.406250,0.393375,0.001658,0.000043,0.000828,0.003171,0.000176,0.000167,0.000848
1,0.394950,0.231359,0.166667,0.125,0.117647,0.439098,0.336496,0.337349,0.268293,0.297297,...,0.129623,0.257812,0.393375,0.067370,0.002725,0.010616,0.042179,0.010222,0.010862,0.026665
2,0.345312,0.224110,0.416667,0.000,0.176471,0.392184,0.283695,0.349398,0.292683,0.270270,...,0.152603,0.304688,0.393375,0.002326,0.000095,0.001376,0.002848,0.000786,0.000554,0.001741
3,0.619183,0.577331,0.500000,0.000,0.294118,0.543238,0.428154,0.686747,0.609756,0.567568,...,0.432675,0.570312,0.393375,0.002730,0.000117,0.003968,0.003857,0.000673,0.000504,0.003264
4,0.409255,0.172104,0.250000,0.000,0.058824,0.452286,0.187773,0.289157,0.243902,0.297297,...,0.115619,0.250000,0.393375,0.000776,0.000128,0.000123,0.000860,0.000141,0.000122,0.000122
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3449,0.605222,0.301135,0.333333,0.125,0.117647,0.644022,0.328255,0.361446,0.390244,0.405405,...,0.240575,0.468750,0.393375,0.000154,0.000003,0.000041,0.000343,0.000011,0.000012,0.000079
3450,0.577042,0.682652,0.416667,0.000,0.411765,0.585918,0.300608,0.674699,0.682927,0.648649,...,0.512029,0.625000,0.393375,0.002433,0.000042,0.000416,0.002805,0.000119,0.000202,0.000549
3451,0.578938,0.514433,0.500000,0.125,0.235294,0.609495,0.528437,0.506024,0.560976,0.513514,...,0.378456,0.632812,0.393375,0.000103,0.000004,0.000044,0.000401,0.000013,0.000022,0.000048
3452,0.110393,0.381764,0.583333,0.000,0.235294,0.174952,0.505055,0.469880,0.439024,0.378378,...,0.262118,0.390625,0.393375,0.067604,0.006111,0.075257,0.041146,0.064905,0.047159,0.071757


In [None]:
test_prop = pd.DataFrame(test_prop)
test_prop

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,0.435626,0.338954,0.333333,0.250,0.411765,0.476103,0.428921,0.481928,0.365854,0.351351,...,0.196409,0.304688,0.393375,0.001658,0.000043,0.000828,0.003171,0.000176,0.000167,0.000848
1,0.258445,0.354877,0.416667,0.000,0.176471,0.312260,0.311772,0.445783,0.414634,0.405405,...,0.235548,0.390625,0.393375,0.067370,0.002725,0.010616,0.042179,0.010222,0.010862,0.026665
2,0.576525,0.313726,0.333333,0.500,0.294118,0.608376,0.429400,0.421687,0.390244,0.405405,...,0.214722,0.406250,0.393375,0.002326,0.000095,0.001376,0.002848,0.000786,0.000554,0.001741
3,0.362720,0.310045,0.416667,0.000,0.117647,0.408967,0.373581,0.349398,0.390244,0.351351,...,0.231239,0.445312,0.393375,0.002730,0.000117,0.003968,0.003857,0.000673,0.000504,0.003264
4,0.313082,0.324475,0.333333,0.000,0.117647,0.279012,0.277466,0.445783,0.390244,0.378378,...,0.227648,0.398438,0.393375,0.000776,0.000128,0.000123,0.000860,0.000141,0.000122,0.000122
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
478,0.570579,0.240375,0.166667,0.125,0.411765,0.601742,0.248622,0.481928,0.292683,0.351351,...,0.138241,0.210938,0.486542,0.021712,0.008025,0.007350,0.011953,0.005778,0.007749,0.006768
479,0.155636,0.292214,0.416667,0.000,0.058824,0.126758,0.320636,0.445783,0.341463,0.324324,...,0.179892,0.265625,0.486542,0.080607,0.026296,0.149997,0.119138,0.081741,0.041467,0.107646
480,0.362461,0.317251,0.250000,0.125,0.176471,0.408728,0.318528,0.397590,0.390244,0.405405,...,0.229084,0.390625,0.393375,0.220604,0.072417,0.176721,0.306973,0.105562,0.118196,0.224077
481,0.276112,0.302479,0.250000,0.250,0.117647,0.299313,0.318145,0.216867,0.243902,0.216216,...,0.107720,0.250000,0.393375,0.000460,0.000021,0.000059,0.000692,0.000032,0.000092,0.000132


In [None]:
train_prop

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,0.488883,0.408759,0.416667,0.250,0.470588,0.525975,0.546835,0.481928,0.439024,0.405405,...,0.246679,0.406250,0.393375,0.001658,0.000043,0.000828,0.003171,0.000176,0.000167,0.000848
1,0.394950,0.231359,0.166667,0.125,0.117647,0.439098,0.336496,0.337349,0.268293,0.297297,...,0.129623,0.257812,0.393375,0.067370,0.002725,0.010616,0.042179,0.010222,0.010862,0.026665
2,0.345312,0.224110,0.416667,0.000,0.176471,0.392184,0.283695,0.349398,0.292683,0.270270,...,0.152603,0.304688,0.393375,0.002326,0.000095,0.001376,0.002848,0.000786,0.000554,0.001741
3,0.619183,0.577331,0.500000,0.000,0.294118,0.543238,0.428154,0.686747,0.609756,0.567568,...,0.432675,0.570312,0.393375,0.002730,0.000117,0.003968,0.003857,0.000673,0.000504,0.003264
4,0.409255,0.172104,0.250000,0.000,0.058824,0.452286,0.187773,0.289157,0.243902,0.297297,...,0.115619,0.250000,0.393375,0.000776,0.000128,0.000123,0.000860,0.000141,0.000122,0.000122
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3449,0.605222,0.301135,0.333333,0.125,0.117647,0.644022,0.328255,0.361446,0.390244,0.405405,...,0.240575,0.468750,0.393375,0.000154,0.000003,0.000041,0.000343,0.000011,0.000012,0.000079
3450,0.577042,0.682652,0.416667,0.000,0.411765,0.585918,0.300608,0.674699,0.682927,0.648649,...,0.512029,0.625000,0.393375,0.002433,0.000042,0.000416,0.002805,0.000119,0.000202,0.000549
3451,0.578938,0.514433,0.500000,0.125,0.235294,0.609495,0.528437,0.506024,0.560976,0.513514,...,0.378456,0.632812,0.393375,0.000103,0.000004,0.000044,0.000401,0.000013,0.000022,0.000048
3452,0.110393,0.381764,0.583333,0.000,0.235294,0.174952,0.505055,0.469880,0.439024,0.378378,...,0.262118,0.390625,0.393375,0.067604,0.006111,0.075257,0.041146,0.064905,0.047159,0.071757


In [None]:
test_prop

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,0.435626,0.338954,0.333333,0.250,0.411765,0.476103,0.428921,0.481928,0.365854,0.351351,...,0.196409,0.304688,0.393375,0.001658,0.000043,0.000828,0.003171,0.000176,0.000167,0.000848
1,0.258445,0.354877,0.416667,0.000,0.176471,0.312260,0.311772,0.445783,0.414634,0.405405,...,0.235548,0.390625,0.393375,0.067370,0.002725,0.010616,0.042179,0.010222,0.010862,0.026665
2,0.576525,0.313726,0.333333,0.500,0.294118,0.608376,0.429400,0.421687,0.390244,0.405405,...,0.214722,0.406250,0.393375,0.002326,0.000095,0.001376,0.002848,0.000786,0.000554,0.001741
3,0.362720,0.310045,0.416667,0.000,0.117647,0.408967,0.373581,0.349398,0.390244,0.351351,...,0.231239,0.445312,0.393375,0.002730,0.000117,0.003968,0.003857,0.000673,0.000504,0.003264
4,0.313082,0.324475,0.333333,0.000,0.117647,0.279012,0.277466,0.445783,0.390244,0.378378,...,0.227648,0.398438,0.393375,0.000776,0.000128,0.000123,0.000860,0.000141,0.000122,0.000122
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
478,0.570579,0.240375,0.166667,0.125,0.411765,0.601742,0.248622,0.481928,0.292683,0.351351,...,0.138241,0.210938,0.486542,0.021712,0.008025,0.007350,0.011953,0.005778,0.007749,0.006768
479,0.155636,0.292214,0.416667,0.000,0.058824,0.126758,0.320636,0.445783,0.341463,0.324324,...,0.179892,0.265625,0.486542,0.080607,0.026296,0.149997,0.119138,0.081741,0.041467,0.107646
480,0.362461,0.317251,0.250000,0.125,0.176471,0.408728,0.318528,0.397590,0.390244,0.405405,...,0.229084,0.390625,0.393375,0.220604,0.072417,0.176721,0.306973,0.105562,0.118196,0.224077
481,0.276112,0.302479,0.250000,0.250,0.117647,0.299313,0.318145,0.216867,0.243902,0.216216,...,0.107720,0.250000,0.393375,0.000460,0.000021,0.000059,0.000692,0.000032,0.000092,0.000132


#### randomforest

In [None]:
SEED= 42

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

cv = KFold(n_splits=5, shuffle=True, random_state=SEED)

scores = []
for tri, val in cv.split(train_prop, target_1):
    # 학습데이터
    x_train = train_prop.iloc[tri]
    y_train_1 = target_1.iloc[tri]
    y_train_2 = target_2.iloc[tri]

    # 검증데이터
    x_valid = train_prop.iloc[val]
    y_valid_1 = target_1.iloc[val]
    y_valid_2 = target_2.iloc[val]

    # MLM
    model = RandomForestRegressor(random_state=SEED)
    model.fit(x_train, y_train_1)
    pred = model.predict(x_valid)
    score_1 = rmse(y_valid_1, pred)

    #HLM
    model = RandomForestRegressor(random_state=SEED)
    model.fit(x_train, y_train_2)
    pred = model.predict(x_valid)
    score_2 = rmse(y_valid_2, pred)

    score = 0.5*score_1 + 0.5*score_2
    scores.append(score)

print(scores)
np.mean(scores)

[31.541353140506537, 34.10027044057181, 32.304015477774676, 31.863021896633345, 31.94387272173747]


32.35050673544477

In [None]:
model.fit(train_prop, target_1)
rf_pred_1 = model.predict(test_prop)

model.fit(train_prop, target_2)
rf_pred_2 = model.predict(test_prop)

#### Cat

In [None]:
!pip install CatBoost

Collecting CatBoost
  Downloading catboost-1.2.1.1-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: CatBoost
Successfully installed CatBoost-1.2.1.1


In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from catboost import CatBoostRegressor
from tqdm import tqdm


cv = KFold(n_splits=5, shuffle=True, random_state=SEED)

scores = []
for tri, val in tqdm(cv.split(train_prop, target_1)):
    # 학습데이터
    x_train = train_prop.iloc[tri]
    y_train_1 = target_1.iloc[tri]
    y_train_2 = target_2.iloc[tri]

    # 검증데이터
    x_valid = train_prop.iloc[val]
    y_valid_1 = target_1.iloc[val]
    y_valid_2 = target_2.iloc[val]

    # MLM
    model = CatBoostRegressor(random_state=SEED, verbose=0)
    model.fit(x_train, y_train_1)
    pred = model.predict(x_valid)
    score_1 = rmse(y_valid_1, pred)

    #HLM
    model = CatBoostRegressor(random_state=SEED, verbose=0)
    model.fit(x_train, y_train_2)
    pred = model.predict(x_valid)
    score_2 = rmse(y_valid_2, pred)

    score = 0.5*score_1 + 0.5*score_2
    scores.append(score)

print(scores)
np.mean(scores)

5it [00:39,  7.91s/it]

[31.809980687767396, 34.17842997753213, 32.2417358281167, 31.41700347408246, 31.912028960807802]





32.3118357856613

In [None]:
model.fit(train_prop, target_1)
cat_pred_1 = model.predict(test_prop)

model.fit(train_prop, target_2)
cat_pred_2 = model.predict(test_prop)

#### XGBRegressor

In [None]:
scores = []
for tri, val in cv.split(train_prop, target_1):
    # 학습데이터
    x_train = train_prop.iloc[tri]
    y_train_1 = target_1.iloc[tri]
    y_train_2 = target_2.iloc[tri]

    # 검증데이터
    x_valid = train_prop.iloc[val]
    y_valid_1 = target_1.iloc[val]
    y_valid_2 = target_2.iloc[val]

    # MLM
    model = XGBRegressor(random_state=SEED)
    model.fit(x_train, y_train_1, eval_set=[(x_valid, y_valid_1)], eval_metric='rmse')
    pred = model.predict(x_valid)
    score_1 = rmse(y_valid_1, pred)

    # HLM
    model = XGBRegressor(random_state=SEED)
    model.fit(x_train, y_train_2, eval_set=[(x_valid, y_valid_2)], eval_metric='rmse')
    pred = model.predict(x_valid)
    score_2 = rmse(y_valid_2, pred)

    score = 0.5*score_1 + 0.5*score_2
    scores.append(score)

print(scores)
np.mean(scores)

[0]	validation_0-rmse:41.13919
[1]	validation_0-rmse:36.21409
[2]	validation_0-rmse:33.48314
[3]	validation_0-rmse:32.06604
[4]	validation_0-rmse:31.43113
[5]	validation_0-rmse:31.12224
[6]	validation_0-rmse:31.07157
[7]	validation_0-rmse:30.87820
[8]	validation_0-rmse:30.97562
[9]	validation_0-rmse:30.95120
[10]	validation_0-rmse:31.00939
[11]	validation_0-rmse:30.96274
[12]	validation_0-rmse:31.12083
[13]	validation_0-rmse:31.21950
[14]	validation_0-rmse:31.13458
[15]	validation_0-rmse:31.22657
[16]	validation_0-rmse:31.26774




[17]	validation_0-rmse:31.14847
[18]	validation_0-rmse:31.17894
[19]	validation_0-rmse:31.23148
[20]	validation_0-rmse:31.29314
[21]	validation_0-rmse:31.35033
[22]	validation_0-rmse:31.33825
[23]	validation_0-rmse:31.39655
[24]	validation_0-rmse:31.36781
[25]	validation_0-rmse:31.40158
[26]	validation_0-rmse:31.39163
[27]	validation_0-rmse:31.40072
[28]	validation_0-rmse:31.48144
[29]	validation_0-rmse:31.49089
[30]	validation_0-rmse:31.53657
[31]	validation_0-rmse:31.43640
[32]	validation_0-rmse:31.51481
[33]	validation_0-rmse:31.50810
[34]	validation_0-rmse:31.59983
[35]	validation_0-rmse:31.59430
[36]	validation_0-rmse:31.68781
[37]	validation_0-rmse:31.69180
[38]	validation_0-rmse:31.67445
[39]	validation_0-rmse:31.70361
[40]	validation_0-rmse:31.73691
[41]	validation_0-rmse:31.79107
[42]	validation_0-rmse:31.85297
[43]	validation_0-rmse:31.89223
[44]	validation_0-rmse:31.91140
[45]	validation_0-rmse:31.91303
[46]	validation_0-rmse:31.90165
[47]	validation_0-rmse:31.93409
[48]	val



[16]	validation_0-rmse:33.53682
[17]	validation_0-rmse:33.57409
[18]	validation_0-rmse:33.68400
[19]	validation_0-rmse:33.79693
[20]	validation_0-rmse:33.94817
[21]	validation_0-rmse:34.06432
[22]	validation_0-rmse:34.08671
[23]	validation_0-rmse:34.11642
[24]	validation_0-rmse:34.07331
[25]	validation_0-rmse:34.09173
[26]	validation_0-rmse:34.20728
[27]	validation_0-rmse:34.23714
[28]	validation_0-rmse:34.19901
[29]	validation_0-rmse:34.20224
[30]	validation_0-rmse:34.27766
[31]	validation_0-rmse:34.39375
[32]	validation_0-rmse:34.40857
[33]	validation_0-rmse:34.38681
[34]	validation_0-rmse:34.40067
[35]	validation_0-rmse:34.42864
[36]	validation_0-rmse:34.45763
[37]	validation_0-rmse:34.46320
[38]	validation_0-rmse:34.52299
[39]	validation_0-rmse:34.62457
[40]	validation_0-rmse:34.58936
[41]	validation_0-rmse:34.62861
[42]	validation_0-rmse:34.66527
[43]	validation_0-rmse:34.69930
[44]	validation_0-rmse:34.74006
[45]	validation_0-rmse:34.76302
[46]	validation_0-rmse:34.82319
[47]	val



[15]	validation_0-rmse:34.92884
[16]	validation_0-rmse:35.01111
[17]	validation_0-rmse:35.03460
[18]	validation_0-rmse:35.09356
[19]	validation_0-rmse:35.10881
[20]	validation_0-rmse:35.08023
[21]	validation_0-rmse:35.07950
[22]	validation_0-rmse:35.15924
[23]	validation_0-rmse:35.14405
[24]	validation_0-rmse:35.14108
[25]	validation_0-rmse:35.15048
[26]	validation_0-rmse:35.22119
[27]	validation_0-rmse:35.19434
[28]	validation_0-rmse:35.22772
[29]	validation_0-rmse:35.24379
[30]	validation_0-rmse:35.30529
[31]	validation_0-rmse:35.32280
[32]	validation_0-rmse:35.35013
[33]	validation_0-rmse:35.45374
[34]	validation_0-rmse:35.43559
[35]	validation_0-rmse:35.49321
[36]	validation_0-rmse:35.52279
[37]	validation_0-rmse:35.51025
[38]	validation_0-rmse:35.56578
[39]	validation_0-rmse:35.61854
[40]	validation_0-rmse:35.63274
[41]	validation_0-rmse:35.62236
[42]	validation_0-rmse:35.62346
[43]	validation_0-rmse:35.61337
[44]	validation_0-rmse:35.56583
[45]	validation_0-rmse:35.57305
[46]	val



[15]	validation_0-rmse:35.27557
[16]	validation_0-rmse:35.25849
[17]	validation_0-rmse:35.18516
[18]	validation_0-rmse:35.17025
[19]	validation_0-rmse:35.37392
[20]	validation_0-rmse:35.41135
[21]	validation_0-rmse:35.45476
[22]	validation_0-rmse:35.47695
[23]	validation_0-rmse:35.49473
[24]	validation_0-rmse:35.51697
[25]	validation_0-rmse:35.55688
[26]	validation_0-rmse:35.60922
[27]	validation_0-rmse:35.78480
[28]	validation_0-rmse:35.85355
[29]	validation_0-rmse:35.85979
[30]	validation_0-rmse:35.86673
[31]	validation_0-rmse:35.96136
[32]	validation_0-rmse:35.92475
[33]	validation_0-rmse:35.92199
[34]	validation_0-rmse:35.90579
[35]	validation_0-rmse:35.98173
[36]	validation_0-rmse:35.99339
[37]	validation_0-rmse:36.01625
[38]	validation_0-rmse:35.99585
[39]	validation_0-rmse:35.98812
[40]	validation_0-rmse:35.97766
[41]	validation_0-rmse:35.98106
[42]	validation_0-rmse:36.03640
[43]	validation_0-rmse:36.05525
[44]	validation_0-rmse:36.02798
[45]	validation_0-rmse:36.00303
[46]	val



[17]	validation_0-rmse:32.91734
[18]	validation_0-rmse:32.90977
[19]	validation_0-rmse:32.89581
[20]	validation_0-rmse:32.94303
[21]	validation_0-rmse:32.94317
[22]	validation_0-rmse:33.03713
[23]	validation_0-rmse:33.20275
[24]	validation_0-rmse:33.14323
[25]	validation_0-rmse:33.24408
[26]	validation_0-rmse:33.20532
[27]	validation_0-rmse:33.27970
[28]	validation_0-rmse:33.27395
[29]	validation_0-rmse:33.24886
[30]	validation_0-rmse:33.32686
[31]	validation_0-rmse:33.33329
[32]	validation_0-rmse:33.32049
[33]	validation_0-rmse:33.37278
[34]	validation_0-rmse:33.49389
[35]	validation_0-rmse:33.49531
[36]	validation_0-rmse:33.49795
[37]	validation_0-rmse:33.50724
[38]	validation_0-rmse:33.52037
[39]	validation_0-rmse:33.51804
[40]	validation_0-rmse:33.57939
[41]	validation_0-rmse:33.62516
[42]	validation_0-rmse:33.65035
[43]	validation_0-rmse:33.65141
[44]	validation_0-rmse:33.63695
[45]	validation_0-rmse:33.62832
[46]	validation_0-rmse:33.59135
[47]	validation_0-rmse:33.60549
[48]	val



[17]	validation_0-rmse:33.22703
[18]	validation_0-rmse:33.20317
[19]	validation_0-rmse:33.19697
[20]	validation_0-rmse:33.21705
[21]	validation_0-rmse:33.27051
[22]	validation_0-rmse:33.28265
[23]	validation_0-rmse:33.35057
[24]	validation_0-rmse:33.35767
[25]	validation_0-rmse:33.39405
[26]	validation_0-rmse:33.47000
[27]	validation_0-rmse:33.41483
[28]	validation_0-rmse:33.47935
[29]	validation_0-rmse:33.58520
[30]	validation_0-rmse:33.58178
[31]	validation_0-rmse:33.58389
[32]	validation_0-rmse:33.63659
[33]	validation_0-rmse:33.65528
[34]	validation_0-rmse:33.66706
[35]	validation_0-rmse:33.72194
[36]	validation_0-rmse:33.71680
[37]	validation_0-rmse:33.81203
[38]	validation_0-rmse:33.80809
[39]	validation_0-rmse:33.83304
[40]	validation_0-rmse:33.86508
[41]	validation_0-rmse:33.92380
[42]	validation_0-rmse:33.96025
[43]	validation_0-rmse:33.95387
[44]	validation_0-rmse:33.95552
[45]	validation_0-rmse:34.01262
[46]	validation_0-rmse:33.98465
[47]	validation_0-rmse:33.99812
[48]	val



[16]	validation_0-rmse:31.37061
[17]	validation_0-rmse:31.39602
[18]	validation_0-rmse:31.43239
[19]	validation_0-rmse:31.44124
[20]	validation_0-rmse:31.46305
[21]	validation_0-rmse:31.48527
[22]	validation_0-rmse:31.43111
[23]	validation_0-rmse:31.39848
[24]	validation_0-rmse:31.38585
[25]	validation_0-rmse:31.35337
[26]	validation_0-rmse:31.34952
[27]	validation_0-rmse:31.40473
[28]	validation_0-rmse:31.48099
[29]	validation_0-rmse:31.53600
[30]	validation_0-rmse:31.56273
[31]	validation_0-rmse:31.64133
[32]	validation_0-rmse:31.67757
[33]	validation_0-rmse:31.72523
[34]	validation_0-rmse:31.79248
[35]	validation_0-rmse:31.80079
[36]	validation_0-rmse:31.78117
[37]	validation_0-rmse:31.74858
[38]	validation_0-rmse:31.83085
[39]	validation_0-rmse:31.86020
[40]	validation_0-rmse:31.86473
[41]	validation_0-rmse:31.91326
[42]	validation_0-rmse:31.91394
[43]	validation_0-rmse:31.91769
[44]	validation_0-rmse:31.98896
[45]	validation_0-rmse:31.99759
[46]	validation_0-rmse:32.07168
[47]	val



[15]	validation_0-rmse:32.61027
[16]	validation_0-rmse:32.66714
[17]	validation_0-rmse:32.65238
[18]	validation_0-rmse:32.59437
[19]	validation_0-rmse:32.62507
[20]	validation_0-rmse:32.63934
[21]	validation_0-rmse:32.72159
[22]	validation_0-rmse:32.72273
[23]	validation_0-rmse:32.75418
[24]	validation_0-rmse:32.79705
[25]	validation_0-rmse:32.85087
[26]	validation_0-rmse:32.89106
[27]	validation_0-rmse:32.92516
[28]	validation_0-rmse:32.90114
[29]	validation_0-rmse:32.91868
[30]	validation_0-rmse:32.93473
[31]	validation_0-rmse:33.02128
[32]	validation_0-rmse:33.05953
[33]	validation_0-rmse:33.04744
[34]	validation_0-rmse:33.01821
[35]	validation_0-rmse:33.00980
[36]	validation_0-rmse:33.02789
[37]	validation_0-rmse:33.04558
[38]	validation_0-rmse:33.07727
[39]	validation_0-rmse:33.05712
[40]	validation_0-rmse:33.10845
[41]	validation_0-rmse:33.14212
[42]	validation_0-rmse:33.21504
[43]	validation_0-rmse:33.26855
[44]	validation_0-rmse:33.26259
[45]	validation_0-rmse:33.27294
[46]	val



[16]	validation_0-rmse:32.07337
[17]	validation_0-rmse:32.07885
[18]	validation_0-rmse:32.09678
[19]	validation_0-rmse:32.12269
[20]	validation_0-rmse:32.14395
[21]	validation_0-rmse:32.13343
[22]	validation_0-rmse:32.08475
[23]	validation_0-rmse:32.16631
[24]	validation_0-rmse:32.14906
[25]	validation_0-rmse:32.10074
[26]	validation_0-rmse:32.10602
[27]	validation_0-rmse:32.08846
[28]	validation_0-rmse:32.19241
[29]	validation_0-rmse:32.16188
[30]	validation_0-rmse:32.27009
[31]	validation_0-rmse:32.27923
[32]	validation_0-rmse:32.23680
[33]	validation_0-rmse:32.25095
[34]	validation_0-rmse:32.22342
[35]	validation_0-rmse:32.19568
[36]	validation_0-rmse:32.13975
[37]	validation_0-rmse:32.13477
[38]	validation_0-rmse:32.13251
[39]	validation_0-rmse:32.27652
[40]	validation_0-rmse:32.29286
[41]	validation_0-rmse:32.29518
[42]	validation_0-rmse:32.41583
[43]	validation_0-rmse:32.40206
[44]	validation_0-rmse:32.47881
[45]	validation_0-rmse:32.56330
[46]	validation_0-rmse:32.57050
[47]	val



[15]	validation_0-rmse:32.51237
[16]	validation_0-rmse:32.51376
[17]	validation_0-rmse:32.55439
[18]	validation_0-rmse:32.52868
[19]	validation_0-rmse:32.59822
[20]	validation_0-rmse:32.57962
[21]	validation_0-rmse:32.53155
[22]	validation_0-rmse:32.40160
[23]	validation_0-rmse:32.45767
[24]	validation_0-rmse:32.41423
[25]	validation_0-rmse:32.40925
[26]	validation_0-rmse:32.44360
[27]	validation_0-rmse:32.43944
[28]	validation_0-rmse:32.42764
[29]	validation_0-rmse:32.43990
[30]	validation_0-rmse:32.47216
[31]	validation_0-rmse:32.40869
[32]	validation_0-rmse:32.42589
[33]	validation_0-rmse:32.46161
[34]	validation_0-rmse:32.45825
[35]	validation_0-rmse:32.48847
[36]	validation_0-rmse:32.53742
[37]	validation_0-rmse:32.59212
[38]	validation_0-rmse:32.57943
[39]	validation_0-rmse:32.53384
[40]	validation_0-rmse:32.53577
[41]	validation_0-rmse:32.55933
[42]	validation_0-rmse:32.57425
[43]	validation_0-rmse:32.56624
[44]	validation_0-rmse:32.57544
[45]	validation_0-rmse:32.58603
[46]	val

34.18899767422867

In [None]:
model.fit(train_prop, target_1)
xgb_pred_1 = model.predict(test_prop)

model.fit(train_prop, target_2)
xgb_pred_2 = model.predict(test_prop)

#### LGBMRegressor

In [None]:
scores = []
for tri, val in cv.split(train_prop, target_1):
    # 학습데이터
    x_train = train_prop.iloc[tri]
    y_train_1 = target_1.iloc[tri]
    y_train_2 = target_2.iloc[tri]

    # 검증데이터
    x_valid = train_prop.iloc[val]
    y_valid_1 = target_1.iloc[val]
    y_valid_2 = target_2.iloc[val]

    # MLM
    model = LGBMRegressor(random_state=SEED, objective='regression')
    model.fit(x_train, y_train_1, eval_set=[(x_valid, y_valid_1)], eval_metric="rmse")
    pred = model.predict(x_valid)
    score_1 = rmse(y_valid_1, pred)

    # HLM
    model = LGBMRegressor(random_state=SEED, objective='regression')
    model.fit(x_train, y_train_2, eval_set=[(x_valid, y_valid_2)], eval_metric="rmse")
    pred = model.predict(x_valid)
    score_2 = rmse(y_valid_2, pred)


    score = 0.5*score_1 + 0.5*score_2
    scores.append(score)

print(scores)
np.mean(scores)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3390
[LightGBM] [Info] Number of data points in the train set: 2763, number of used features: 24
[LightGBM] [Info] Start training from score 37.549743
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3390
[LightGBM] [Info] Number of data points in the train set: 2763, number of used features: 24
[LightGBM] [Info] Start training from score 53.023585
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3387
[LightGBM] [Info] Number of data points in the train set: 2763, number of used features: 24
[LightGBM] [Info] Start training from score 37.414670
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3387
[LightGBM] [Info] Number of data points in the train set: 2763, number of used features: 24
[LightGBM] [Info] Start training from 

32.85109820929331

In [None]:
model.fit(train_prop, target_1)
lgbm_pred_1 = model.predict(test_prop)

model.fit(train_prop, target_2)
lgbm_pred_2 = model.predict(test_prop)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3393
[LightGBM] [Info] Number of data points in the train set: 3454, number of used features: 24
[LightGBM] [Info] Start training from score 37.276169
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3393
[LightGBM] [Info] Number of data points in the train set: 3454, number of used features: 24
[LightGBM] [Info] Start training from score 53.008790


In [None]:
mlm_pred = (rf_pred_1 + cat_pred_1 + lgbm_pred_1) / 3
hlm_pred = (rf_pred_2 + cat_pred_2 + lgbm_pred_2) / 3

In [None]:
submission

NameError: ignored

In [None]:
submission['MLM'] = mlm_pred
submission['HLM'] = hlm_pred

NameError: ignored

In [None]:
submission

In [None]:
submission.to_csv("submission_ML18(preprocess).csv", index=False)