In [None]:
import pandas as pd
import numpy as np
import re
import torch
from tqdm.auto import tqdm
import random
import os


import matplotlib.pyplot as plt
# import seaborn as sns


DATA_PATH = 'data/data_features/'
SEED = 42


def reset_seeds(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [None]:
train = pd.read_csv(f"{DATA_PATH}train.csv")
test = pd.read_csv(f"{DATA_PATH}test.csv")
submission = pd.read_csv(f"{DATA_PATH}sample_submission.csv")

In [None]:
# 1. 극단치만 판단해서 18개 제거
outliers = train[(train['MLM'] > 100.0) | (train['HLM'] > 100.0) | (train['AlogP'] < -3) | (train['Molecular_Weight'] > 800)
     | (train['Num_H_Acceptors'] > 14) | (train['Num_H_Donors'] > 9) | (train['Num_RotatableBonds'] > 20)
     | (train['LogD'] < -4) | (train['Molecular_PolarSurfaceArea'] > 250)]


outliers.index

len(outliers.index)

train = train.drop(index=outliers.index, axis=0)


train.shape # 3498 -> 3480

(3480, 11)

In [None]:
duplicates = train[train.duplicated(subset=['SMILES'], keep=False)]

duplicates.sort_values(by='SMILES')


duplicates.shape # 이상치 제거로 27 -> 26쌍

processed_duplicate = duplicates.groupby('SMILES')[['MLM','HLM']].mean().reset_index()
processed_duplicate

tmp = pd.merge(processed_duplicate,duplicates,how='left',on='SMILES')
tmp

### -> 중복치 제거 (총 26개)

tmp = tmp.iloc[0::2]  # 첫 번째 행부터 시작하여 2씩 증가하여 행 선택
tmp.drop(['MLM_y','HLM_y'],axis =1,inplace=True)

tmp.columns = ['SMILES', 'MLM', 'HLM','id',  'AlogP', 'Molecular_Weight',
       'Num_H_Acceptors', 'Num_H_Donors', 'Num_RotatableBonds', 'LogD',
       'Molecular_PolarSurfaceArea']

processed_duplicate = tmp[['id', 'SMILES', 'MLM', 'HLM', 'AlogP', 'Molecular_Weight',
       'Num_H_Acceptors', 'Num_H_Donors', 'Num_RotatableBonds', 'LogD',
       'Molecular_PolarSurfaceArea']]

train = train.drop_duplicates(subset=['SMILES'], keep=False) #기존 train에서 중복치들을 우선 모두 제거
train.shape

processed_duplicate.shape

train = pd.concat([train, processed_duplicate], axis=0, ignore_index=True)
train.shape


(3454, 11)

In [None]:
!pip install Chem

!pip install rdkit

from rdkit import Chem, DataStructs
from rdkit.Chem import Descriptors
from rdkit.Chem import AllChem
from rdkit import DataStructs
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole     # 화면에 출력하기 위한 옵션
IPythonConsole.ipython_useSVG=True

train['mol'] = train['SMILES'].apply(lambda x: Chem.MolFromSmiles(x))
train['mol'] = train['mol'].apply(lambda x: Chem.AddHs(x))
train['num_of_atoms'] = train['mol'].apply(lambda x: x.GetNumAtoms())
train['num_of_heavy_atoms'] = train['mol'].apply(lambda x: x.GetNumHeavyAtoms())
train.head()

# 탄소 패턴을 지정한다
c_patt = Chem.MolFromSmiles('C')
# 이 패턴이 들어있는 곳을 찾는다. 패턴의 수를 세면 탄소 원자가 몇개 들어있는지 알 수 있다
print(train['mol'][0].GetSubstructMatches(c_patt))

# - num_of_{}_atoms 추가

# 임의의 패턴(원자)를 몇개 포함하고 있는지를 얻는 함수
def number_of_atoms(atom_list, df):
    for i in atom_list:
        df['num_of_{}_atoms'.format(i)] = df['mol'].apply(lambda x: len(x.GetSubstructMatches(Chem.MolFromSmiles(i))))

number_of_atoms(['C', 'O', 'N', 'Cl'], train) # 탄소, 산소, 질소, 염소
train.head()

# - 테스트 데이터에도 추가

test['mol'] = test['SMILES'].apply(lambda x: Chem.MolFromSmiles(x))
test['mol'] = test['mol'].apply(lambda x: Chem.AddHs(x))
test['num_of_atoms'] = test['mol'].apply(lambda x: x.GetNumAtoms())
test['num_of_heavy_atoms'] = test['mol'].apply(lambda x: x.GetNumHeavyAtoms())

number_of_atoms(['C','O', 'N', 'Cl'], test)
test.head()



Defaulting to user installation because normal site-packages is not writeable
[0m[33mDEPRECATION: distro-info 0.18ubuntu0.18.04.1 has a non-standard version number. pip 23.3 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of distro-info or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0mDefaulting to user installation because normal site-packages is not writeable
[0m[33mDEPRECATION: distro-info 0.18ubuntu0.18.04.1 has a non-standard version number. pip 23.3 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of distro-info or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m((0,), (1,), (3,), (4,), (5,), (6,), (7,), (9,), (11,), (12,), (13,), (14,), (16,)

Unnamed: 0,id,SMILES,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea,mol,num_of_atoms,num_of_heavy_atoms,num_of_C_atoms,num_of_O_atoms,num_of_N_atoms,num_of_Cl_atoms
0,TEST_000,CC(C)Nc1ccnc(N2CCN(Cc3cccs3)C(CCO)C2)n1,2.641,361.505,4,2,7,2.635,92.76,<rdkit.Chem.rdchem.Mol object at 0x7f2f3c9ee510>,52,25,18,1,5,0
1,TEST_001,COc1cc(=O)n(-c2ccccc2)cc1C(=O)N1CCC2(CC1)OCCO2,0.585,370.399,5,0,3,0.585,68.31,<rdkit.Chem.rdchem.Mol object at 0x7f2f3c9ee330>,49,27,20,5,2,0
2,TEST_002,Cc1cccc(NC(=N)/N=c2\nc(O)c(Cc3ccccc3)c(C)[nH]2)c1,4.276,347.414,4,4,5,4.29,92.86,<rdkit.Chem.rdchem.Mol object at 0x7f2f3c9ee3f0>,47,26,20,1,5,0
3,TEST_003,O=C(c1nc2ncccn2n1)N1CCCn2cc(-c3ccccc3)nc21,1.795,345.358,5,0,2,1.795,81.21,<rdkit.Chem.rdchem.Mol object at 0x7f2f3c9ee450>,41,26,18,1,7,0
4,TEST_004,CCN1CCN(C(=O)c2cc3c(=O)n4cc(C)ccc4nc3n2C)CC1,1.219,353.418,4,0,2,0.169,61.15,<rdkit.Chem.rdchem.Mol object at 0x7f2f3c9ee390>,49,26,19,2,5,0


In [None]:
# GNN

!pip install torch


!pip install torch-geometric




Defaulting to user installation because normal site-packages is not writeable
[0m[33mDEPRECATION: distro-info 0.18ubuntu0.18.04.1 has a non-standard version number. pip 23.3 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of distro-info or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0mDefaulting to user installation because normal site-packages is not writeable
[0m[33mDEPRECATION: distro-info 0.18ubuntu0.18.04.1 has a non-standard version number. pip 23.3 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of distro-info or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m

In [None]:

import torch
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import MolFromSmiles
from torch_geometric.data import Data

In [None]:



# SMILES 문자열을 입력으로 받아 RDKit Mol 객체로 변환
def smiles_to_mol(smiles):
    mol = MolFromSmiles(smiles)
    mol = Chem.AddHs(mol)  # 수소 원자 추가
    mol = Chem.MolToSmiles(mol)  # 정규화된 SMILES로 변환
    mol = Chem.MolFromSmiles(mol)
    return mol

# RDKit Mol 객체를 PyTorch Geometric Data로 변환
def mol_to_geometric_data(mol):
    num_atoms = mol.GetNumAtoms()
    edge_indices = []
    edge_attr = []

    for bond in mol.GetBonds():
        start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
        edge_indices.append((start, end))
        edge_indices.append((end, start))  # 무방향 그래프이므로 역방향 엣지 추가
        edge_attr.extend([bond.GetBondTypeAsDouble()] * 2)

    x = torch.eye(num_atoms)  # 노드 피처는 항등 행렬로 초기화

    data = Data(x=x, edge_index=torch.tensor(edge_indices).t().contiguous(), edge_attr=torch.tensor(edge_attr))
    return data

# SMILES 문자열을 입력으로 받아 그래프 데이터로 변환
def smiles_to_graph(smiles):
    mol = smiles_to_mol(smiles)
    data = mol_to_geometric_data(mol)
    return data


In [None]:

# # 그래프 데이터에서 필요한 정보를 추출하여 사용하세요

df = pd.DataFrame(columns=['SMILES', 'graph_x', 'graph_index', 'graph_attr'])
df['SMILES'] = train['SMILES']

df = pd.DataFrame(columns=['SMILES', 'graph_x', 'graph_index', 'graph_attr'])


In [None]:
# 예제 SMILES 문자열
df = pd.DataFrame(columns=['SMILES', 'graph_x', 'graph_index', 'graph_attr'])


smiles_strings = train['SMILES']  # 여기에 원하는 SMILES 문자열을 넣으세요

# SMILES를 그래프로 변환하고 데이터프레임에 추가
for smiles_string in tqdm(smiles_strings):
    graph_data = smiles_to_graph(smiles_string)  # 앞서 정의한 함수를 사용하여 SMILES를 그래프로 변환

#     # 데이터프레임에 추가
#     df = df.append({'SMILES': smiles_string,
#                           'graph_x': graph_data.x.numpy().sum(),
#                           'graph_index': graph_data.edge_index.numpy().sum(),
#                           'graph_attr': graph_data.edge_attr.numpy().sum()}, ignore_index=True)




    # 데이터프레임에 추가할 데이터 생성
    new_data = {'SMILES': smiles_string,
                'graph_x': graph_data.x.numpy().sum(),
                'graph_index': graph_data.edge_index.numpy().sum(),
                'graph_attr': graph_data.edge_attr.numpy().sum()}

    # 데이터프레임에 데이터 추가
    df = pd.concat([df, pd.DataFrame([new_data])], ignore_index=True)


  0%|          | 0/3454 [00:00<?, ?it/s]

In [None]:
df

Unnamed: 0,SMILES,graph_x,graph_index,graph_attr
0,CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC,28.0,1550,78.0
1,Cc1nc(C)c(CN2CC(C)C(=O)Nc3ccccc32)s1,21.0,898,59.0
2,CCCN1CCN(c2nn3nnnc3c3ccccc23)CC1,22.0,1026,65.0
3,Cc1ccc(-c2ccc(-n3nc(C)c(S(=O)(=O)N4CCN(C5CCCCC...,35.0,2586,99.0
4,Cc1ccc2c(c1)N(C(=O)c1ccncc1)CC(C)O2,20.0,820,58.0
...,...,...,...,...
3449,Cc1ccc(-c2cc(-c3nc4c5ccccc5ncn4n3)[nH]n2)c(C)c1,26.0,1516,86.0
3450,Cc1ccc(-c2ccc(C(CN3CCCC3)N(C)C(=O)CN3C(=O)COc4...,38.0,3028,106.0
3451,Cc1ccccc1-c1nc2ccccc2cc1C(C)n1c(=O)[nH]c2c(S(C...,33.0,2284,107.0
3452,N#Cc1nccnc1OC1CCN(C(=O)C2CC(=O)N(C3CCOCC3)C2)C1,28.0,1636,76.0


In [None]:
df_train = df[['graph_x','graph_index','graph_attr']]
df_train

train = pd.concat([train,df_train],axis=1)



In [None]:
df = pd.DataFrame(columns=['SMILES', 'graph_x', 'graph_index', 'graph_attr'])

smiles_strings = test['SMILES']  # 여기에 원하는 SMILES 문자열을 넣으세요

# SMILES를 그래프로 변환하고 데이터프레임에 추가
for smiles_string in tqdm(smiles_strings):
    graph_data = smiles_to_graph(smiles_string)  # 앞서 정의한 함수를 사용하여 SMILES를 그래프로 변환

    # 데이터프레임에 추가할 데이터 생성
    new_data = {'SMILES': smiles_string,
                'graph_x': graph_data.x.numpy().sum(),
                'graph_index': graph_data.edge_index.numpy().sum(),
                'graph_attr': graph_data.edge_attr.numpy().sum()}

    # 데이터프레임에 데이터 추가
    df = pd.concat([df, pd.DataFrame([new_data])], ignore_index=True)


# 화합물 군집화

  0%|          | 0/483 [00:00<?, ?it/s]

In [None]:
df

Unnamed: 0,SMILES,graph_x,graph_index,graph_attr
0,CC(C)Nc1ccnc(N2CCN(Cc3cccs3)C(CCO)C2)n1,25.0,1270,65.0
1,COc1cc(=O)n(-c2ccccc2)cc1C(=O)N1CCC2(CC1)OCCO2,27.0,1488,76.0
2,Cc1cccc(NC(=N)/N=c2\nc(O)c(Cc3ccccc3)c(C)[nH]2)c1,26.0,1372,78.0
3,O=C(c1nc2ncccn2n1)N1CCCn2cc(-c3ccccc3)nc21,26.0,1464,83.0
4,CCN1CCN(C(=O)c2cc3c(=O)n4cc(C)ccc4nc3n2C)CC1,26.0,1444,77.0
...,...,...,...,...
478,CCc1noc(CC)c1CC(=O)NCC1(CC)CCCCC1,22.0,946,53.0
479,CC(=O)N1CCC2(CC1)OC(=O)C(C)=C2C(=O)N1CCN(C)CC1,24.0,1178,60.0
480,CC(C)NC(=O)CN1C(=O)c2ccccc2N2C(=O)c3ccccc3C12,26.0,1452,76.0
481,Cn1cc(Br)c(=O)c(NC(=O)c2ccc(O)cc2F)c1,20.0,776,58.0


In [None]:

df_test = df[['graph_x','graph_index','graph_attr']]
df_test

test = pd.concat([test,df_test],axis=1)

train['graph_index'] = train['graph_index'].astype(float)
test['graph_index'] = test['graph_index'].astype(float)


In [None]:
train

Unnamed: 0,id,SMILES,MLM,HLM,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,...,mol,num_of_atoms,num_of_heavy_atoms,num_of_C_atoms,num_of_O_atoms,num_of_N_atoms,num_of_Cl_atoms,graph_x,graph_index,graph_attr
0,TRAIN_0000,CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC,26.0100,50.6800,3.259,400.495,5,2,8,3.259,...,<rdkit.Chem.rdchem.Mol object at 0x7f2f3ca101b0>,52,28,20,3,4,0,28.0,1550.0,78.0
1,TRAIN_0001,Cc1nc(C)c(CN2CC(C)C(=O)Nc3ccccc32)s1,29.2700,50.5900,2.169,301.407,2,1,2,2.172,...,<rdkit.Chem.rdchem.Mol object at 0x7f2f3ca10210>,40,21,16,1,3,0,21.0,898.0,59.0
2,TRAIN_0002,CCCN1CCN(c2nn3nnnc3c3ccccc23)CC1,5.5860,80.8920,1.593,297.358,5,0,3,1.585,...,<rdkit.Chem.rdchem.Mol object at 0x7f2f3ca10150>,41,22,15,0,7,0,22.0,1026.0,65.0
3,TRAIN_0003,Cc1ccc(-c2ccc(-n3nc(C)c(S(=O)(=O)N4CCN(C5CCCCC...,5.7100,2.0000,4.771,494.652,6,0,5,3.475,...,<rdkit.Chem.rdchem.Mol object at 0x7f2f3ca10390>,69,35,26,2,6,0,35.0,2586.0,99.0
4,TRAIN_0004,Cc1ccc2c(c1)N(C(=O)c1ccncc1)CC(C)O2,93.2700,99.9900,2.335,268.310,3,0,1,2.337,...,<rdkit.Chem.rdchem.Mol object at 0x7f2f3ca10450>,36,20,16,2,2,0,20.0,820.0,58.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3449,TRAIN_1833,Cc1ccc(-c2cc(-c3nc4c5ccccc5ncn4n3)[nH]n2)c(C)c1,28.4630,24.1955,4.609,340.381,4,1,2,4.736,...,<rdkit.Chem.rdchem.Mol object at 0x7f2f3c9624b0>,42,26,20,0,6,0,26.0,1516.0,86.0
3450,TRAIN_0898,Cc1ccc(-c2ccc(C(CN3CCCC3)N(C)C(=O)CN3C(=O)COc4...,56.0435,66.1260,4.282,553.480,5,0,7,4.009,...,<rdkit.Chem.rdchem.Mol object at 0x7f2f3c962510>,68,38,29,3,4,2,38.0,3028.0,106.0
3451,TRAIN_2575,Cc1ccccc1-c1nc2ccccc2cc1C(C)n1c(=O)[nH]c2c(S(C...,56.3130,63.7320,4.304,459.520,6,1,4,4.304,...,<rdkit.Chem.rdchem.Mol object at 0x7f2f3c962570>,54,33,24,3,5,0,33.0,2284.0,107.0
3452,TRAIN_0216,N#Cc1nccnc1OC1CCN(C(=O)C2CC(=O)N(C3CCOCC3)C2)C1,70.2150,93.8400,-1.133,385.417,7,0,4,-1.133,...,<rdkit.Chem.rdchem.Mol object at 0x7f2f3c9625d0>,51,28,19,4,5,0,28.0,1636.0,76.0


In [None]:
test

Unnamed: 0,id,SMILES,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea,mol,num_of_atoms,num_of_heavy_atoms,num_of_C_atoms,num_of_O_atoms,num_of_N_atoms,num_of_Cl_atoms,graph_x,graph_index,graph_attr
0,TEST_000,CC(C)Nc1ccnc(N2CCN(Cc3cccs3)C(CCO)C2)n1,2.641,361.505,4,2,7,2.635,92.76,<rdkit.Chem.rdchem.Mol object at 0x7f2f3c9ee510>,52,25,18,1,5,0,25.0,1270.0,65.0
1,TEST_001,COc1cc(=O)n(-c2ccccc2)cc1C(=O)N1CCC2(CC1)OCCO2,0.585,370.399,5,0,3,0.585,68.31,<rdkit.Chem.rdchem.Mol object at 0x7f2f3c9ee330>,49,27,20,5,2,0,27.0,1488.0,76.0
2,TEST_002,Cc1cccc(NC(=N)/N=c2\nc(O)c(Cc3ccccc3)c(C)[nH]2)c1,4.276,347.414,4,4,5,4.290,92.86,<rdkit.Chem.rdchem.Mol object at 0x7f2f3c9ee3f0>,47,26,20,1,5,0,26.0,1372.0,78.0
3,TEST_003,O=C(c1nc2ncccn2n1)N1CCCn2cc(-c3ccccc3)nc21,1.795,345.358,5,0,2,1.795,81.21,<rdkit.Chem.rdchem.Mol object at 0x7f2f3c9ee450>,41,26,18,1,7,0,26.0,1464.0,83.0
4,TEST_004,CCN1CCN(C(=O)c2cc3c(=O)n4cc(C)ccc4nc3n2C)CC1,1.219,353.418,4,0,2,0.169,61.15,<rdkit.Chem.rdchem.Mol object at 0x7f2f3c9ee390>,49,26,19,2,5,0,26.0,1444.0,77.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
478,TEST_478,CCc1noc(CC)c1CC(=O)NCC1(CC)CCCCC1,4.207,306.443,2,1,7,4.207,55.13,<rdkit.Chem.rdchem.Mol object at 0x7f2f3ca05a50>,52,22,18,2,2,0,22.0,946.0,53.0
479,TEST_479,CC(=O)N1CCC2(CC1)OC(=O)C(C)=C2C(=O)N1CCN(C)CC1,-0.608,335.398,5,0,1,-1.736,70.16,<rdkit.Chem.rdchem.Mol object at 0x7f2f3ca05ab0>,49,24,17,4,3,0,24.0,1178.0,60.0
480,TEST_480,CC(C)NC(=O)CN1C(=O)c2ccccc2N2C(=O)c3ccccc3C12,1.792,349.383,3,1,3,1.792,69.72,<rdkit.Chem.rdchem.Mol object at 0x7f2f3ca05b10>,45,26,20,3,3,0,26.0,1452.0,76.0
481,TEST_481,Cn1cc(Br)c(=O)c(NC(=O)c2ccc(O)cc2F)c1,0.790,341.132,3,2,2,0.423,69.64,<rdkit.Chem.rdchem.Mol object at 0x7f2f3ca05b70>,30,20,13,3,2,0,20.0,776.0,58.0


In [None]:
!pip install transformers

Defaulting to user installation because normal site-packages is not writeable
[0m[33mDEPRECATION: distro-info 0.18ubuntu0.18.04.1 has a non-standard version number. pip 23.3 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of distro-info or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m

In [None]:
!pip install rdkit

Defaulting to user installation because normal site-packages is not writeable
[0m[33mDEPRECATION: distro-info 0.18ubuntu0.18.04.1 has a non-standard version number. pip 23.3 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of distro-info or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m

In [None]:
!pip install torch

Defaulting to user installation because normal site-packages is not writeable
[0m[33mDEPRECATION: distro-info 0.18ubuntu0.18.04.1 has a non-standard version number. pip 23.3 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of distro-info or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m

In [None]:
!pip install dgl

Defaulting to user installation because normal site-packages is not writeable
[0mCollecting dgl
  Obtaining dependency information for dgl from https://files.pythonhosted.org/packages/d1/9c/9bf782f567215114bc000afd578f2f40a8cfd32ffeb0c5141e0a1ea4c6a2/dgl-1.1.2-cp38-cp38-manylinux1_x86_64.whl.metadata
  Downloading dgl-1.1.2-cp38-cp38-manylinux1_x86_64.whl.metadata (558 bytes)
Collecting networkx>=2.1 (from dgl)
  Downloading networkx-3.1-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m23.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Downloading dgl-1.1.2-cp38-cp38-manylinux1_x86_64.whl (6.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.0/6.0 MB[0m [31m100.2 MB/s[0m eta [36m0:00:00[0m00:01[0m
[0m[33mDEPRECATION: distro-info 0.18ubuntu0.18.04.1 has a non-standard version number. pip 23.3 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of distro-info

In [None]:
import dgl
import dgl.function as fn
import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem import AllChem
from rdkit.Chem import PandasTools
from rdkit.DataStructs import BulkTanimotoSimilarity
from sklearn.cluster import AgglomerativeClustering
import matplotlib.pyplot as plt


In [None]:
from PIL import Image

import torch
from transformers import AutoImageProcessor, ResNetForImageClassification
from PIL import Image
import torch.nn.functional as F

In [None]:
# SMILES 문자열을 RDKit Mol 객체로 변환
def smiles_to_mol(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return mol

# 분자 그림 생성
def draw_molecule(mol, file_name):
    img = Draw.MolToImage(mol)
    img.save(file_name)

In [None]:
for i, mol in tqdm(enumerate(train['mol'])):
    draw_molecule(mol, f'{DATA_PATH}molecule_{i}.png')

0it [00:00, ?it/s]

In [None]:
def get_honeycomb_probability(i):
    image_path = f'{DATA_PATH}molecule_{i}.png'
    image = Image.open(image_path)

    processor = AutoImageProcessor.from_pretrained("microsoft/resnet-50")
    model = ResNetForImageClassification.from_pretrained("microsoft/resnet-50")

    inputs = processor(images=image, return_tensors="pt")

    with torch.no_grad():
        logits = model(**inputs).logits

    # 로짓을 확률로 변환
    probs = F.softmax(logits, dim=-1)

    # 'honeycomb' 라벨의 확률 추출
    honeycomb_label_index = model.config.label2id['honeycomb']
    honeycomb_probability = probs[0][honeycomb_label_index].item()

    return honeycomb_probability


In [None]:
train['mode_label_proba'] = 0

In [None]:
from tqdm import tqdm

# train 데이터프레임의 처음 10개 행에 대해서만 작업을 수행
for proba in tqdm(train['mode_label_proba'], desc="Processing", ncols=100):
    train['mode_label_proba'][i] = get_honeycomb_probability(i)


Processing:   0%|                                                          | 0/3454 [00:00<?, ?it/s]

Downloading (…)rocessor_config.json:   0%|          | 0.00/266 [00:00<?, ?B/s]

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.
2023-09-16 07:13:00.960477: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-09-16 07:13:01.622124: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Downloading (…)lve/main/config.json:   0%|          | 0.00/69.6k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/103M [00:00<?, ?B/s]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['mode_label_proba'][i] = get_honeycomb_probability(i)
Processing:   0%|                                                | 1/3454 [00:08<8:06:29,  8.45s/it]Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.
Processing:   0%|                                                | 2/3454 [00:09<3:44:38,  3.90s/it]Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.
Processing:   0%|                                                | 3/3454 [00:09<2:20:49,  2.45s/it]Could not find image processor class in the image processor config or the model config. 

In [None]:
for i, mol in tqdm(enumerate(test['mol'])):
    draw_molecule(mol, f'{DATA_PATH}molecule_test_{i}.png')

483it [00:06, 73.83it/s]


In [None]:
def get_honeycomb_probability_test(i):
    image_path = f'{DATA_PATH}molecule_test_{i}.png'
    image = Image.open(image_path)

    processor = AutoImageProcessor.from_pretrained("microsoft/resnet-50")
    model = ResNetForImageClassification.from_pretrained("microsoft/resnet-50")

    inputs = processor(images=image, return_tensors="pt")

    with torch.no_grad():
        logits = model(**inputs).logits

    # 로짓을 확률로 변환
    probs = F.softmax(logits, dim=-1)

    # 'honeycomb' 라벨의 확률 추출
    honeycomb_label_index = model.config.label2id['honeycomb']
    honeycomb_probability = probs[0][honeycomb_label_index].item()

    return honeycomb_probability


In [None]:
test['mode_label_proba']=0

In [None]:
from tqdm import tqdm

# train 데이터프레임의 처음 10개 행에 대해서만 작업을 수행
for proba in tqdm(test['mode_label_proba'], desc="Processing", ncols=100):
    test['mode_label_proba'][i] = get_honeycomb_probability(i)

Processing:   0%|                                                           | 0/483 [00:00<?, ?it/s]Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['mode_label_proba'][i] = get_honeycomb_probability(i)
Processing:   0%|                                                   | 1/483 [00:01<08:55,  1.11s/it]Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.
Processing:   0%|▏                                                  | 2/483 [00:01<07:03,  1.14it/s]Could not find image processor class in the image processor config or the model config. L

In [None]:
test.drop(columns='model_label_proba',axis=1, inplace=True)

In [None]:
train

Unnamed: 0,id,SMILES,MLM,HLM,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,...,num_of_atoms,num_of_heavy_atoms,num_of_C_atoms,num_of_O_atoms,num_of_N_atoms,num_of_Cl_atoms,graph_x,graph_index,graph_attr,mode_label_proba
0,TRAIN_0000,CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC,26.0100,50.6800,3.259,400.495,5,2,8,3.259,...,52,28,20,3,4,0,28.0,1550.0,78.0,0.00000
1,TRAIN_0001,Cc1nc(C)c(CN2CC(C)C(=O)Nc3ccccc32)s1,29.2700,50.5900,2.169,301.407,2,1,2,2.172,...,40,21,16,1,3,0,21.0,898.0,59.0,0.00000
2,TRAIN_0002,CCCN1CCN(c2nn3nnnc3c3ccccc23)CC1,5.5860,80.8920,1.593,297.358,5,0,3,1.585,...,41,22,15,0,7,0,22.0,1026.0,65.0,0.00000
3,TRAIN_0003,Cc1ccc(-c2ccc(-n3nc(C)c(S(=O)(=O)N4CCN(C5CCCCC...,5.7100,2.0000,4.771,494.652,6,0,5,3.475,...,69,35,26,2,6,0,35.0,2586.0,99.0,0.00000
4,TRAIN_0004,Cc1ccc2c(c1)N(C(=O)c1ccncc1)CC(C)O2,93.2700,99.9900,2.335,268.310,3,0,1,2.337,...,36,20,16,2,2,0,20.0,820.0,58.0,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3449,TRAIN_1833,Cc1ccc(-c2cc(-c3nc4c5ccccc5ncn4n3)[nH]n2)c(C)c1,28.4630,24.1955,4.609,340.381,4,1,2,4.736,...,42,26,20,0,6,0,26.0,1516.0,86.0,0.00000
3450,TRAIN_0898,Cc1ccc(-c2ccc(C(CN3CCCC3)N(C)C(=O)CN3C(=O)COc4...,56.0435,66.1260,4.282,553.480,5,0,7,4.009,...,68,38,29,3,4,2,38.0,3028.0,106.0,0.00000
3451,TRAIN_2575,Cc1ccccc1-c1nc2ccccc2cc1C(C)n1c(=O)[nH]c2c(S(C...,56.3130,63.7320,4.304,459.520,6,1,4,4.304,...,54,33,24,3,5,0,33.0,2284.0,107.0,0.00000
3452,TRAIN_0216,N#Cc1nccnc1OC1CCN(C(=O)C2CC(=O)N(C3CCOCC3)C2)C1,70.2150,93.8400,-1.133,385.417,7,0,4,-1.133,...,51,28,19,4,5,0,28.0,1636.0,76.0,0.00000


In [None]:
test

Unnamed: 0,id,SMILES,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea,mol,num_of_atoms,num_of_heavy_atoms,num_of_C_atoms,num_of_O_atoms,num_of_N_atoms,num_of_Cl_atoms,graph_x,graph_index,graph_attr,mode_label_proba
0,TEST_000,CC(C)Nc1ccnc(N2CCN(Cc3cccs3)C(CCO)C2)n1,2.641,361.505,4,2,7,2.635,92.76,<rdkit.Chem.rdchem.Mol object at 0x7f2f3c9ee510>,52,25,18,1,5,0,25.0,1270.0,65.0,0.000000
1,TEST_001,COc1cc(=O)n(-c2ccccc2)cc1C(=O)N1CCC2(CC1)OCCO2,0.585,370.399,5,0,3,0.585,68.31,<rdkit.Chem.rdchem.Mol object at 0x7f2f3c9ee330>,49,27,20,5,2,0,27.0,1488.0,76.0,0.000000
2,TEST_002,Cc1cccc(NC(=N)/N=c2\nc(O)c(Cc3ccccc3)c(C)[nH]2)c1,4.276,347.414,4,4,5,4.290,92.86,<rdkit.Chem.rdchem.Mol object at 0x7f2f3c9ee3f0>,47,26,20,1,5,0,26.0,1372.0,78.0,0.000000
3,TEST_003,O=C(c1nc2ncccn2n1)N1CCCn2cc(-c3ccccc3)nc21,1.795,345.358,5,0,2,1.795,81.21,<rdkit.Chem.rdchem.Mol object at 0x7f2f3c9ee450>,41,26,18,1,7,0,26.0,1464.0,83.0,0.000000
4,TEST_004,CCN1CCN(C(=O)c2cc3c(=O)n4cc(C)ccc4nc3n2C)CC1,1.219,353.418,4,0,2,0.169,61.15,<rdkit.Chem.rdchem.Mol object at 0x7f2f3c9ee390>,49,26,19,2,5,0,26.0,1444.0,77.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
478,TEST_478,CCc1noc(CC)c1CC(=O)NCC1(CC)CCCCC1,4.207,306.443,2,1,7,4.207,55.13,<rdkit.Chem.rdchem.Mol object at 0x7f2f3ca05a50>,52,22,18,2,2,0,22.0,946.0,53.0,0.000000
479,TEST_479,CC(=O)N1CCC2(CC1)OC(=O)C(C)=C2C(=O)N1CCN(C)CC1,-0.608,335.398,5,0,1,-1.736,70.16,<rdkit.Chem.rdchem.Mol object at 0x7f2f3ca05ab0>,49,24,17,4,3,0,24.0,1178.0,60.0,0.000000
480,TEST_480,CC(C)NC(=O)CN1C(=O)c2ccccc2N2C(=O)c3ccccc3C12,1.792,349.383,3,1,3,1.792,69.72,<rdkit.Chem.rdchem.Mol object at 0x7f2f3ca05b10>,45,26,20,3,3,0,26.0,1452.0,76.0,0.000000
481,TEST_481,Cn1cc(Br)c(=O)c(NC(=O)c2ccc(O)cc2F)c1,0.790,341.132,3,2,2,0.423,69.64,<rdkit.Chem.rdchem.Mol object at 0x7f2f3ca05b70>,30,20,13,3,2,0,20.0,776.0,58.0,0.000000


In [None]:
train.to_csv(f"{DATA_PATH}train_mode_label_proba_1.csv", index=False)
test.to_csv(f"{DATA_PATH}test_mode_label_proba_1.csv", index=False)

In [None]:
def get_image_label(i):
    image_path = f'{DATA_PATH}molecule_{i}.png'
    image = Image.open(image_path)

    processor = AutoImageProcessor.from_pretrained("microsoft/resnet-50")
    model = ResNetForImageClassification.from_pretrained("microsoft/resnet-50")

    inputs = processor(images=image, return_tensors="pt")

    with torch.no_grad():
        logits = model(**inputs).logits

    predicted_label = logits.argmax().item()
    return predicted_label

In [None]:
# tqdm로 진행 상황을 보면서 이미지 라벨을 얻습니다.
predicted_labels = []
for i in tqdm(range(len(train))):  # train 데이터프레임의 크기에 따라 수정하세요.
    label = get_image_label(i)
    predicted_labels.append(label)

# predicted_labels을 train 데이터프레임에 추가합니다.
train['predicted_label'] = predicted_labels

  0%|          | 0/3454 [00:00<?, ?it/s]Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.
  0%|          | 1/3454 [00:01<1:23:32,  1.45s/it]Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.
  0%|          | 2/3454 [00:02<58:42,  1.02s/it]  Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.
  0%|          | 3/3454 [00:03<59:26,  1.03s/it]Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.
  0%|          | 4/3454 [00:04<1:00:30,  1.05s/it]Could not find image processor class in the image processor config or the 

In [None]:
def get_image_label_test(i):
    image_path = f'{DATA_PATH}molecule_test_{i}.png'
    image = Image.open(image_path)

    processor = AutoImageProcessor.from_pretrained("microsoft/resnet-50")
    model = ResNetForImageClassification.from_pretrained("microsoft/resnet-50")

    inputs = processor(images=image, return_tensors="pt")

    with torch.no_grad():
        logits = model(**inputs).logits

    predicted_label = logits.argmax().item()
    return predicted_label

In [None]:
# tqdm로 진행 상황을 보면서 이미지 라벨을 얻습니다.
predicted_labels = []
for i in tqdm(range(len(test))):  # train 데이터프레임의 크기에 따라 수정하세요.
    label = get_image_label_test(i)
    predicted_labels.append(label)

# predicted_labels을 train 데이터프레임에 추가합니다.
test['predicted_label'] = predicted_labels

  0%|          | 0/483 [00:00<?, ?it/s]Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.
  0%|          | 1/483 [00:00<05:55,  1.36it/s]Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.
  0%|          | 2/483 [00:01<05:53,  1.36it/s]Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.
  1%|          | 3/483 [00:02<05:50,  1.37it/s]Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.
  1%|          | 4/483 [00:02<05:48,  1.37it/s]Could not find image processor class in the image processor config or the model confi

In [None]:
train['predicted_label'].unique()

array([599, 644, 677, 892, 409, 530, 845])

In [None]:
test['predicted_label'].unique()

array([599, 644, 892, 530, 845])

In [None]:
train.to_csv(f"{DATA_PATH}train_label_proba_2.csv", index=False)
test.to_csv(f"{DATA_PATH}test_label_proba_2.csv", index=False)

In [None]:
print(model.config.label2id['honeycomb'])


599


In [None]:
for i in train['predicted_label'].unique():
    label_id = model.config.id2label[i]
    print(label_id)

honeycomb
matchstick
nail
wall clock
analog clock
digital clock
syringe


In [None]:
model.config

ResNetConfig {
  "_name_or_path": "microsoft/resnet-50",
  "architectures": [
    "ResNetForImageClassification"
  ],
  "depths": [
    3,
    4,
    6,
    3
  ],
  "downsample_in_first_stage": false,
  "embedding_size": 64,
  "hidden_act": "relu",
  "hidden_sizes": [
    256,
    512,
    1024,
    2048
  ],
  "id2label": {
    "0": "tench, Tinca tinca",
    "1": "goldfish, Carassius auratus",
    "2": "great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias",
    "3": "tiger shark, Galeocerdo cuvieri",
    "4": "hammerhead, hammerhead shark",
    "5": "electric ray, crampfish, numbfish, torpedo",
    "6": "stingray",
    "7": "cock",
    "8": "hen",
    "9": "ostrich, Struthio camelus",
    "10": "brambling, Fringilla montifringilla",
    "11": "goldfinch, Carduelis carduelis",
    "12": "house finch, linnet, Carpodacus mexicanus",
    "13": "junco, snowbird",
    "14": "indigo bunting, indigo finch, indigo bird, Passerina cyanea",
    "15": "robin, Ameri

In [None]:
index_label = [599, 644, 677, 892, 409, 530, 845]

In [None]:
df_train = pd.DataFrame(0, columns=index_label, index=range(train.shape[0]))
df_train

Unnamed: 0,599,644,677,892,409,530,845
0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...
3449,0,0,0,0,0,0,0
3450,0,0,0,0,0,0,0
3451,0,0,0,0,0,0,0
3452,0,0,0,0,0,0,0


In [None]:
ㄴindex_label = [599, 644, 677, 892, 409, 530, 845]

import pandas as pd
import torch
from PIL import Image
from transformers import AutoFeatureExtractor, AutoModelForImageClassification, AutoConfig
import numpy as np

# Define your labels

def get_labels_probability(i, DATA_PATH, df_train):
    image_path = f'{DATA_PATH}molecule_{i}.png'
    image = Image.open(image_path)

    # Load the feature extractor and model
    feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/resnet-50")
    config = AutoConfig.from_pretrained("microsoft/resnet-50")
    model = AutoModelForImageClassification.from_pretrained("microsoft/resnet-50", config=config)

    inputs = feature_extractor(images=image, return_tensors="pt")
    labels = torch.tensor([index_label], dtype=torch.float32)

    with torch.no_grad():
        # Get the logits from the model
        logits = model(**inputs).logits

    # Apply softmax to get label probabilities
    probabilities = torch.nn.functional.softmax(logits, dim=1)[0].numpy()

    # Create a dictionary with label probabilities
    label_probabilities = {label: probability for label, probability in zip(index_label, probabilities)}

    # Append the label probabilities to the DataFrame
    df_train = pd.concat([df_train, pd.DataFrame(label_probabilities, index=[0])], ignore_index=True)

    return df_train



In [None]:
# Example usage:
# Initialize an empty DataFrame
df_train = pd.DataFrame(columns=index_label)

# Define the number of test images (replace with your actual value)

# Iterate through your test images and update the DataFrame
for i in tqdm(range(train.shape[0])):
    df_train = get_labels_probability(i, DATA_PATH, df_train)

# Display the DataFrame
print(df_train)


100%|██████████| 3454/3454 [46:36<00:00,  1.24it/s]

               599           644           677           892           409  \
0     3.850327e-07  4.714931e-08  2.964005e-07  3.672964e-07  1.236351e-07   
1     1.561324e-05  2.984224e-06  3.796620e-06  4.877138e-06  7.153630e-06   
2     5.398044e-07  1.039881e-07  4.922246e-07  3.299579e-07  5.504496e-07   
3     6.333834e-07  1.287260e-07  1.419117e-06  4.466020e-07  4.713968e-07   
4     1.807379e-07  1.397058e-07  4.409355e-08  1.001132e-07  9.883728e-08   
...            ...           ...           ...           ...           ...   
3449  3.652243e-08  2.886769e-09  1.467746e-08  4.026949e-08  8.061966e-09   
3450  5.645429e-07  4.619418e-08  1.490166e-07  3.249211e-07  8.366377e-08   
3451  2.458780e-08  4.849249e-09  1.568974e-08  4.703590e-08  8.944228e-09   
3452  1.566753e-05  6.692989e-06  2.691478e-05  4.757677e-06  4.541992e-05   
3453  3.765951e-08  4.623397e-09  3.019535e-08  3.317234e-08  1.976569e-08   

               530           845  
0     7.799269e-08  9.643183




In [None]:
train = pd.concat([train,df_train],axis=1)
train

Unnamed: 0,id,SMILES,MLM,HLM,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,...,graph_attr,mode_label_proba,predicted_label,599,644,677,892,409,530,845
0,TRAIN_0000,CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC,26.0100,50.6800,3.259,400.495,5,2,8,3.259,...,78.0,0.00000,599,3.850327e-07,4.714931e-08,2.964005e-07,3.672964e-07,1.236351e-07,7.799269e-08,9.643183e-08
1,TRAIN_0001,Cc1nc(C)c(CN2CC(C)C(=O)Nc3ccccc32)s1,29.2700,50.5900,2.169,301.407,2,1,2,2.172,...,59.0,0.00000,599,1.561324e-05,2.984224e-06,3.796620e-06,4.877138e-06,7.153630e-06,5.059904e-06,3.028648e-06
2,TRAIN_0002,CCCN1CCN(c2nn3nnnc3c3ccccc23)CC1,5.5860,80.8920,1.593,297.358,5,0,3,1.585,...,65.0,0.00000,599,5.398044e-07,1.039881e-07,4.922246e-07,3.299579e-07,5.504496e-07,2.583589e-07,1.978212e-07
3,TRAIN_0003,Cc1ccc(-c2ccc(-n3nc(C)c(S(=O)(=O)N4CCN(C5CCCCC...,5.7100,2.0000,4.771,494.652,6,0,5,3.475,...,99.0,0.00000,599,6.333834e-07,1.287260e-07,1.419117e-06,4.466020e-07,4.713968e-07,2.351361e-07,3.708277e-07
4,TRAIN_0004,Cc1ccc2c(c1)N(C(=O)c1ccncc1)CC(C)O2,93.2700,99.9900,2.335,268.310,3,0,1,2.337,...,58.0,0.00000,599,1.807379e-07,1.397058e-07,4.409355e-08,1.001132e-07,9.883728e-08,5.704584e-08,1.388257e-08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3449,TRAIN_1833,Cc1ccc(-c2cc(-c3nc4c5ccccc5ncn4n3)[nH]n2)c(C)c1,28.4630,24.1955,4.609,340.381,4,1,2,4.736,...,86.0,0.00000,599,3.652243e-08,2.886769e-09,1.467746e-08,4.026949e-08,8.061966e-09,5.806772e-09,9.016412e-09
3450,TRAIN_0898,Cc1ccc(-c2ccc(C(CN3CCCC3)N(C)C(=O)CN3C(=O)COc4...,56.0435,66.1260,4.282,553.480,5,0,7,4.009,...,106.0,0.00000,599,5.645429e-07,4.619418e-08,1.490166e-07,3.249211e-07,8.366377e-08,9.431290e-08,6.245198e-08
3451,TRAIN_2575,Cc1ccccc1-c1nc2ccccc2cc1C(C)n1c(=O)[nH]c2c(S(C...,56.3130,63.7320,4.304,459.520,6,1,4,4.304,...,107.0,0.00000,599,2.458780e-08,4.849249e-09,1.568974e-08,4.703590e-08,8.944228e-09,1.046075e-08,5.487400e-09
3452,TRAIN_0216,N#Cc1nccnc1OC1CCN(C(=O)C2CC(=O)N(C3CCOCC3)C2)C1,70.2150,93.8400,-1.133,385.417,7,0,4,-1.133,...,76.0,0.00000,599,1.566753e-05,6.692989e-06,2.691478e-05,4.757677e-06,4.541992e-05,2.196831e-05,8.150343e-06


In [None]:
def get_labels_probability_test(i, DATA_PATH, df_train):
    image_path = f'{DATA_PATH}molecule_test_{i}.png'
    image = Image.open(image_path)

    # Load the feature extractor and model
    feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/resnet-50")
    config = AutoConfig.from_pretrained("microsoft/resnet-50")
    model = AutoModelForImageClassification.from_pretrained("microsoft/resnet-50", config=config)

    inputs = feature_extractor(images=image, return_tensors="pt")
    labels = torch.tensor([index_label], dtype=torch.float32)

    with torch.no_grad():
        # Get the logits from the model
        logits = model(**inputs).logits

    # Apply softmax to get label probabilities
    probabilities = torch.nn.functional.softmax(logits, dim=1)[0].numpy()

    # Create a dictionary with label probabilities
    label_probabilities = {label: probability for label, probability in zip(index_label, probabilities)}

    # Append the label probabilities to the DataFrame
    df_test = pd.concat([df_train, pd.DataFrame(label_probabilities, index=[0])], ignore_index=True)

    return df_test

In [None]:
# Example usage:
# Initialize an empty DataFrame
df_test = pd.DataFrame(columns=index_label)

# Define the number of test images (replace with your actual value)

# Iterate through your test images and update the DataFrame
for i in tqdm(range(test.shape[0])):
    df_test = get_labels_probability(i, DATA_PATH, df_test)

# Display the DataFrame
print(df_test)

100%|██████████| 483/483 [06:29<00:00,  1.24it/s]

              599           644           677           892           409  \
0    3.850327e-07  4.714931e-08  2.964005e-07  3.672964e-07  1.236351e-07   
1    1.561324e-05  2.984224e-06  3.796620e-06  4.877138e-06  7.153630e-06   
2    5.398044e-07  1.039881e-07  4.922246e-07  3.299579e-07  5.504496e-07   
3    6.333834e-07  1.287260e-07  1.419117e-06  4.466020e-07  4.713968e-07   
4    1.807379e-07  1.397058e-07  4.409355e-08  1.001132e-07  9.883728e-08   
..            ...           ...           ...           ...           ...   
478  5.032433e-06  8.788252e-06  2.628836e-06  1.382537e-06  4.043608e-06   
479  1.868070e-05  2.879873e-05  5.364458e-05  1.377469e-05  5.720182e-05   
480  5.112404e-05  7.930943e-05  6.320205e-05  3.549116e-05  7.387088e-05   
481  1.075247e-07  2.357480e-08  2.108159e-08  8.058865e-08  2.247606e-08   
482  7.366234e-06  4.003631e-06  4.013834e-06  4.060432e-06  9.186190e-06   

              530           845  
0    7.799269e-08  9.643183e-08  
1    5.




In [None]:
test = pd.read_csv(f"{DATA_PATH}test_label_proba_2.csv")
test

Unnamed: 0,id,SMILES,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea,mol,...,num_of_heavy_atoms,num_of_C_atoms,num_of_O_atoms,num_of_N_atoms,num_of_Cl_atoms,graph_x,graph_index,graph_attr,mode_label_proba,predicted_label
0,TEST_000,CC(C)Nc1ccnc(N2CCN(Cc3cccs3)C(CCO)C2)n1,2.641,361.505,4,2,7,2.635,92.76,<rdkit.Chem.rdchem.Mol object at 0x7f2f3c9ee510>,...,25,18,1,5,0,25.0,1270.0,65.0,0.000000,599
1,TEST_001,COc1cc(=O)n(-c2ccccc2)cc1C(=O)N1CCC2(CC1)OCCO2,0.585,370.399,5,0,3,0.585,68.31,<rdkit.Chem.rdchem.Mol object at 0x7f2f3c9ee330>,...,27,20,5,2,0,27.0,1488.0,76.0,0.000000,599
2,TEST_002,Cc1cccc(NC(=N)/N=c2\nc(O)c(Cc3ccccc3)c(C)[nH]2)c1,4.276,347.414,4,4,5,4.290,92.86,<rdkit.Chem.rdchem.Mol object at 0x7f2f3c9ee3f0>,...,26,20,1,5,0,26.0,1372.0,78.0,0.000000,599
3,TEST_003,O=C(c1nc2ncccn2n1)N1CCCn2cc(-c3ccccc3)nc21,1.795,345.358,5,0,2,1.795,81.21,<rdkit.Chem.rdchem.Mol object at 0x7f2f3c9ee450>,...,26,18,1,7,0,26.0,1464.0,83.0,0.000000,599
4,TEST_004,CCN1CCN(C(=O)c2cc3c(=O)n4cc(C)ccc4nc3n2C)CC1,1.219,353.418,4,0,2,0.169,61.15,<rdkit.Chem.rdchem.Mol object at 0x7f2f3c9ee390>,...,26,19,2,5,0,26.0,1444.0,77.0,0.000000,599
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
478,TEST_478,CCc1noc(CC)c1CC(=O)NCC1(CC)CCCCC1,4.207,306.443,2,1,7,4.207,55.13,<rdkit.Chem.rdchem.Mol object at 0x7f2f3ca05a50>,...,22,18,2,2,0,22.0,946.0,53.0,0.000000,644
479,TEST_479,CC(=O)N1CCC2(CC1)OC(=O)C(C)=C2C(=O)N1CCN(C)CC1,-0.608,335.398,5,0,1,-1.736,70.16,<rdkit.Chem.rdchem.Mol object at 0x7f2f3ca05ab0>,...,24,17,4,3,0,24.0,1178.0,60.0,0.000000,644
480,TEST_480,CC(C)NC(=O)CN1C(=O)c2ccccc2N2C(=O)c3ccccc3C12,1.792,349.383,3,1,3,1.792,69.72,<rdkit.Chem.rdchem.Mol object at 0x7f2f3ca05b10>,...,26,20,3,3,0,26.0,1452.0,76.0,0.000000,599
481,TEST_481,Cn1cc(Br)c(=O)c(NC(=O)c2ccc(O)cc2F)c1,0.790,341.132,3,2,2,0.423,69.64,<rdkit.Chem.rdchem.Mol object at 0x7f2f3ca05b70>,...,20,13,3,2,0,20.0,776.0,58.0,0.000000,599


In [None]:
df_test

Unnamed: 0,599,644,677,892,409,530,845
0,3.850327e-07,4.714931e-08,2.964005e-07,3.672964e-07,1.236351e-07,7.799269e-08,9.643183e-08
1,1.561324e-05,2.984224e-06,3.796620e-06,4.877138e-06,7.153630e-06,5.059904e-06,3.028648e-06
2,5.398044e-07,1.039881e-07,4.922246e-07,3.299579e-07,5.504496e-07,2.583589e-07,1.978212e-07
3,6.333834e-07,1.287260e-07,1.419117e-06,4.466020e-07,4.713968e-07,2.351361e-07,3.708277e-07
4,1.807379e-07,1.397058e-07,4.409355e-08,1.001132e-07,9.883728e-08,5.704584e-08,1.388257e-08
...,...,...,...,...,...,...,...
478,5.032433e-06,8.788252e-06,2.628836e-06,1.382537e-06,4.043608e-06,3.610053e-06,7.687655e-07
479,1.868070e-05,2.879873e-05,5.364458e-05,1.377469e-05,5.720182e-05,1.931656e-05,1.222659e-05
480,5.112404e-05,7.930943e-05,6.320205e-05,3.549116e-05,7.387088e-05,5.505913e-05,2.545091e-05
481,1.075247e-07,2.357480e-08,2.108159e-08,8.058865e-08,2.247606e-08,4.325810e-08,1.508215e-08


In [None]:
test = pd.concat([test,df_test],axis=1)
test.shape

(483, 28)

In [None]:
train.to_csv(f"{DATA_PATH}train_label_proba_3.csv", index=False)
test.to_csv(f"{DATA_PATH}test_label_proba_3.csv", index=False)

In [None]:
df_test.to_csv(f"{DATA_PATH}df_test.csv", index=False)