In [1]:
from rdkit import Chem
from rdkit.Chem import Draw
import numpy as np
from PIL import Image
import pandas as pd

from tqdm import tqdm
import uuid

In [24]:
def smiles_to_image(smiles, img_size=(224, 224)):
    mol = Chem.MolFromSmiles(smiles)
    img = Draw.MolToImage(mol, size=img_size)
    return img

In [25]:
train_balance = pd.read_csv('../data/balance/train.csv')

In [26]:
train_balance.head()

Unnamed: 0,Smiles,IC50_nM,pIC50
0,CN[C@@H](C)C(=O)N[C@H](C(=O)N1C[C@@H](NC(=O)CC...,0.022,10.66
1,N([C@H](C(=O)N1[C@H](C(=O)N[C@H]2c3ccccc3CCC2)...,0.022,10.66
2,O=C(N)c1c(OC)cc2c(OC[C@H]3NC(=O)CC3)ncc(C#CCOC...,0.022,10.66
3,c1(c(cc2c(c(ncc2C#CCOCCOCCOCCC(N[C@H]2C[C@@H](...,0.022,10.66
4,O(C)c1cc2c(ncc(c2cc1C(N)=O)C#CCOCCOCCOCCC(=O)N...,0.022,10.66


In [28]:
list_sample = []
root = "/mnt/4726C8D91DA6DCE5/doannghia/dacon/data/balance/image_train"
for i in tqdm(range(len(train_balance))):
    smile, ic50, pic50 = train_balance.iloc[i].values
    name = str(uuid.uuid4()) + '_' + str(i) + '.jpg'
    path_save = f"{root}/{name}"
    img = smiles_to_image(smile)
    img.save(path_save, format="JPEG")
    
    list_sample.append([path_save, smile, ic50, pic50])

100%|██████████| 28520/28520 [01:16<00:00, 374.91it/s]


In [29]:
df_train = pd.DataFrame(columns=['Path', 'Smiles', 'IC50_nM', 'pIC50'], data=list_sample)

In [35]:
val_balance = pd.read_csv('../data/balance/val.csv')

In [36]:
val_balance.head()

Unnamed: 0,Smiles,IC50_nM,pIC50
0,NC(=O)[C@@H]1[C@H](Nc2nc(Nc3ccnc(N4CCCC4)c3)nc...,206.0,6.69
1,O=c1[nH]c(N2CCOCC2)nc(N[C@@H]2CCCNC2)c1-c1nccs1,1830.0,5.74
2,CC(C)(O)[C@H](F)CNC(=O)c1cnc(-n2ncc3cc(C#N)cnc...,9.6,8.02
3,COc1cc2cc[nH]c2cc1NC(=O)c1cnn2cccnc12,787.0,6.1
4,CC(C)Nc1cc(-n2ncc3cc(C#N)cnc32)ncc1C(=O)NC1CCN...,15.7,7.8


In [None]:
list_sample = []
root = "/mnt/4726C8D91DA6DCE5/doannghia/dacon/data/balance/image_val"
for i in tqdm(range(len(val_balance))):
    smile, ic50, pic50 = val_balance.iloc[i].values
    name = str(uuid.uuid4()) + '_' + str(i) + '.jpg'
    path_save = f"{root}/{name}"
    img = smiles_to_image(smile)
    img.save(path_save, format="JPEG")
    
    list_sample.append([path_save, smile, ic50, pic50])

In [None]:
df_val = pd.DataFrame(columns=['Path', 'Smiles', 'IC50_nM', 'pIC50'], data=list_sample)

In [30]:
test = pd.read_csv("../data/test.csv")

In [31]:
list_sample_test = []
root = "/mnt/4726C8D91DA6DCE5/doannghia/dacon/data/balance/image_test"
for i in tqdm(range(len(test))):
    id, smile = test.iloc[i].values
    name = str(uuid.uuid4()) + '_' + str(i) + '.jpg'
    path_save = f"{root}/{name}"
    img = smiles_to_image(smile)
    img.save(path_save, format="JPEG")
    
    list_sample_test.append([path_save, id, smile])

  0%|          | 0/113 [00:00<?, ?it/s]

100%|██████████| 113/113 [00:00<00:00, 354.24it/s]


In [32]:
df_test = pd.DataFrame(columns=['Path', "ID", "Smiles"], data=list_sample_test)

In [33]:
len(df_train), len(df_test)

(28520, 113)

In [34]:
df_train.to_csv("../data/balance/train_image.csv", index=False)
df_val.to_csv("../data/balance/val_image.csv", index=False)
df_test.to_csv("../data/balance/test_image.csv", index=False)