In [23]:
import numpy as np

from rdkit.Chem import AllChem as Chem
from rdkit import DataStructs

In [24]:
def validity(smiles):
    def verify_sequence(smile):
        mol = Chem.MolFromSmiles(smile)

        return smile != '' and mol is not None and mol.GetNumAtoms() > 1

    n_smiles = len(smiles)
    n_valid_smiles = sum(map(verify_sequence, smiles))

    return n_valid_smiles / n_smiles

In [25]:
def uniqueness(smiles):
    n_smiles = len(smiles)
    n_uniq_smiles = len(set(smiles))

    return n_uniq_smiles / n_smiles

In [26]:
with open("./result/sampled_smiles_pre_vae.txt") as f:
    list_smiles = []
    for i in f:
        s = i.strip("\r\n ")
        if s != "EMPTY":
            list_smiles.append(s)

In [27]:
len(list_smiles)

100

In [28]:
with open("./data/test.txt") as f:
    list_test_smiles = []
    for i, e in enumerate(f):
        if i == 100: 
            break
        list_test_smiles.append(e)

In [29]:
validity(list_smiles)

1.0

In [30]:
def uniqueness(smiles):
    n_smiles = len(smiles)
    n_uniq_smiles = len(set(smiles))

    return n_uniq_smiles / n_smiles

In [31]:
uniqueness(list_smiles)

1.0

In [32]:
def diversity(smiles, other_smiles=None):
    def remap(x, x_min, x_max):
        if x_max == 0 and x_min == 0:
            return 0

        if x_max - x_min == 0:
            return x

        return (x - x_min) / (x_max - x_min)

    def calc_diversity(smile, fps):
        mol = Chem.MolFromSmiles(smile)
        if smile != '' and mol is not None and mol.GetNumAtoms() > 1:
            ref_fps = Chem.GetMorganFingerprintAsBitVect(mol, 4, nBits=2048)
            dist = DataStructs.BulkTanimotoSimilarity(ref_fps, fps, returnDistance=True)
            mean_dist = np.mean(dist)

            low_rand_dst, mean_div_dst = 0.9, 0.945
            val = remap(mean_dist, low_rand_dst, mean_div_dst)
            val = np.clip(val, 0.0, 1.0)

            return val

        return 0

    if other_smiles is None:
        other_smiles = smiles

    mols = [Chem.MolFromSmiles(s) for s in other_smiles]
    fps = [Chem.GetMorganFingerprintAsBitVect(m, 4, nBits=2048) for m in mols if m is not None]
    divs = [calc_diversity(s, fps=fps) for s in smiles]

    return np.mean(divs)

In [33]:
diversity(list_smiles, list_test_smiles)

0.3024808672481752

In [34]:
import pandas as pd

In [35]:
data = pd.read_csv('/data/Insilico/mmnist_dataset/data/mcf_dataset_splitted.csv')

In [49]:
data.head()

Unnamed: 0,ID,SMILES,logP,SA,QED,split
0,ZINC50764925,CCCCNC(=O)C1CCCN(C(=O)CCC(C)C)C1,2.5775,2.4214,0.7296,train
1,ZINC65292537,Cc1ccccc1C(=O)N1CCC(Cc2nc3cccnc3n2C)C1,2.9815,2.6987,0.7398,train
2,ZINC31820077,CCc1c(C(=O)N2CCC(C(=O)NC)CC2)cnn1-c1ccccn1,1.4279,2.3242,0.9122,test_scaffolds
3,ZINC65513427,CC(=O)Nc1ccc2nc(NC(=O)C3CC3c3ccco3)sc2c1,3.5899,3.0427,0.7594,test
4,ZINC02782238,CCn1cc(C(=O)Nc2cn(CC)nc2C(N)=O)c(C)n1,0.779,2.508,0.8477,test


In [37]:
train_data = data.loc[data['split'] == "train"]

In [38]:
train_data.shape

(1759412, 6)

In [45]:
a = train_data.sample(200_000)

In [47]:
a.head()

Unnamed: 0,ID,SMILES,logP,SA,QED,split
24536,ZINC75840918,FC(F)(F)c1ccc2nnc(C3CCCC3)n2n1,2.8007,2.5409,0.7875,train
1978886,ZINC76867352,Cc1nc(C)c(CNC(C)COc2ccc(F)cc2F)s1,3.5952,2.7771,0.8848,train
605073,ZINC62466901,CCC(C)OC(=O)Cn1c(-c2ccc(N)cc2)csc1=O,2.5007,2.8558,0.6801,train
1340705,ZINC19404995,Cc1c(N)cccc1NC(=O)C1Cc2ccccc2O1,2.5194,2.4691,0.8225,train
35366,ZINC55211885,CCN(Cc1nc(COC)no1)c1nc(C)nc2oc(C)nc12,1.7905,3.0739,0.6723,train


In [48]:
a.to_csv("./mcf_dataset_train_200k.csv")

In [52]:
test_data = data.loc[data['split'] == "test"]
test_sc_data = data.loc[data['split'] == "test_scaffolds"]

test_data.sample(20000).to_csv("./mcf_dataset_test_20k.csv")
test_sc_data.sample(20000).to_csv("./mcf_dataset_test_scaffolds_20k.csv")