In [1]:
#!/usr/bin/env python3
import copy
import os
import random

from rdkit import Chem
from rdkit.Chem.Scaffolds import MurckoScaffold
from scipy.cluster.hierarchy import dendrogram, linkage, leaves_list, cut_tree
from rdkit.Chem import AllChem
from collections import Counter
import pandas as pd
import numpy as np
from collections import defaultdict
import joblib

In [2]:

# copy from xiong et al. attentivefp
class ScaffoldGenerator(object):
    """
    Generate molecular scaffolds.
    Parameters
    ----------
    include_chirality : : bool, optional (default False)
        Include chirality in scaffolds.
    """

    def __init__(self, include_chirality=False):
        self.include_chirality = include_chirality

    def get_scaffold(self, mol):
        """
        Get Murcko scaffolds for molecules.
        Murcko scaffolds are described in DOI: 10.1021/jm9602928. They are
        essentially that part of the molecule consisting of rings and the
        linker atoms between them.
        Parameters
        ----------
        mols : array_like
            Molecules.
        """
        return MurckoScaffold.MurckoScaffoldSmiles(
            mol=mol, includeChirality=self.include_chirality)


# copy from xiong et al. attentivefp
def generate_scaffold(smiles, include_chirality=False):
    """Compute the Bemis-Murcko scaffold for a SMILES string."""
    mol = Chem.MolFromSmiles(smiles)
    engine = ScaffoldGenerator(include_chirality=include_chirality)
    scaffold = engine.get_scaffold(mol)
    return scaffold



In [3]:
df = pd.read_csv(os.path.join('./datasets/drugbank1/drugbank.csv'))
scaffolds = {}
print(len(df))

191808


In [4]:
drug_set = set()

for i in range(len(df)):
    drug_set.add(df.loc[i, 'Drug1'])
    drug_set.add(df.loc[i, 'Drug2'])

print(len(drug_set))

for d in drug_set:
    try:
        scaffold = generate_scaffold(d)
        if scaffolds.__contains__(scaffold):
            scaffolds[scaffold].append(d)
        else:
            scaffolds[scaffold] = [d]
    except:
        print("error", d)
        # df.drop(index=i, inplace=True)
        continue
# df = df.reset_index()
joblib.dump(scaffolds, "datasets/drugbank1/scaffolds.pkl")


1706


[12:10:40] Explicit valence for atom # 0 N, 4, is greater than permitted


error [H][N]([H])([H])[Pt](Cl)(Cl)[N]([H])([H])[H]


[12:10:40] Explicit valence for atom # 0 N, 4, is greater than permitted
[12:10:41] SMILES Parse Error: syntax error while parsing: OC1=CC=CC(=C1)C-1=C2\CCC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC(O)=CC=C1)C1=CC(O)=CC=C1)\C1=CC(O)=CC=C1
[12:10:41] SMILES Parse Error: Failed parsing SMILES 'OC1=CC=CC(=C1)C-1=C2\CCC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC(O)=CC=C1)C1=CC(O)=CC=C1)\C1=CC(O)=CC=C1' for input: 'OC1=CC=CC(=C1)C-1=C2\CCC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC(O)=CC=C1)C1=CC(O)=CC=C1)\C1=CC(O)=CC=C1'
[12:10:41] Explicit valence for atom # 0 N, 4, is greater than permitted


error [H][N]([H])([H])[Pt]1(OCC(=O)O1)[N]([H])([H])[H]
error OC1=CC=CC(=C1)C-1=C2\CCC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC(O)=CC=C1)C1=CC(O)=CC=C1)\C1=CC(O)=CC=C1
error [H][N]([H])([H])[Pt]1(OC(=O)C2(CCC2)C(=O)O1)[N]([H])([H])[H]
error [H][N]1([H])[C@@H]2CCCC[C@H]2[N]([H])([H])[Pt]11OC(=O)C(=O)O1


[12:10:41] Explicit valence for atom # 0 N, 4, is greater than permitted


['datasets/drugbank1/scaffolds.pkl']

In [11]:
smile_scafold = {}
for i in range(len(df)):
    smile_scafold[df.loc[i,'Drug1']] = generate_scaffold(df.loc[i,'Drug1'])
    smile_scafold[df.loc[i,'Drug2']] = generate_scaffold(df.loc[i,'Drug2'])

In [73]:
all_key = scaffolds.keys()
print(all_key)

dict_keys(['O=c1ccc2cc3ccoc3cc2o1', 'C1=CCC2C(=C1)c1cc3ccc(cc4nc(cc5ccc(cc2n1)[nH]5)C=C4)[nH]3', '', 'O=C(c1ccccc1)c1cccs1', 'c1ccc2c(c1)Nc1ccccc1S2', 'O=C(c1ccccc1)c1ccccc1', 'O=c1nc2[nH]c3ccccc3nc-2c(=O)[nH]1', 'c1ccc2c(c1)[nH]c1ccccc12', 'O=C(OC1C[N+]2(CCCOc3ccccc3)CCC1CC2)C(c1cccs1)c1cccs1', 'O=C(Cc1ccccc1)OC1CC2[NH2+]C(C1)C1OC21', 'O=C(Cc1ccccc1)OC1CC2CCC(C1)N2', 'O=C1Nc2cccnc2N(C(=O)CN2CCNCC2)c2ccccc21', 'c1ccc2c(c1)Cc1ccccc1N1CCNCC21', 'O=C1N(Cc2ccccc2)C2C[S+]3CCCC3C2N1Cc1ccccc1', 'O=C(NCC1CCCN1)c1ccccc1', 'C1CC2CCC(C1)[NH2+]2', 'c1ccc(COCC[N+]23CCC(C(c4ccccc4)c4ccccc4)(CC2)CC3)cc1', 'C(CC[NH+]1CCCC1)CC[NH+]1CCCC1', 'c1ccc2c(c1)Sc1ccccc1N2CC1CN2CCC1CC2', 'c1ccccc1', 'O=C(CC[NH+]1CCc2ccccc2C1Cc1ccccc1)OCCCCCOC(=O)CC[NH+]1CCc2ccccc2C1Cc1ccccc1.c1ccccc1.c1ccccc1', 'O=C(OC1CC2[NH2+]C(C1)C1OC21)C(c1cccs1)c1cccs1', 'C1=CC2CC1CC2C(CCN1CCCCC1)c1ccccc1', 'O=C(OC1CC[NH2+]C1)C(c1ccccc1)C1CCCC1', 'O=C(Cc1ccccc1)OC1CC2CCC(C1)[NH2+]2', 'C1=Cc2ccccc2C(=C2CCNCC2)c2ccccc21', 'c1ccc(-c2ccc(NCCN3C

In [43]:
all_key = scaffolds.keys()
print(all_key)
train_scaffold = random.sample(all_key, round(len(all_key) * 0.88))

train_idx = []
test1_idx = []
test2_idx = []
for i in range(len(df)):
    if train_scaffold.__contains__(smile_scafold[df.loc[i, 'Drug1']]) \
            and train_scaffold.__contains__(smile_scafold[df.loc[i, 'Drug2']]):
        train_idx.append(i)
    elif not train_scaffold.__contains__(smile_scafold[df.loc[i, 'Drug1']]) \
            and not train_scaffold.__contains__(smile_scafold[df.loc[i, 'Drug2']]):
        test2_idx.append(i)
    else:
        test1_idx.append(i)
print(len(train_idx), len(test1_idx), len(test2_idx), len(train_idx)+len(test1_idx)+len(test2_idx))
if len(train_idx) > 150000:
    df_old = df.loc[train_idx]
    df_old = df_old.reset_index()
    df_old.to_csv('./datasets/drugbank1/drugbank_train.csv')

    df_test1_idx = df.loc[test1_idx]
    df_test1_idx = df_test1_idx.reset_index()
    df_test1_idx.to_csv('./datasets/drugbank1/drugbank_test1.csv')

    df_test2_idx = df.loc[test2_idx]
    df_test2_idx = df_test2_idx.reset_index()
    df_test2_idx.to_csv('./datasets/drugbank1/drugbank_test2.csv')
    print("done")


153853 35508 2276 191637
done


In [4]:
def search_index(unique_smiles, df, num_class, num_limit):

    vec_list = []
    for smi in unique_smiles:
        m1 = Chem.MolFromSmiles(smi)
        fp4 = list(AllChem.GetMorganFingerprintAsBitVect(m1, radius=2, nBits=256))
        vec_list.append(fp4)
    print("@@@@@@@@@@@", len(vec_list))
    Z = linkage(vec_list, 'average', metric='jaccard')
    cluster = cut_tree(Z, num_class).ravel()
    stat_dict = {k: v for k, v in sorted(Counter(cluster).items(), key=lambda item: item[1], reverse=True)}

    num = 0
    data_dict = defaultdict(list)
    for k,v in stat_dict.items():
        pos = np.nonzero(cluster==k)[0]
        # print(k, stat_dict[k], len(pos))
        smi_idx = []
        for idx in pos:
            smi_single = df[df["Drug1"] == unique_smiles[idx]]
            smi_idx.append(smi_single)
        df_tmp = pd.concat(smi_idx)
        num += len(df_tmp)
        data_dict[k] = df_tmp
    print("@@@@@@@@@@@", num)

    num = 0
    all_keys = list(data_dict.keys())
    class_num = -1
    meat_class = {}
    for k,v in data_dict.items():
        if len(v) > num_limit:
            class_num += 1
            meat_class[class_num] = v
            num += len(v)
            all_keys.remove(k)

    random.shuffle(all_keys)

    smi_idx = []
    smi_idx_num = 0
    for i,k in enumerate(all_keys):
        # print(i, len(data_dict[k]))
        if smi_idx_num < num_limit:
            smi_idx.append(data_dict[k])
            smi_idx_num += len(data_dict[k])
        else:
            class_num += 1
            meat_class[class_num] = pd.concat(smi_idx)
            num += len(meat_class[class_num])

            smi_idx = []
            smi_idx_num = 0
            smi_idx.append(data_dict[k])
            smi_idx_num += len(data_dict[k])

        if i == len(all_keys) -1:
            class_num += 1
            meat_class[class_num] = pd.concat(smi_idx)
            num += len(meat_class[class_num])


    print(class_num, len(meat_class[class_num]),num)

    if len(meat_class[class_num]) < 10:
        meat_class.pop(class_num)

    num = 0
    for k,v in meat_class.items():
        num += len(v)
    print(num)

    return meat_class

In [2]:
df_train = pd.read_csv('datasets/drugbank1/drugbank_train.csv')
unique_smi = df["Drug1"].unique()
meat_class = search_index(unique_smi, df_train, 100, 150)
# meat_class = search_index(unique_smi, df, 100, 100)
meta_train = {}
meta_train_num = 0
meta_train_k_num = 0
meta_val = {}
meta_val_num = 0
meta_val_k_num = 0
meta_keys = list(meat_class.keys())
random.shuffle(meta_keys)
for k in meta_keys:
    if len(meta_train.keys()) < len(meta_keys) *0.8:
        meta_train[k] = meat_class[k]
        meta_train_num += len(meat_class[k])
        meta_train_k_num += 1
    else:
        meta_val_k_num +=1
        meta_val[k] = meat_class[k]
        meta_val_num += len(meat_class[k])
print(meta_train_num, meta_train_k_num,meta_val_num,meta_val_k_num, meta_train_num+meta_val_num)
joblib.dump(meta_train, "datasets/drugbank1/meta_train.pkl")
joblib.dump(meta_val, "datasets/drugbank1/meta_val.pkl")


NameError: name 'df' is not defined

In [5]:
# df = pd.read_csv('datasets/drugbank1/drugbank_test1.csv')
df = pd.read_csv('datasets/drugbank1/drugbank_test2.csv')
unique_smi = df["Drug1"].unique()
meat_class = search_index(unique_smi, df, 100, 200)
# joblib.dump(meat_class, "datasets/drugbank1/meta_test1.pkl")
joblib.dump(meat_class, "datasets/drugbank1/meta_test2.pkl")

@@@@@@@@@@@ 141
@@@@@@@@@@@ 2276
9 271 2276
2276


['datasets/drugbank1/meta_test2.pkl']

In [3]:
meta_train = joblib.load("datasets/drugbank1/meta_train.pkl")
meta_val = joblib.load("datasets/drugbank1/meta_val.pkl")
train_pd = []
for k,v in meta_train.items():
    train_pd.append(v)
df_tmp = pd.concat(train_pd)
df_tmp = df_tmp.reset_index(drop=True)
df_tmp.to_csv('./datasets/drugbank1/drugbank_train.csv')


test_pd = []
for k,v in meta_val.items():
    test_pd.append(v)
df_tmp = pd.concat(test_pd)
df_tmp = df_tmp.reset_index(drop=True)
df_tmp.to_csv('./datasets/drugbank1/drugbank_val.csv')