# Import and Setup

In [None]:
# Clone the entire repo
%rm -rf .temp/
!git clone -l -s https://github.com/manbaritone/GraphEGFR .temp
%rm .temp/run.ipynb
%mv .temp/* ./
%rm -rf .temp/*

In [None]:
# Install Python dependencies
%pip install rdkit
%pip install deepchem
%pip install JPype1
%pip install torch_geometric
%pip install dgl dgllife
%pip install pybel

In [None]:
# Download large required project files for the test
!wget https://zenodo.org/records/8051021/files/GraphEGFR.tar.gz
!tar -xvf /content/GraphEGFR.tar.gz -C .

In [1]:
import pandas as pd
import os
import torch
from torch_geometric.loader import DataLoader

from graphegfr.models import GraphEGFR
from graphegfr.configs import Configs
from graphegfr.fingerprint import Fingerprint
from graphegfr.featurizer import generate_npdata, clean_smiles
from graphegfr.dataset import load_dataset

2024-05-06 00:20:47.043095: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [2]:
target_dict = {
    r'MTL_HER124': ['HER1','HER2','HER4'],
    r'MTL_ALL_WT_MT': ['HER1','HER2','HER4','T790M_L858R','L858R','delE746_A750','T790M'],
    r'MTL_HER1_ALL_MT': ['HER1','T790M_L858R','L858R','delE746_A750','T790M'],
    r'MTL_ALL_MT': ['T790M_L858R','L858R','delE746_A750','T790M'],
    r'HER1':['HER1'],
    r'HER2':['HER2'],
    r'HER4':['HER4'],
    r'T790M_L858R':['T790M_L858R'],
    r'L858R':['L858R'],
    r'delE746_A750':['delE746_A750'],
    r'T790M':['T790M']
}

# Main Section

In [3]:
datapath = ".temp" # path to save generated fingerprint 
smiles_path = "resources/LigEGFR/data_prep/0_rawdata/drug/FDA_drug.csv"

smiles_raw = pd.read_csv(smiles_path)["Isomeric_SMILES"] # series of SMILES
smiles = clean_smiles(smiles_raw)
for smi in smiles.tolist():
    print(smi)

Number of defect: 0
C#Cc1cccc(Nc2ncnc3cc(OCCOC)c(OCCOC)cc23)c1
C=CC(=O)Nc1cc(Nc2nccc(-c3cn(C)c4ccccc34)n2)c(OC)cc1N(C)CCN(C)C
COc1cc2ncnc(Nc3ccc(F)c(Cl)c3)c2cc1OCCCN1CCOCC1
CN(C)C/C=C/C(=O)Nc1cc2c(Nc3ccc(F)c(Cl)c3)ncnc2cc1O[C@H]1CCOC1
COc1cc2ncnc(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)/C=C/CN1CCCCC1
CS(=O)(=O)CCNCc1ccc(-c2ccc3ncnc(Nc4ccc(OCc5cccc(F)c5)c(Cl)c4)c3c2)o1
CCOc1cc2ncc(C#N)c(Nc3ccc(OCc4ccccn4)c(Cl)c3)c2cc1NC(=O)/C=C/CN(C)C
COc1cc2c(Nc3ccc(Br)cc3F)ncnc2cc1OCC1CCN(C)CC1
C=CC(=O)Nc1cc(Nc2ncc(C(=O)OC(C)C)c(-c3cn(C)c4ccccc34)n2)c(OC)cc1N(C)CCN(C)C
COc1cc(N2CCC(N3CCN(C)CC3)CC2)ccc1Nc1ncc(Cl)c(Nc2ccccc2P(C)(C)=O)n1
Cc1cc(Nc2ncnc3ccc(NC4=NC(C)(C)CO4)cc23)ccc1Oc1ccn2ncnc2c1


In [4]:
print("Generating fingerprint...")
Fingerprint(smiles, datapath)
adj, feature, graph, edge = generate_npdata(smiles, datapath)

print("Done")
# print(adj.shape)
# print(feature.shape)
# print(graph.shape)
# print(len(edge))

Generating fingerprint...


0it [00:00, ?it/s]
100%|██████████| 11/11 [00:01<00:00,  6.89it/s]
100%|██████████| 11/11 [00:00<00:00, 18.53it/s]
100%|██████████| 11/11 [00:00<00:00, 24.68it/s]
100%|██████████| 11/11 [00:00<00:00, 14.97it/s]
100%|██████████| 11/11 [00:00<00:00, 46.83it/s]
100%|██████████| 11/11 [00:00<00:00, 24.93it/s]
100%|██████████| 11/11 [00:00<00:00, 50.35it/s]
100%|██████████| 11/11 [00:00<00:00, 13.25it/s]
100%|██████████| 11/11 [00:00<00:00, 52.92it/s]
100%|██████████| 11/11 [00:00<00:00, 433.97it/s]
100%|██████████| 11/11 [00:00<00:00, 354.53it/s]
100%|██████████| 11/11 [00:00<00:00, 462.34it/s]
100%|██████████| 11/11 [00:00<00:00, 686.39it/s]
100%|██████████| 11/11 [00:00<00:00, 995.97it/s]
100%|██████████| 11/11 [00:00<00:00, 402.92it/s]
100%|██████████| 11/11 [00:00<00:00, 27.70it/s]
0it [00:00, ?it/s]
100%|██████████| 11/11 [00:00<00:00, 422.38it/s]
100%|██████████| 11/11 [00:00<00:00, 424.35it/s]
100%|██████████| 11/11 [00:00<00:00, 426.05it/s]
100%|██████████| 11/11 [00:00<00:00, 933.

Done


In [5]:
# Generated from the above cell
fpc = pd.read_csv(f'{datapath}/fingerprint-nonhash.csv').to_numpy()
fpf = pd.read_csv(f'{datapath}/fingerprint-hash.csv').to_numpy()

fingfeaf=fpf.shape[-1]
fingfeac=fpc.shape[-1]
fpfs=[]
fpcs=[]
Label = [None] * len(edge)
for i in fpf:
    fpfs.append(torch.FloatTensor(i))
for i in fpc:
    fpcs.append(torch.FloatTensor(i).unsqueeze(1))

In [6]:
dataset, smiles_list = load_dataset(adj, smiles, Label, fpfs, fpcs)

In [7]:
test_loader = DataLoader(dataset,batch_size=1,shuffle=False)

In [11]:
# list all possible targets
for i in target_dict:
    print(i)

MTL_HER124
MTL_ALL_WT_MT
MTL_HER1_ALL_MT
MTL_ALL_MT
HER1
HER2
HER4
T790M_L858R
L858R
delE746_A750
T790M


In [54]:
# enter target here
target = "T790M_L858R"
print_architecture = False

configs = Configs.parse(f"configs/sample/{target}-conf.json")
hpconfig = configs['hyperparam']
num_atom_features = hpconfig["num_atom_features"]
edge_dim = hpconfig["edge_dim"]
fingerprint_dim = hpconfig["fingerprint_dim"]
num_layers = hpconfig["num_layers"]
num_timesteps = hpconfig["num_timesteps"]
dropout = 0 # not used in eval mode regardless

model = GraphEGFR(num_atom_features,edge_dim, fingerprint_dim,
                  num_layers, num_timesteps, dropout, fingfeaf, 
                  fingfeac, configs)
state_dict = torch.load(f"./state_dict/{target}.pt")
model.load_state_dict(state_dict)
model.cpu().eval()
if print_architecture: 
    print("== Model Architecture ==")
    print("Target:", target)
    print("Model:\n",model)
else:
    print()




In [55]:
records = {"smiles":[]}
if "MTL" not in target:
    index_ans = target_dict[configs["target"]].index(target)
    actual_target = target_dict[configs["target"]][index_ans]
    records[actual_target] = []
else:
    for t in target_dict[configs["target"]]:
        records[t] = []
for data, smi in zip(test_loader, smiles_list):
    if "MTL" not in target:
        value = model(data)[0,index_ans].item()
        records[actual_target].append(value)
    else:
        value = model(data)[0,:].tolist()
        for i, v in enumerate(value):
            records[target_dict[configs["target"]][i]].append(v) 
    records["smiles"].append(smi)
df_records = pd.DataFrame(records)
df_records

Unnamed: 0,smiles,T790M_L858R
0,C#Cc1cccc(Nc2ncnc3cc(OCCOC)c(OCCOC)cc23)c1,5.054158
1,C=CC(=O)Nc1cc(Nc2nccc(-c3cn(C)c4ccccc34)n2)c(O...,8.412347
2,COc1cc2ncnc(Nc3ccc(F)c(Cl)c3)c2cc1OCCCN1CCOCC1,6.121329
3,CN(C)C/C=C/C(=O)Nc1cc2c(Nc3ccc(F)c(Cl)c3)ncnc2...,7.915504
4,COc1cc2ncnc(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)/C=C/C...,7.489783
5,CS(=O)(=O)CCNCc1ccc(-c2ccc3ncnc(Nc4ccc(OCc5ccc...,7.514225
6,CCOc1cc2ncc(C#N)c(Nc3ccc(OCc4ccccn4)c(Cl)c3)c2...,7.232996
7,COc1cc2c(Nc3ccc(Br)cc3F)ncnc2cc1OCC1CCN(C)CC1,6.271029
8,C=CC(=O)Nc1cc(Nc2ncc(C(=O)OC(C)C)c(-c3cn(C)c4c...,8.828916
9,COc1cc(N2CCC(N3CCN(C)CC3)CC2)ccc1Nc1ncc(Cl)c(N...,7.301531
