# Import and Setup

In [1]:
import pandas as pd
import os
import torch
from torch_geometric.loader import DataLoader

from graphegfr.models import GraphEGFR
from graphegfr.configs import Configs
from graphegfr.fingerprint import Fingerprint
from graphegfr.featurizer import generate_npdata, clean_smiles
from graphegfr.dataset import load_dataset

2024-05-03 10:43:56.472004: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-03 10:43:56.477008: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2024-05-03 10:43:56.477025: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
target_dict = {
    r'MTL_HER124': ['HER1','HER2','HER4'],
    r'MTL_ALL_WT_MT': ['HER1','HER2','HER4','T790M_L858R','L858R','delE746_A750','T790M'],
    r'MTL_HER1_ALL_MT': ['HER1','T790M_L858R','L858R','delE746_A750','T790M'],
    r'MTL_ALL_MT': ['T790M_L858R','L858R','delE746_A750','T790M'],
    r'HER1':['HER1'],
    r'HER2':['HER2'],
    r'HER4':['HER4'],
    r'T790M_L858R':['T790M_L858R'],
    r'L858R':['L858R'],
    r'delE746_A750':['delE746_A750'],
    r'T790M':['T790M']
}

In [3]:
modelpaths = {}
for elem in os.listdir("models"):
    path = os.path.join("models", elem)
    if os.path.isfile(path) and path.endswith(".pt"):
        modelpaths[elem.split("_")[0]] = path

# Main Section

In [4]:
datapath = ".temp" # path to save generated fingerprint 
smiles_path = "resources/LigEGFR/data_prep/0_rawdata/drug/FDA_drug.csv"

smiles_raw = pd.read_csv(smiles_path)["Isomeric_SMILES"] # series of SMILES
smiles = clean_smiles(smiles_raw)
for smi in smiles.tolist():
    print(smi)

Number of defect: 0
C#Cc1cccc(Nc2ncnc3cc(OCCOC)c(OCCOC)cc23)c1
C=CC(=O)Nc1cc(Nc2nccc(-c3cn(C)c4ccccc34)n2)c(OC)cc1N(C)CCN(C)C
COc1cc2ncnc(Nc3ccc(F)c(Cl)c3)c2cc1OCCCN1CCOCC1
CN(C)C/C=C/C(=O)Nc1cc2c(Nc3ccc(F)c(Cl)c3)ncnc2cc1O[C@H]1CCOC1
COc1cc2ncnc(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)/C=C/CN1CCCCC1
CS(=O)(=O)CCNCc1ccc(-c2ccc3ncnc(Nc4ccc(OCc5cccc(F)c5)c(Cl)c4)c3c2)o1
CCOc1cc2ncc(C#N)c(Nc3ccc(OCc4ccccn4)c(Cl)c3)c2cc1NC(=O)/C=C/CN(C)C
COc1cc2c(Nc3ccc(Br)cc3F)ncnc2cc1OCC1CCN(C)CC1
C=CC(=O)Nc1cc(Nc2ncc(C(=O)OC(C)C)c(-c3cn(C)c4ccccc34)n2)c(OC)cc1N(C)CCN(C)C
COc1cc(N2CCC(N3CCN(C)CC3)CC2)ccc1Nc1ncc(Cl)c(Nc2ccccc2P(C)(C)=O)n1
Cc1cc(Nc2ncnc3ccc(NC4=NC(C)(C)CO4)cc23)ccc1Oc1ccn2ncnc2c1


In [5]:
print("Generating fingerprint...")
Fingerprint(smiles, datapath)
adj, feature, graph, edge = generate_npdata(smiles, datapath)

print("Done")
# print(adj.shape)
# print(feature.shape)
# print(graph.shape)
# print(len(edge))

Generating fingerprint...


0it [00:00, ?it/s]
100%|██████████| 11/11 [00:01<00:00,  6.94it/s]
100%|██████████| 11/11 [00:00<00:00, 17.04it/s]
100%|██████████| 11/11 [00:00<00:00, 20.58it/s]
100%|██████████| 11/11 [00:00<00:00, 18.21it/s]
100%|██████████| 11/11 [00:00<00:00, 26.24it/s]
100%|██████████| 11/11 [00:00<00:00, 18.68it/s]
100%|██████████| 11/11 [00:00<00:00, 31.72it/s]
100%|██████████| 11/11 [00:00<00:00, 11.56it/s]
100%|██████████| 11/11 [00:00<00:00, 33.16it/s]
100%|██████████| 11/11 [00:00<00:00, 427.38it/s]
100%|██████████| 11/11 [00:00<00:00, 344.06it/s]
100%|██████████| 11/11 [00:00<00:00, 459.35it/s]
100%|██████████| 11/11 [00:00<00:00, 138.55it/s]
100%|██████████| 11/11 [00:00<00:00, 325.51it/s]
100%|██████████| 11/11 [00:00<00:00, 202.83it/s]
100%|██████████| 11/11 [00:00<00:00, 17.98it/s]
0it [00:00, ?it/s]
100%|██████████| 11/11 [00:00<00:00, 296.60it/s]
100%|██████████| 11/11 [00:00<00:00, 402.28it/s]
100%|██████████| 11/11 [00:00<00:00, 410.53it/s]
100%|██████████| 11/11 [00:00<00:00, 896.

Done


In [6]:
# Generated from the above cell
fpc = pd.read_csv(f'{datapath}/fingerprint-nonhash.csv').to_numpy()
fpf = pd.read_csv(f'{datapath}/fingerprint-hash.csv').to_numpy()

fingfeaf=fpf.shape[-1]
fingfeac=fpc.shape[-1]
fpfs=[]
fpcs=[]
Label = [None] * len(edge)
for i in fpf:
    fpfs.append(torch.FloatTensor(i))
for i in fpc:
    fpcs.append(torch.FloatTensor(i).unsqueeze(1))

In [7]:
dataset, smiles_list = load_dataset(adj, smiles, Label, fpfs, fpcs)

In [8]:
test_loader = DataLoader(dataset,batch_size=1,shuffle=False)

In [9]:
# for target in ['HER1','HER2','HER4','T790M_L858R','L858R','delE746_A750','T790M']:
target = "HER1"
configs = Configs.parse(f"configs/sample/{target}-conf.json")
hpconfig = configs['hyperparam']
num_atom_features = hpconfig["num_atom_features"]
edge_dim = hpconfig["edge_dim"]
fingerprint_dim = hpconfig["fingerprint_dim"]
num_layers = hpconfig["num_layers"]
num_timesteps = hpconfig["num_timesteps"]
dropout = 0 # not used in eval mode regardless

model = GraphEGFR(num_atom_features,edge_dim, fingerprint_dim,
                  num_layers, num_timesteps, dropout, fingfeaf, 
                  fingfeac, configs)
state_dict = torch.load(f"./state_dict/{target}.pt")
model.load_state_dict(state_dict)
model.cpu().eval()
index_ans = target_dict[configs["target"]].index(target)

In [10]:
print("== Model Architecture ==")
model

== Model Architecture ==


GraphEGFR(
  (mol_model): AttentiveFP_DGL(
    (gnn): AttentiveFPGNN(
      (init_context): GetContext(
        (project_node): Sequential(
          (0): Linear(in_features=75, out_features=275, bias=True)
          (1): LeakyReLU(negative_slope=0.01)
        )
        (project_edge1): Sequential(
          (0): Linear(in_features=86, out_features=275, bias=True)
          (1): LeakyReLU(negative_slope=0.01)
        )
        (project_edge2): Sequential(
          (0): Dropout(p=0, inplace=False)
          (1): Linear(in_features=550, out_features=1, bias=True)
          (2): LeakyReLU(negative_slope=0.01)
        )
        (attentive_gru): AttentiveGRU1(
          (edge_transform): Sequential(
            (0): Dropout(p=0, inplace=False)
            (1): Linear(in_features=275, out_features=275, bias=True)
          )
          (gru): GRUCell(275, 275)
        )
      )
      (gnn_layers): ModuleList()
    )
    (readout): AttentiveFPReadout(
      (readouts): ModuleList(
        (0)

In [12]:
print("Target index (in output):",index_ans)
records = {"smiles":[], "predicted_values":[]}
for data, smi in zip(test_loader, smiles_list):
    value = model(data)[0,index_ans].item()
    records["predicted_values"].append(value)
    records["smiles"].append(smi)
df_records = pd.DataFrame(records)
df_records

Target index (in output): 0


Unnamed: 0,smiles,predicted_values
0,C#Cc1cccc(Nc2ncnc3cc(OCCOC)c(OCCOC)cc23)c1,6.483565
1,C=CC(=O)Nc1cc(Nc2nccc(-c3cn(C)c4ccccc34)n2)c(O...,7.429565
2,COc1cc2ncnc(Nc3ccc(F)c(Cl)c3)c2cc1OCCCN1CCOCC1,8.14937
3,CN(C)C/C=C/C(=O)Nc1cc2c(Nc3ccc(F)c(Cl)c3)ncnc2...,8.557239
4,COc1cc2ncnc(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)/C=C/C...,8.500108
5,CS(=O)(=O)CCNCc1ccc(-c2ccc3ncnc(Nc4ccc(OCc5ccc...,8.139348
6,CCOc1cc2ncc(C#N)c(Nc3ccc(OCc4ccccn4)c(Cl)c3)c2...,8.696309
7,COc1cc2c(Nc3ccc(Br)cc3F)ncnc2cc1OCC1CCN(C)CC1,6.957242
8,C=CC(=O)Nc1cc(Nc2ncc(C(=O)OC(C)C)c(-c3cn(C)c4c...,6.816389
9,COc1cc(N2CCC(N3CCN(C)CC3)CC2)ccc1Nc1ncc(Cl)c(N...,4.693923
