<a href="https://colab.research.google.com/github/manbaritone/GraphEGFR/blob/main/run-colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##**GraphEGFR on Google Colab**

<img src="https://github.com/manbaritone/GraphEGFR/blob/main/graphegfr_architect.png?raw=true">

**GraphEGFR** is a model specifically designed to enhance molecular representation for the prediction of inhibitor bioactivity (pIC50) against wild-type HER1, HER2, HER4, and mutant HER1 proteins. GraphEGFR incorporates deep learning techniques such as multi-task learning and transfer learning, inspired by a graph attention mechanism for molecular graphs and deep neural networks, and convolutional neural networks for molecular fingerprints.

**For more information, please visit:**

**GitHub:** https://github.com/manbaritone/GraphEGFR

**Publication:** [Boonyarit B., Yamprasert N., Kaewnuratchadasorn P., Kinchakawat J., Prommin C., Rungrotmongkol T., Nutanong S. GraphEGFR: Multi‐task and transfer learning based on molecular graph attention mechanism and fingerprints improving inhibitor bioactivity prediction for EGFR family proteins on data scarcity.
*Journal of Computational Chemistry*, 2024](https://onlinelibrary.wiley.com/doi/full/10.1002/jcc.27388)

In [None]:
#@title Clone the entire GraphEGFR repository

!git clone -l -s https://github.com/manbaritone/GraphEGFR .proj.temp/
%rm .proj.temp/run.ipynb
%mv .proj.temp/* ./
%rm -r .proj.temp/

In [None]:
#@title Install CondaColab

!pip install -q condacolab
import condacolab
import sys
condacolab.install()
!echo $PYTHONPATH
%env PYTHONPATH=/usr/local/bin/python3.10

In [None]:
#@title CondaColab Check

import condacolab
condacolab.check()

In [None]:
#@title Install dependencies (~4 mins)

!pip install scikit-learn==1.2.2
!pip install imblearn
!conda install conda-forge::openbabel
!python -m pip install rdkit
!python -m pip install deepchem==2.5.0
!python -m pip install JPype1
!pip install torch==2.0.0
!pip install torch_geometric
!pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-2.0.0+cu118.html
!python -m pip install torchmetrics
!pip install dgl==1.1.3 -f https://data.dgl.ai/wheels/cu118/repo.html
!pip install dgllife


In [None]:
#@title Download required project files from Zenodo (~2.9 GB with ~4 mins)

!wget https://zenodo.org/records/11122146/files/GraphEGFR.tar.gz
!tar -xvf /content/GraphEGFR.tar.gz --warning=no-unknown-keyword -C .
!rm GraphEGFR/*.ipynb
!mv GraphEGFR/* ./
!rm -r GraphEGFR

In [None]:
#@title Import packages and Define targets

from google.colab import files
import pandas as pd
import os
import io
import torch
from torch_geometric.loader import DataLoader

from graphegfr.models import GraphEGFR
from graphegfr.configs import Configs
from graphegfr.fingerprint import Fingerprint
from graphegfr.featurizer import generate_npdata, clean_smiles
from graphegfr.dataset import load_dataset

target_dict = {
    r'MTL_HER124': ['HER1','HER2','HER4'],
    r'MTL_ALL_WT_MT': ['HER1','HER2','HER4','T790M_L858R','L858R','delE746_A750','T790M'],
    r'MTL_HER1_ALL_MT': ['HER1','T790M_L858R','L858R','delE746_A750','T790M'],
    r'MTL_ALL_MT': ['T790M_L858R','L858R','delE746_A750','T790M'],
    r'HER1':['HER1'],
    r'HER2':['HER2'],
    r'HER4':['HER4'],
    r'T790M_L858R':['T790M_L858R'],
    r'L858R':['L858R'],
    r'delE746_A750':['delE746_A750'],
    r'T790M':['T790M']
}

In [None]:
#@title Input target and Upload a SMI file for desired compounds in SMILES format

target = "MTL_HER1_ALL_MT" # @param ["HER1", "HER2", "HER4", "T790M_L858R", "L858R", "delE746_A750", "T790M", "MTL_HER124", "MTL_ALL_WT_MT", "MTL_HER1_ALL_MT", "MTL_ALL_MT"]
print_architecture = False # @param {type:"boolean"}
datapath = ".temp" # path to save generated fingerprint
smiles_file = "FDA_drugs" # @param ["FDA_drugs", "custom"]
# @markdown * Please select a **target** by choosing either a single protein target or multi-task learning (MTL) for simultaneous prediction:
# @markdown        1. MTL_HER124: ['HER1','HER2','HER4']
# @markdown        2. MTL_ALL_WT_MT: ['HER1','HER2','HER4','T790M_L858R','L858R','delE746_A750','T790M']
# @markdown        3. MTL_HER1_ALL_MT: ['HER1','T790M_L858R','L858R','delE746_A750','T790M']
# @markdown        4. MTL_ALL_MT: ['T790M_L858R','L858R','delE746_A750','T790M']
# @markdown      **Note:** The mutation targets are based on the HER1 protein.
# @markdown
# @markdown   – You can upload an SMI file (Recommend: Isomeric SMILES) by selecting "**custom**" for the **smile_file** option; otherwise, example "**FDA_drugs**" will be used.
smiles_path = []

if smiles_file == 'FDA_drugs':
  smiles_path = f"examples/FDA_drug.csv" # example SMILES strings (FDA approved drugs)
  smiles_raw = pd.read_csv(smiles_path)["Isomeric_SMILES"] # series of SMILES

else:
  upload_smiles = files.upload() # upload isomeric SMILES
  file_name = next(iter(upload_smiles))
  data = io.BytesIO(upload_smiles[file_name])
  smiles_raw = pd.read_csv(data, header=None)[0] # series of SMILES
  print(smiles_raw)

smiles = clean_smiles(smiles_raw)
for smi in smiles.tolist():
    print(smi)

In [None]:
#@title Generate features from input SMILES string(s)

print("Generating fingerprint...")
Fingerprint(smiles, datapath)
adj, feature, graph, edge = generate_npdata(smiles, datapath)

print("Done")
# print(adj.shape)
# print(feature.shape)
# print(graph.shape)
# print(len(edge))

fpc = pd.read_csv(f'{datapath}/fingerprint-nonhash.csv').to_numpy()
fpf = pd.read_csv(f'{datapath}/fingerprint-hash.csv').to_numpy()

fingfeaf=fpf.shape[-1]
fingfeac=fpc.shape[-1]
fpfs=[]
fpcs=[]
Label = [None] * len(edge)
for i in fpf:
    fpfs.append(torch.FloatTensor(i))
for i in fpc:
    fpcs.append(torch.FloatTensor(i).unsqueeze(1))

In [None]:
#@title Load data and config

dataset, smiles_list = load_dataset(adj, smiles, Label, fpfs, fpcs)
test_loader = DataLoader(dataset,batch_size=1,shuffle=False)

configs = Configs.parse(f"configs/sample/{target}-conf.json")
hpconfig = configs['hyperparam']
num_atom_features = hpconfig["num_atom_features"]
edge_dim = hpconfig["edge_dim"]
fingerprint_dim = hpconfig["fingerprint_dim"]
num_layers = hpconfig["num_layers"]
num_timesteps = hpconfig["num_timesteps"]
dropout = 0 # not used in eval mode regardless

In [None]:
#@title Run GraphEGFR

model = GraphEGFR(num_atom_features,edge_dim, fingerprint_dim,
                  num_layers, num_timesteps, dropout, fingfeaf,
                  fingfeac, configs)
state_dict = torch.load(f"./state_dict/{target}.pt")
model.load_state_dict(state_dict)
model.cpu().eval()
if print_architecture:
    print("== Model Architecture ==")
    print("Target:", target)
    print("Model:\n",model)
else:
    print()

In [None]:
#@title Print predicted value(s) in pIC50
# @markdown **Note:** pIC50 = -log(IC50)

records = {"smiles":[]}
if "MTL" not in target:
    index_ans = target_dict[configs["target"]].index(target)
    actual_target = target_dict[configs["target"]][index_ans]
    records[actual_target] = []
else:
    for t in target_dict[configs["target"]]:
        records[t] = []
for data, smi in zip(test_loader, smiles_list):
    if "MTL" not in target:
        value = model(data)[0,index_ans].item()
        records[actual_target].append(value)
    else:
        value = model(data)[0,:].tolist()
        for i, v in enumerate(value):
            records[target_dict[configs["target"]][i]].append(v)
    records["smiles"].append(smi)
df_records = pd.DataFrame(records)
df_records