In [None]:
!mamba install -c conda-forge rdkit

%pip install -U numpy

!pip install --pre deepchem
import deepchem

!wget https://storage.googleapis.com/inspirit-ai-data-bucket-1/Data/Deep%20Dives/AI%20%2B%20Healthcare/Sessions%201-5/Drug%20Discovery%20with%20GNNs/delaney-processed.csv
!wget https://storage.googleapis.com/inspirit-ai-data-bucket-1/Data/Deep%20Dives/AI%20%2B%20Healthcare/Sessions%201-5/Drug%20Discovery%20with%20GNNs/tox21.csv.gz

import gzip
import shutil
with gzip.open('tox21.csv.gz', 'rb') as f_in:
    with open('tox21.csv', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)


from deepchem.utils.save import load_from_disk

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn import metrics

from deepchem.models import GraphConvModel
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier

In [None]:
solubility_data = pd.read_csv("delaney-processed.csv")
print (solubility_data.shape)
solubility_data.head()

In [None]:
input_cols = ['Minimum Degree', 'Molecular Weight', 'Number of H-Bond Donors', 'Number of Rings', 'Number of Rotatable Bonds', 'Polar Surface Area']
output_cols = ['measured log solubility in mols per litre']

X = solubility_data[input_cols]
y = solubility_data[output_cols]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle = False)

In [None]:
import sklearn
from sklearn import linear_model

lm = sklearn.linear_model.LinearRegression()

lm.fit(X_train,y_train)

In [None]:
y_pred = lm.predict(X_test)
print(metrics.r2_score(y_test, y_pred))

In [None]:
plt.plot(y_test, y_pred, '.')
plt.legend(['LinReg Predictions'])
plt.xlabel("True Log Solubility")
plt.ylabel("Predicted Log Solubility")

In [None]:
esol_pred = solubility_data['ESOL predicted log solubility in mols per litre'].loc[y_test.index]

In [None]:
print(metrics.r2_score(y_test, y_pred))
print(metrics.r2_score(y_test, esol_pred))

plt.plot(y_test, y_pred, '.')
plt.plot(y_test, esol_pred, '.')
plt.legend(['LinReg Predictions','ESOL Predictions'])
plt.xlabel("True Log Solubility")
plt.ylabel("Predicted Log Solubility")
plt.show()

In [None]:
loader = deepchem.data.CSVLoader(tasks=["measured log solubility in mols per litre"], 
                                 feature_field="smiles", 
                                 featurizer=deepchem.feat.ConvMolFeaturizer())
solubility_smiles_data = loader.create_dataset("delaney-processed.csv")

splitter = deepchem.splits.IndexSplitter()
sol_train_dataset, sol_test_dataset = splitter.train_test_split(solubility_smiles_data,
                                                                 frac_train=0.8)

In [None]:
import warnings
warnings.filterwarnings('ignore')

sol_gnn = GraphConvModel(n_tasks=1, mode='regression', dropout=0.2)
sol_gnn.fit(sol_train_dataset, nb_epoch=600)

In [None]:
gnn_pred = sol_gnn.predict(sol_test_dataset)

print(metrics.r2_score(y_test, y_pred))
print(metrics.r2_score(y_test, esol_pred))
print(metrics.r2_score(y_test, gnn_pred))

plt.plot(y_test, y_pred, '.')
plt.plot(y_test, esol_pred, '.')
plt.plot(y_test, gnn_pred, '.')
plt.legend(['LinReg Predictions','ESOL Predictions','GNN Predictions'])
plt.xlabel("True Log Solubility")
plt.ylabel("Predicted Log Solubility")
plt.show()

In [None]:
gnn_pred_series = pd.Series(gnn_pred.squeeze(), index = y_test.index)
soluble = gnn_pred_series[gnn_pred_series > 0]
print (len(soluble)/len(gnn_pred_series))
solubility_data.loc[soluble.index]

In [None]:
print (sol_train_dataset) #All data
print (sol_train_dataset.X) #X data - molecules
print (sol_train_dataset.X[0]) #First molecule

In [None]:
my_molecule = sol_train_dataset.X[0] 
print (my_molecule.n_atoms) 

In [None]:
print(sol_train_dataset.X[0].max_deg)
print(sol_train_dataset.X[0].min_deg)
print(sol_train_dataset.X[0].deg_list)
print(sol_train_dataset.X[0].canon_adj_list)

In [None]:
atom_features = my_molecule.get_atom_features()
print (atom_features)
print (atom_features.shape)

In [None]:
toxicity_data = pd.read_csv("tox21.csv")
toxicity_data.head()

In [None]:
_, datasets, transformers = deepchem.molnet.load_tox21(featurizer='GraphConv')
tox_train_dataset, tox_valid_dataset, tox_test_dataset = datasets

In [None]:
import warnings
warnings.filterwarnings('ignore')

tox_gnn = GraphConvModel(n_tasks=12, mode='classification', dropout=0.2)
tox_gnn.fit(tox_train_dataset, nb_epoch=20)

In [None]:
tox_pred = tox_gnn.predict(tox_test_dataset)[:,:,1]
tox_pred

In [None]:
tox_pred_class = (tox_pred > 0.5)

In [None]:
print ("Baseline Accuracy is 100% minus")
print (tox_test_dataset.y.mean())


print ("Accuracy")
right_ans = tox_pred_class == tox_test_dataset.y
print (right_ans.mean())

In [None]:
any_toxic = tox_pred_class.any(axis=1) 
print (any_toxic.shape)

In [None]:
any_toxic = tox_pred_class.any(axis=1) 
1 - (any_toxic.mean())

In [None]:
.31 * .16