### Evalute the WeightedModel on all of the binary classifation task in ADME

The models and model weights where tuned on the training data from cyp2c9_veith.

The molecules are embedded using ColorRefinement an algorithm I wrote to detect identical arbritaty sized subgrpahs. 

In [None]:
!git clone https://github.com/parkerburchett/pysmiles
!pip install pyTDC
!git clone https://github.com/parkerburchett/TDC-DeepLearning
!pip3 install rdkit-pypi

Cloning into 'pysmiles'...
remote: Enumerating objects: 420, done.[K
remote: Counting objects: 100% (30/30), done.[K
remote: Compressing objects: 100% (26/26), done.[K
remote: Total 420 (delta 12), reused 18 (delta 4), pack-reused 390[K
Receiving objects: 100% (420/420), 134.29 KiB | 1.66 MiB/s, done.
Resolving deltas: 100% (251/251), done.
Collecting pyTDC
  Downloading PyTDC-0.3.1.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 1.6 MB/s 
[?25hCollecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Building wheels for collected packages: pyTDC
  Building wheel for pyTDC (setup.py) ... [?25l[?25hdone
  Created wheel for pyTDC: filename=PyTDC-0.3.1-py3-none-any.whl size=116110 sha256=0ee772d9050df257fc3e357a13a1e001e427b62617de3e7ca82bb362b6f01666
  Stored in directory: /root/.cache/pip/wheels/a9/54/07/50251965a66a68eb6c0e2b3022588cc992cc4c2a2e69d8c7ec
Successfully built pyTDC
Installing collected packages: fuzzywuzzy, pyTDC
Successfully

In [None]:
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor
from pysmiles.pysmiles import read_smiles
import os
os.chdir('/content/TDC-DeepLearning/')
from utils import ColorRefinement as cr 

In [None]:
tuned_weights = np.array([0.23494501, 0.50427143, 0.16934516, 0.0914384])

tuned_models = [LGBMRegressor(boosting_type='gbdt', class_weight=None,
               colsample_bytree=0.45921506474872353, importance_type='split',
               learning_rate=0.003605978989205916, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=997, n_jobs=-1, num_leaves=171, objective=None,
               random_state=None, reg_alpha=0.06136193030050688, reg_lambda=0.0,
               silent=True, subsample=0.664374000848817,
               subsample_for_bin=200000, subsample_freq=1),
 LGBMRegressor(boosting_type='gbdt', class_weight=None,
               colsample_bytree=0.19190373976042552, importance_type='split',
               learning_rate=0.018880733945270692, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=772, n_jobs=-1, num_leaves=412, objective=None,
               random_state=None, reg_alpha=0.1319602189105627, reg_lambda=0.0,
               silent=True, subsample=0.953435263598222,
               subsample_for_bin=200000, subsample_freq=1),
 LGBMRegressor(boosting_type='gbdt', class_weight=None,
               colsample_bytree=0.0846115062976256, importance_type='split',
               learning_rate=0.061904626017968235, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=710, n_jobs=-1, num_leaves=218, objective=None,
               random_state=None, reg_alpha=0.09722107305351997, reg_lambda=0.0,
               silent=True, subsample=0.7625401046034898,
               subsample_for_bin=200000, subsample_freq=1),
 LGBMRegressor(boosting_type='gbdt', class_weight=None,
               colsample_bytree=0.10234165146414909, importance_type='split',
               learning_rate=0.021876318417714605, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=574, n_jobs=-1, num_leaves=193, objective=None,
               random_state=None, reg_alpha=0.08093256266597965, reg_lambda=0.0,
               silent=True, subsample=0.9986613307559011,
               subsample_for_bin=200000, subsample_freq=1)]

In [None]:
class WeightedModel:

  def __init__(self,
               cr_num_hops:int,
               cr_num_colors:int, 
               models:list,
               model_weights:list):
    
    self.cr_num_hops = cr_num_hops
    self.cr_num_colors = cr_num_colors
    self.models = models
    self.model_weights = model_weights


  def _create_embeddings(self, smiles):
    graphs = [read_smiles(s,) for s in smiles]
    hop_feature_dfs = cr.create_hop_feature_dfs(graphs=graphs,
                                                num_hops=self.cr_num_hops,
                                                num_colors=self.cr_num_colors)
    return hop_feature_dfs


  def fit(self, smiles, targets):
    hop_feature_dfs = self._create_embeddings(smiles)
    print('embedded for fitting')
    for hop_num, model in enumerate(self.models):
      X = hop_feature_dfs[hop_num].values
      y = targets.values
      model.fit(X,y)


  def predict(self, smiles):
    hop_feature_dfs = self._create_embeddings(smiles)
    print('embedded for prediction')
    prediction_df = pd.DataFrame()
    for hop_num, model in enumerate(self.models):
      prediction_df[hop_num] = model.predict(hop_feature_dfs[hop_num].values)
    weighted_predictions = prediction_df.values.dot(self.model_weights)
    return weighted_predictions


In [None]:
from sklearn.metrics import precision_recall_curve,auc
def compute_auprc(y_true,y_pred):
    precision, recall, thresholds = precision_recall_curve(y_true, y_pred)
    area = auc(recall, precision)
    return area

In [None]:
%%time
from tdc.benchmark_group import admet_group
group = admet_group(path = 'data/')
benchmark = group.get('cyp2c9_veith')
test_predictions_list = []

for seed in [1, 2, 3, 4, 5]:
  name = benchmark['name']
  train_val, test = benchmark['train_val'], benchmark['test']
  train, valid = group.get_train_valid_split(benchmark = name, split_type = 'default', seed = seed)
  final_model = WeightedModel(cr_num_hops=4,
                              cr_num_colors=2_000,
                              models=tuned_models,
                              model_weights=tuned_weights)
  
  final_model.fit(train['Drug'], train['Y'])
  print('fit model')
  y_preds = final_model.predict(test['Drug'].values)
  predictions = {}
  predictions[name] = y_preds
  test_predictions_list.append(predictions)

  print(compute_auprc(test['Y'].values, y_preds))

Found local copy...
generating training, validation splits...
100%|██████████| 9673/9673 [00:03<00:00, 2942.15it/s]


embedded for fitting
fit model
embedded for prediction


generating training, validation splits...


0.762515017335501


100%|██████████| 9673/9673 [00:03<00:00, 3066.53it/s]


embedded for fitting
fit model
embedded for prediction


generating training, validation splits...


0.7690281851884446


100%|██████████| 9673/9673 [00:03<00:00, 3001.46it/s]


embedded for fitting
fit model
embedded for prediction


generating training, validation splits...


0.7661692844559881


100%|██████████| 9673/9673 [00:03<00:00, 3018.67it/s]


embedded for fitting
fit model
embedded for prediction


generating training, validation splits...


0.7660245925139474


100%|██████████| 9673/9673 [00:03<00:00, 2921.31it/s]


embedded for fitting
fit model
embedded for prediction
0.7702735061834148
CPU times: user 2h 8min 59s, sys: 42.3 s, total: 2h 9min 41s
Wall time: 1h 56min 13s


In [None]:
group.evaluate_many(test_predictions_list)

{'cyp2c9_veith': [0.767, 0.003]}