In [None]:
!git clone https://github.com/parkerburchett/pysmiles
!pip install pyTDC
!git clone https://github.com/parkerburchett/TDC-DeepLearning

fatal: destination path 'pysmiles' already exists and is not an empty directory.
fatal: destination path 'TDC-DeepLearning' already exists and is not an empty directory.


In [None]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import lightgbm as lgb


from sklearn.metrics import precision_recall_curve, auc
from pysmiles.pysmiles import read_smiles
from tdc.single_pred import ADME
import os
os.chdir('/content/TDC-DeepLearning/')
from utils import ColorRefinement as cr # this is the graph embedding Algo I wrote



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def compute_auprc(y_true, y_pred):
    # https://stats.stackexchange.com/questions/157012/area-under-precision-recall-curve-auc-of-pr-curve-and-average-precision-ap
    precision, recall, thresholds = precision_recall_curve(y_true, y_pred)
    area = round(auc(recall, precision), 6)
    return area


In [None]:
import json

best_models_params = json.load(open('/content/drive/MyDrive/SpringBoard/Therapeutic Data Commons Projects/HyperParamTuning/bestmodels.json', 'r'))
best_weights = pd.read_csv('/content/drive/MyDrive/SpringBoard/Therapeutic Data Commons Projects/HyperParamTuning/best_4_regression_weights.csv', index_col=0)


best_models_params

{'0': "{'colsample_bytree': 0.45921506474872353, 'learning_rate': 0.003605978989205916, 'n_estimators': 997, 'num_leaves': 171, 'reg_alpha': 0.06136193030050688, 'subsample': 0.664374000848817}",
 '1': "{'colsample_bytree': 0.19190373976042552, 'learning_rate': 0.018880733945270692, 'n_estimators': 772, 'num_leaves': 412, 'reg_alpha': 0.1319602189105627, 'subsample': 0.953435263598222}",
 '2': "{'colsample_bytree': 0.0846115062976256, 'learning_rate': 0.061904626017968235, 'n_estimators': 710, 'num_leaves': 218, 'reg_alpha': 0.09722107305351997, 'subsample': 0.7625401046034898}",
 '3': "{'colsample_bytree': 0.10234165146414909, 'learning_rate': 0.021876318417714605, 'n_estimators': 574, 'num_leaves': 193, 'reg_alpha': 0.08093256266597965, 'subsample': 0.9986613307559011}",
 '4': "{'colsample_bytree': 0.29036206035748535, 'learning_rate': 0.09316958191707549, 'n_estimators': 473, 'num_leaves': 101, 'reg_alpha': 0.1992406954388929, 'subsample': 0.005705227464680718}"}

In [None]:
tuned_weights = best_weights.values[:-1].flatten()
tuned_weights

array([0.23494501, 0.50427143, 0.16934516, 0.0914384 ])

In [None]:
model_params = [json.loads(best_models_params[i].replace("'",'"')) for i in best_models_params.keys()]
tuned_models = [lgb.LGBMRegressor(subsample_freq=1, **p) for p in model_params[:-1]]
tuned_models

[LGBMRegressor(boosting_type='gbdt', class_weight=None,
               colsample_bytree=0.45921506474872353, importance_type='split',
               learning_rate=0.003605978989205916, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=997, n_jobs=-1, num_leaves=171, objective=None,
               random_state=None, reg_alpha=0.06136193030050688, reg_lambda=0.0,
               silent=True, subsample=0.664374000848817,
               subsample_for_bin=200000, subsample_freq=1),
 LGBMRegressor(boosting_type='gbdt', class_weight=None,
               colsample_bytree=0.19190373976042552, importance_type='split',
               learning_rate=0.018880733945270692, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=772, n_jobs=-1, num_leaves=412, objective=None,
               random_state=None, reg_alpha=0.1319602189105627, reg_lambda=0.0,
          

In [97]:
class WeightedModel:

  def __init__(self,
               cr_num_hops:int,
               cr_num_colors:int, 
               models:list,
               model_weights:list):
    self.cr_num_hops = cr_num_hops
    self.cr_num_colors = cr_num_colors
    self.models = models
    self.model_weights = model_weights


  def faster_fit(self, model_df_pairs):
    """
      Fit from model_df_pairs
    """
    hop_feature_dfs = [pair['df'] for pair in model_df_pairs]
    self.models = [pair['model_object'] for pair in model_df_pairs]  
    for hop_num, model in enumerate(self.models):
      df = hop_feature_dfs[hop_num]
      X=df[self.features]
      y=df[self.target]
      model.fit(X,y)


  def _create_embeddings(self, smiles):
    graphs = [read_smiles(s, silent=True) for s in smiles]
    hop_feature_dfs = cr.create_hop_feature_dfs(graphs=graphs,
                                                num_hops=self.cr_num_hops,
                                                num_colors=self.cr_num_colors)
    print('embedded with Color Refinement')
    return hop_feature_dfs


  def fit(self, smiles, targets):
    hop_feature_dfs = self._create_embeddings(smiles)
    for hop_num, model in enumerate(self.models):
      X = hop_feature_dfs[hop_num].values
      y = targets.values
      model.fit(X,y)
      print('fit a model')


  def predict(self, smiles):
    hop_feature_dfs = self._create_embeddings(smiles)
    prediction_df = pd.DataFrame()
    for hop_num, model in enumerate(self.models):
      prediction_df[hop_num] = model.predict(hop_feature_dfs[hop_num].values)
    weighted_predictions = prediction_df.values.dot(self.model_weights)
    return weighted_predictions


In [None]:
from tdc.benchmark_group import admet_group
group = admet_group(path = 'data/')
benchmark = group.get('cyp2c9_veith')

valid_df = benchmark['test'][['Drug', 'Y']]
train_df = benchmark['train_val'][['Drug', 'Y']]
name = benchmark['name']
predictions = {}

final_model = WeightedModel(cr_num_hops=4,
                            cr_num_colors=10_000,
                            models=tuned_models,
                            model_weights=tuned_weights)

train_smiles, train_targets = train_df['Drug'], train_df['Y']
final_model.fit(train_smiles, train_targets)
preds = final_model.predict(valid_df['Drug'])

selfCalcuated_auprc = compute_auprc(valid_df['Y'], preds)
print('you calcuated auprc at ', selfCalcuated_auprc)

predictions[name] = preds 
print(group.evaluate(predictions))

Found local copy...


embedded with Color Refinement
fit a model
fit a model
fit a model
fit a model
embedded with Color Refinement
you calcuated auprc at  0.770984
{'cyp2c9_veith': {'pr-auc': 0.771}}
