# Explore Hyper Params on the 10k colors hop dfs



#### Load Data

In [None]:
!pip install pyTDC



#### Constants

In [None]:
hop_number = 2  # you only need to change this number with each copy of this notebook.
# You can run these in parallel

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
from sklearn.metrics import precision_recall_curve, auc
from tdc import utils
from tdc.benchmark_group import admet_group
import lightgbm as lgb
import glob
from tdc.benchmark_group import admet_group
from scipy.stats import uniform, randint
from datetime import datetime
import time
from typing import  Callable
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# load data
def get_larger_file_names():
  """
    Returns the file locations for the 10_000 bucket color embeddings.
  """
  return glob.glob('/content/drive/MyDrive/SpringBoard/Therapeutic Data Commons Projects/data/more_buckets_data/*hop_larger_embedding.csv')

def load_feature_df(hop_num:int):
  """
      Load all the color features, target of hop number hop nu
      Return a dataframe of (color0, color1 ... color998,color999, target) of the insample training data
  """
  hop_file_name = get_larger_file_names()[hop_num]
  df = pd.read_csv(hop_file_name, index_col=0)
  df = df.astype(np.int16)
  group = admet_group(path = 'data/')
  benchmark = group.get('cyp2c9_veith')
  targets =  benchmark['train_val']['Y'].astype(np.bool)
  df['target'] =  targets
  return df



# Constants
def compute_auprc(y_true, y_pred):
    # https://stats.stackexchange.com/questions/157012/area-under-precision-recall-curve-auc-of-pr-curve-and-average-precision-ap
    precision, recall, thresholds = precision_recall_curve(y_true, y_pred)
    area = round(auc(recall, precision), 6)
    return area


parent_save_location_name = f'/content/drive/MyDrive/SpringBoard/Therapeutic Data Commons Projects/HyperParamTuning/RandomSearchOutcomes/results_hop_number__{hop_number}__'


model_object = lgb.LGBMRegressor(subsample_freq=1) # This makes the subsample param work

param_distributions = {
    'n_estimators': randint(0, 1000),
    'learning_rate': uniform(0, .1),
    'num_leaves': randint(0, 500),
    'reg_alpha': uniform(0, .2),
    'colsample_bytree' :uniform(.0, 1),
    'subsample': uniform(0, 1)
    }

df =  load_feature_df(hop_number)
feature_names = [f for f in df.columns if 'color' in f]
target_name = 'target'



Downloading Benchmark Group...
100%|██████████| 1.47M/1.47M [00:01<00:00, 1.13MiB/s]
Extracting zip file...
Done!


### Random Search helper methods


In [None]:
def create_results_df(cv_results_):
  results_df = pd.DataFrame(cv_results_)
  results_df = results_df.sort_values(by=['rank_test_score'])
  col_order = ['mean_test_score','std_test_score','mean_fit_time']
  other_cols = [a for a in results_df.columns if a not in col_order]
  col_order.extend(other_cols)
  results_df = results_df[col_order] # reorder the cols to make them more readable
  results_df = results_df.sort_values('mean_test_score', ascending=False)
  return results_df


def get_file_name_to_save() -> str:
  current_unix_time = int(time.mktime(datetime.now().timetuple()))
  save_location_name = parent_save_location_name + f'{current_unix_time}.csv'
  return save_location_name


def save_results(results_df):
  save_location_name = get_file_name_to_save()
  results_df.to_csv(save_location_name, header=True)


def run_random_search(n_iter:int,
                      model_object,
                      param_distributions: dict,
                      score_func: Callable,
                      df:pd.DataFrame,
                      feature_names:list,
                      target_name:str) -> pd.DataFrame:
  """
      Do randomized search based on the model object and param distribtion.
  """
  random_search = RandomizedSearchCV(model_object,
                                    param_distributions,
                                    n_iter=n_iter,
                                    scoring=make_scorer(score_func, greater_is_better=True),
                                    n_jobs=-1,
                                    cv=5)

  search_results = random_search.fit(df[feature_names],df[target_name])
  
  results_df = create_results_df(search_results.cv_results_)
  print(f'CV test Score: {round(search_results.best_score_, 6)} out of {n_iter} random searches')
  save_results(results_df)


run_random_search(
      n_iter = 5,
      model_object = model_object,
      param_distributions = param_distributions,
      score_func = compute_auprc,
      df = df,
      feature_names = feature_names,
      target_name = target_name
  )


CV test Score: 0.739674 out of 5 random searches


# Do Random Search

Test 10k random hyper parameters with 5 fold CV on the given hop feature df

In [None]:
for _ in range(100):
  start = datetime.now()
  run_random_search(
      n_iter = 100,
      model_object = model_object,
      param_distributions = param_distributions,
      score_func = compute_auprc,
      df = df,
      feature_names = feature_names,
      target_name = target_name
  ) # these are very very fast
  minutes = (datetime.now() - start) / 60
  print(f'Took {minutes} minutes for 100 random searches')

CV test Score: 0.75482 out of 100 random searches
Took 0:00:13.348269 minutes for 100 random searches
CV test Score: 0.753907 out of 100 random searches
Took 0:00:11.046611 minutes for 100 random searches
CV test Score: 0.757967 out of 100 random searches
Took 0:00:14.426456 minutes for 100 random searches
CV test Score: 0.756788 out of 100 random searches
Took 0:00:13.317451 minutes for 100 random searches
CV test Score: 0.753474 out of 100 random searches
Took 0:00:12.561162 minutes for 100 random searches
CV test Score: 0.759663 out of 100 random searches
Took 0:00:11.820288 minutes for 100 random searches
CV test Score: 0.756597 out of 100 random searches
Took 0:00:12.338642 minutes for 100 random searches
CV test Score: 0.757778 out of 100 random searches
Took 0:00:13.309009 minutes for 100 random searches
CV test Score: 0.756233 out of 100 random searches
Took 0:00:14.515806 minutes for 100 random searches
CV test Score: 0.755686 out of 100 random searches
Took 0:00:11.640691 min