# Create shortlist with split UUIDs - split datasets with different splits
- This will use the hyperparameter_search_wrapper.py code
- do NOT run this until you're completely done featurizing your datasets

In [22]:
# import packages

import pandas as pd
import json
import os
import numpy as np
from atomsci.ddm.pipeline import model_tracker as trkr
from atomsci.ddm.utils import struct_utils as su
from atomsci.ddm.utils import curate_data as cd
%config Completer.use_jedi = False

In [37]:
# specify your home path and the folder where your datasets are
home_path=f"/global/home/users/{my_username}"
data_path=os.path.join(home_path, 'data')
curated_data_path=os.path.join(data_path,'curated_datasets')

In [38]:
# set up parameters
my_username="amandapaulson"
script_dir=f"/global/home/users/{my_username}/repos/AMPL/atomsci/ddm/utils"
result_dir=f"/global/home/users/{my_username}/data/featurize_datasets_shortlist"

# change to what you called your shortlist
shortlist_key=f"/global/home/users/{my_username}/data/bsep_feat_shortlist.csv"

In [39]:
# create parameter dictionary to pass to ampl software

params = {
    # don't change these
    "hyperparam": "True",
    "slurm_account": "ic_engin296ma",
    "slurm_partition": "savio2_htc",
    "slurm_time_limit": "60",
    "slurm_nodes":"None",
    "slurm_options":"#SBATCH@--cpus-per-task=1",
    "system": "LC",
    "datastore": "False",
    "save_results": "True",
    
    # these will be modified depending on what you need to do
    "split_only": "True",
    "use_shortlist": "True",
    "shortlist_key": shortlist_key,
    "result_dir": result_dir,
    "smiles_col" : "base_rdkit_smiles",
    "collection_name" : "featurize"
}

In [40]:
# save the dict as a .json file in your results directory
fname=f'{result_dir}/bsep_split_shortlist.json'
with open(fname, 'w') as fp:
    json.dump(params, fp, indent=4)

# print out the command you'll run in your terminal
print(f'python {script_dir}/hyperparam_search_wrapper.py --config_file {fname}')

python /global/home/users/amandapaulson/repos/AMPL/atomsci/ddm/utils/hyperparam_search_wrapper.py --config_file /global/home/users/amandapaulson/data/featurize_datasets_shortlist/bsep_split_shortlist.json


## copy/paste the command ^ above into a terminal
## you MUST have your atomsci env activated first
`conda activate atomsci`

output will be printed directly to the terminal window. watch what it says and if any errors arise.

# inspect new shortlist file

In [41]:
sl=pd.read_csv(shortlist_key)
sl

Unnamed: 0,dataset_key,response_cols,collection
0,/global/home/users/amandapaulson/data/curated_...,substrate,BSEP_substrate_class
1,/global/home/users/amandapaulson/data/curated_...,"pIC50,active",BSEP_pIC50_reg
2,/global/home/users/amandapaulson/data/curated_...,inhibitor,BSEP_inhibitor_class
3,/global/home/users/amandapaulson/data/curated_...,"pct_inhib,active",BSEP_pct_inhib_reg


In [42]:
sl_uuid_key=shortlist_key.replace('.csv','_with_uuids.csv')
sl_uuid_key

'/global/home/users/amandapaulson/data/bsep_feat_shortlist_with_uuids.csv'

In [43]:
sl=pd.read_csv(sl_uuid_key)
sl

Unnamed: 0,dataset_key,bucket,response_cols,collection,random_10_10,random_15_15,random_10_20,random_20_20,scaffold_10_10,scaffold_15_15,scaffold_10_20,scaffold_20_20,fingerprint_10_10,fingerprint_15_15,fingerprint_10_20,fingerprint_20_20
0,/global/home/users/amandapaulson/data/curated_...,public,substrate,BSEP_substrate_class,eb6056a0-bea5-4bf5-82a8-c547ec3ee2b3,a7b083c4-387e-4050-80f9-6cca7351287d,fda59802-0723-40de-be41-46d5c43a632f,545cc841-a02f-42ab-bed3-6514ef173179,,,,,573f3258-6694-4783-95ef-c4926a6c7079,d38ca641-4b44-473e-8721-8b343ef96ca8,f5023b1d-9417-4c95-a3ef-4437c99281d6,4cdd9b12-596e-4c8c-b8dc-b3b73787943b
1,/global/home/users/amandapaulson/data/curated_...,public,"pIC50,active",BSEP_pIC50_reg,1f0202ed-07a9-41d0-bf83-449b177ee81d,d3f0f361-3da9-4ec2-8f66-4bdd6fe1884a,4b5081a8-0223-4b73-a2dd-d41250dc6cd3,adcd3233-a4ef-45b0-aef6-9e3ff31457a9,190c68e4-19c0-4115-886a-1723dda50881,d3ddff01-3560-4464-9420-073039e16186,f05c88a3-7da6-4bd0-bde7-6901ae989a94,e4d70f88-cffc-40ca-b6e1-4e85d34affa3,4c1e760e-57ce-419f-bf9d-a2d0c97ef463,1483e461-ad4c-42ad-9fdc-c3db009687fd,1582aa5b-5eb9-4287-a483-0b1616402840,1c81bbb4-7f6b-4a55-b1aa-c106282b7758
2,/global/home/users/amandapaulson/data/curated_...,public,inhibitor,BSEP_inhibitor_class,943587de-9305-4044-8ed7-349536255bd3,a0bccea3-88c1-4971-94b6-c46f194db98c,2209f236-10c1-403c-96ec-b506bfd26b06,a9bd68c6-334e-4c41-abe3-44228f0da4d6,ec7c99c5-c1b2-4858-8423-4a774b3214ce,8674c5a1-caf7-4649-b493-c91f75dd3fd3,3a986e53-c0bb-4ef8-ada7-e273d6a29192,425dfa23-6786-4b88-a784-14d4bb7ec142,1ab1271f-7bdd-4966-ad72-ccb49c472cd4,089d55f8-5821-4095-9091-a8b1ba72817e,c2a79a88-1f0e-4b9e-a6c0-27d7041a3c72,9b344058-0dd4-4018-9363-73ae1753c92f
3,/global/home/users/amandapaulson/data/curated_...,public,"pct_inhib,active",BSEP_pct_inhib_reg,a6123948-87e4-46ac-ba30-317d13914d47,7dca8371-cc27-4753-beef-c7c3d862a4dd,f2d3be22-6293-4104-90cc-680c3887b5d5,c5ffbc0a-d630-4800-9a5c-bdeab5ecdee2,ab5502dd-5b03-47a7-b502-ee4f8db40807,64a14558-33fa-4547-b4b1-f28b7ad4254e,83791887-bdde-49eb-95d1-da3fcc382f09,5b5a3e8c-cb3e-4d98-ad5b-e26383cc36b9,f7c42ada-dbb9-44ec-80bc-6d5a383b9f8d,3abdbe2a-3be3-4ded-844e-ee30555f210a,d378a83e-9ccf-45bc-a11e-18b147bad296,8212780f-8488-4afd-ae81-132b6347602a


### now you have a bunch of new columns
- each UUID corresponds to a split file in your curated_data directory
- random_10_10 means you split the data randomly and 10% goes into valid, 10% into test, 80% into train
- **remember not to delete these split files, especially once you start training models**

In [54]:
curated_data_path

'/global/home/users/amandapaulson/data/curated_datasets'

In [55]:
for root, dirs, files in os.walk(curated_data_path):
    print('\n'+root+'\n')
    files.sort()
    for file in files:
        print(file)


/global/home/users/amandapaulson/data/curated_datasets

combined_class_inhibitor_data_cur.csv
combined_class_inhibitor_data_cur_train_valid_test_fingerprint_14b7bf3a-6272-43e0-8e9d-c6e385a14e41.csv
combined_class_inhibitor_data_cur_train_valid_test_fingerprint_262113db-910f-42d2-abc8-28d5a5beb318.csv
combined_class_inhibitor_data_cur_train_valid_test_fingerprint_57fade82-dd21-45f8-a5b8-9ed3d5d2112a.csv
combined_class_inhibitor_data_cur_train_valid_test_fingerprint_b75234da-dd2c-458b-8e1f-98b688046403.csv
combined_class_inhibitor_data_cur_train_valid_test_random_61b485b9-ceff-4742-b3d7-b02d72ba0f74.csv
combined_class_inhibitor_data_cur_train_valid_test_random_7624990e-0093-4653-967a-e88fa147eb13.csv
combined_class_inhibitor_data_cur_train_valid_test_random_90da2246-be18-407b-83a7-2737e3beb9cf.csv
combined_class_inhibitor_data_cur_train_valid_test_random_b9da9cf9-e2de-443d-91a9-d122e439bf15.csv
combined_class_inhibitor_data_cur_train_valid_test_scaffold_31934b3e-0f25-4bda-b639-55472648a

# if you have NaN's instead of split uuids, email me for more info!

# you're ready to start modeling now!