# Create shortlist with split UUIDs - split datasets with different splits
- This will use the hyperparameter_search_wrapper.py code
- do NOT run this until you're completely done featurizing your datasets

In [3]:
# import packages

import pandas as pd
import json
import os
import numpy as np
from atomsci.ddm.pipeline import model_tracker as trkr
from atomsci.ddm.utils import struct_utils as su
from atomsci.ddm.utils import curate_data as cd
%config Completer.use_jedi = False

In [4]:
# specify your home path and the folder where your datasets are
my_username="amandapaulson"

home_path=f"/global/home/users/{my_username}"
data_path=os.path.join(home_path, 'data')
curated_data_path=os.path.join(data_path,'curated_datasets')
curated_data_path

'/global/home/users/amandapaulson/data/curated_datasets'

In [5]:
# set up parameters

script_dir=f"/global/home/users/{my_username}/repos/AMPL/atomsci/ddm/utils"
result_dir=f"/global/home/users/{my_username}/data/featurize_datasets_shortlist"

# change to what you called your shortlist
shortlist_key=f"/global/home/users/{my_username}/data/bsep_feat_shortlist.csv"

In [6]:
# create parameter dictionary to pass to ampl software

params = {
    # don't change these
    "hyperparam": "True",
    "slurm_account": "ic_engin296ma",
    "slurm_partition": "savio2_htc",
    "slurm_time_limit": "60",
    "slurm_nodes":"None",
    "slurm_options":"#SBATCH@--cpus-per-task=1",
    "system": "LC",
    "datastore": "False",
    "save_results": "True",
    
    # these will be modified depending on what you need to do
    "split_only": "True",
    "use_shortlist": "True",
    "shortlist_key": shortlist_key,
    "result_dir": result_dir,
    "smiles_col" : "base_rdkit_smiles",
    "collection_name" : "featurize"
}

params

{'hyperparam': 'True',
 'slurm_account': 'ic_engin296ma',
 'slurm_partition': 'savio2_htc',
 'slurm_time_limit': '60',
 'slurm_nodes': 'None',
 'slurm_options': '#SBATCH@--cpus-per-task=1',
 'system': 'LC',
 'datastore': 'False',
 'save_results': 'True',
 'split_only': 'True',
 'use_shortlist': 'True',
 'shortlist_key': '/global/home/users/amandapaulson/data/bsep_feat_shortlist.csv',
 'result_dir': '/global/home/users/amandapaulson/data/featurize_datasets_shortlist',
 'smiles_col': 'base_rdkit_smiles',
 'collection_name': 'featurize'}

In [40]:
# save the dict as a .json file in your results directory
fname=f'{result_dir}/bsep_split_shortlist.json'
with open(fname, 'w') as fp:
    json.dump(params, fp, indent=4)

# print out the command you'll run in your terminal
print(f'python {script_dir}/hyperparam_search_wrapper.py --config_file {fname}')

python /global/home/users/amandapaulson/repos/AMPL/atomsci/ddm/utils/hyperparam_search_wrapper.py --config_file /global/home/users/amandapaulson/data/featurize_datasets_shortlist/bsep_split_shortlist.json


## copy/paste the command ^ above into a terminal
## you MUST have your atomsci env activated first
`conda activate atomsci`

output will be printed directly to the terminal window. watch what it says and if any errors arise.

# inspect new shortlist file

In [7]:
sl=pd.read_csv(shortlist_key)
sl

Unnamed: 0,dataset_key,response_cols,collection
0,/global/home/users/amandapaulson/data/curated_...,substrate,BSEP_substrate_class
1,/global/home/users/amandapaulson/data/curated_...,"pIC50,active",BSEP_pIC50_reg
2,/global/home/users/amandapaulson/data/curated_...,inhibitor,BSEP_inhibitor_class
3,/global/home/users/amandapaulson/data/curated_...,"pct_inhib,active",BSEP_pct_inhib_reg


In [8]:
sl_uuid_key=shortlist_key.replace('.csv','_with_uuids.csv')
sl_uuid_key

'/global/home/users/amandapaulson/data/bsep_feat_shortlist_with_uuids.csv'

In [9]:
sl=pd.read_csv(sl_uuid_key)
sl

Unnamed: 0,dataset_key,bucket,response_cols,collection,random_10_10,random_15_15,random_10_20,random_20_20,scaffold_10_10,scaffold_15_15,scaffold_10_20,scaffold_20_20,fingerprint_10_10,fingerprint_15_15,fingerprint_10_20,fingerprint_20_20
0,/global/home/users/amandapaulson/data/curated_...,public,substrate,BSEP_substrate_class,9b4c1491-27d8-460b-8531-fdf88ebab3a9,dd241db5-8735-4249-98ee-febbe5b3590f,ea5505cf-05ab-4b99-95a0-c9c39eb895b5,5925ea42-3ac0-4d06-b01d-ba857f7fd4e3,,,,,f87dc56c-6615-49d8-b41a-d2a82fe2d5c9,55a1d301-d2c0-45d1-a9c4-8a136a66268f,1176149d-2c84-49de-8c9b-6f7a0f469d08,c2d89b38-97ba-43ab-a7b5-6115b214bf33
1,/global/home/users/amandapaulson/data/curated_...,public,"pIC50,active",BSEP_pIC50_reg,a3957fff-9857-49f2-b596-2e50bef64f61,d371b7c7-8705-4057-beec-3a42a6f6196f,97c576a7-9d95-45a7-94bb-2eb82ccc64d2,1675990e-e55b-48e8-9227-c2db8dd3497e,26021005-0cd6-4ee6-872f-e2a036fe0981,754a5b73-fcca-4b0a-a823-80f2717ba6fd,9a8c9002-888d-49a4-9eb6-6c517c165eb2,55574502-4524-4b39-acb8-5d41eab8a25c,0fde29ef-f8e1-4c81-9135-4166fcb77129,374c75f0-04ff-4ace-97a6-37ba3ff0774e,8efc2521-830e-47fa-8c60-ed528b76f9a1,315c00fe-4fb4-4095-adc3-7f86ce3345e6
2,/global/home/users/amandapaulson/data/curated_...,public,inhibitor,BSEP_inhibitor_class,61b485b9-ceff-4742-b3d7-b02d72ba0f74,7624990e-0093-4653-967a-e88fa147eb13,b9da9cf9-e2de-443d-91a9-d122e439bf15,90da2246-be18-407b-83a7-2737e3beb9cf,31934b3e-0f25-4bda-b639-55472648a36c,9c147930-65e8-4e1c-ba26-09a44fd77d31,b2c48656-7143-4c51-aeb7-9b41f540ea77,6a6ccbfa-e69d-43fb-bf4c-5ae74099d48e,14b7bf3a-6272-43e0-8e9d-c6e385a14e41,262113db-910f-42d2-abc8-28d5a5beb318,57fade82-dd21-45f8-a5b8-9ed3d5d2112a,b75234da-dd2c-458b-8e1f-98b688046403
3,/global/home/users/amandapaulson/data/curated_...,public,"pct_inhib,active",BSEP_pct_inhib_reg,596df618-5760-43ac-9b40-332dee64f2be,8e47bf5f-8fdb-47e2-aefd-eb373de5a961,da2a26e5-185f-41f4-9cf9-2c952c91efd0,3fc09959-10fb-4888-8373-a370d75f59bd,62b2c43a-213d-4f40-959c-dc24767f2bc2,d5e21e98-0b7b-4a6e-a454-d64eafffd600,7c754b0c-49e9-4905-ab3b-b5e2183b875b,b1c877af-0a06-4f2c-9215-ef576cdbb410,301908ba-2a8b-4fe4-9f2e-2a8a72c2d252,6f84fe3f-287e-463b-b204-26d6dcbfeb3c,0cb8ae0b-335f-4dc6-abc6-bc5d134e77a2,89f8ab28-ae70-4892-9977-b93cbd42dc8f


### now you have a bunch of new columns
- each UUID corresponds to a split file in your curated_data directory
- random_10_10 means you split the data randomly and 10% goes into valid, 10% into test, 80% into train
- **remember not to delete these split files, especially once you start training models**

In [54]:
curated_data_path

'/global/home/users/amandapaulson/data/curated_datasets'

In [55]:
for root, dirs, files in os.walk(curated_data_path):
    print('\n'+root+'\n')
    files.sort()
    for file in files:
        print(file)


/global/home/users/amandapaulson/data/curated_datasets

combined_class_inhibitor_data_cur.csv
combined_class_inhibitor_data_cur_train_valid_test_fingerprint_14b7bf3a-6272-43e0-8e9d-c6e385a14e41.csv
combined_class_inhibitor_data_cur_train_valid_test_fingerprint_262113db-910f-42d2-abc8-28d5a5beb318.csv
combined_class_inhibitor_data_cur_train_valid_test_fingerprint_57fade82-dd21-45f8-a5b8-9ed3d5d2112a.csv
combined_class_inhibitor_data_cur_train_valid_test_fingerprint_b75234da-dd2c-458b-8e1f-98b688046403.csv
combined_class_inhibitor_data_cur_train_valid_test_random_61b485b9-ceff-4742-b3d7-b02d72ba0f74.csv
combined_class_inhibitor_data_cur_train_valid_test_random_7624990e-0093-4653-967a-e88fa147eb13.csv
combined_class_inhibitor_data_cur_train_valid_test_random_90da2246-be18-407b-83a7-2737e3beb9cf.csv
combined_class_inhibitor_data_cur_train_valid_test_random_b9da9cf9-e2de-443d-91a9-d122e439bf15.csv
combined_class_inhibitor_data_cur_train_valid_test_scaffold_31934b3e-0f25-4bda-b639-55472648a

# if you have NaN's instead of split uuids, email me for more info!

# you're ready to start modeling now!