# Featurize shortlist (list of datasets) using slurm
- since this isn't yet hard coded into the AMPL software, we need some helper files
- that's what you pulled from github in the ampl_berkeley repo

In [296]:
import pandas as pd
import json
import os
import numpy as np
from atomsci.ddm.pipeline import model_tracker as trkr
from atomsci.ddm.utils import struct_utils as su
from atomsci.ddm.utils import curate_data as cd
%config Completer.use_jedi = False

# Upload curated files to cluster file system
- From interactive jupyter, create folders like:
```
<home_dir>/data
<home_dir>/data/curated_data
```
- Upload your files into the curated data folder

# Create shortlist csv

In [297]:
# edit with your own username
my_username='amandapaulson'

In [298]:
# specify your home path and the folder where your datasets are
home_path=f"/global/home/users/{my_username}"
data_path=os.path.join(home_path, 'data')
curated_data_path=os.path.join(data_path,'curated_datasets')

In [299]:
# create list of datasets by joining the data path and the file name
dataset_keys= [os.path.join(curated_data_path, x) for x in os.listdir(curated_data_path) if 'csv' in x]
# double check output
dataset_keys

['/global/home/users/amandapaulson/data/curated_datasets/combined_substrate_data_cur.csv',
 '/global/home/users/amandapaulson/data/curated_datasets/combined_ic50_inhibitor_data_cur.csv',
 '/global/home/users/amandapaulson/data/curated_datasets/combined_class_inhibitor_data_cur.csv',
 '/global/home/users/amandapaulson/data/curated_datasets/combined_pct_inhibitor_data_cur.csv']

In [300]:
# check for duplicates - use this if you get errors in your sbatch submission

# for dset in dataset_keys:
#     df=pd.read_csv(dset)
#     try:
#         df=df.drop(columns='index')
#     except:
#         pass
#     df=df.sort_values('compound_id').reset_index(drop=True)
#     df.to_csv(dset, index=False)
#     display(df[df.compound_id.duplicated(keep=False)])

In [301]:
# check your response columns by looking at each file in turn
for dset in dataset_keys:
    df=pd.read_csv(dset)
    print(dset)
    display(df.head(1))

/global/home/users/amandapaulson/data/curated_datasets/combined_substrate_data_cur.csv


Unnamed: 0,compound_id,base_rdkit_smiles,substrate
0,(-)-Roehybridine(626578),COc1cc2[nH]c3c(c2cc1OC)CCN[C@]31CC[C@@]2(C[C@@...,0


/global/home/users/amandapaulson/data/curated_datasets/combined_ic50_inhibitor_data_cur.csv


Unnamed: 0,compound_id,base_rdkit_smiles,relation,pIC50,active
0,CHEMBL100,CC1(C)Oc2ccc(C#N)cc2[C@@H](N2CCCC2=O)[C@@H]1O,,3.451856,0


/global/home/users/amandapaulson/data/curated_datasets/combined_class_inhibitor_data_cur.csv


Unnamed: 0,compound_id,base_rdkit_smiles,relation,inhibitor
0,1-methyl-4- phenylpyridinium,C[n+]1ccc(-c2ccccc2)cc1,,0.0


/global/home/users/amandapaulson/data/curated_datasets/combined_pct_inhibitor_data_cur.csv


Unnamed: 0,compound_id,base_rdkit_smiles,relation,pct_inhib,active
0,1-methyl-4- phenylpyridinium,C[n+]1ccc(-c2ccccc2)cc1,,19.9,0


In [302]:
# set response_cols to all cols possible with no spaces between
response_cols=[
    "substrate",
    "pIC50,active",
    "inhibitor",
    "pct_inhib,active"
]

In [303]:
# choose an informative name for the collection 
# of models you will create for each dataset. 
# useful for separating them later
collections=[
    "BSEP_substrate_class",
    "BSEP_pIC50_reg",
    "BSEP_inhibitor_class",
    "BSEP_pct_inhib_reg"
]

In [304]:
# create empty dataframe and populate it with each list
sl=pd.DataFrame([],columns=['dataset_key','response_cols','collection'])
sl

Unnamed: 0,dataset_key,response_cols,collection


In [305]:
sl.dataset_key=dataset_keys
sl

Unnamed: 0,dataset_key,response_cols,collection
0,/global/home/users/amandapaulson/data/curated_...,,
1,/global/home/users/amandapaulson/data/curated_...,,
2,/global/home/users/amandapaulson/data/curated_...,,
3,/global/home/users/amandapaulson/data/curated_...,,


In [306]:
sl.response_cols=response_cols
sl

Unnamed: 0,dataset_key,response_cols,collection
0,/global/home/users/amandapaulson/data/curated_...,substrate,
1,/global/home/users/amandapaulson/data/curated_...,"pIC50,active",
2,/global/home/users/amandapaulson/data/curated_...,inhibitor,
3,/global/home/users/amandapaulson/data/curated_...,"pct_inhib,active",


In [307]:
sl.collection=collections
sl

Unnamed: 0,dataset_key,response_cols,collection
0,/global/home/users/amandapaulson/data/curated_...,substrate,BSEP_substrate_class
1,/global/home/users/amandapaulson/data/curated_...,"pIC50,active",BSEP_pIC50_reg
2,/global/home/users/amandapaulson/data/curated_...,inhibitor,BSEP_inhibitor_class
3,/global/home/users/amandapaulson/data/curated_...,"pct_inhib,active",BSEP_pct_inhib_reg


In [308]:
sl_path=os.path.join(data_path, 'bsep_feat_shortlist.csv')
sl_path

'/global/home/users/amandapaulson/data/bsep_feat_shortlist.csv'

In [309]:
sl.to_csv(sl_path, index=False)
sl

Unnamed: 0,dataset_key,response_cols,collection
0,/global/home/users/amandapaulson/data/curated_...,substrate,BSEP_substrate_class
1,/global/home/users/amandapaulson/data/curated_...,"pIC50,active",BSEP_pIC50_reg
2,/global/home/users/amandapaulson/data/curated_...,inhibitor,BSEP_inhibitor_class
3,/global/home/users/amandapaulson/data/curated_...,"pct_inhib,active",BSEP_pct_inhib_reg


# set up to run script

In [310]:
# you shouldn't have to change anything here
script_dir=f"/global/home/users/{my_username}/repos/ampl_berkeley"
python_path=f"/global/home/users/{my_username}/.conda/envs/atomsci/bin/python"
result_dir=f"/global/home/users/{my_username}/data/featurize_datasets_shortlist"

In [311]:
# make the directory if it doesn't exist
os.makedirs(result_dir, exist_ok=True)

In [312]:
# this means "cluster, <run this script> <using this version of python> <with> <these> <options>"
print(f"sbatch {script_dir}/featurize_shortlist.sh {python_path} {script_dir} {result_dir} {sl_path}")

sbatch /global/home/users/amandapaulson/repos/ampl_berkeley/featurize_shortlist.sh /global/home/users/amandapaulson/.conda/envs/atomsci/bin/python /global/home/users/amandapaulson/repos/ampl_berkeley /global/home/users/amandapaulson/data/featurize_datasets_shortlist /global/home/users/amandapaulson/data/bsep_feat_shortlist.csv


## copy and paste the output ^ into a terminal window

- Depending on how many compounds you have in your datasets, these commands could take a while ~hours
- I have set the time limit on the .sh file to two hours
- If you don't get all the files featurized before the two hour limit, simply run the command again
- The previously featurized files will be skipped

## Useful commands to check progress in terminal:

- list jobs and how long they've been running

`squeue -u <your_username>`

- look at the end of the out file to see if there are errors

`tail ~/feat_shortlist.out`

- monitor how much progress has been made, if no errors. (before featurizing a file, it will look to see if the featurized file already exists. if it doesn't it will print out an error that there is no featurized file. the ones printed out with this command are the ones that are being featurized)

`cat ~/feat_shortlist.out | grep "scaled_descriptors"`

## if you need to logout and come back later:
- submitting a job to the cluster means it will run without you monitoring it
- you can safely logout after saving your notebooks, etc, and log back in later to check progress

# monitor progress (optional)

In [313]:
# you can type these commands with ! in front which will act like a terminal prompt
# if no output, you don't have a job running
! squeue -u amandapaulson

             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)


In [314]:
! cat ~/feat_shortlist.out | grep "scaled_descriptors"

[Errno 2] No such file or directory: '/global/home/users/amandapaulson/data/curated_datasets/scaled_descriptors/combined_pct_inhibitor_data_cur_with_mordred_filtered_descriptors.csv'


In [315]:
# tail -1 means the last line; -2 last two lines
! tail -1 ~/feat_shortlist.out

/global/home/users/amandapaulson/data/featurize_datasets_shortlist


# Inspect file system when complete
- in your curated data path, you will have your original data files
- in a subfolder called `scaled_descriptors` you will have your featurized data files
- for each curated dataset, you should have an rdkit_raw and mordred_filtered descriptor file
- at the end of the script, the train_valid_test .csv's will be deleted

In [316]:
curated_data_path

'/global/home/users/amandapaulson/data/curated_datasets'

In [285]:
for root, dirs, files in os.walk(curated_data_path):
    print('\n'+root+'\n')
    files.sort()
    for file in files:
        print(file)


/global/home/users/amandapaulson/data/curated_datasets

combined_class_inhibitor_data_cur.csv
combined_class_inhibitor_data_cur_train_valid_test_random_1df527fb-ab53-4283-8ac2-187c18c3b718.csv
combined_class_inhibitor_data_cur_train_valid_test_random_48446fdf-f34f-4ef5-9494-6dcda2300204.csv
combined_class_inhibitor_data_cur_train_valid_test_random_90cb93b6-12e2-4057-9969-bf68ad7446d8.csv
combined_class_inhibitor_data_cur_train_valid_test_random_911a709e-e6df-4f9b-8c0c-ceb98786cfe0.csv
combined_ic50_inhibitor_data_cur.csv
combined_ic50_inhibitor_data_cur_train_valid_test_random_7a1a54dc-a31b-44b0-88e6-01555194630b.csv
combined_ic50_inhibitor_data_cur_train_valid_test_random_8b0b93d9-33d4-404a-bfda-c9cd76890e30.csv
combined_ic50_inhibitor_data_cur_train_valid_test_random_dead5902-c4f1-4149-aa11-ece2207a0050.csv
combined_ic50_inhibitor_data_cur_train_valid_test_random_fab0a674-4cb5-4ef9-aa2a-edd614d823ba.csv
combined_pct_inhibitor_data_cur.csv
combined_pct_inhibitor_data_cur_train_valid_

## delete split files since you will explicitly create them in the next notebook

In [317]:
for root, dirs, files in os.walk(curated_data_path, topdown=False):
    for file in files:
        if 'train_valid_test' in file:
            os.remove(os.path.join(root,file))

## inspect again

In [318]:
for root, dirs, files in os.walk(curated_data_path):
    print('\n'+root+'\n')
    files.sort()
    for file in files:
        print(file)


/global/home/users/amandapaulson/data/curated_datasets

combined_class_inhibitor_data_cur.csv
combined_ic50_inhibitor_data_cur.csv
combined_pct_inhibitor_data_cur.csv
combined_substrate_data_cur.csv

/global/home/users/amandapaulson/data/curated_datasets/scaled_descriptors

combined_class_inhibitor_data_cur_with_mordred_filtered_descriptors.csv
combined_class_inhibitor_data_cur_with_rdkit_raw_descriptors.csv
combined_ic50_inhibitor_data_cur_with_mordred_filtered_descriptors.csv
combined_ic50_inhibitor_data_cur_with_rdkit_raw_descriptors.csv
combined_pct_inhibitor_data_cur_with_mordred_filtered_descriptors.csv
combined_pct_inhibitor_data_cur_with_rdkit_raw_descriptors.csv
combined_substrate_data_cur_with_mordred_filtered_descriptors.csv
combined_substrate_data_cur_with_rdkit_raw_descriptors.csv


# You're done! move on to split_shortlist notebook