In [5]:
import polaris as po
import datamol as mo
from splito.simpd import SIMPDSplitter
from pathlib import Path
import pandas as pd
import numpy as np

# Load data and save
For some reason, retrieving the cached data doesn't work. So saving data to a CSV, the old school way. 

In [3]:
challenge = "antiviral-admet-2025"
datadir = Path('../admet/data')
datadir.mkdir(exist_ok=True, parents=True)
data_fname = datadir.joinpath('train_data.csv')

if data_fname.exists: 
    data = pd.read_csv(str(data_fname))
else:
    competition = po.load_competition(f"asap-discovery/{challenge}")
    train, test = competition.get_train_test_split()
    data = train.as_dataframe()
    data.to_csv(datadir / 'train_data.csv', index=False)

# Splitting algorithm

## Classify target values

The molecules need to be classified as active (good) / inactive (bad).  In the paper they use a cutoff for potency which give a 40/60 split (I think!). I think this is just to ensure we have an approximate fraction of good and bad molecules in the training and test set.  

In general, the ideal drug will generally have: 

1. High solubility -> high KSOL.
3. High CNS penetrance -> High MDR1-MDCKII 
2. Long half life -> low MLM and HLM. 
4. Low lipophilicity -> Low LogD.

So let's rank the compounds, keeping the NaNs present and then take their mean rank after ignoring the NaNs.  Then we'll choose a rank which gives a 60:40 inactive/active split.

If you're doing this on potency - just select a IC50 value which you think is good (20nM is the minimal value from the Target Candidate Profiles). 

In [None]:
# Larger rank is better: 
data['KSOL_rank'] = data['KSOL'].rank(na_option='keep', ascending=True)
data['LogD_rank'] = data['LogD'].rank(na_option='keep', ascending=False)
data['MLM_rank'] = data['MLM'].rank(na_option='keep', ascending=False)
data['HLM_rank'] = data['HLM'].rank(na_option='keep', ascending=False)
data['MDR1-MDCKII_rank'] = data['MDR1-MDCKII'].rank(na_option='keep', ascending=True)
data['mean_rank'] = data.filter(regex='rank', axis=1).apply(lambda x: np.mean(x.values, where=~np.isnan(x.values)), axis=1)

Let's find the rank that gives 40% 'active': 

In [23]:
for c in np.arange(190, 200):
    print(f"{c}: fraction active: {np.mean(data['mean_rank'] > c): 4.1%}")

190: fraction active:  43.3%
191: fraction active:  42.4%
192: fraction active:  42.4%
193: fraction active:  41.7%
194: fraction active:  41.2%
195: fraction active:  40.3%
196: fraction active:  40.1%
197: fraction active:  38.9%
198: fraction active:  38.5%
199: fraction active:  38.5%


In [24]:
rank_cutoff = 195
data['is_active'] = data['mean_rank'] > rank_cutoff