In [5]:
import pandas as pd
import numpy as np

## Loading the data

In [2]:
# Use the relevant folder where you have T2, T3 and T4 
# (can be downloaded from here: https://app.box.com/folder/114032352224 )
data_folder = '../../../public-data-preparation/processing/deliverables/regression'
T2 = pd.read_csv(f'{data_folder}/T2.csv')
T3 = pd.read_csv(f'{data_folder}/T3.csv')
T4 = pd.read_csv(f'{data_folder}/T4.csv')

## Creating new inputs

### T0:

- `use_in_regression` column added and set to True for all assays
- `direction` column added and set to high for all ADME assays
- Five `expert_threshold_{i}` columns added and all set to NaN

In [27]:
T3.head()

Unnamed: 0,input_assay_id,regression_task_id,assay_type,target_id,use_in_regression,direction,expert_threshold_1,expert_threshold_2,expert_threshold_3,expert_threshold_4,expert_threshold_5
0,729459,0,ADME,81020,True,high,,,,,
1,942987,1,ADME,81135,True,high,,,,,
2,1617702,2,PANEL,22221,True,,,,,,
3,1678646,3,ADME,102723,True,high,,,,,
4,305428,4,ADME,12594,True,high,,,,,


In [8]:
T3['use_in_regression'] = True
T3.loc[T3['assay_type'] == 'ADME', 'direction'] = 'high'
for i in range(1, 6):
    T3[f'expert_threshold_{i}'] = np.nan

In [9]:
T0 = T3[['assay_type', 'use_in_regression', 'expert_threshold_1', 'expert_threshold_2',
         'expert_threshold_3', 'expert_threshold_4', 'expert_threshold_5', 'direction']]

In [37]:
print(T0.shape)
print(T0.columns)
T0.head()

(3262, 8)
Index(['assay_type', 'use_in_regression', 'expert_threshold_1',
       'expert_threshold_2', 'expert_threshold_3', 'expert_threshold_4',
       'expert_threshold_5', 'direction'],
      dtype='object')


Unnamed: 0,assay_type,use_in_regression,expert_threshold_1,expert_threshold_2,expert_threshold_3,expert_threshold_4,expert_threshold_5,direction
0,ADME,True,,,,,,high
1,ADME,True,,,,,,high
2,PANEL,True,,,,,,
3,ADME,True,,,,,,high
4,ADME,True,,,,,,high


### T1

- All values with unit nM converted to log space (pIC/EC50)
- Corresponding qualifiers inverted
- `input_assay_id` can be extracted from a join of T4 on regression_task_id with T3

In [10]:
T4.head()

Unnamed: 0,input_compound_id,regression_task_id,standard_value,standard_relation,standard_units,standard_type
0,1075529,0,25000.0,>,nM,EC50
1,1203610,1,67000.0,=,nM,IC50
2,1203845,1,100000.0,>,nM,IC50
3,1203966,1,62000.0,=,nM,IC50
4,2176418,2,60000.0,=,nM,EC50


In [12]:
T4.standard_units.value_counts()

nM         1568430
ug.mL-1       5925
Name: standard_units, dtype: int64

In [14]:
def convert_to_log_scale(x, epsilon=1e-16):
    return -np.log10(1e-9*x + epsilon)

T4.loc[T4['standard_units'] == 'nM', 'standard_value'] = T4.loc[T4['standard_units'] == 'nM', 'standard_value'].apply(convert_to_log_scale)


In [15]:
T4.standard_relation.value_counts()

=     1479878
>      115829
<        5951
>=        547
<=        158
~          12
>>          5
Name: standard_relation, dtype: int64

In [24]:
qualifier_conversion = {
    '>': '<',
    '>=': '<=',
    '<': '>',
    '<=': '>=',
    '<<': '>>',
    '>>': '<<',
    '=': '=',
    '~': '~'
}
T4.loc[T4['standard_units'] == 'nM', 'standard_qualifier'] = T4.loc[T4['standard_units'] == 'nM', 'standard_relation'].map(qualifier_conversion)
T4.loc[T4['standard_units'] != 'nM', 'standard_qualifier'] = T4.loc[T4['standard_units'] != 'nM', 'standard_relation']


In [28]:
T4_join = T4.join(T3.set_index('regression_task_id')['input_assay_id'], on='regression_task_id')

In [29]:
T4_join.shape

(1606640, 8)

In [32]:
T1 = T4_join[['input_compound_id', 'input_assay_id', 'standard_qualifier',
       'standard_value']]

In [34]:
print(T1.shape)
print(T1.columns)
T1.head()

(1606640, 4)
Index(['input_compound_id', 'input_assay_id', 'standard_qualifier',
       'standard_value'],
      dtype='object')


Unnamed: 0,input_compound_id,input_assay_id,standard_qualifier,standard_value
1425163,176394,517,<,6.0
1421980,176083,517,=,6.0
1421713,175944,517,=,6.09691
1427265,175970,517,=,6.869666
1427266,176251,517,=,8.69897
