# A & T Assays

In [1]:
from pandas import read_csv, isna

In [2]:
from assay_processing import delete_assays

In [3]:
file_path = 'deliverables/chembl_extract.csv'
types_to_keep = ['IC50', 'EC50', 'Ki', 'Kd', 'Potency', 'AC50']
min_values_per_assay = 50

In [4]:
df = read_csv(file_path, low_memory=False)
df = df.loc[df['assay_type'].isin(['A', 'T'])]
df = df.loc[df['standard_type'].isin(types_to_keep)]

---
## only keep assays with at least one pchembl value

In [5]:
assays = list(df.assay_id.unique())

In [6]:
assay_ids_to_drop = list()
for assay in assays:
    unique_pchembl_values = df[df.assay_id==assay]['pchembl_value'].unique()
    if len(unique_pchembl_values) == 1 and isna(unique_pchembl_values[0]):
        assay_ids_to_drop.append(assay)        

In [7]:
print(f'Before: {len(assays)}')

Before: 11520


In [8]:
print(f'Bad: {len(assay_ids_to_drop)}')

Bad: 4586


In [9]:
df = delete_assays(assay_ids_to_drop, df)

### 6934 remaining assays

---
## Only keep assays with at least 50 data points

In [10]:
assays = list(df.assay_id.unique())

In [11]:
assay_ids_to_drop = list()
for assay in assays:
    assay_values = [v for v in df[df.assay_id==assay]['standard_value'] if not isna(v)] 
    if len(assay_values) < min_values_per_assay:
        assay_ids_to_drop.append(assay)   

In [12]:
print(f'Before: {len(assays)}')

Before: 6934


In [13]:
print(f'Bad: {len(assay_ids_to_drop)}')

Bad: 6814


In [14]:
df = delete_assays(assay_ids_to_drop, df)

### 120 remaining assays

---
## Unit filtering

In [15]:
assert list(df.standard_units.unique()) == ['nM']

---
## Export

In [16]:
from pickle import dump
reference = [
    {'assay_id': a, 'input_assay_id': a}
    for a in df.assay_id.unique()
]
with open('reference_at.pkl', 'wb') as f:
    dump(reference, f)

In [16]:
df.to_csv('data/admet_tox.csv', index=False)