# B & F Assays

In [1]:
from pandas import read_csv, isna

In [2]:
from assay_processing import delete_assays
from assay_fusion import cluster_assays, merge_assays

In [3]:
file_path = 'deliverables/chembl_extract.csv'
types_to_keep = ['IC50', 'EC50', 'Ki', 'Kd', 'Potency', 'AC50']
min_values_per_assay = 50

In [4]:
df = read_csv(file_path, low_memory=False)
df = df.loc[df['assay_type'].isin(['B', 'F'])]
df = df.loc[df['standard_type'].isin(types_to_keep)]

---
## only keep assays with at least one pchembl value

In [5]:
assays = df.assay_id.unique()

In [6]:
assay_ids_to_drop = list()
for assay in assays:
    unique_pchembl_values = df[df.assay_id==assay]['pchembl_value'].unique()
    if len(unique_pchembl_values) == 1 and isna(unique_pchembl_values[0]):
        assay_ids_to_drop.append(assay)        

In [7]:
print(f'Before: {len(assays)}')

Before: 2997


In [8]:
print(f'Bad: {len(assay_ids_to_drop)}')

Bad: 0


In [9]:
df = delete_assays(assay_ids_to_drop, df)

### 2997 remaining assays

---
## Statistical Assay Fusion

In [10]:
%%time
assay_groups = cluster_assays(df)

CPU times: user 2min 35s, sys: 6.66 s, total: 2min 42s
Wall time: 1h 22min 1s


In [11]:
from pickle import dump
reference = list()
for group in assay_groups:
    m = min(group)
    reference.extend([{'assay_id': a, 'input_assay_id': m} for a in group])
with open('reference_bf.pkl', 'wb') as f:
    dump(reference, f)

In [20]:
df = merge_assays(df, assay_groups)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_['assay_id'][df_['assay_id'].isin(batch)] = batch.min()


In [21]:
df.assay_id.unique().shape[0]

2979

---
## Only keep assays with at least 50 data points

In [22]:
assays = df.assay_id.unique()

In [23]:
assay_ids_to_drop = list()
for assay in assays:
    assay_values = [v for v in df[df.assay_id==assay]['standard_value'] if not isna(v)] 
    if len(assay_values) < min_values_per_assay:
        assay_ids_to_drop.append(assay)   

In [24]:
print(f'Before: {len(assays)}')

Before: 2979


In [25]:
print(f'Bad: {len(assay_ids_to_drop)}')

Bad: 0


In [17]:
df = delete_assays(assay_ids_to_drop, df)

### 2979 remaining assays

---
## Unit filtering

In [26]:
df = df[df.standard_units == 'nM']

---
## Export

In [28]:
df.to_csv('data/binding_functional.csv', index=False)