In [1]:
import sys
sys.path.append('../')

%load_ext autoreload
%autoreload 2

In [154]:
from tqdm import tqdm
from collections import defaultdict
import numpy as np
import yaml
import pickle
import pandas as pd
from bigbind import *
from run import *

In [3]:
with open("../cfg.yaml", "r") as f:
    cfg = yaml.safe_load(f)

In [208]:
activities_unfiltered = pd.read_csv(cfg["bigbind_folder"] + "/activities_unfiltered.csv")

# remove mixtures
activities = activities_unfiltered[~activities_unfiltered["canonical_smiles"].str.contains("\.")].reset_index(drop=True)

# remove anything chembl thinks could be sketchy
activities = activities.query("potential_duplicate == 0 and data_validity_comment == 'valid' and confidence_score >= 8 and standard_relation == '='")

# we don't have these values for everything after deduping so just drop em
activities = activities.drop(columns=["compound_chembl_id", "potential_duplicate", "data_validity_comment", "confidence_score", "target_chembl_id", "target_type", "assay_id"])

# now we filter duplicates. For now, just use the median for all duplicated measurements
dup_indexes = activities.duplicated(keep=False, subset=['canonical_smiles', 'protein_accession'])
dup_df = activities[dup_indexes]

dup_rows = defaultdict(list)

# for smiles, uniprot, pchembl in zip(dup_df.canonical_smiles, dup_df.protein_accession, dup_df.pchembl_value):
#     dup_values[(smiles, uniprot)].append(pchembl)
for i, row in tqdm(dup_df.iterrows(), total=len(dup_df)):
    dup_rows[(row['canonical_smiles'], row['protein_accession'])].append(row)

activities = activities[~dup_indexes].reset_index(drop=True)

new_data = {
    "canonical_smiles": [],
    "standard_type": [],
    "standard_relation": [],
    "standard_value": [],
    "standard_units": [],
    "pchembl_value": [],
    "protein_accession": []
}
    
for (smiles, uniprot), rows in tqdm(dup_rows.items()):
    st_types = { r.standard_type for r in rows }
    if len(st_types) == 1:
        st_type = next(iter(st_types))
    else:
        st_type = "mixed"
    pchembl_values = [ r.pchembl_value for r in rows ]
    final_pchembl = np.median(pchembl_values)
    final_nM = 10**(9-final_pchembl)
    new_data["canonical_smiles"].append(smiles)
    new_data["standard_type"].append(st_type)
    new_data["standard_relation"].append("=")
    new_data["standard_value"].append(final_nM)
    new_data["standard_units"].append('nM')
    new_data["pchembl_value"].append(final_pchembl)
    new_data["protein_accession"].append(uniprot)

new_data_df = pd.DataFrame(new_data)
activities = pd.concat([activities, new_data_df])