In [1]:
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from tqdm import tqdm
%matplotlib inline

from paths import path_out_data, path_data

In [2]:
#read in the DTC data and test data
file_DTC = "DtcDrugTargetInteractions.csv"
file_test = "round_1_template.csv"
columns_related = ['standard_inchi_key', 'target_id', 'standard_type', 
                    'standard_value', 'standard_units']

DTC = pd.read_csv("%s/%s"%(path_data,file_DTC),usecols= columns_related)
data_test = pd.read_csv("%s/%s"%(path_data,file_test))

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# slice the rows that have a kd measured in "NM" unit
#the merge replicate by taking the average
mask_KD = DTC.standard_type.apply(lambda x: x in set(["Kd", "KD","KDAPP"]))
mask_KD_value = ~ DTC.standard_value.isna()
mask_KD_unit = DTC.standard_units == "NM"

mask_KD = mask_KD & mask_KD_value  & mask_KD_unit

DTC_kd = DTC[mask_KD].groupby(by=["standard_inchi_key","target_id"])['standard_value'].mean().reset_index()

In [4]:
DTC_kd.head()

Unnamed: 0,standard_inchi_key,target_id,standard_value
0,AAAKPBKWFMPTSQ-UHFFFAOYSA-N,P00374,2077.372727
1,AAAQFGUYHFJNHI-SFHVURJKSA-N,O60885,68.533333
2,AAAQFGUYHFJNHI-SFHVURJKSA-N,P25440,116.116667
3,AAAQFGUYHFJNHI-SFHVURJKSA-N,Q15059,60.225
4,AAAQFGUYHFJNHI-SFHVURJKSA-N,Q6PL18,10000.0


In [5]:
# find all the measured bioacivity 
type_counts = DTC.standard_type.value_counts()
terms = type_counts[type_counts > DTC_kd.shape[0]].index[:-1] # the last one is KD

In [6]:
#merge other bioactivity with kd by the standard_inchi_ky and target_id
pbar = tqdm(enumerate(terms), total = len(terms))

for i, term in pbar:
    if term not in set(['Kd','KD','KDAPP']):
       mask = DTC.standard_type.apply(lambda x: x==term)
       DTC_other  = DTC[mask].groupby(by=["standard_inchi_key","target_id"])['standard_value'].mean().reset_index()
       DTC_kd = pd.merge(DTC_kd,DTC_other, how = 'left', left_on= ["standard_inchi_key","target_id"],
                 right_on = ["standard_inchi_key","target_id"])


100%|██████████| 7/7 [00:08<00:00,  1.10s/it]


In [7]:
DTC_kd.head()

Unnamed: 0,standard_inchi_key,target_id,standard_value_x,standard_value_y,standard_value_x.1,standard_value_y.1,standard_value_x.2,standard_value_y.2,standard_value_x.3,standard_value_y.3
0,AAAKPBKWFMPTSQ-UHFFFAOYSA-N,P00374,2077.372727,,,,,,,
1,AAAQFGUYHFJNHI-SFHVURJKSA-N,O60885,68.533333,,232.273846,35.55,,,,
2,AAAQFGUYHFJNHI-SFHVURJKSA-N,P25440,116.116667,,363.0,53.0,,,,
3,AAAQFGUYHFJNHI-SFHVURJKSA-N,Q15059,60.225,,206.96,42.1,,,,
4,AAAQFGUYHFJNHI-SFHVURJKSA-N,Q6PL18,10000.0,,,,,,,


In [8]:
DTC_kd.columns =  ['standard_inchi_key', 'target_id',"Kd"] + list(terms)

In [9]:
# check the completeness of the each column
ratios = DTC_kd.isnull().sum(axis = 0)/float(DTC_kd.shape[0])
ratios = list(ratios)
ratios.sort()
ratios

[0.0,
 0.0,
 0.0,
 0.8640935450137845,
 0.8858446620401179,
 0.9186234432930886,
 0.9761003897708908,
 0.9810818518870615,
 0.9979085464397757,
 0.9996767753588744]

In [10]:
#save kd only as other bioactivity is missing for most reacords (more than 86%)
DTC_kd.iloc[:,:3].to_csv(path_out_data+"/DTC_train.csv",index = False)

In [11]:
# a helper function to check the scenario for each record in the testing

k_u_d = set(DTC_kd.standard_inchi_key.unique()) # k_u_d: key_unique_DTC
p_u_d = set(DTC_kd.target_id.unique())          #p_u_d: protein_unique_DTC

def test_scenario(x):
    b1 = x.Compound_InchiKeys in k_u_d # b1: boolean 1
    b2 = x.UniProt_Id in p_u_d
    if b1:
        if b2:
            return 1
        else:
            return 3
    else:
        if b2:
            return 2
        else:
            return 4

In [12]:
data_test["scenario"] = data_test.apply(test_scenario, axis = 1)

In [13]:
data_test.head()

Unnamed: 0,Compound_SMILES,Compound_InchiKeys,Compound_Name,UniProt_Id,Entrez_Gene_Symbol,DiscoveRx_Gene_Symbol,scenario
0,CS(=O)(=O)Nc1cccc(c1)-c1ccc2c(NC(=O)C3CC3)n[nH...,FUQAHBLEGDXXKS-UHFFFAOYSA-N,SB-742864,Q2M2I8,AAK1,AAK1,2
1,CC(C)OC(N(CC1)CCN1C2=NC3=C(C4=CN=C(C(F)(F)F)C=...,NDDGTHBFQLPYJL-UHFFFAOYSA-N,NK-92,Q2M2I8,AAK1,AAK1,2
2,O=S(NC1=CC(C2=CC=C(C(NC(C3CC3)=O)=NN4)C4=C2)=C...,UCBIQZUJJSVQHL-UHFFFAOYSA-N,UNC-AA-1-0013,Q2M2I8,AAK1,AAK1,2
3,O=C(C1CC1)NC2=NNC3=C2C=CC(C4=CC=CC(NS(N(C)CC)(...,NKCMEGZSWOVKIL-UHFFFAOYSA-N,UNC-AA-1-0017,Q2M2I8,AAK1,AAK1,2
4,[nH]1c2c(c(c1)C(=O)NC)cc(cc2)Nc3nc(ccn3)-c4ncccc4,YNYMPWKHPFKBTO-UHFFFAOYSA-N,BI01078627,P00519,ABL1,ABL1-nonphosphorylated,2


In [14]:
data_test.scenario.value_counts()

2    404
1     17
4      7
3      2
Name: scenario, dtype: int64