# Create validation indication sets

In [1]:
import pandas

In [2]:
import json

with open('dbid.json', 'r') as fin:
    dbid_map = json.load(fin)
    
with open('doid.json', 'r') as fin:
    doid_map = json.load(fin)

## Add DrugCentral novel indications

In [3]:
url = 'https://github.com/dhimmel/drugcentral/blob/e80a0c966a53ce48650d98069b126801c2793517/rephetio/indications.tsv?raw=true'
drugcentral_df = pandas.read_table(url)
drugcentral_df = drugcentral_df[drugcentral_df.category.isnull()]
drugcentral_df = drugcentral_df.rename(columns={'doid_id': 'disease_id', 'drugbank_id': 'compound_id'})
drugcentral_df = drugcentral_df[['compound_id', 'disease_id']]
drugcentral_df['status_drugcentral'] = 1
drugcentral_df.head(2)

Unnamed: 0,compound_id,disease_id,status_drugcentral
5,DB00389,DOID:12361,1
19,DB00988,DOID:14330,1


In [4]:
drugcentral_df['compound_id'] = drugcentral_df['compound_id'].apply(lambda i: dbid_map.get(i,i))
drugcentral_df['disease_id'] = drugcentral_df['disease_id'].apply(lambda i: doid_map.get(i, i))

drugcentral_df.head(2)

Unnamed: 0,compound_id,disease_id,status_drugcentral
5,Q414013,DOID:12361,1
19,Q170304,Q11085,1


In [5]:
print(len(drugcentral_df))

idx1 = drugcentral_df['compound_id'].str.startswith('Q')
idx2 = drugcentral_df['disease_id'].str.startswith('Q')

drugcentral_df = drugcentral_df.loc[idx1 & idx2]
len(drugcentral_df)

210


208

## Add ClinicalTrials.gov novel indications

In [6]:
url = 'https://github.com/dhimmel/clintrials/blob/7c65dec7b69322ca2f8ba2b170c1b3dbd92ebff8/data/DrugBank-DO-slim.tsv?raw=true'
trial_df = pandas.read_table(url)
trial_df = trial_df.groupby(['compound_id', 'disease_id']).apply(len).reset_index()
trial_df.columns = 'compound_id', 'disease_id', 'n_trials'
trial_df['status_trials'] = 1
trial_df.head(2)

Unnamed: 0,compound_id,disease_id,n_trials,status_trials
0,DB00014,DOID:10283,75,1
1,DB00014,DOID:11476,2,1


In [7]:
trial_df['compound_id'] = trial_df['compound_id'].apply(lambda i: dbid_map.get(i,i))
trial_df['disease_id'] = trial_df['disease_id'].apply(lambda i: doid_map.get(i, i))

trial_df.head(2)

Unnamed: 0,compound_id,disease_id,n_trials,status_trials
0,Q1992653,Q181257,75,1
1,Q1992653,Q165328,2,1


In [8]:
print(len(trial_df))

idx1 = trial_df['compound_id'].str.startswith('Q')
idx2 = trial_df['disease_id'].str.startswith('Q')

trial_df = trial_df.loc[idx1 & idx2]
len(trial_df)

6382


6291

In [10]:
pkdb.head()

Unnamed: 0,doid_id,drugbank_id,disease,drug,category,n_curators,n_resources
0,DOID:10652,DB00843,Alzheimer's disease,Donepezil,DM,2,1
1,DOID:10652,DB00674,Alzheimer's disease,Galantamine,DM,1,4
2,DOID:10652,DB01043,Alzheimer's disease,Memantine,DM,1,3
3,DOID:10652,DB00989,Alzheimer's disease,Rivastigmine,DM,1,3
4,DOID:10652,DB00245,Alzheimer's disease,Benzatropine,SYM,3,1


In [12]:
url = 'https://raw.githubusercontent.com/dhimmel/indications/11d535ba0884ee56c3cd5756fdfb4985f313bd80/catalog/indications.tsv'
pkdb = pandas.read_table(url)
pkdb['compound_id'] = pkdb['drugbank_id'].apply(lambda i: dbid_map.get(i, float('nan')))
pkdb['disease_id'] = pkdb['doid_id'].apply(lambda i: doid_map.get(i, float('nan')))

pkdb.head(2)

Unnamed: 0,doid_id,drugbank_id,disease,drug,category,n_curators,n_resources,compound_id,disease_id
0,DOID:10652,DB00843,Alzheimer's disease,Donepezil,DM,2,1,Q415081,Q11081
1,DOID:10652,DB00674,Alzheimer's disease,Galantamine,DM,1,4,Q412690,Q11081


In [14]:
len(pkdb.dropna(subset=['compound_id', 'disease_id'])), len(pkdb)

(1375, 1388)

In [15]:
pkdb.to_csv('pkdb.tsv', sep='\t', index = False)

## Apply to indications with an unknown category

In [27]:
pair_df = pandas.read_table('./../prediction/features/compound-disease-pairs.tsv.bz2')
pair_df = pair_df[pair_df.category.isnull()]
pair_df = pair_df.drop(['category', 'status'], axis='columns')

pair_df = pair_df.merge(trial_df, how='left')
pair_df = pair_df.merge(drugcentral_df, how='left')
for column in 'status_drugcentral', 'status_trials', 'n_trials':
    pair_df[column] = pair_df[column].fillna(0).astype(int)

pair_df.head(2)

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,compound_id,compound_name,disease_id,disease_name,n_trials,status_trials,status_drugcentral
0,Q161459,"(+)-1,2-diaminocyclohexane",Q4596888,2-hydroxyglutaric aciduria,0,0,0
1,Q161459,"(+)-1,2-diaminocyclohexane",Q3335660,3-M syndrome,0,0,0


In [28]:
pair_df.status_trials.value_counts()

0    36138608
1        5159
Name: status_trials, dtype: int64

In [29]:
pair_df.status_drugcentral.value_counts()

0    36143646
1         121
Name: status_drugcentral, dtype: int64

In [30]:
pair_df.to_csv('validation-statuses.tsv', sep='\t', index=False)