In [2]:
import pandas as pd

In [5]:
data_dir = '../data/'

## Data Sources Analysis

### DrugBank

num node types = 6

In [37]:
drugs_df = pd.read_csv(data_dir + 'drugbank/drug_id_name_map.csv', index_col=[0])
drug_salts_df = pd.read_csv(data_dir + 'triplets/drug_salts.tsv', sep='\t', index_col=[0])
drug_salts_cas_num_df = pd.read_csv(data_dir + 'triplets/salts_cas_numebr.tsv', sep='\t', index_col=[0])
drug_subclass_df = pd.read_csv(data_dir + 'triplets/drug_subclass.tsv', sep='\t', index_col=[0])
pathway_df = pd.read_csv(data_dir + 'drugbank/pathways.csv', index_col=[0])
pathway_enzyme_df = pd.read_csv(data_dir + 'drugbank/pathways_enzym.csv', index_col=[0])

num_drugs = drugs_df.shape[0]
print(f'Number of drugs in DrugBank: {num_drugs}')

num_salts = len(drug_salts_df.salt_id.unique())
print(f'Number of salts in DrugBank: {num_salts}')

num_subclasses = len(drug_subclass_df.subclass.unique())
print(f'Number of subclasses in DrugBank: {num_subclasses}')

num_pathways = len(pathway_df.smpdb_id.unique())
print(f'Number of pathways in DrugBank: {num_pathways}')

num_pathways_cat = len(pathway_df.category.unique())
print(f'Number of pathways categories in DrugBank: {num_pathways_cat}')

num_enzymes = len(pathway_enzyme_df.enzyme_id.unique())
print(f'Number of enzymes in DrugBank: {num_enzymes}')

Number of drugs in DrugBank: 4225
Number of salts in DrugBank: 1849
Number of subclasses in DrugBank: 324
Number of pathways in DrugBank: 758
Number of pathways categories in DrugBank: 6
Number of enzymes in DrugBank: 1030


In [46]:
ddi_df = pd.read_csv(data_dir + '/triplets/ddi.tsv', sep='\t', index_col=[0])
dfi_df = pd.read_csv(data_dir + '/triplets/dfi_processed.tsv', sep='\t', index_col=[0])
herbs_df = pd.read_csv(data_dir + '/triplets/herbs-di.tsv', sep='\t', index_col=[0])

print(f'Number of DDI in DrugBank: {ddi_df.shape[0]}')
print(f'Number of DFI in DrugBank: {dfi_df.shape[0]}')
print(f'Number of herb-drug interactions in mskcc: {herbs_df.shape[0]}')

print()
print(f'Number of types of interactions in DrugBank: {len(ddi_df.interaction.unique())}')

Number of DDI in DrugBank: 1681172
Number of DFI in DrugBank: 1918
Number of herb-drug interactions in mskcc: 566

Number of types of interactions in DrugBank: 202


In [52]:
# TODO: plot - types of interactions
# TODO: ddi_df.groupby(by='drug1').count()

### FooDB

Num node types = 2

In [60]:
food_df = pd.read_csv(data_dir + 'fooDB/food.csv', index_col=[0])
food_compounds_df = pd.read_csv(data_dir + 'fooDB/compound.csv', index_col=[0])
food_content_df = pd.read_csv(data_dir + 'fooDB/content.csv', index_col=[0])

print(f'Number of foods in FooDB: {food_df.shape[0]}')
print(f'Number of food compound in FooDB: {food_compounds_df.shape[0]}')

Number of foods in FooDB: 992
Number of food compound in FooDB: 70477


In [97]:
# Number of compounds in each food

tmp = food_content_df[["food_id", "id"]].groupby(by=['food_id']).count().sort_values('id', ascending=False)
food_id2name = dict(zip(food_df.id, food_df.name))
food_ids = tmp.index
food_ids = [food_id2name.get(i, i) for i in food_ids]
food_compounds_count = tmp.id

food_compounds_counts_dict = dict(zip(food_ids, food_compounds_count))
food_compounds_counts_dict

{'Cattle (Beef, Veal)': 50385,
 'Domestic pig': 24768,
 'Breakfast cereal': 13784,
 'Baby food': 13578,
 'Soup': 11102,
 'Cheese': 8792,
 'Sausage': 6793,
 'Pastry': 5426,
 'Common bean': 5220,
 'Biscuit': 5008,
 'Corn': 4893,
 'Potato': 4821,
 'Pizza': 3752,
 'Salad dressing': 2930,
 'Cake': 2813,
 'Sauce': 2806,
 'Pasta': 2760,
 'Hamburger': 2735,
 'Snack bar': 2646,
 'Cucurbita': 2582,
 'Other candy': 2574,
 'Soy bean': 2409,
 'Other snack food': 2381,
 'Cracker': 2313,
 'Eggs': 2303,
 'Pepper': 2290,
 'Wheat bread': 2172,
 'Pudding': 2164,
 'Yogurt': 2108,
 'Vegetarian food': 2041,
 'Other dish': 1958,
 'Rice': 1949,
 'Ice cream': 1900,
 'Wheat': 1839,
 'Breakfast sandwich': 1814,
 'Milk (Cow)': 1765,
 'Apple': 1694,
 'Common pea': 1689,
 'Peanut': 1645,
 'Garden tomato': 1613,
 'Pie': 1603,
 'Other bread': 1572,
 'Other sandwich': 1358,
 'Carrot': 1297,
 'Cream': 1296,
 'White bread': 1241,
 'Allium': 1167,
 'Rye bread': 1134,
 'Apricot': 1132,
 'Peach': 1129,
 'Other beverage': 1

### MSKCC - Herbs

In [101]:
herbs_df = pd.read_csv(data_dir + 'triplets/herbs-di.tsv', sep='\t', index_col=[0])
print(f'Number of herbs from mskcc: {len(herbs_df.drug2.unique())}')

Number of herbs from mskcc: 86


### BioKG - OGB

num node types = 5

The ogbl-biokg dataset is a Knowledge Graph (KG), which we created using data from a large number of biomedical data repositories. It contains 5 types of entities: diseases (10,687 nodes), proteins (17,499), drugs (10,533 nodes), side effects (9,969 nodes), and protein functions (45,085 nodes). There are 51 types of directed relations connecting two types of entities, including 38 kinds of drug-drug interactions, 8 kinds of protein-protein interaction, as well as drug-protein, drug-side effect, function-function relations.

In [152]:
# subgraph

biokg_df = pd.read_csv(data_dir + 'triplets/biokg_subgraph.tsv', sep='\t', index_col=[0])
print(f'Number of all triplets: {biokg_df.shape[0]}')

drugs = set(biokg_df.drug1[biokg_df.drug1.str.match('DB.*')]).union(set(biokg_df.drug2[biokg_df.drug2.str.match('DB.*')]))
print(f'Number of drugs in BioKG subgraph: {len(drugs)}')

diseases = set(biokg_df.drug1[biokg_df.drug1.str.match('disease.*')]).union(set(biokg_df.drug2[biokg_df.drug2.str.match('disease.*')]))
print(f'Number of disease in BioKG subgraph: {len(diseases)}')

proteins = set(biokg_df.drug1[biokg_df.drug1.str.match('protein.*')]).union(set(biokg_df.drug2[biokg_df.drug2.str.match('protein.*')]))
print(f'Number of proteins in BioKG subgraph: {len(proteins)}')

side_effects = set(biokg_df.drug1[biokg_df.drug1.str.match('side.*')]).union(set(biokg_df.drug2[biokg_df.drug2.str.match('side.*')]))
print(f'Number of side effects in BioKG subgraph: {len(side_effects)}')

functions = set(biokg_df.drug1[biokg_df.drug1.str.match('function.*')]).union(set(biokg_df.drug2[biokg_df.drug2.str.match('function.*')]))
print(f'Number of functions in BioKG subgraph: {len(functions)}')

Number of all triplets: 1263637
Number of drugs in BioKG subgraph: 879
Number of disease in BioKG subgraph: 4193
Number of proteins in BioKG subgraph: 14282
Number of side effects in BioKG subgraph: 8777
Number of functions in BioKG subgraph: 13434


In [134]:
print(f'Number of relation types: {len(biokg_df.relation.unique())}')
tmp_df = biokg_df.groupby(by='relation').count().sort_values('drug1', ascending=False)
print("Number of occurences of each relation type:")
dict(zip(tmp_df.index, tmp_df.drug1))

Number of relation types: 50
Number of occurences of each relation type:


{'protein-function': 298590,
 'protein-protein_reaction': 145498,
 'protein-protein_catalysis': 130541,
 'protein-protein_binding': 126427,
 'drug-sideeffect': 82966,
 'drug-protein': 78499,
 'disease-protein': 36915,
 'protein-protein_activation': 31101,
 'drug-drug_cardiovascular_system_disease': 25424,
 'drug-drug_gastrointestinal_system_disease': 22588,
 'drug-drug_nervous_system_disease': 22120,
 'drug-drug_respiratory_system_disease': 21798,
 'drug-drug_hematopoietic_system_disease': 20472,
 'drug-drug_integumentary_system_disease': 19928,
 'drug-drug_urinary_system_disease': 18072,
 'drug-drug_acquired_metabolic_disease': 17932,
 'drug-drug_musculoskeletal_system_disease': 16886,
 'drug-drug_endocrine_system_disease': 15902,
 'drug-drug_cancer': 13678,
 'protein-protein_inhibition': 12149,
 'drug-drug_cognitive_disorder': 10910,
 'drug-drug_viral_infectious_disease': 10644,
 'drug-drug_inherited_metabolic_disorder': 10550,
 'drug-drug_immune_system_disease': 9696,
 'drug-drug_fu

### Hetionet

node types description - https://github.com/hetio/hetionet/blob/main/describe/nodes/metanodes.tsv

relations description - https://github.com/hetio/hetionet/blob/main/describe/edges/metaedges.tsv

In [151]:
hetionet_df = pd.read_csv(data_dir + 'triplets/hetionet.tsv', sep='\t', index_col=[0])
print(f'Number of all triplets: {hetionet_df.shape[0]}')

node_types = ["DB", "Gene", "Anatomy", "Biological Process", "Cellular Component", 
              "Disease", "Pathway", "Pharmacologic Class", "Side Effect", "Symptom"]

for node in node_types:
    count = len(set(hetionet_df['head'][hetionet_df['head'].str.match(node + '.*')]).union(set(hetionet_df['tail'][hetionet_df['tail'].str.match(node + '.*')])))
    print(f'Number of {node} in BioKG subgraph: {count}')

Number of all triplets: 1221982
Number of DB in BioKG subgraph: 1534
Number of Gene in BioKG subgraph: 16945
Number of Anatomy in BioKG subgraph: 389
Number of Biological Process in BioKG subgraph: 10310
Number of Cellular Component in BioKG subgraph: 1241
Number of Disease in BioKG subgraph: 135
Number of Pathway in BioKG subgraph: 1775
Number of Pharmacologic Class in BioKG subgraph: 263
Number of Side Effect in BioKG subgraph: 5311
Number of Symptom in BioKG subgraph: 407


In [150]:
print(f'Number of relation types: {len(hetionet_df.relation.unique())}')
tmp_df = hetionet_df.groupby(by='relation').count().sort_values('head', ascending=False)
print("Number of occurences of each relation type:")
dict(zip(tmp_df.index, tmp_df['head']))

Number of relation types: 24
Number of occurences of each relation type:


{'Gr>G': 259502,
 'GpBP': 232224,
 'AeG': 201805,
 'CcSE': 138489,
 'GiG': 100384,
 'GpPW': 39931,
 'GpMF': 38385,
 'AuG': 33126,
 'AdG': 32964,
 'GpCC': 28460,
 'GcG': 26489,
 'CdG': 20874,
 'CuG': 18435,
 'DaG': 11681,
 'CbG': 11163,
 'DuG': 7115,
 'DdG': 7047,
 'CrC': 6287,
 'DlA': 2595,
 'DpS': 2504,
 'PCiC': 931,
 'CtD': 748,
 'DrD': 454,
 'CpD': 389}

## Datasets Analysis

### Drugbank (full)

In [None]:
drugbank_df = pd.read_csv(data_dir + 'triplets/train_drugbank.tsv') 

### DrugBank + BioKG subgraph

### DrugBank + Hetionet subgraph

## Detailed Analysis of One Drug

## Other

### Common drugs in DrugBank

### Food compounds in drugs

### Drug salts cas-number vs food comounds cas-number