## DrugBank Data Exploration 

In [1]:
import pandas as pd

In [2]:
drug_dir = 'data/drugbank/'

#### Drugs ids with names

In [3]:
drugs = pd.read_csv(drug_dir + 'drug_id_name_map.csv', index_col=[0])
drugs.head()

Unnamed: 0,id,drug_name
1,DB00001,Lepirudin
2,DB00002,Cetuximab
3,DB00003,Dornase alfa
4,DB00004,Denileukin diftitox
5,DB00005,Etanercept


In [9]:
drugs.shape

(14594, 2)

In [4]:
num_drugs = len(drugs.id)
print(f'Number of drugs in dataset is {num_drugs}')

Number of drugs in dataset is 14594


#### Drug's categories

In [5]:
categories = pd.read_csv(drug_dir + 'drug_category.csv', index_col=[0]).dropna()
categories.head()
# mesh-id - The Medical Subjects Headings (MeSH) identifier for the category

Unnamed: 0,mesh_id,category
1,D000602,"Amino Acids, Peptides, and Proteins"
2,D000925,Anticoagulants
3,D058833,Antithrombin Proteins
4,D000991,Antithrombins
6,D002317,Cardiovascular Agents


In [6]:
categories.groupby('category').count()

Unnamed: 0_level_0,mesh_id
category,Unnamed: 1_level_1
11-Hydroxycorticosteroids,16
14-alpha Demethylase Inhibitors,8
17-Hydroxycorticosteroids,17
17-Ketosteroids,17
2-Chloroadenosine,1
...,...
meta-Aminobenzoates,3
ortho-Aminobenzoates,8
p38 Mitogen-Activated Protein Kinases,1
para-Aminobenzoates,19


#### Drugs classification

In [4]:
drug_class = pd.read_csv(drug_dir + 'drug_classification.csv', index_col=[0])

In [4]:
drug_class.head()

Unnamed: 0,id,kingdom,superclass,class,subclass,substituent
1,DB00001,Organic Compounds,Organic Acids,Carboxylic Acids and Derivatives,"Amino Acids, Peptides, and Analogues",
2,DB00002,Organic Compounds,Organic Acids,Carboxylic Acids and Derivatives,"Amino Acids, Peptides, and Analogues",
3,DB00003,Organic Compounds,Organic Acids,Carboxylic Acids and Derivatives,"Amino Acids, Peptides, and Analogues",
4,DB00004,Organic Compounds,Organic Acids,Carboxylic Acids and Derivatives,"Amino Acids, Peptides, and Analogues",
5,DB00005,Organic Compounds,Organic Acids,Carboxylic Acids and Derivatives,"Amino Acids, Peptides, and Analogues",


In [7]:
set(drug_class['class'])

{"(3'->5')-dinucleotides and analogues",
 "(5'->5')-dinucleotides",
 "2',3'-dideoxy-3'-thionucleoside monophosphates",
 "2',5'-dideoxyribonucleosides",
 '2-arylbenzofuran flavonoids',
 "5'-deoxyribonucleosides",
 '6,7-benzomorphans',
 'Acyl halides',
 'Ajmaline-sarpagine alkaloids',
 'Alkali metal organides',
 'Alkali metal oxoanionic compounds',
 'Alkali metal salts',
 'Alkaline earth metal organides',
 'Alkaline earth metal oxoanionic compounds',
 'Alkaline earth metal salts',
 'Alkyl halides',
 'Allocolchicine alkaloids',
 'Allyl-type 1,3-dipolar organic compounds',
 'Amaryllidaceae alkaloids',
 'Anthracenes',
 'Anthracyclines',
 'Aporphines',
 'Aryl halides',
 'Aryltetralin lignans',
 'Aurone flavonoids',
 'Azaspirodecane derivatives',
 'Azepanes',
 'Azepines',
 'Azetidines',
 'Azobenzenes',
 'Azoles',
 'Azolidines',
 'Azolines',
 'Benzazepines',
 'Benzene and substituted derivatives',
 'Benzimidazole ribonucleosides and ribonucleotides',
 'Benzimidazoles',
 'Benzisoxazoles',
 'Ben

In [8]:
drug_class.groupby('subclass').count()

Unnamed: 0_level_0,id,kingdom,superclass,class,substituent
subclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
(3'->5')-dinucleotides,5,5,5,5,5
"1,2-oxazinanes",1,1,1,1,1
"1,3,5-triazinanes",2,2,2,2,2
"1,3,5-triazines",2,2,2,2,2
"1,3-diazepanes",1,1,1,1,1
...,...,...,...,...,...
Vinyl bromides,1,1,1,1,1
Vinyl chlorides,1,1,1,1,1
Vitamin D and derivatives,19,19,19,19,19
Xylenes,12,12,12,12,12


#### Drugs interactions

In [8]:
interactions = pd.read_csv(drug_dir + 'drug_interactions.csv', index_col=[0])

In [9]:
interactions.head()

Unnamed: 0,id,interacted_drug_name,description
1,DB06605,Apixaban,Apixaban may increase the anticoagulant activi...
2,DB06695,Dabigatran etexilate,Dabigatran etexilate may increase the anticoag...
3,DB01254,Dasatinib,The risk or severity of bleeding and hemorrhag...
4,DB01609,Deferasirox,The risk or severity of gastrointestinal bleed...
5,DB01586,Ursodeoxycholic acid,The risk or severity of bleeding and bruising ...


In [17]:
print(interactions.description[3])

The risk or severity of bleeding and hemorrhage can be increased when Dasatinib is combined with Lepirudin.


In [3]:
food_interactions = pd.read_csv(drug_dir + 'drug_food_interactions.csv', index_col=[0])

In [4]:
food_interactions.head()

Unnamed: 0,id,description
1,DB00001,Avoid herbs and supplements with anticoagulant...
2,DB00006,Avoid echinacea.
3,DB00006,Avoid herbs and supplements with anticoagulant...
4,DB00008,Drink plenty of fluids.
5,DB00009,Avoid herbs and supplements with anticoagulant...


In [7]:
list(food_interactions.description.values)[:20]

['Avoid herbs and supplements with anticoagulant/antiplatelet activity. Examples include garlic, ginger, bilberry, danshen, piracetam, and ginkgo biloba.',
 'Avoid echinacea.',
 'Avoid herbs and supplements with anticoagulant/antiplatelet activity. Examples include garlic, ginger, bilberry, danshen, piracetam, and ginkgo biloba.',
 'Drink plenty of fluids.',
 'Avoid herbs and supplements with anticoagulant/antiplatelet activity. Examples include garlic, ginger, bilberry, danshen, piracetam, and ginkgo biloba.',
 'Avoid alcohol.',
 'Administer iron supplement. When initiating an erythropoiesis-stimulating agent, evaluate iron stores and start iron supplementation if indicated. Most patients with chronic kidney disease require iron supplementation while taking an erythropoiesis-stimulating agent.',
 'Avoid herbs and supplements with anticoagulant/antiplatelet activity. Examples include garlic, ginger, bilberry, danshen, piracetam, and ginkgo biloba.',
 'Avoid herbs and supplements with a

#### Drugs mixtures

In [13]:
mixtures = pd.read_csv(drug_dir + 'drug_mixtures.csv', index_col=[0])

In [14]:
mixtures.head()

Unnamed: 0,drug_name,ingredients
1,Refludan,Lepirudin
2,Erbitux,Cetuximab
3,Pulmozyme,Dornase alfa
4,Pulmozyme 1mg/ml,Dornase alfa
5,Ontak,Denileukin diftitox


#### Drug InChi key

In [15]:
inchi_key = pd.read_csv(drug_dir + 'drug_inchi_key.csv', index_col=[0])
inchi_key.head()

Unnamed: 0,id,inchi_key
1,DB00006,OIRCOABEOLEUMC-GEJPAHFPSA-N
2,DB00007,GFIJNRVAKGFPGQ-LIJARHBVSA-N
3,DB00014,BLCLNMBMMGCOAS-URPVMXJPSA-N
4,DB00027,NDAYQJDHGXTBJL-MWWSRJDJSA-N
5,DB00035,NFLWUMRGJYTJIN-PNIOQBSNSA-N


#### Drug molecules

In [16]:
molecules = pd.read_csv(drug_dir + 'drug_molecul.csv', index_col=[0])
molecules.head()

Unnamed: 0,id,molecule
1,DB00001,C287H440N80O110S6
2,DB00002,C6484H10042N1732O2023S36
3,DB00003,C1321H1999N339O396S9
4,DB00004,C2560H4042N678O799S17
5,DB00005,C2224H3475N621O698S36


#### Drug salts

In [17]:
salts = pd.read_csv(drug_dir + 'drug_salts.csv', index_col=[0])
salts.head()

Unnamed: 0,id,name,cas_number,inchi_key,drug
1,DBSALT000105,Leuprolide acetate,74381-53-6,YFDMUNOZURYOCP-XNHQSDQCSA-N,DB00007
2,DBSALT003182,Leuprolide mesylate,944347-41-5,MBIDSOMXPLCOHS-XNHQSDQCSA-N,DB00007
3,DBSALT001439,Sermorelin acetate,114466-38-5,,DB00010
4,DBSALT000093,Goserelin acetate,145781-92-6,IKDXDQDKCZPQSZ-JHYYTBFNSA-N,DB00014
5,DBSALT001733,Insulin human zinc suspension,,,DB00030
