In [1]:
import pandas as pd
import re

#### How much are phytochemicals in drugs?

In [4]:
drug_ingredients = pd.read_csv('data/triplets/ingredients.tsv', sep='\t', index_col=[0])
drug_ingredients.head()

Unnamed: 0,drug_name,relation,ingredient
0,Refludan,contains,Lepirudin
1,Erbitux,contains,Cetuximab
2,Pulmozyme,contains,Dornase alfa
3,Pulmozyme 1mg/ml,contains,Dornase alfa
4,Ontak,contains,Denileukin diftitox


In [21]:
with open('data/phytochemicals.txt', 'r') as f:
    lines = f.readlines()  

phytochemicals = [re.sub('\n', '', x) for x in lines]   
phytochemicals = [x.lower() for x in phytochemicals] 

print(phytochemicals[:10])
phytochemicals = set(phytochemicals)

['alpha-carotene', 'beta-carotene', 'gamma-carotene', 'delta-carotene', 'epsilon-carotene', 'lycopene', 'neurosporene', 'phytofluene', 'phytoene', 'canthaxanthin']


In [25]:
drug_names = set(drug_ingredients.drug_name)

drugs_with_phytochemicals = dict()
phytochemicals_in_drugs = set()

for drug_name in drug_names:
    ingredients = list(drug_ingredients[drug_ingredients['drug_name'] == drug_name].ingredient.values)
    ingredients = [re.sub(' ', '', x.lower()) for x in ingredients]
    common_ingredients = list(set(ingredients) & phytochemicals)
    if len(common_ingredients) > 0:
        drugs_with_phytochemicals[drug_name] = len(common_ingredients) 
        for ing in common_ingredients:
            phytochemicals_in_drugs.add(ing)

print(phytochemicals_in_drugs)
print()
print('Number of drugs that contains phytochemicals:', len(drugs_with_phytochemicals), '/ out of', len(drug_names))

{'phenylalanine', 'testosterone', 'camphor', 'pectin', 'phenol', 'tocopherol', 'piperine', 'lutein', 'astaxanthin', 'curcumin', 'genistein', 'histidine', 'tryptophan', 'lycopene', 'hesperidin', 'cholecalciferol', 'squalane', 'thymol', 'tyrosine', 'rutin', 'resveratrol', 'capsaicin', 'zeaxanthin', 'borneol', 'eucalyptol', 'paclitaxel', 'geraniol', 'diosmetin', 'menthol'}

Number of drugs that contains phytochemicals: 4593 / out of 106826


### Data Analysis
(in training dataset)

#### Drugs

In [7]:
drug_inchi_key = pd.read_csv('data/triplets/drugs_inchi_key.tsv', sep='\t', index_col=[0])
print(drug_inchi_key.head(3))
print()

drug_molecule = pd.read_csv('data/triplets/drugs_molecule.tsv', sep='\t', index_col=[0])
print(drug_molecule.head(3))
print()

drug_subclass = pd.read_csv('data/triplets/drug_subclass.tsv', sep='\t', index_col=[0])
print(drug_subclass.head(3))
print()

drug_salts = pd.read_csv('data/triplets/drug_salts.tsv', sep='\t', index_col=[0])
print(drug_salts.head(3))
print()

drug_ingredients = pd.read_csv('data/triplets/ingredients.tsv', sep='\t', index_col=[0])
print(drug_ingredients.head(3))
print()

drug_interactions = pd.read_csv('data/triplets/ddi.tsv', sep='\t', index_col=[0])
print(drug_interactions.head(3))
print()

          drug       relation                    inchi_key
1  Bivalirudin  has_inchi_key  OIRCOABEOLEUMC-GEJPAHFPSA-N
2   Leuprolide  has_inchi_key  GFIJNRVAKGFPGQ-LIJARHBVSA-N
3    Goserelin  has_inchi_key  BLCLNMBMMGCOAS-URPVMXJPSA-N

           drug      relation                  molecule
1     Lepirudin  has_molecule         C287H440N80O110S6
2     Cetuximab  has_molecule  C6484H10042N1732O2023S36
3  Dornase alfa  has_molecule      C1321H1999N339O396S9

           drug     relation                              subclass
1     Lepirudin  in_subclass  Amino Acids, Peptides, and Analogues
2     Cetuximab  in_subclass  Amino Acids, Peptides, and Analogues
3  Dornase alfa  in_subclass  Amino Acids, Peptides, and Analogues

         drug  relation                 salt
1  Leuprolide  contains   Leuprolide acetate
2  Leuprolide  contains  Leuprolide mesylate
3  Sermorelin  contains   Sermorelin acetate

   drug_name  relation    ingredient
0   Refludan  contains     Lepirudin
1    Erbitux  

#### Food

In [9]:
food_compounds = pd.read_csv('data/triplets/food_compound.tsv', sep='\t', index_col=[0])
print(food_compounds.head())
print()

compounds_cas_num = pd.read_csv('data/triplets/compounds_cas_number.tsv', sep='\t', index_col=[0])
print(compounds_cas_num.head())
print()

       food  relation    compound
0  Angelica  contains    Apigenin
1  Angelica  contains    Luteolin
2  Angelica  contains  Kaempferol
3  Angelica  contains   Myricetin
4  Angelica  contains   Quercetin

                    compound        relation  cas_number
0     Quercetin 3-rutinoside  has_cas_number    153-18-4
1  2,6-Dihydroxybenzoic acid  has_cas_number    303-07-1
2             (S)-Naringenin  has_cas_number    480-41-1
3            L-Chicoric acid  has_cas_number  70831-56-0
4                   Phytoene  has_cas_number    540-04-5



#### Drug Supplements

In [10]:
ds_ingredients = pd.read_csv('data/triplets/ds_ingredients.tsv', sep='\t', index_col=[0])
print(ds_ingredients.head())
print()

ds_relations = pd.read_csv('data/triplets/ds_relations.tsv', sep='\t', index_col=[0])
print(ds_relations.head())
print()

                                        CUI1             REL  \
0     Douglas Laboratories - L-Lysine 500 mg  has_ingredient   
1  Met-Rx - Pure Protein Shake Vanilla Cream  has_ingredient   
2  Met-Rx - Pure Protein Shake Vanilla Cream  has_ingredient   
3  Met-Rx - Pure Protein Shake Vanilla Cream  has_ingredient   
4  Met-Rx - Pure Protein Shake Vanilla Cream  has_ingredient   

                    CUI2  
0     L-Lysine Free Form  
1             Saccharide  
2     Sodium-D-aspartate  
3          Oleovitamin A  
4  Magnesium Trisilicate  

                               CUI1             REL  \
689301  1-Androsten-3beta-ol-17-one  interacts_with   
689302                       4-DHEA  interacts_with   
689319                    Blackbush  interacts_with   
689320                    Blackbush  interacts_with   
689322                    Blackbush  interacts_with   

                                  CUI2  
689301                    Testosterone  
689302                    Testosterone 