### Herb-drug Interactions Extraction
Extracting herb-drug interactions from https://www.mskcc.org/cancer-care/diagnosis-treatment/symptom-management/integrative-medicine/herbs/search.

The __herb-drug_interactions.json__ is created from a webscraped file __herbs_scrapping.json__. 

In this notebook, we extract names of drugs and map them to their ATC codes (using drug_id_name_map.csv, drug_synonyms.csv, drug_atc_code.csv). The output is a .tsv file in a following format: __\<drug's atc_code\>, interacts_with, \<herb\>__  

In [1]:
import pandas as pd
import json
import re
import itertools

In [2]:
with open('herb-drug_interactions.json', 'r') as f:
    herbs_drug_interactions = json.load(f)

herbs_drug_interactions

{'5-HTP': {'drugs': ['Antidepressants/anxiolytics (tricyclics, MAOIs, and SSRIs)',
   'Monamine oxdiase inhibitors',
   'Linezolid (Zyvox, an antibiotic MAOI)',
   'Carbidopa (Lodosyn, a dopamine promoter)'],
  'url': 'https://www.mskcc.org/cancer-care/integrative-medicine/herbs/5-htp-01'},
 '714X': {'drugs': [],
  'url': 'https://www.mskcc.org/cancer-care/integrative-medicine/herbs/714x'},
 'Acai Berry': {'drugs': [],
  'url': 'https://www.mskcc.org/cancer-care/integrative-medicine/herbs/acai-berry'},
 'Agaricus': {'drugs': ['CYP450 substrates'],
  'url': 'https://www.mskcc.org/cancer-care/integrative-medicine/herbs/agaricus'},
 'AHCC': {'drugs': ['CYP450 substrates', 'Aromatase inhibitors'],
  'url': 'https://www.mskcc.org/cancer-care/integrative-medicine/herbs/ahcc'},
 'Aloe Vera': {'drugs': ['Cytochrome P450 substrates', 'Sevoflurane'],
  'url': 'https://www.mskcc.org/cancer-care/integrative-medicine/herbs/aloe-vera'},
 'Alpha-Lipoic Acid': {'drugs': ['Hypoglycemic agents'],
  'url

In [3]:
herbs_drugs_df = pd.DataFrame({'herb': herbs_drug_interactions.keys(),
                               'drugs': [elem['drugs'] for elem in list(herbs_drug_interactions.values())]})
herbs_drugs_df.head()

Unnamed: 0,herb,drugs
0,5-HTP,"[Antidepressants/anxiolytics (tricyclics, MAOI..."
1,714X,[]
2,Acai Berry,[]
3,Agaricus,[CYP450 substrates]
4,AHCC,"[CYP450 substrates, Aromatase inhibitors]"


In [4]:
def transform_drug_names(drugs):
    new_drugs = [re.sub('-', ' ', drug).lower() for drug in drugs]
    new_drugs = list(set(new_drugs))
    return new_drugs

In [5]:
herbs_drugs_df['drugs_transformed'] = herbs_drugs_df['drugs'].apply(transform_drug_names)
herbs_drugs_df.head()

Unnamed: 0,herb,drugs,drugs_transformed
0,5-HTP,"[Antidepressants/anxiolytics (tricyclics, MAOI...","[monamine oxdiase inhibitors, linezolid (zyvox..."
1,714X,[],[]
2,Acai Berry,[],[]
3,Agaricus,[CYP450 substrates],[cyp450 substrates]
4,AHCC,"[CYP450 substrates, Aromatase inhibitors]","[cyp450 substrates, aromatase inhibitors]"


In [6]:
herbs_drugs_df = herbs_drugs_df.explode('drugs_transformed').dropna().drop(columns=['drugs'])

In [7]:
# split by '(', ',', '/'
# frist, try match the first thing, then the second ...

splitted_drugs = []

for drug in list(herbs_drugs_df.drugs_transformed):
    parts = re.split(r',|\(|/|\)', drug)
    splitted_drugs.append((parts))


splitted_drugs = [[d.strip() for d in drugs if d != ''] for drugs in splitted_drugs]
# splitted_drugs

In [8]:
herbs_drugs_df['splitted_drugs'] = splitted_drugs
herbs_drugs_df_transformed = herbs_drugs_df.explode("splitted_drugs")
herbs_drugs_df_transformed.head()

Unnamed: 0,herb,drugs_transformed,splitted_drugs
0,5-HTP,monamine oxdiase inhibitors,monamine oxdiase inhibitors
0,5-HTP,"linezolid (zyvox, an antibiotic maoi)",linezolid
0,5-HTP,"linezolid (zyvox, an antibiotic maoi)",zyvox
0,5-HTP,"linezolid (zyvox, an antibiotic maoi)",an antibiotic maoi
0,5-HTP,"carbidopa (lodosyn, a dopamine promoter)",carbidopa


In [9]:
drug_names = pd.read_csv('../data/drugbank/drug_id_name_map.csv', index_col=[0])
print(drug_names.head())

drug_atc = pd.read_csv('../data/drugbank/drug_atc_code.csv', index_col=[0])
print(drug_atc.head())

        id            drug_name
1  DB00001            Lepirudin
2  DB00002            Cetuximab
3  DB00003         Dornase alfa
4  DB00004  Denileukin diftitox
5  DB00005           Etanercept
        id atc_code
1  DB00001  B01AE02
2  DB00002  L01XC06
3  DB00003  R05CB13
4  DB00004  L01XX29
5  DB00005  L04AB01


In [10]:
drug_synonyms = pd.read_csv('../data/drugbank/drug_synonyms.csv', index_col = [0])
drug_synonyms

Unnamed: 0,drug_id,synonym
1,DB00001,Hirudin variant-1
2,DB00001,Lepirudin
3,DB00001,Lepirudin recombinant
4,DB00002,Cetuximab
5,DB00002,Cétuximab
...,...,...
33177,DB16736,Allogenic thymocyte-depleted thymus tissue-agdc
33178,DB16737,Ac-GQFR-kbt
33179,DB16741,Bortezomib D-mannitol ester
33180,DB16741,Bortezomib D-mannitol symmetrical ester


In [11]:
drugs_dict = dict(zip(drug_names.drug_name, drug_names.id))
drugs_dict_lower = {}
for k, v in drugs_dict.items():
    new_k = re.sub('-', ' ', k)
    new_k = new_k.lower()
    drugs_dict_lower[new_k] = v

drugs_synonyms_dict = dict(zip(drug_synonyms.synonym, drug_synonyms.drug_id))
drugs_synonyms_dict_lower = {}
for k, v in drugs_synonyms_dict.items():
    new_k = re.sub('-', ' ', k)
    new_k = new_k.lower()
    drugs_synonyms_dict_lower[new_k] = v

In [12]:
herbs_drugs_df_transformed['drugbank_id'] = list(itertools.repeat(None, herbs_drugs_df_transformed.shape[0]))
herbs_drugs_df_transformed = herbs_drugs_df_transformed.reset_index(drop=True)
herbs_drugs_df_transformed.head()

Unnamed: 0,herb,drugs_transformed,splitted_drugs,drugbank_id
0,5-HTP,monamine oxdiase inhibitors,monamine oxdiase inhibitors,
1,5-HTP,"linezolid (zyvox, an antibiotic maoi)",linezolid,
2,5-HTP,"linezolid (zyvox, an antibiotic maoi)",zyvox,
3,5-HTP,"linezolid (zyvox, an antibiotic maoi)",an antibiotic maoi,
4,5-HTP,"carbidopa (lodosyn, a dopamine promoter)",carbidopa,


In [13]:
# for extracted_drug in splitted_drugs:
for row in herbs_drugs_df_transformed.itertuples():
    d = row[3]
    # for d in extracted_drug:
    if d in drugs_dict_lower.keys():
        herbs_drugs_df_transformed['drugbank_id'][row[0]] = drugs_dict_lower[d]

    elif d in drugs_synonyms_dict_lower.keys():
        herbs_drugs_df_transformed['drugbank_id'][row[0]] = drugs_synonyms_dict_lower[d]

In [14]:
herbs_drugs_df_transformed

Unnamed: 0,herb,drugs_transformed,splitted_drugs,drugbank_id
0,5-HTP,monamine oxdiase inhibitors,monamine oxdiase inhibitors,
1,5-HTP,"linezolid (zyvox, an antibiotic maoi)",linezolid,DB00601
2,5-HTP,"linezolid (zyvox, an antibiotic maoi)",zyvox,
3,5-HTP,"linezolid (zyvox, an antibiotic maoi)",an antibiotic maoi,
4,5-HTP,"carbidopa (lodosyn, a dopamine promoter)",carbidopa,DB00190
...,...,...,...,...
682,Zyflamend,"chemotherapy drugs (gemcitabine, taxol, doxoru...",chemotherapy drugs,
683,Zyflamend,"chemotherapy drugs (gemcitabine, taxol, doxoru...",gemcitabine,DB00441
684,Zyflamend,"chemotherapy drugs (gemcitabine, taxol, doxoru...",taxol,
685,Zyflamend,"chemotherapy drugs (gemcitabine, taxol, doxoru...",doxorubicin,DB00997


In [15]:
herbs_drugs_df_transformed_atc = herbs_drugs_df_transformed.merge(drug_atc, left_on="drugbank_id", right_on="id", how="left").drop(columns=["id", "drugs_transformed"]).dropna()
herbs_drugs_df_transformed_atc

Unnamed: 0,herb,splitted_drugs,drugbank_id,atc_code
1,5-HTP,linezolid,DB00601,J01XX08
15,Aloe Vera,sevoflurane,DB01236,N01AB08
18,Andrographis,aminophylline,DB01223,R03DA55
19,Andrographis,aminophylline,DB01223,R03DA05
20,Andrographis,aminophylline,DB01223,R03DA20
...,...,...,...,...
1054,Zinc,penicillamine,DB00859,M01CC01
1057,Zyflamend,gemcitabine,DB00441,L01BC05
1059,Zyflamend,doxorubicin,DB00997,L01DB01
1060,Zyflamend,bicalutamide,DB01128,L02BB03


In [16]:
herbs_drugs_atc = herbs_drugs_df_transformed_atc[["herb", "atc_code"]]
herbs_drugs_atc["relation"] = list(itertools.repeat("interacts_with", herbs_drugs_atc.shape[0]))
herbs_drugs_atc = herbs_drugs_atc.iloc[:,[1,2,0]]
herbs_drugs_atc = herbs_drugs_atc.rename(columns={"atc_code": "drug"})
herbs_drugs_atc = herbs_drugs_atc.reset_index(drop=True)
herbs_drugs_atc

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  herbs_drugs_atc["relation"] = list(itertools.repeat("interacts_with", herbs_drugs_atc.shape[0]))


Unnamed: 0,drug,relation,herb
0,J01XX08,interacts_with,5-HTP
1,N01AB08,interacts_with,Aloe Vera
2,R03DA55,interacts_with,Andrographis
3,R03DA05,interacts_with,Andrographis
4,R03DA20,interacts_with,Andrographis
...,...,...,...
561,M01CC01,interacts_with,Zinc
562,L01BC05,interacts_with,Zyflamend
563,L01DB01,interacts_with,Zyflamend
564,L02BB03,interacts_with,Zyflamend


In [17]:
herbs_drugs_atc.to_csv("../data/triplets/herbs-di.tsv", sep="\t")