In [75]:
import sys
from os import listdir
import os

from xml.dom.minidom import parse
import pandas as pd
import nltk

In [76]:
datadir_drugbank = "/Users/lluccardoner/PycharmProjects/MET_AHLT_Lab_2019/data/Train/DrugBank/"
datadir_medline = "/Users/lluccardoner/PycharmProjects/MET_AHLT_Lab_2019/data/Train/MedLine/"

# Load data

In [77]:
def parse_XML_file(datadir):
    trees = []
    for f in listdir(datadir):
        # parse XML file
        tree = parse(datadir + "/" + f)
        trees.append(tree)
    return trees

def parsed_to_df(parsed):
    items = []
    for tree in parsed:
        for s in tree.getElementsByTagName("sentence"):
            s_id = s.attributes["id"].value
            s_txt = s.attributes["text"].value
            for e in s.getElementsByTagName("entity"):
                e_id = e.attributes["id"].value
                e_offset = e.attributes["charOffset"].value 
                e_name = e.attributes["text"].value
                e_type = e.attributes["type"].value
                item = {"s_id": s_id, "s_txt": s_txt, "e_id": e_id, "e_offset": e_offset, "e_name": e_name, "e_type": e_type}
                items.append(item)
                
    return pd.DataFrame(items)

def load_data(datadir, training_set):
    trees = parse_XML_file(datadir)
    df_loaded = parsed_to_df(trees)
    df_loaded['training_set'] = training_set
    return df_loaded

In [78]:
df_drugbank = load_data(datadir_drugbank, "drugbank")
df_drugbank.shape

(12929, 7)

In [79]:
df_medline = load_data(datadir_medline, "medline")
df_medline.shape

(1836, 7)

In [80]:
df = df_drugbank.append(df_medline, ignore_index=True)
df.shape

(14765, 7)

## Entities with sentences

In [81]:
df_drugbank.head()

Unnamed: 0,e_id,e_name,e_offset,e_type,s_id,s_txt,training_set
0,DDI-DrugBank.d481.s0.e0,calcium,25-31,drug,DDI-DrugBank.d481.s0,"Milk, milk products, and calcium-rich foods or...",drugbank
1,DDI-DrugBank.d481.s0.e1,EMCYT,82-86,brand,DDI-DrugBank.d481.s0,"Milk, milk products, and calcium-rich foods or...",drugbank
2,DDI-DrugBank.d419.s0.e0,allopurinol,33-43,drug,DDI-DrugBank.d419.s0,The concurrent administration of allopurinol a...,drugbank
3,DDI-DrugBank.d419.s0.e1,ampicillin,49-58,drug,DDI-DrugBank.d419.s0,The concurrent administration of allopurinol a...,drugbank
4,DDI-DrugBank.d419.s0.e2,ampicillin,175-184,drug,DDI-DrugBank.d419.s0,The concurrent administration of allopurinol a...,drugbank


In [82]:
df_medline.head()

Unnamed: 0,e_id,e_name,e_offset,e_type,s_id,s_txt,training_set
0,DDI-MedLine.d69.s0.e0,contortrostatin,70-84,drug_n,DDI-MedLine.d69.s0,Differential regulation of tyrosine phosphoryl...,medline
1,DDI-MedLine.d69.s0.e1,echistatin,141-150,drug_n,DDI-MedLine.d69.s0,Differential regulation of tyrosine phosphoryl...,medline
2,DDI-MedLine.d69.s0.e2,flavoridin,156-165,drug_n,DDI-MedLine.d69.s0,Differential regulation of tyrosine phosphoryl...,medline
3,DDI-MedLine.d69.s1.e0,contortrostatin,28-42,drug_n,DDI-MedLine.d69.s1,The homodimeric disintegrin contortrostatin wa...,medline
4,DDI-MedLine.d69.s1.e1,echistatin,96-105,drug_n,DDI-MedLine.d69.s1,The homodimeric disintegrin contortrostatin wa...,medline


### Total loaded rows

In [83]:
df[['e_type', 'training_set']].groupby(["training_set"]).count()

Unnamed: 0_level_0,e_type
training_set,Unnamed: 1_level_1
drugbank,12929
medline,1836


In [84]:
df[['e_type', 'training_set']].groupby(["training_set"])['e_type'].value_counts()

training_set  e_type
drugbank      drug      8197
              group     3206
              brand     1423
              drug_n     103
medline       drug      1228
              drug_n     401
              group      193
              brand       14
Name: e_type, dtype: int64

In [85]:
df[['e_name', 'training_set']].groupby(["training_set"])["e_name"].value_counts()

training_set  e_name                                                                      
drugbank      digoxin                                                                         176
              warfarin                                                                        176
              phenytoin                                                                       157
              lithium                                                                         119
              theophylline                                                                    115
              ketoconazole                                                                    107
              cimetidine                                                                       89
              carbamazepine                                                                    81
              aspirin                                                                          79
              erythromycin 

In [86]:
df.groupby(["training_set"]).describe()

Unnamed: 0_level_0,e_id,e_id,e_id,e_id,e_name,e_name,e_name,e_name,e_offset,e_offset,...,e_type,e_type,s_id,s_id,s_id,s_id,s_txt,s_txt,s_txt,s_txt
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq,count,unique,...,top,freq,count,unique,top,freq,count,unique,top,freq
training_set,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
drugbank,12929,12929,DDI-DrugBank.d549.s3.e2,1,12929,2655,digoxin,176,12929,3957,...,drug,8197,12929,4683,DDI-DrugBank.d64.s87,55,12929,4505,Drugs that reportedly may increase oral antico...,55
medline,1836,1836,DDI-MedLine.d63.s8.e0,1,1836,556,digoxin,33,1836,1206,...,drug,1228,1836,877,DDI-MedLine.d111.s5,14,1836,877,"acetaminophen/theophylline, lidocaine/quinidin...",14


## Entities and type 

In [87]:
df_e = df[["e_name", "e_type", "training_set"]].drop_duplicates()
df_e.head(10)

Unnamed: 0,e_name,e_type,training_set
0,calcium,drug,drugbank
1,EMCYT,brand,drugbank
2,allopurinol,drug,drugbank
3,ampicillin,drug,drugbank
7,AUGMENTIN XR,brand,drugbank
12,broad-spectrum antibiotics,group,drugbank
14,contraceptives,group,drugbank
15,PROCRIT,brand,drugbank
16,central nervous system depressants,group,drugbank
17,benzodiazepines,group,drugbank


In [88]:
# there are names that have more than one type
df_e.groupby(["training_set"]).describe()

Unnamed: 0_level_0,e_name,e_name,e_name,e_name,e_type,e_type,e_type,e_type
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq
training_set,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
drugbank,2719,2655,ergot,2,2719,4,drug,1393
medline,560,556,ACTH,2,560,4,drug,306


In [89]:
df[df["e_name"]=="ACTH"].head()

Unnamed: 0,e_id,e_name,e_offset,e_type,s_id,s_txt,training_set
4156,DDI-DrugBank.d318.s2.e2,ACTH,35-38,group,DDI-DrugBank.d318.s2,Corticosteroids and Corticotropin (ACTH): may ...,drugbank
9155,DDI-DrugBank.d162.s6.e1,ACTH,17-20,drug,DDI-DrugBank.d162.s6,"Corticosteroids, ACTH: intensified electrolyte...",drugbank
9628,DDI-DrugBank.d17.s9.e3,ACTH,52-55,drug,DDI-DrugBank.d17.s9,Amphotericin B or Corticosteroids or Corticotr...,drugbank
10913,DDI-DrugBank.d46.s9.e1,ACTH,19-22,drug,DDI-DrugBank.d46.s9,"- Corticosteroids, ACTH: Intensified electroly...",drugbank
11721,DDI-DrugBank.d10.s1.e1,ACTH,111-114,drug,DDI-DrugBank.d10.s1,Although studies designed to examine drug inte...,drugbank


## Feature analysis

### Upper case

In [90]:
df_e['is_upper'] = df_e['e_name'].apply(lambda x : x.isupper())
df_e.head()

Unnamed: 0,e_name,e_type,training_set,is_upper
0,calcium,drug,drugbank,False
1,EMCYT,brand,drugbank,True
2,allopurinol,drug,drugbank,False
3,ampicillin,drug,drugbank,False
7,AUGMENTIN XR,brand,drugbank,True


In [91]:
df_e.groupby(['training_set', 'e_type', 'is_upper']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,e_name
training_set,e_type,is_upper,Unnamed: 3_level_1
drugbank,brand,False,225
drugbank,brand,True,165
drugbank,drug,False,1356
drugbank,drug,True,37
drugbank,drug_n,False,57
drugbank,drug_n,True,9
drugbank,group,False,852
drugbank,group,True,18
medline,brand,False,6
medline,drug,False,296


### Suffixes

In [92]:
df_e['suffix_5'] = df_e['e_name'].apply(lambda x : x[-5:].lower())
df_e.head()

Unnamed: 0,e_name,e_type,training_set,is_upper,suffix_5
0,calcium,drug,drugbank,False,lcium
1,EMCYT,brand,drugbank,True,emcyt
2,allopurinol,drug,drugbank,False,rinol
3,ampicillin,drug,drugbank,False,illin
7,AUGMENTIN XR,brand,drugbank,True,in xr


In [98]:
initial_drug_suffixes = ["idase", "idone", "uride", "ogens", "rinol", "amate", "lones", "pamil", "olone", "parin", "ssant", "udine",
            "D", "etron", "adiol", "feine", "pines", "zines", "toxin", "MAO", "opram", "ophen", "sides", "talis",
            "ulant", "ylate", "osine", "oxide", "caine", "illin", "itant", "limus", "pride", "sulin", "oride", "abine",
            "hrine", "iazem", "atory", "oidal", "emide", "hanol", "phine", "SAIDs", "coxib", "necid", "nists", "esium",
            "acids", "nolol", "nafil", "azine", "exate", "rates", "cking", "lcium", "azide", "zepam", "arone", "rofen",
            "ampin", "ergic", "roids", "tamin", "adine", "odium", "bital", "pirin", "lants", "orine", "mines", "zolam",
            "apine", "cohol", "ckers", "ipine", "acid", "yclic", "otics", "tives", "xacin", "etine", "epine", "drugs",
            "tatin", "thium", "lline", "amide", "sants", "etics", "itors", "ytoin", "gents", "navir", "goxin", "farin",
            "mycin", "idine", "amine", "azole"]
print(len(initial_drug_suffixes), len(new_type_suffix_5["drug"]))

100 411


In [99]:
len(intersection(initial_drug_suffixes, new_type_suffix_5["drug"]))

34

In [100]:
#for k in new_type_suffix_5:
#    with open(k+'_suffix_5_DrugBank.txt', 'w') as f:
#            for item in new_type_suffix_5[k]:
#                f.write("%s\n" % item)

In [101]:
#loaded = {}
#for k in new_type_suffix_5:
#    with open(k+'_suffix_5_DrugBank.txt', 'r') as f:
#        loaded[k] = f.read().splitlines()

In [102]:
# df_e[df_e["e_name"].str.contains("propranolo")]

Unnamed: 0,e_name,e_type,training_set,is_upper,suffix_5
1045,propranolol,drug,drugbank,False,nolol
14704,propranolol,drug,medline,False,nolol


### Plural

In [103]:
df_e['is_plural'] = df_e['e_name'].apply(lambda x : x[-1:] == 's')
df_e.head(10)

Unnamed: 0,e_name,e_type,training_set,is_upper,suffix_5,is_plural
0,calcium,drug,drugbank,False,lcium,False
1,EMCYT,brand,drugbank,True,emcyt,False
2,allopurinol,drug,drugbank,False,rinol,False
3,ampicillin,drug,drugbank,False,illin,False
7,AUGMENTIN XR,brand,drugbank,True,in xr,False
12,broad-spectrum antibiotics,group,drugbank,False,otics,True
14,contraceptives,group,drugbank,False,tives,True
15,PROCRIT,brand,drugbank,True,ocrit,False
16,central nervous system depressants,group,drugbank,False,sants,True
17,benzodiazepines,group,drugbank,False,pines,True


In [104]:
df_e.groupby(['training_set', "e_type", "is_plural"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,e_name,is_upper,suffix_5
training_set,e_type,is_plural,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
drugbank,brand,False,383,383,383
drugbank,brand,True,7,7,7
drugbank,drug,False,1355,1355,1355
drugbank,drug,True,38,38,38
drugbank,drug_n,False,64,64,64
drugbank,drug_n,True,2,2,2
drugbank,group,False,227,227,227
drugbank,group,True,643,643,643
medline,brand,False,6,6,6
medline,drug,False,294,294,294


### Part Of Speech

In [105]:
df_e['e_pos'] = df_e['e_name'].apply(lambda x : nltk.pos_tag([x])[0][1])
df_e.head(10)

Unnamed: 0,e_name,e_type,training_set,is_upper,suffix_5,is_plural,e_pos
0,calcium,drug,drugbank,False,lcium,False,NN
1,EMCYT,brand,drugbank,True,emcyt,False,NN
2,allopurinol,drug,drugbank,False,rinol,False,NN
3,ampicillin,drug,drugbank,False,illin,False,NN
7,AUGMENTIN XR,brand,drugbank,True,in xr,False,NN
12,broad-spectrum antibiotics,group,drugbank,False,otics,True,NNS
14,contraceptives,group,drugbank,False,tives,True,NNS
15,PROCRIT,brand,drugbank,True,ocrit,False,NN
16,central nervous system depressants,group,drugbank,False,sants,True,NNS
17,benzodiazepines,group,drugbank,False,pines,True,NNS


In [106]:
df_e.groupby(['training_set', "e_type", "e_pos"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,e_name,is_upper,suffix_5,is_plural
training_set,e_type,e_pos,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
drugbank,brand,IN,2,2,2,2
drugbank,brand,JJ,9,9,9,9
drugbank,brand,NN,354,354,354,354
drugbank,brand,NNP,18,18,18,18
drugbank,brand,NNS,4,4,4,4
drugbank,brand,RB,1,1,1,1
drugbank,brand,VB,1,1,1,1
drugbank,brand,VBN,1,1,1,1
drugbank,drug,IN,3,3,3,3
drugbank,drug,JJ,30,30,30,30
