In [15]:
import sys
from os import listdir
import os

from xml.dom.minidom import parse
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize

In [2]:
datadir_drugbank = "/Users/lluccardoner/PycharmProjects/MET_AHLT_Lab_2019/data/Train/DrugBank/"
datadir_medline = "/Users/lluccardoner/PycharmProjects/MET_AHLT_Lab_2019/data/Train/MedLine/"

# Load data

In [3]:
def parse_XML_file(datadir):
    trees = []
    for f in listdir(datadir):
        # parse XML file
        tree = parse(datadir + "/" + f)
        trees.append(tree)
    return trees

def parsed_to_df(parsed):
    items = []
    for tree in parsed:
        for s in tree.getElementsByTagName("sentence"):
            s_id = s.attributes["id"].value
            s_txt = s.attributes["text"].value
            for p in s.getElementsByTagName("pair"):
                p_id = p.attributes["id"].value
                p_ddi = p.attributes["ddi"].value
                if p_ddi=="true" and "type" in p.attributes:
                    p_type = p.attributes["type"].value
                else:
                    p_type = "null"
                p_e1 = p.attributes["e1"].value
                p_e2 = p.attributes["e2"].value
                item = {"s_id": s_id, "s_txt": s_txt, "p_id": p_id, "p_ddi": p_ddi, "p_type": p_type, "p_e1": p_e1, "p_e2": p_e2}
                items.append(item)
                
    return pd.DataFrame(items)

def load_data(datadir, training_set):
    trees = parse_XML_file(datadir)
    df_loaded = parsed_to_df(trees)
    df_loaded['training_set'] = training_set
    return df_loaded

In [4]:
df_drugbank = load_data(datadir_drugbank, "drugbank")
df_drugbank.shape

(26005, 8)

In [5]:
df_medline = load_data(datadir_medline, "medline")
df_medline.shape

(1787, 8)

In [6]:
df = df_drugbank.append(df_medline, ignore_index=True)
df.shape

(27792, 8)

## Drug Drug Interaction Pairs

In [7]:
df_drugbank.head()

Unnamed: 0,p_ddi,p_e1,p_e2,p_id,p_type,s_id,s_txt,training_set
0,True,DDI-DrugBank.d481.s0.e0,DDI-DrugBank.d481.s0.e1,DDI-DrugBank.d481.s0.p0,mechanism,DDI-DrugBank.d481.s0,"Milk, milk products, and calcium-rich foods or...",drugbank
1,True,DDI-DrugBank.d419.s0.e0,DDI-DrugBank.d419.s0.e1,DDI-DrugBank.d419.s0.p0,effect,DDI-DrugBank.d419.s0,The concurrent administration of allopurinol a...,drugbank
2,False,DDI-DrugBank.d419.s0.e0,DDI-DrugBank.d419.s0.e2,DDI-DrugBank.d419.s0.p1,,DDI-DrugBank.d419.s0,The concurrent administration of allopurinol a...,drugbank
3,False,DDI-DrugBank.d419.s0.e1,DDI-DrugBank.d419.s0.e2,DDI-DrugBank.d419.s0.p2,,DDI-DrugBank.d419.s0,The concurrent administration of allopurinol a...,drugbank
4,True,DDI-DrugBank.d419.s1.e0,DDI-DrugBank.d419.s1.e1,DDI-DrugBank.d419.s1.p0,effect,DDI-DrugBank.d419.s1,It is not known whether this potentiation of a...,drugbank


In [8]:
df_medline.head()

Unnamed: 0,p_ddi,p_e1,p_e2,p_id,p_type,s_id,s_txt,training_set
0,False,DDI-MedLine.d69.s0.e0,DDI-MedLine.d69.s0.e1,DDI-MedLine.d69.s0.p0,,DDI-MedLine.d69.s0,Differential regulation of tyrosine phosphoryl...,medline
1,False,DDI-MedLine.d69.s0.e0,DDI-MedLine.d69.s0.e2,DDI-MedLine.d69.s0.p1,,DDI-MedLine.d69.s0,Differential regulation of tyrosine phosphoryl...,medline
2,False,DDI-MedLine.d69.s0.e1,DDI-MedLine.d69.s0.e2,DDI-MedLine.d69.s0.p2,,DDI-MedLine.d69.s0,Differential regulation of tyrosine phosphoryl...,medline
3,False,DDI-MedLine.d69.s1.e0,DDI-MedLine.d69.s1.e1,DDI-MedLine.d69.s1.p0,,DDI-MedLine.d69.s1,The homodimeric disintegrin contortrostatin wa...,medline
4,False,DDI-MedLine.d69.s1.e0,DDI-MedLine.d69.s1.e2,DDI-MedLine.d69.s1.p1,,DDI-MedLine.d69.s1,The homodimeric disintegrin contortrostatin wa...,medline


### Total loaded rows

In [9]:
df[['p_id', 'training_set']].groupby(["training_set"]).count()

Unnamed: 0_level_0,p_id
training_set,Unnamed: 1_level_1
drugbank,26005
medline,1787


In [10]:
df[['p_ddi', 'training_set']].groupby(["training_set"])["p_ddi"].value_counts()

training_set  p_ddi
drugbank      false    22216
              true      3789
medline       false     1555
              true       232
Name: p_ddi, dtype: int64

In [11]:
df[['p_type', 'training_set']].groupby(["training_set"])['p_type'].value_counts()

training_set  p_type   
drugbank      null         22217
              effect        1535
              mechanism     1257
              advise         818
              int            178
medline       null          1555
              effect         152
              mechanism       62
              int             10
              advise           8
Name: p_type, dtype: int64

In [12]:
df.groupby(["training_set"])[["p_ddi", "p_type"]].describe()

Unnamed: 0_level_0,p_ddi,p_ddi,p_ddi,p_ddi,p_type,p_type,p_type,p_type
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq
training_set,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
drugbank,26005,2,False,22216,26005,5,,22217
medline,1787,2,False,1555,1787,5,,1555


### Words in between entities pairs

In [17]:
def tokenize(txt):
    offset = 0
    tks = []
    for t in word_tokenize(txt):
        offset = txt.find(t, offset)
        tks.append((t, offset, offset + len(t) - 1))
        offset += len(t)
    return tks

In [38]:
# directory with files to process
datadirs = [datadir_drugbank, datadir_medline]
# process each file in directory
items = []
for datadir in datadirs:
    for f in listdir(datadir) :
        # parse XML file, obtaining a DOM tree
        tree = parse(datadir+"/"+f)
        # process each sentence in the file
        sentences = tree.getElementsByTagName("sentence")
        for s in sentences :
            s_id = s.attributes["id"].value # get sentence id 
            s_txt = s.attributes["text"].value # get sentence text
            tokens = tokenize(s_txt)
            # load and store sentence entities
            entities = {}
            ents = s.getElementsByTagName("entity")
            for e in ents :
                e_id = e.attributes["id"].value
                offs = e.attributes["charOffset"].value.split("-")
                entities[e_id] = offs
            # for each pair in the sentence, extract entities
            # and words in between
            pairs = s.getElementsByTagName("pair")
            for p in pairs:
                id_e1 = p.attributes["e1"].value
                id_e2 = p.attributes["e2"].value
                ddi =  p.attributes["ddi"].value
                if ddi=="true" and "type" in p.attributes:
                    e1_start = int(entities[id_e1][0])
                    e1_end = int(entities[id_e1][-1])
                    e2_start = int(entities[id_e2][0])
                    e2_end = int(entities[id_e2][-1])

                    p_type = p.attributes["type"].value
                    p_id = p.attributes["id"].value

                    between = False
                    before = False
                    after = False
                    for t in tokens :
                        if (t[1]>e1_end and t[2]<e2_start):
                            between = True
                        elif(t[2]<e1_start):
                            before = True
                        elif(t[1]>e2_end):
                            after = True
                        item = {"s_id": s_id, "p_type": p_type, "token": t[0] , "between": between, "before": before, "after": after}
                        items.append(item)
                        between = False
                        before = False
                        after = False

df_words = pd.DataFrame(items)

In [39]:
print(df_words.shape)
df_words.head()

(145698, 6)


Unnamed: 0,after,before,between,p_type,s_id,token
0,False,True,False,mechanism,DDI-DrugBank.d481.s0,Milk
1,False,True,False,mechanism,DDI-DrugBank.d481.s0,","
2,False,True,False,mechanism,DDI-DrugBank.d481.s0,milk
3,False,True,False,mechanism,DDI-DrugBank.d481.s0,products
4,False,True,False,mechanism,DDI-DrugBank.d481.s0,","


In [40]:
df_words.groupby(["p_type"])[["after", "before", "between"]].describe()

Unnamed: 0_level_0,after,after,after,after,before,before,before,before,between,between,between,between
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq,count,unique,top,freq
p_type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
advise,26513,2,False,17849,26513,2,False,19955,26513,2,False,17184
effect,58984,2,False,39033,58984,2,False,46607,58984,2,False,36825
int,10609,2,False,5650,10609,2,False,9657,10609,2,False,6341
mechanism,49592,2,False,30769,49592,2,False,39116,49592,2,False,32304


In [51]:
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

ps = PorterStemmer()
lem = WordNetLemmatizer() 

df_words["lower"] = df_words['token'].apply(lambda x : x.lower())
df_words["PoS"] = df_words['token'].apply(lambda x : nltk.pos_tag([x])[0][1])
df_words["stemm"] = df_words['token'].apply(lambda x : ps.stem(x))
df_words["lema"] = df_words['token'].apply(lambda x : lem.lemmatize(x))

In [63]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

df_words = df_words[~df_words.lower.isin(stop_words)]

In [71]:
import string
df_words = df_words[~df_words.lower.isin([x for x in string.punctuation])]

In [72]:
df_words.head()

Unnamed: 0,after,before,between,p_type,s_id,token,PoS,stemm,lema,lower
0,False,True,False,mechanism,DDI-DrugBank.d481.s0,Milk,NN,milk,Milk,milk
2,False,True,False,mechanism,DDI-DrugBank.d481.s0,milk,NN,milk,milk,milk
3,False,True,False,mechanism,DDI-DrugBank.d481.s0,products,NNS,product,product,products
6,False,False,False,mechanism,DDI-DrugBank.d481.s0,calcium-rich,JJ,calcium-rich,calcium-rich,calcium-rich
7,False,False,True,mechanism,DDI-DrugBank.d481.s0,foods,NNS,food,food,foods


In [73]:
n = 10
df_words[df_words["after"]==True].groupby(['lower'])['PoS'].agg(
    {"PoS_count": len}).sort_values(
    "PoS_count", ascending=False).head(n).reset_index()

is deprecated and will be removed in a future version
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,lower,PoS_count
0,may,508
1,drugs,259
2,increase,241
3,levels,206
4,plasma,190
5,inhibitors,182
6,increased,178
7,warfarin,165
8,sodium,164
9,acid,154


In [48]:
effect_clues = ["administered", "concurrently", "concomitantly", "increase", "increases", "increased", "effect",
                "effects", "prevent", "prevents", "prevented", "potentiate", "potentiates", "potentiated"]
mechanism_clues = ["reduce", "reduces", "reduced", "decrease", "decreases", "decreased", "change", "changes", "changed",
                   "elevate", "elevates", "elevated", "interfere", "interferes", "interfered"]

In [50]:
for e in effect_clues:
    print(e, ps.stem(e), lem.lemmatize(e), nltk.pos_tag([e])[0][1])

for m in mechanism_clues:
    print(m, ps.stem(m), lem.lemmatize(m), nltk.pos_tag([m])[0][1])

administered administ administered VBN
concurrently concurr concurrently RB
concomitantly concomitantli concomitantly RB
increase increas increase NN
increases increas increase NNS
increased increas increased VBN
effect effect effect NN
effects effect effect NNS
prevent prevent prevent NN
prevents prevent prevents NNS
prevented prevent prevented VBN
potentiate potenti potentiate NN
potentiates potenti potentiates NNS
potentiated potenti potentiated VBN
reduce reduc reduce VB
reduces reduc reduces NNS
reduced reduc reduced VBN
decrease decreas decrease NN
decreases decreas decrease NNS
decreased decreas decreased VBN
change chang change NN
changes chang change NNS
changed chang changed VBN
elevate elev elevate NN
elevates elev elevates NNS
elevated elev elevated VBN
interfere interfer interfere RB
interferes interfer interferes NNS
interfered interf interfered VBN
