In [1]:
import sys
from os import listdir
import os

from xml.dom.minidom import parse
import pandas as pd
import nltk

In [2]:
datadir_drugbank = "/Users/lluccardoner/PycharmProjects/MET_AHLT_Lab_2019/data/Train/DrugBank/"
datadir_medline = "/Users/lluccardoner/PycharmProjects/MET_AHLT_Lab_2019/data/Train/MedLine/"

# Load data

In [3]:
def parse_XML_file(datadir):
    trees = []
    for f in listdir(datadir):
        # parse XML file
        tree = parse(datadir + "/" + f)
        trees.append(tree)
    return trees

def parsed_to_df(parsed):
    items = []
    for tree in parsed:
        for s in tree.getElementsByTagName("sentence"):
            s_id = s.attributes["id"].value
            s_txt = s.attributes["text"].value
            for p in s.getElementsByTagName("pair"):
                p_id = p.attributes["id"].value
                p_ddi = p.attributes["ddi"].value
                if p_ddi=="true" and "type" in p.attributes:
                    p_type = p.attributes["type"].value
                else:
                    p_type = "null"
                p_e1 = p.attributes["e1"].value
                p_e2 = p.attributes["e2"].value
                item = {"s_id": s_id, "s_txt": s_txt, "p_id": p_id, "p_ddi": p_ddi, "p_type": p_type, "p_e1": p_e1, "p_e2": p_e2}
                items.append(item)
                
    return pd.DataFrame(items)

def load_data(datadir, training_set):
    trees = parse_XML_file(datadir)
    df_loaded = parsed_to_df(trees)
    df_loaded['training_set'] = training_set
    return df_loaded

In [4]:
df_drugbank = load_data(datadir_drugbank, "drugbank")
df_drugbank.shape

(26005, 8)

In [5]:
df_medline = load_data(datadir_medline, "medline")
df_medline.shape

(1787, 8)

In [6]:
df = df_drugbank.append(df_medline, ignore_index=True)
df.shape

(27792, 8)

## Drug Drug Interaction Pairs

In [7]:
df_drugbank.head()

Unnamed: 0,p_ddi,p_e1,p_e2,p_id,p_type,s_id,s_txt,training_set
0,True,DDI-DrugBank.d481.s0.e0,DDI-DrugBank.d481.s0.e1,DDI-DrugBank.d481.s0.p0,mechanism,DDI-DrugBank.d481.s0,"Milk, milk products, and calcium-rich foods or...",drugbank
1,True,DDI-DrugBank.d419.s0.e0,DDI-DrugBank.d419.s0.e1,DDI-DrugBank.d419.s0.p0,effect,DDI-DrugBank.d419.s0,The concurrent administration of allopurinol a...,drugbank
2,False,DDI-DrugBank.d419.s0.e0,DDI-DrugBank.d419.s0.e2,DDI-DrugBank.d419.s0.p1,,DDI-DrugBank.d419.s0,The concurrent administration of allopurinol a...,drugbank
3,False,DDI-DrugBank.d419.s0.e1,DDI-DrugBank.d419.s0.e2,DDI-DrugBank.d419.s0.p2,,DDI-DrugBank.d419.s0,The concurrent administration of allopurinol a...,drugbank
4,True,DDI-DrugBank.d419.s1.e0,DDI-DrugBank.d419.s1.e1,DDI-DrugBank.d419.s1.p0,effect,DDI-DrugBank.d419.s1,It is not known whether this potentiation of a...,drugbank


In [8]:
df_medline.head()

Unnamed: 0,p_ddi,p_e1,p_e2,p_id,p_type,s_id,s_txt,training_set
0,False,DDI-MedLine.d69.s0.e0,DDI-MedLine.d69.s0.e1,DDI-MedLine.d69.s0.p0,,DDI-MedLine.d69.s0,Differential regulation of tyrosine phosphoryl...,medline
1,False,DDI-MedLine.d69.s0.e0,DDI-MedLine.d69.s0.e2,DDI-MedLine.d69.s0.p1,,DDI-MedLine.d69.s0,Differential regulation of tyrosine phosphoryl...,medline
2,False,DDI-MedLine.d69.s0.e1,DDI-MedLine.d69.s0.e2,DDI-MedLine.d69.s0.p2,,DDI-MedLine.d69.s0,Differential regulation of tyrosine phosphoryl...,medline
3,False,DDI-MedLine.d69.s1.e0,DDI-MedLine.d69.s1.e1,DDI-MedLine.d69.s1.p0,,DDI-MedLine.d69.s1,The homodimeric disintegrin contortrostatin wa...,medline
4,False,DDI-MedLine.d69.s1.e0,DDI-MedLine.d69.s1.e2,DDI-MedLine.d69.s1.p1,,DDI-MedLine.d69.s1,The homodimeric disintegrin contortrostatin wa...,medline


### Total loaded rows

In [9]:
df[['p_id', 'training_set']].groupby(["training_set"]).count()

Unnamed: 0_level_0,p_id
training_set,Unnamed: 1_level_1
drugbank,26005
medline,1787


In [10]:
df[['p_ddi', 'training_set']].groupby(["training_set"])["p_ddi"].value_counts()

training_set  p_ddi
drugbank      false    22216
              true      3789
medline       false     1555
              true       232
Name: p_ddi, dtype: int64

In [11]:
df[['p_type', 'training_set']].groupby(["training_set"])['p_type'].value_counts()

training_set  p_type   
drugbank      null         22217
              effect        1535
              mechanism     1257
              advise         818
              int            178
medline       null          1555
              effect         152
              mechanism       62
              int             10
              advise           8
Name: p_type, dtype: int64

In [12]:
df.groupby(["training_set"])[["p_ddi", "p_type"]].describe()

Unnamed: 0_level_0,p_ddi,p_ddi,p_ddi,p_ddi,p_type,p_type,p_type,p_type
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq
training_set,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
drugbank,26005,2,False,22216,26005,5,,22217
medline,1787,2,False,1555,1787,5,,1555
