In [None]:
'''
author: Irsyad Adam
'''

  
  '''
  <ns0:atc-codes>
    <ns0:atc-code code="B01AE02">
      <ns0:level code="B01AE">Direct thrombin inhibitors</ns0:level>
      <ns0:level code="B01A">ANTITHROMBOTIC AGENTS</ns0:level>
      <ns0:level code="B01">ANTITHROMBOTIC AGENTS</ns0:level>
      <ns0:level code="B">BLOOD AND BLOOD FORMING ORGANS</ns0:level>
    </ns0:atc-code>
  </ns0:atc-codes>



<ns0:categories>
    <ns0:category>
      <ns0:category>Amino Acids, Peptides, and Proteins</ns0:category>
      <ns0:mesh-id>D000602</ns0:mesh-id>
    </ns0:category>
    <ns0:category>
      <ns0:category>Anticoagulants</ns0:category>
      <ns0:mesh-id>D000925</ns0:mesh-id>
    </ns0:category>
    '''

In [1]:
import xml.etree.ElementTree as et

#import the xml
filename = "fulldatabase.xml"
#parse the tree
tree = et.parse(filename)
#get root
root = tree.getroot()

In [3]:
import pandas as pd
import xml.etree.ElementTree as et

#########################################################
#helper functions to extract drugbank and categories

def get_drugbank_id(child, namespace) -> str:
    '''
    gets the drugbank id of a child
    @param child is child of root, which is root = ElementTree.parse(filename).getroot().child
    @param namspace is the namespace ofthe xml document, given by namespace = root.tag.split('}')[0]+'}'
    @returns the drugbank_id
    '''
    #get the id tag
    for element in child.findall(namespace +'drugbank-id'):
        #use primary drugbank id
        if 'primary' in element.attrib:
            #get nested text
            drugbank_id = element.text
            return str(drugbank_id)
        else:
            raise Exception('drugbank id not found')

def filter_categories(child, filter_list, namespace) -> bool:
    '''
    checks if drug given is part of a category that we want
    @param child is child of root, which is root = ElementTree.parse(filename).getroot().child
    @param filter_list is the target list that we want to use
    @param namspace is the namespace ofthe xml document, given by namespace = root.tag.split('}')[0]+'}'
    @returns a bool describing if the drug is part of the category
    '''
    category_list = []
    #get all of the categories
    for element in child.findall(namespace + 'categories'):
        for cat in element:
            for subcat in cat:
                if (subcat.tag == namespace + 'category'):
                    category_list.append(str(subcat.text))

    #check if any of them are in the filter list    
    if any(els in max(filter_list, category_list, key=len) for els in min(category_list, filter_list, key=len)):
        return True
    else:
        return False



######################################################################################
#helper functions to grab uniprot identifiation numbers


def get_uniprot_targets(child, namespace) -> list:
    '''
    get uniprot targets
    @param child is the child of root
    @param namespace is namespace
    @returns a list, with list[0] being the uni identification number, and list[1] being the uniID
    '''
    uniprot_targets = []
    polypeptide_targets = []
    #iterate through and grab targets
    for element in child.findall(namespace + 'targets'):
        for targets in element:
            for id in targets:
                if (id.tag == namespace + 'id'):
                    uniprot_targets.append(str(id.text))
                if (id.tag == namespace + 'polypeptide'):
                    polypeptide_targets.append(str(id.attrib['id']))
    return [uniprot_targets, polypeptide_targets]
        
def get_uniprot_enzymes(child, namespace) -> list:
    '''
    get uniprot enzymes
    @param child is the child of root
    @param namespace is namespace
    @returns a list, with list[0] being the uni identification number, and list[1] being the uniID
    '''
    uniprot_enzymes = []
    polypeptide_enzymes = []
    #iterate through and grab targets
    for element in child.findall(namespace + 'enzymes'):
        for targets in element:
            for id in targets:
                if (id.tag == namespace + 'id'):
                    uniprot_enzymes.append(str(id.text))
                if (id.tag == namespace + 'polypeptide'):
                    polypeptide_enzymes.append(str(id.attrib['id']))
    return [uniprot_enzymes, polypeptide_enzymes]

def get_uniprot_carriers(child, namespace) -> list:
    '''
    get uniprot carriers
    @param child is the child of root
    @param namespace is namespace
    @returns a list, with list[0] being the uni identification number, and list[1] being the uniID
    '''
    uniprot_carriers = []
    polypeptide_carriers = []
    #iterate through and grab targets
    for element in child.findall(namespace + 'carriers'):
        for targets in element:
            for id in targets:
                if (id.tag == namespace + 'id'):
                    uniprot_carriers.append(str(id.text))
                if (id.tag == namespace + 'polypeptide'):
                    polypeptide_carriers.append(str(id.attrib['id']))
    return [uniprot_carriers, polypeptide_carriers]


def get_uniprot_transporters(child, namespace) -> list:
    '''
    get uniprot transporters
    @param child is the child of root
    @param namespace is namespace
    @returns a list, with list[0] being the uni identification number, and list[1] being the uniID
    '''
    uniprot_transporters = []
    polypeptide_transporters= []
    #iterate through and grab targets
    for element in child.findall(namespace + 'transporters'):
        for targets in element:
            for id in targets:
                if (id.tag == namespace + 'id'):
                    uniprot_transporters.append(str(id.text))
                if (id.tag == namespace + 'polypeptide'):
                    polypeptide_transporters.append(str(id.attrib['id']))
    return [uniprot_transporters, polypeptide_transporters]

######################################################################


def construct_csv(root) -> any:
    '''
    returns a dataframe
    @param root is the root, given by ElementTree.parse(filename).getroot()
    @returns a df with drugbankID, UniprotID, uniprot identification numbers
    '''
    #seperate namespace
    namespace = root.tag.split('}')[0]+'}'

    #create the lists
    drugbank_id = []

    uniprot_identification_number = []
    uniprot_ID = []


    #grab list filter
    filter_list = ['Anticoagulants', 'Antiplatelet Agents and Dual Antiplatelet Therapy', 'ACE Inhibitors', 'Angiotensin II Receptor Blockers', 'Angiotensin Receptor-Neprilysin Inhibitors', \
                    'Beta Blockers', 'Calcium Channel Blockers', 'Cholesterol-lowering Medications', 'Digitalis Preparations', 'Diuretics', 'Vasodilators']

    #iterate throught the drugs
    for child in root:

        #if the drug is a part of the categories that are listed
        if filter_categories(child, filter_list, namespace):

            #get the drugbank id in a list
            drugbank_id.append(get_drugbank_id(child, namespace))

            #get uni numbers in a list
            uniprot_identification_number.append(get_uniprot_targets(child, namespace)[0] + get_uniprot_enzymes(child, namespace)[0] + get_uniprot_carriers(child, namespace)[0] + \
                                                 get_uniprot_transporters(child, namespace)[0])


            #get uniID in a list
            uniprot_ID.append(get_uniprot_targets(child, namespace)[1] + get_uniprot_enzymes(child, namespace)[1] + get_uniprot_carriers(child, namespace)[1] + \
                                                 get_uniprot_transporters(child, namespace)[1])


    #convert to csv 
    data = {'drugbank_id': drugbank_id, 'uniprotID': uniprot_ID, 'uniprot_identification_number': uniprot_identification_number}

    df = pd.DataFrame(data)
    return df

def get_uniID(df) -> list:
    '''
    returns the whole list of proteins from the df
    @param df is the pd.DataFrame
    @return is a list of all of the elements in the column
    '''
    uniprotID = []
    for elements in df['uniprotID']:
        for id in elements:
            uniprotID.append(id)
    return uniprotID

def txt_to_list(filepath) -> list:
    '''
    returns a list with newlines as delimiter
    @param filepath is the file name
    @returns a list
    '''
    file = open(filepath, 'r')
    file = [line.strip('\n') for line in file.readlines()]
    return file



In [4]:
#create csv
df = construct_csv(root)
df

Unnamed: 0,drugbank_id,uniprotID,uniprot_identification_number
0,DB00001,[P00734],[BE0000048]
1,DB00006,"[P00734, P05164]","[BE0000048, BE0001075]"
2,DB00009,"[P00747, P02671, Q03405, P05121]","[BE0000211, BE0000538, BE0000717, BE0000240]"
3,DB00013,"[P00747, Q03405, P05121, P05120, P05154, P9816...","[BE0000211, BE0000717, BE0000240, BE0000969, B..."
4,DB00015,"[P00747, P02671, P05121]","[BE0000211, BE0000538, BE0000240]"
...,...,...,...
231,DB15366,[],[]
232,DB15536,[],[]
233,DB15861,[],[]
234,DB15880,[],[]


In [5]:
#grab unid
unid = get_uniID(df)
len(unid)

1446

In [6]:
#grab proteome
unified_cardiac_protiome = txt_to_list('unified_cardiac_proteome.txt')
len(unified_cardiac_protiome)

25966

In [8]:
#grab the unified cardiac proteome and add it into the df
df_column = []

#iterate throught the index
for i in range(len(df['drugbank_id'])):
    #Aiterate throught the nested list
    unified_protein_list = []
    for j in range(len(df['uniprotID'][i])):
        #if the protein is the same as intersection
        if df['uniprotID'][i][j] in unified_cardiac_protiome:
            unified_protein_list.append(df['uniprotID'][i][j])
    df_column.append(unified_protein_list)   


df['cardiac proteins'] = df_column
df


Unnamed: 0,drugbank_id,uniprotID,uniprot_identification_number,cardiac proteins
0,DB00001,[P00734],[BE0000048],[P00734]
1,DB00006,"[P00734, P05164]","[BE0000048, BE0001075]","[P00734, P05164]"
2,DB00009,"[P00747, P02671, Q03405, P05121]","[BE0000211, BE0000538, BE0000717, BE0000240]","[P00747, P02671]"
3,DB00013,"[P00747, Q03405, P05121, P05120, P05154, P9816...","[BE0000211, BE0000717, BE0000240, BE0000969, B...","[P00747, P05120, P14543]"
4,DB00015,"[P00747, P02671, P05121]","[BE0000211, BE0000538, BE0000240]","[P00747, P02671]"
...,...,...,...,...
231,DB15366,[],[],[]
232,DB15536,[],[],[]
233,DB15861,[],[],[]
234,DB15880,[],[],[]
