Author: Irsyad Adam

In [1]:
import pandas as pd
from tqdm import tqdm
import requests

In [2]:
def extract_pmid_from_unid(unid) -> str:
    """
    grabs all pmids
    @param id is the uniprot id that is going to be grabbed
    @return is a list with all of the pmids associated EXCLUDING THE NUCLETIDE SEQ
    """
    #get the url
    url = 'https://www.uniprot.org/uniprot/' + unid + '.txt'

    #check the response
    response = requests.get(url=url)

    #if successful
    if response.status_code == 200:
        pmid_list = []
        #new line delimiter
        response = response.text.splitlines()
        #search
        for i in range(len(response)):
            #RX is the section for pmids, WANT TO EXCULUDE NUCLEOTIDE SEQUENCE PMIDS
            if ('RX' in response[i]) and (('NUCLEOTIDE SEQUENCE' not in response[i - 1]) and ('NUCLEOTIDE SEQUENCE' not in response[i - 2]) and ('NUCLEOTIDE SEQUENCE' not in response[i - 3])):
                #process string
                pmid = response[i]
                pmid = str(pmid.split()[1][7:-1])
                pmid_list.append(pmid)
        return pmid_list

    #if not successful
    else:
        #get error
        print('Error, Status Code:' % response.status_code)

def unid_pmid_to_df(io = "edge_list.csv") -> pd.DataFrame:
    """
    takes a csv file, gets the uniprot id column, and gets every pmid from that uniprotid, excluding the 'nucleotide
    sequences'
    @param io is the csv file to get parsed
    @return df is the df with uniprotid-pmid
    """
    df = pd.read_csv(io)

    #get all unique identifiers
    unid = list(set(df["UNIPROT_ID"]))
    print("---Import Completed---", flush = True)
    pmid_list = []

    #iterate
    for element in tqdm(unid, desc = "Extracting PMIDs: "):
        pmid_list.append(extract_pmid_from_unid(element))
        
    #print
    print("Done", flush = True)
    dict_to_df = {"UNIPROT_ID" : unid, "PMID" : pmid_list}
    return pd.DataFrame(dict_to_df)

In [3]:
import xml.etree.ElementTree as et

def create_root(io) -> any:
    """
    returns the root or xml
    @param io is filepath
    @returns the root associated with the xml
    """
    return et.parse(io).getroot()

def get_child_pmids(child, namespace) -> list:
    """
    given a drug, gets all pmids associated with the drug
    @param child is the child of root (iterated through the et.root)
    @returns a list of pmids associated with the child
    """
    pmids = []
    for element in child.findall(namespace + "general-references"):
        for articles in element:
            for article in articles:
                for ref in article:
                    if (ref.tag == namespace + "pubmed-id"):
                        pmids.append(ref.text)
                
    return pmids


def get_dbID(child, namespace) -> str:
    """
    gets primary dbid from a child of xml root
    @param child is the child of xml root
    @returns a string
    """
    #get the id tag
    for element in child.findall(namespace +'drugbank-id'):
        #use primary drugbank id
        if 'primary' in element.attrib:
            #get nested text
            drugbank_id = element.text
            return str(drugbank_id)
        else:
            raise Exception('drugbank id not found')

def find_all_drugbank_pmids(io = "drugbank_database.xml", csv = "edge_list.csv") -> pd.DataFrame:
    """
    finds all pmids in drugbank given drugbank ids from the csv
    @param io is the filepath of the full drugbank xml
    """
    print("---Importing Data---", flush = True)
    #read the csv
    df = pd.read_csv(csv)
    #get the dbid
    dbid_list = list(set(df["DRUGBANK_ID"]))
    print("Done")

    print("----Importing XML---", flush = True)
    #iterate over the xml
    root = create_root(io)
    #seperate namespace
    namespace = root.tag.split('}')[0]+'}'
    print("Done", flush = True)

    print("---Iterating Through Root---")
    pmid_list = []
    for child in tqdm(root, desc = "Searching XML"):
        if get_dbID(child, namespace) in dbid_list:
            #pmid_list.append(get_child_pmids(child, namespace))
            pmid_list.append(get_child_pmids(child, namespace))
    print("Done", flush = True)
    df = {"DRUGBANK_ID" : dbid_list, "PMID" : pmid_list}
    return pd.DataFrame(df)


In [4]:
unid_pmid_df = unid_pmid_to_df()
unid_pmid_df.to_csv("edge_list_unid_and_pmid.csv", index = False)
unid_pmid_df

---Import Completed---


Extracting PMIDs: 100%|██████████| 828/828 [11:13<00:00,  1.23it/s]

Done





In [5]:
#runtime ~260 sec
dbid_pmid_df = find_all_drugbank_pmids()
dbid_pmid_df.to_csv("edge_list_dbid_and_pmid.csv", index = False)
dbid_pmid_df

---Importing Data---
Done
----Importing XML---
Done
---Iterating Through Root---


Searching XML: 100%|██████████| 14315/14315 [00:03<00:00, 4721.16it/s]

Done



