In [1]:
import pandas as pd

In [2]:
# Open and read the XML file as a string

def get_xml_text(path:str)-> str: 
    with open(path, "r", encoding="utf-8") as f:
        xml_content = f.read()
    return xml_content

path = "/home/u111169/wrkdir/mgh-project/datasets/drugbank_data/full_database.xml"

xml_content = get_xml_text(path)

In [3]:
import re

pattern = r'(<drug\s+type="[^"]+"\s+created="[^"]+"\s+updated="[^"]+">)'


def get_each_drug_from_xml(xml_content , pattern ):
    matches = list(re.finditer(pattern, xml_content))

    # Extract all <drug> sections
    drug_sections = []
    for i in range(len(matches)):
        start_pos = matches[i].start()
        end_pos = matches[i + 1].start() if i + 1 < len(matches) else len(xml_content)

        # Extract the content between this match and the next one
        drug_sections.append(xml_content[start_pos:end_pos])
    return drug_sections


In [4]:
drug_sections = get_each_drug_from_xml(xml_content , pattern )

In [15]:
import re 
def extract_drugbank_id(text): 
    pattern = r'(<drug\s+type="[^"]+"\s+created="[^"]+"\s+updated="[^"]+">)\s*<drugbank-id primary="true">([^<]+)</drugbank-id>' 
    match = re.search(pattern, text) 
    if match: 
        return match.group(2) 
    return None # Example usage: sample_text = '<drug type="biotech" created="2023-01-01" updated="2024-01-01">\n <drugbank-id primary="true">DB12345</drugbank-id>' result = extract_drugbank_id(sample_text) print(result) # Output: DB12345

In [16]:
extract_drugbank_id(drug_sections[555])

'DB00570'

In [17]:
import re

def extract_drug_type(text):
    pattern = r'(<drug\s+type="([^"]+)"\s+created="[^"]+"\s+updated="[^"]+">)'
    match = re.search(pattern, text)
    if match:
        drug_type = match.group(2)
        if drug_type in ["biotech", "small molecule"]:
            return drug_type
    return None

In [18]:
extract_drug_type(drug_sections[55])

'biotech'

In [19]:
count = drug_sections[17429].count("<statte/>")
count

0

In [20]:
columns = ['state',
 'groups',
 'general-references',
 'classification',
 'salts',
 'synonyms',
 'products',
 'international-brands',
 'mixtures',
 'packagers',
 'manufacturers',
 'prices',
 'categories',
 'affected-organisms',
 'dosages',
 'atc-codes',
 'ahfs-codes',
 'pdb-entries',
 'patents',
 'food-interactions',
 'drug-interactions',
 'sequences',
 'calculated-properties',
 'experimental-properties',
 'external-identifiers',
 'external-links',
 'pathways',
 'reactions',
 'snp-effects',
 'snp-adverse-drug-reactions',
 'targets',
 'enzymes',
 'carriers',
 'transporters']


# Define the main and state patterns
main_first = r'(<drug\s+type="([^"]+)"\s+created="[^"]+"\s+updated="[^"]+">)'
state_first = r'<state>(.*?)</state>'
op_first = '<state/>'

main_second =  r'<general-references>(.*?)</general-references>'
op_main_second = '<general-references/>'
state_second = r'<classification>(.*?)</classification>'
op_state_second = '<classification/>'


main_third = r'<pdb-entries>(.*?)</pdb-entries>'
op_main_third = '<pdb-entries/>'
state_third = r'<patents>(.*?)</patents>'
op_state_third = '<patents/>'

In [21]:
def extract_text_between_main(main ,state,op , text):
    pattern_one = re.compile(state, re.DOTALL)
    pattern_two = re.compile(op, re.DOTALL)
    if pattern_one:
        # Use a regex pattern to find text between the main and state
        pattern = re.compile(f'{main}.*?{state}', re.DOTALL)
        match = pattern.search(text)
    
        # If a match is found, extract the text between the main and state
        if match:
            return match.group(0)
        else:
            return None
    elif pattern_two:
        # Use a regex pattern to find text between the main and state
        pattern = re.compile(f'{main}.*?{op}', re.DOTALL)
        match = pattern.search(text)
    
        # If a match is found, extract the text between the main and state
        if match:
            return match.group(0)
        else:
            return None 
    else: 
        return None 

def extract_text_between(main,op_main ,state ,op_state , text):
    pattern_main = re.compile(state, re.DOTALL)
    pattern_op_main = re.compile(op_main, re.DOTALL)
    pattern_state = re.compile(state, re.DOTALL)
    pattern_op_state = re.compile(op_state, re.DOTALL)
    
    if pattern_main:
        if pattern_state:
            # Use a regex pattern to find text between the main and state
            pattern = re.compile(f'{pattern_main}.*?{pattern_state}', re.DOTALL)
            match = pattern.search(text)

            # If a match is found, extract the text between the main and state
            if match:
                return match.group(0)
            else:
                return None
        elif pattern_op_state: 
            # Use a regex pattern to find text between the main and state
            pattern = re.compile(f'{pattern_main}.*?{pattern_op_state}', re.DOTALL)
            match = pattern.search(text)

            # If a match is found, extract the text between the main and state
            if match:
                return match.group(0)
            else:
                return None
        else: 
            return None
    elif pattern_op_main:
        if pattern_state:
            # Use a regex pattern to find text between the main and state
            pattern = re.compile(f'{pattern_op_main}.*?{pattern_state}', re.DOTALL)
            match = pattern.search(text)

            # If a match is found, extract the text between the main and state
            if match:
                return match.group(0)
            else:
                return None
        elif pattern_op_state: 
            # Use a regex pattern to find text between the main and state
            pattern = re.compile(f'{pattern_op_main}.*?{pattern_op_state}', re.DOTALL)
            match = pattern.search(text)

            # If a match is found, extract the text between the main and state
            if match:
                return match.group(0)
            else:
                return None
        else: 
            return None
    else: 
        return None 

In [24]:
def get_df_for_each_drug(text):

    d = {}
    for i in columns: 
        pattern = f"<{i}>(.*?)</{i}>"
        pattern = re.compile(pattern, re.DOTALL)
        d[i] = [pattern.findall(text)]
    df = pd.DataFrame(d)
    df["intro"] = [str(extract_text_between_main(main_first , state_first,op_first , text))]
    df["tox_meta_molinform"] = [str(extract_text_between(main_second,op_main_second ,state_second ,op_state_second , text))]
    df["tox_meta_molinform"] = [str(extract_text_between(main_third,op_main_third ,state_third ,op_state_third , text))]
    df["drug_bank_id"] = [extract_drugbank_id(text)]
    df["drug_type"] = [extract_drug_type(text)]
    return df

In [25]:

def get_one_df_for_all_drugs(drug_sections):
    l = [] 
    for drug in drug_sections: 
        l.append(get_df_for_each_drug(drug))
    final_df = pd.concat(l , axis=0)
    return final_df

In [26]:
df = get_one_df_for_all_drugs(drug_sections)

In [38]:
df.to_csv("drugbank_df.csv")#, index=False

In [39]:
df_small = df[df["drug_type"]=="small molecule"]

In [40]:
df_small

Unnamed: 0,state,groups,general-references,classification,salts,synonyms,products,international-brands,mixtures,packagers,...,snp-effects,snp-adverse-drug-reactions,targets,enzymes,carriers,transporters,intro,tox_meta_molinform,drug_bank_id,drug_type
0,[solid],[\n <group>approved</group>\n <group>inv...,[\n <articles>\n <article>\n <r...,[\n <description>This compound belongs to t...,[],"[\n <synonym language=""english"" coder=""inn/...",[\n <product>\n <name>Angiomax</name>\...,[\n <international-brand>\n <name>Angi...,[],[\n <packager>\n <name>Ben Venue Labor...,...,[],[],"[\n <target position=""1"">\n <id>BE0000...",[\n <uniprot-id>P00734</uniprot-id>\n ...,[],[],"<drug type=""small molecule"" created=""2005-06-1...",,DB00006,small molecule
0,[solid],[\n <group>approved</group>\n ],[\n <articles/>\n <textbooks/>\n <lin...,[\n <description>This compound belongs to t...,"[\n <salt>\n <drugbank-id primary=""tru...","[\n <synonym language=""english"" coder=""inn/...",[\n <product>\n <name>Zoladex</name>\n...,[],[],[\n <packager>\n <name>AstraZeneca Inc...,...,[],[],"[\n <target position=""1"">\n <id>BE0000...",[],[],[],"<drug type=""small molecule"" created=""2005-06-1...",,DB00014,small molecule
0,[liquid],[\n <group>approved</group>\n ],[\n <articles>\n <article>\n <r...,[\n <description>This compound belongs to t...,[],"[\n <synonym language=""english"" coder="""">Ba...",[\n <product>\n <name>Antibiotic Cream...,[\n <international-brand>\n <name>Sofr...,[\n <mixture>\n <name>Neomycin and Pol...,[\n <packager>\n <name>Johnson &amp; J...,...,[],[],"[\n <target position=""1"">\n <id>BE0000...",[],[],"[\n <transporter position=""1"">\n <id>B...","<drug type=""small molecule"" created=""2005-06-1...",,DB00027,small molecule
0,[solid],[\n <group>approved</group>\n ],[\n <articles>\n <article>\n <r...,[],"[\n <salt>\n <drugbank-id primary=""tru...","[\n <synonym language=""english"" coder="""">1-...",[\n <product>\n <name>Apo-desmopressin...,[\n <international-brand>\n <name>Adiu...,[],[\n <packager>\n <name>Amerisource Hea...,...,[],[],"[\n <target position=""1"">\n <id>BE0000...","[\n <enzyme position=""1"">\n <id>BE0000...",[],[],"<drug type=""small molecule"" created=""2005-06-1...",,DB00035,small molecule
0,[solid],[\n <group>approved</group>\n <group>inv...,[\n <articles/>\n <textbooks/>\n <lin...,[\n <description>This compound belongs to t...,"[\n <salt>\n <drugbank-id primary=""tru...","[\n <synonym language=""english"" coder=""inn/...",[\n <product>\n <name>Cetrorelix</name...,[],[],[\n <packager>\n <name>Baxter Internat...,...,[],[],"[\n <target position=""1"">\n <id>BE0000...",[],[],[],"<drug type=""small molecule"" created=""2005-06-1...",,DB00050,small molecule
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,[],[\n <group>investigational</group>\n ],[\n <articles/>\n <textbooks/>\n <lin...,[],[],"[\n <synonym language=""english"" coder="""">TD...",[],[],[],[],...,[],[],[],[],[],[],,,DB19450,small molecule
0,[],[\n <group>investigational</group>\n ],[\n <articles/>\n <textbooks/>\n <lin...,[],[],[],[],[],[],[],...,[],[],[],[],[],[],,,DB19451,small molecule
0,[],[\n <group>investigational</group>\n ],[\n <articles/>\n <textbooks/>\n <lin...,[],[],"[\n <synonym language=""english"" coder="""">9-...",[],[],[],[],...,[],[],[],[],[],[],,,DB19454,small molecule
0,[],[\n <group>investigational</group>\n ],[\n <articles/>\n <textbooks/>\n <lin...,[],[],"[\n <synonym language=""english"" coder="""">HE...",[],[],[],[],...,[],[],[],[],[],[],,,DB19455,small molecule


In [None]:
from huggingface_hub import HfApi, Repository, notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
api = HfApi()

repo_name = "drugbank_data_csv"  # Choose a unique name
repo_url = api.create_repo(repo_name)

repo = Repository(
    local_dir="/content/drive/MyDrive/Papers/Original_Papers/Llm drug prediction/new_dugbank",
    #clone_url=repo_url,
    private=True  # Set to False if you want it public
)

api.upload_folder(
    folder_path="/content/drive/MyDrive/Papers/Original_Papers/Llm drug prediction/new_dugbank",
    repo_id="Moreza009/drugbank_data",
    #repo_type="space",
)