In [58]:
import xml.dom.minidom
import pandas as pd
import numpy as np

In [59]:
file = xml.dom.minidom.parse("hsa00250.xml")

### Entries

##### This script iterates through the "entry" elements in "hsa00250.xml" and creates a dataframe containing the entry's name, id, x & y coordinates and type.

In [69]:
model = file.documentElement
graphics = model.getElementsByTagName("graphics")
entries = model.getElementsByTagName("entry")
x_coordinates = []
y_coordinates = []
types = []
names = []
ids = []

for entry in entries: 
    types.append(entry.getAttribute("type"))
    ids.append(entry.getAttribute("id"))

for element in graphics:
    x_coordinates.append(element.getAttribute("x"))
    y_coordinates.append(element.getAttribute("y"))
    names.append(element.getAttribute("name"))
    
data = pd.DataFrame({"name": name_values, "id": ids, "x": x_values, "y": y_values, "type": type_values})
data.index = np.arange(1, len(data)+1) #start indexing from 1 instead of 0
data
  

Unnamed: 0,name,id,x,y,type
1,C00122,22,285,530,compound
2,C00042,23,284,587,compound
3,C00036,24,748,528,compound
4,C00026,25,680,587,compound
5,Citrate cycle (TCA cycle),26,511,558,map
...,...,...,...,...,...
103,"RIMKLA, FAM80A, NAAGS, NAAGS-II, NAAGS2",177,745,221,gene
104,Glyoxylate and dicarboxylate metabolism,178,484,973,map
105,Monobactam biosynthesis,181,345,189,map
106,Arginine biosynthesis,182,850,894,map


##### The dataframe is exported as .json and copied into index.html (currently manually, will attempt to automate in future). The information in the .json file is used in the javascript file to place shapes at every entry's location, pick a colour for the shape based on the entry's "type" and place text with the entry's "name" next to it on the canvas.

In [62]:
dataframe = data.to_json('./export.json', orient='table')

### Relationships

##### The script below iterates through the "relation" elements of "hsa00250.xml" and stores the "entry1" and "entry2" attributes in lists. Two for loops iterate through the entry1 & entry2 lists, locate the id in the above dataframe ("data") and append the coordinates to lists. A dataframe is then created ("relationships_data") containing the start and end ids and the corresponding x & y coordinates. Does not run without running the previous cells first.

In [70]:
relationships = model.getElementsByTagName("relation")
relations_start = []
relations_end = []
start_x = []
start_y = []
end_x = []
end_y = []

for relation in relationships:
    relations_start.append(relation.getAttribute("entry1"))
    relations_end.append(relation.getAttribute("entry2"))

for entry1 in relations_start:
    start_x.append(data.loc[data["id"] == entry1, "x"].values[0])
    start_y.append(data.loc[data["id"] == entry1, "y"].values[0])
    
for entry2 in relations_end:    
    end_x.append(data.loc[data["id"] == entry2, "x"].values[0])
    end_y.append(data.loc[data["id"] == entry2, "y"].values[0])

relationships_data = pd.DataFrame({"start": start_values, "start_x": start_x, "start_y": start_y, "end": end_values, "end_x": end_x, "end_y": end_y})
relationships_data.index = np.arange(1, len(relationships_data)+1) #start indexing from 1 instead of 0
relationships_data

Unnamed: 0,start,start_x,start_y,end,end_x,end_y
1,68,143,227,63,55,426
2,68,143,227,61,143,395
3,75,142,597,61,143,395
4,75,142,597,63,55,426
5,58,416,442,59,359,442
...,...,...,...,...,...,...
290,178,484,973,105,763,758
291,178,484,973,102,763,716
292,178,484,973,93,551,672
293,178,484,973,99,542,719


##### The dataframe is exported to a .json file which is copied into the index.html file. The information is then used to put arrows between related nodes on the canvas. 

In [116]:
relationships_dataframe = relationships_data.to_json('./relations_export.json', orient='table')

### Reactions

##### The below script creates a dictionary from the "reaction" elements and its children in "hsa00250.xml". This was necessary because some reactions have multiple products. 

In [71]:
reaction_elements = model.getElementsByTagName("reaction")

reaction_dataframe = []
for reaction in reaction_elements:
    substrate_elements = reaction.getElementsByTagName("substrate")
    product_elements = reaction.getElementsByTagName("product")
    for product in product_elements:
        reaction_dataframe.append({
            "reaction_id" : str(reaction.getAttribute("id")),
            "substrate_id" : str(substrate_elements[0].getAttribute("id")),
            "product_id" : str(product.getAttribute("id"))
        })
        
reaction_dataframe

[{'reaction_id': '162', 'substrate_id': '32', 'product_id': '28'},
 {'reaction_id': '33', 'substrate_id': '31', 'product_id': '24'},
 {'reaction_id': '35', 'substrate_id': '28', 'product_id': '29'},
 {'reaction_id': '38', 'substrate_id': '29', 'product_id': '28'},
 {'reaction_id': '44', 'substrate_id': '28', 'product_id': '32'},
 {'reaction_id': '47', 'substrate_id': '28', 'product_id': '24'},
 {'reaction_id': '49', 'substrate_id': '28', 'product_id': '24'},
 {'reaction_id': '50', 'substrate_id': '30', 'product_id': '24'},
 {'reaction_id': '55', 'substrate_id': '28', 'product_id': '54'},
 {'reaction_id': '56', 'substrate_id': '28', 'product_id': '53'},
 {'reaction_id': '58', 'substrate_id': '54', 'product_id': '22'},
 {'reaction_id': '59', 'substrate_id': '53', 'product_id': '22'},
 {'reaction_id': '61', 'substrate_id': '27', 'product_id': '60'},
 {'reaction_id': '63', 'substrate_id': '27', 'product_id': '60'},
 {'reaction_id': '71', 'substrate_id': '28', 'product_id': '70'},
 {'reacti

##### The below script iterates through the above dictionary and matches up substrate and product ids with the coordinates in the first dataframe ("data"). The information is then put into another dataframe ("reaction_coordinates_dataframe").

In [73]:
substrate_x = []
substrate_y = []
product_x = []
product_y = []
substrate_ids = []
product_ids = []

for item in reaction_dataframe:
    substrate_id = item.get("substrate_id")
    substrate_ids.append(substrate_id)
    substrate_x.append(data.loc[data["id"] == substrate_id, "x"].values[0])
    substrate_y.append(data.loc[data["id"] == substrate_id, "y"].values[0])
    product_id = item.get("product_id")
    product_ids.append(product_id)
    product_x.append(data.loc[data["id"] == product_id, "x"].values[0])
    product_y.append(data.loc[data["id"] == product_id, "y"].values[0])

reaction_coordinates_dataframe = pd.DataFrame({"substrate_ids": substrate_ids, "substrate_x": substrate_x, "substrate_y": substrate_y, "product_ids": product_ids, "product_x": product_x, "product_y": product_y})
reaction_coordinates_dataframe.index = np.arange(1, len(reaction_coordinates_dataframe)+1)
reaction_coordinates_dataframe

Unnamed: 0,substrate_ids,substrate_x,substrate_y,product_ids,product_x,product_y
1,32,680,210,28,496,306
2,31,752,411,24,748,528
3,28,496,306,29,749,306
4,29,749,306,28,496,306
5,28,496,306,32,680,210
6,28,496,306,24,748,528
7,28,496,306,24,748,528
8,30,616,455,24,748,528
9,28,496,306,54,416,395
10,28,496,306,53,359,395


##### The above dataframe is then exported as .json and copied into index.html. The javascript file then uses this information to put arrows between substrates and products on the canvas.

In [56]:
x = reaction_coordinates_dataframe.to_json('./reactions_export.json', orient='table')