In [1]:
import xml.dom.minidom
import pandas as pd
import numpy as np
import Bio
from Bio.KEGG import REST
import re

In [2]:
file = xml.dom.minidom.parse("hsa05415.xml")

### Entries

##### This script iterates through the "entry" elements in "hsa00250.xml" and creates a dataframe containing the entry's name, id, x & y coordinates and type.

In [19]:
model = file.documentElement
graphics = model.getElementsByTagName("graphics")
entries = model.getElementsByTagName("entry")
x_coordinates = []
y_coordinates = []
types = []
names = []
ids = []
new_names = []

for entry in entries: 
    types.append(entry.getAttribute("type"))
    ids.append(entry.getAttribute("id"))
    
for element in graphics:
    x_coordinates.append(element.getAttribute("x"))
    y_coordinates.append(element.getAttribute("y"))
    names.append(element.getAttribute("name"))

name_type = pd.DataFrame({"name": names, "type": types})

for row_index in range(len(name_type)):
    row = name_type.loc[row_index]
    row_type = row["type"]
    if row_type == "gene":
        row_name = row["name"]
        new_name = row_name.split(", ")[0]
        new_names.append(new_name)
    elif row_type == "compound":
        row_name = row["name"]
        tidy_row_name = row_name.replace("...", "")
        compound = REST.kegg_get(tidy_row_name).readlines()
        compound_name_row = compound[1]
        new_names.append(compound_name_row.replace("\n", "").replace(";", "")[12:])
    else:
        row_name = row["name"]
        new_names.append(row_name)

data = pd.DataFrame({"name": new_names, "id": ids, "x": x_coordinates, "y": y_coordinates, "type": types})
data.index = np.arange(1, len(data)+1) #start indexing from 1 instead of 0


In [36]:
row_index


162

##### The dataframe is exported as .json and copied into index.html (currently manually, will attempt to automate in future). The information in the .json file is used in the javascript file to place shapes at every entry's location, pick a colour for the shape based on the entry's "type" and place text with the entry's "name" next to it on the canvas.

In [34]:
dataframe = data.to_json('./export_hsa05415.json', orient='table')

### Relationships

##### The script below iterates through the "relation" elements of "hsa00250.xml" and stores the "entry1" and "entry2" attributes in lists. Two for loops iterate through the entry1 & entry2 lists, locate the id in the above dataframe ("data") and append the coordinates to lists. A dataframe is then created ("relationships_data") containing the start and end ids and the corresponding x & y coordinates. Does not run without running the previous cells first.

In [47]:
relationships = model.getElementsByTagName("relation")
relations_start = []
relations_end = []
start_x = []
start_y = []
end_x = []
end_y = []
full_start = []
full_end = []

for relation in relationships:
    relations_start.append(relation.getAttribute("entry1"))
    relations_end.append(relation.getAttribute("entry2"))

for entry1 in relations_start:
    start_x.append(data.loc[data["id"] == entry1, "x"].values[0])
    start_y.append(data.loc[data["id"] == entry1, "y"].values[0])
    
for entry2 in relations_end:    
    end_x.append(data.loc[data["id"] == entry2, "x"].values[0])
    end_y.append(data.loc[data["id"] == entry2, "y"].values[0])

    
relationships_data = pd.DataFrame({"start": relations_start, "start_x": start_x, "start_y": start_y, "end": relations_end, "end_x": end_x, "end_y": end_y})
relationships_data.index = np.arange(1, len(relationships_data)+1) #start indexing from 1 instead of 0
relationships_data

Unnamed: 0,start,start_x,start_y,end,end_x,end_y
1,283,898,124,532,1077,92
2,532,1077,92,470,1172,120
3,283,898,124,533,1077,117
4,283,898,124,724,1140,144
5,62,705,642,44,805,575
...,...,...,...,...,...,...
67,611,1243,1219,603,1391,1277
68,611,1243,1219,604,1391,1307
69,611,1243,1219,605,1391,1328
70,44,805,575,507,495,749


##### The dataframe is exported to a .json file which is copied into the index.html file. The information is then used to put arrows between related nodes on the canvas. 

In [16]:
relationships_dataframe = relationships_data.to_json('./relations_export_hsa05415.json', orient='table')

### Reactions

##### The below script creates a dictionary from the "reaction" elements and its children in "hsa00250.xml". This was necessary because some reactions have multiple products. 

In [18]:
reaction_elements = model.getElementsByTagName("reaction")

reaction_dataframe = []
for reaction in reaction_elements:
    substrate_elements = reaction.getElementsByTagName("substrate")
    product_elements = reaction.getElementsByTagName("product")
    for product in product_elements:
        reaction_dataframe.append({
            "reaction_id" : str(reaction.getAttribute("id")),
            "substrate_id" : str(substrate_elements[0].getAttribute("id")),
            "product_id" : str(product.getAttribute("id"))
        })
        
reaction_dataframe

[]

##### The below script iterates through the above dictionary and matches up substrate and product ids with the coordinates in the first dataframe ("data"). The information is then put into another dataframe ("reaction_coordinates_dataframe").

In [8]:
substrate_x = []
substrate_y = []
product_x = []
product_y = []
substrate_ids = []
product_ids = []

for item in reaction_dataframe:
    substrate_id = item.get("substrate_id")
    substrate_ids.append(substrate_id)
    substrate_x.append(data.loc[data["id"] == substrate_id, "x"].values[0])
    substrate_y.append(data.loc[data["id"] == substrate_id, "y"].values[0])
    product_id = item.get("product_id")
    product_ids.append(product_id)
    product_x.append(data.loc[data["id"] == product_id, "x"].values[0])
    product_y.append(data.loc[data["id"] == product_id, "y"].values[0])

reaction_coordinates_dataframe = pd.DataFrame({"substrate_ids": substrate_ids, "substrate_x": substrate_x, "substrate_y": substrate_y, "product_ids": product_ids, "product_x": product_x, "product_y": product_y})
reaction_coordinates_dataframe.index = np.arange(1, len(reaction_coordinates_dataframe)+1)
reaction_coordinates_dataframe

Unnamed: 0,substrate_ids,substrate_x,substrate_y,product_ids,product_x,product_y
1,105,378,958,45,146,958
2,105,378,958,45,146,958
3,101,550,958,105,378,958
4,101,550,958,105,378,958
5,146,378,825,103,378,868
6,103,378,868,146,378,825
7,103,378,868,106,265,868
8,107,212,911,106,265,868
9,99,627,868,102,483,868
10,96,483,736,102,483,868


##### The above dataframe is then exported as .json and copied into index.html. The javascript file then uses this information to put arrows between substrates and products on the canvas.

In [9]:
x = reaction_coordinates_dataframe.to_json('./reactions_export_hsa00010.json', orient='table')

### JSON for D3

In [56]:
import json

##### The script below creates a dataframe and exports it to JSON in a specific format used by the D3 library

In [1]:
nodes = []
links = []

for index, name in enumerate(new_names):
    node = {"name": name, "group": types[index], "id": ids[index]}
    nodes.append(node)

for index, start in enumerate(relations_start):
    link = {"source": start, "target": relations_end[index], "value": 1}
    links.append(link)

d3_data = {"nodes": [], "links": []}
d3_data["nodes"] += nodes
d3_data["links"] += links
d3_data

json_string = json.dumps(d3_data)

with open('d3_data.json', 'w', encoding='utf-8') as f:
    json.dump(d3_data, f, ensure_ascii=False, indent=4)

NameError: name 'new_names' is not defined