### Install Dependencies

We defined all related functions in `lib/graph.py` module. Before doing the graph analysis, we need to import the module. In addition, we assume that you have followed the instructions in the [README](../README.md) file and have installed all the required dependencies.

In [1]:
import os
import sys

libdir = os.path.join(os.path.dirname(os.getcwd()), "lib")
sys.path.append(libdir)

from graph import (
    biomedgps2stat,
    make_wide_format,
    transposed_array,
    snake_case,
    gen_layout,
)

In [2]:
import os

root_dir = os.path.dirname(os.getcwd())

# You need to change the path to your own data path
data_dir = os.path.join(root_dir, "graph_data")

entity_file = os.path.join(data_dir, "entities.tsv")
relation_file = os.path.join(data_dir, "relations.tsv")

### Prepare the data

In [3]:
import pandas as pd

entities = pd.read_csv(entity_file, sep="\t")
relations = pd.read_csv(relation_file, sep="\t")
node_stat, edge_stat = biomedgps2stat(entities, relations)

# Merge two columns: start_entity_type and end_entity_type, the string format is "start_entity_type:end_entity_type"
edge_stat["simple_relation_type"] = (
    edge_stat["start_entity_type"] + ":" + edge_stat["end_entity_type"]
)

# start_entity_type:end_entity_type may be same as end_entity_type:start_entity_type, so we need to merge them
edge_stat["simple_relation_type"] = edge_stat["simple_relation_type"].apply(
    lambda x: ":".join(sorted(x.split(":")))
)

  entities = pd.read_csv(entity_file, sep="\t")


In [4]:
node_stat, edge_stat

(           entity_type         resource  entity_count
 0              Anatomy             MESH          1315
 1              Anatomy           UBERON         15822
 2    BiologicalProcess               GO         30571
 3    BiologicalProcess         Hetionet           387
 4    CellularComponent               GO          4485
 5    CellularComponent         Hetionet            32
 6             Compound         DrugBank        265567
 7             Compound             MESH           186
 8              Disease             DOID             1
 9              Disease             MESH          3518
 10             Disease            MONDO         26582
 11                Gene           ENTREZ         93995
 12          Metabolite             HMDB        247967
 13   MolecularFunction               GO         12502
 14   MolecularFunction         Hetionet           143
 15             Pathway         Hetionet           294
 16             Pathway             KEGG          1057
 17       

### Chart 1: Number of nodes and edges in the graph

In [5]:
import plotly.graph_objects as go
import plotly.express as px

node_count = node_stat["entity_count"].sum()
edge_count = edge_stat["relation_count"].sum()

fig = go.Figure(
    data=[
        go.Pie(
            labels=["Node Count", "Edge Count"],
            values=[node_count, edge_count],
            textinfo="value",
        )
    ]
)
fig.update_layout(**gen_layout("Total Count", "Type", "Count", True))
fig.show()

### Chart 2: Number of nodes for each entity type

In [6]:
chart2_data = node_stat.groupby(["resource", "entity_type"]).sum().reset_index()
fig2 = px.bar(chart2_data, x="entity_type", y="entity_count", color="resource", barmode="stack")
fig2.update_layout(**gen_layout("Node Count", "Node Type", "Count", True))
fig2.show()

### Chart 3: Number of nodes for each resource

In [7]:
chart3_data = node_stat.groupby(["entity_type"]).sum().reset_index()
fig3 = px.bar(
    chart3_data, x="resource", y="entity_count", color="entity_type", barmode="stack"
)
fig3.update_layout(**gen_layout("Node Count", "Resource", "Count", True))
fig3.show()

### Chart 4: Number of relations for each relation type

In [8]:
chart4_data = edge_stat.groupby(["resource", "simple_relation_type"]).sum().reset_index()
fig4 = px.bar(
    chart4_data,
    x="simple_relation_type",
    y="relation_count",
    color="resource",
    barmode="stack",
)
fig4.update_layout(**gen_layout("Edge Count", "Relation Type", "Count", True))
fig4.show()

### Chart 5: Number of relations for each resource

In [9]:
selected_edge_stat = edge_stat[["resource", "simple_relation_type", "relation_count"]]
chart5_data = selected_edge_stat.groupby(["simple_relation_type", "resource"]).sum().reset_index()
fig5 = px.bar(chart5_data, x="resource", y="relation_count", color="simple_relation_type", barmode="stack")
fig5.update_layout(**gen_layout("Edge Count", "Resource", "Count", True))
fig5.show()

### Table 1: Number of nodes

In [10]:
header_color = "grey"
row_even_color = "lightgrey"
row_odd_color = "white"

node_stat_dicts = node_stat.to_dict("records")
data = make_wide_format(node_stat_dicts, "entity_type", "resource", "entity_count")
columns = list(set([item["resource"] for item in node_stat_dicts]))
cell_values = transposed_array(
    [
        [item["entity_type"]] + [item.get(snake_case(column), 0) for column in columns]
        for item in data
    ]
)

row_num = len(cell_values[0])
colors = [row_even_color if i % 2 == 0 else row_odd_color for i in range(row_num)]

table = go.Figure(
    data=[
        go.Table(
            header=dict(
                values=["Node Type"] + columns,
                align="center",
                line=dict(width=1, color="black"),
                fill=dict(color=header_color),
                font=dict(family="Arial", size=12, color="white"),
            ),
            cells=dict(
                values=cell_values,
                align="center",
                line=dict(color="black", width=1),
                fill=dict(color=[colors]),
                font=dict(family="Arial", size=11, color="black"),
                height=30,
            ),
        )
    ]
)

table.update_layout(title="Node Count")
table.show()

### Table 2: Number of relations

In [11]:
grouped_edge_stat = (
    edge_stat.groupby(["simple_relation_type", "resource"]).sum().reset_index()
)
edge_stat_dicts = grouped_edge_stat.to_dict("records")
data = make_wide_format(
    edge_stat_dicts, "simple_relation_type", "resource", "relation_count"
)
columns = list(set([item["resource"] for item in edge_stat_dicts]))
cell_values = transposed_array(
    [
        [item["simple_relation_type"]]
        + [item.get(snake_case(column), 0) for column in columns]
        for item in data
    ]
)

row_num = len(cell_values[0])
colors = [row_even_color if i % 2 == 0 else row_odd_color for i in range(row_num)]

table2 = go.Figure(
    data=[
        go.Table(
            header=dict(
                values=["Relation Type"] + columns,
                align="center",
                line=dict(width=1, color="black"),
                fill=dict(color=header_color),
                font=dict(family="Arial", size=12, color="white"),
            ),
            cells=dict(
                values=cell_values,
                align="center",
                line=dict(color="black", width=1),
                fill=dict(color=[colors]),
                font=dict(family="Arial", size=11, color="black"),
                height=30,
            ),
        )
    ]
)

table2.update_layout(title="Edge Count")
table2.show()