In [1]:
import os
import sys

rootdir = os.path.abspath("..")
sys.path.append(os.path.join(rootdir, 'lib'))

from graph import create_graph, remove_nodes, gen_cytoscape, search_subgraph, load_relations, get_relation_types, group_relations

## Show statistics of the graph

In [14]:
import plotly.express as px

dataset_name = "drkg"
relations = load_relations(dataset_name)
relations_types = get_relation_types(relations)
grouped_df = group_relations(relations)

# Plot all
# grouped_df = df.groupby(['source_target', "resource"]).size().reset_index(name='count')
fig2 = px.bar(
    grouped_df,
    x="source_target",
    y="count",
    color="resource",
    title="Count of Rows by Source-Target and Resource",
)
# fig2.update_traces(text=grouped_df['count'], textposition='inside')

# Show the plot
fig2.show(renderer="vscode")

## Create a subgraph

You can create a subgraph by selecting a subset of nodes and edges from the original graph. The subgraph will keep all the properties of the original graph. We assume each model have a unique dataset name and related directory structure. We can use the dataset name to create a subgraph.

In [14]:
# You must modify the dataset_name according to your situation
dataset_name = "drkg-hsdn-custom-malacards-filtered-all"

# allowd_types = ["Disease", "Gene", "Compound", "Symptom"]
allowd_types = [
    "Gene",
    "Compound",
    "Disease",
    "Symptom",
    "Pathway",
    "Anatomy",
    "Metabolite",
    "MolecularFunction",
    "BiologicalProcess",
    "CellularComponent",
]

rootdir = os.path.dirname(os.getcwd())
datadir = os.path.join(os.getcwd(), dataset_name, "data")
relation_file = os.path.join(datadir, "relations.tsv")
entity_file = os.path.join(rootdir, "graph_data", "entities.tsv")

# Build the graph
G = create_graph(relation_file, entity_file, allowd_types)

#### Create a subgraph by extracting nodes with paths <= max_length from the start node

In [27]:
# Specify the start node as a tuple (id, type)
# ME/CFS
# start_node = ('MONDO:0005404', 'Disease')

# ME/CFS
start_node = ('MESH:D015673', 'Disease')

# Lung Cancer
# start_node = ('MONDO:0008903', 'Disease')
n_hops = 1

# types_to_remove = ["Disease"]
types_to_remove = []

pathdir = os.path.join(datadir, 'paths')
file_prefix = f'{start_node[0].replace(":", "_").lower()}_{n_hops}_{"".join([x[0] for x in allowd_types]).lower()}'

# Remove the nodes from the graph
formatted_df = remove_nodes(G, start_node, n_hops, types_to_remove)
formatted_df.drop_duplicates(subset=['source_id', 'source_type', 'target_id', 'target_type', 'relation_type'], inplace=True)
print("Number of edges in the subgraph after removing duplicates: ", formatted_df.shape[0])

df_file = os.path.join(pathdir, f'{file_prefix}_subgraph.tsv')
formatted_df.to_csv(df_file, index=False, sep='\t')

xgmml_file = os.path.join(pathdir, f'{file_prefix}_network.xgmml')
gen_cytoscape(formatted_df, xgmml_file, allowd_types)

Number of nodes in the subgraph:  87931
Number of edges in the subgraph:  4369170
Number of nodes to remove:  87438
Number of nodes in the subgraph after removing nodes:  493
Number of edges in the subgraph:  4910
Number of edges in the subgraph after removing duplicates:  4910


### Visualize a subgraph which contains two specified nodes

In [28]:
first_node = ("MESH:D015673", "Disease")
second_node = ("DrugBank:DB00783", "Compound")
n_hops = 2

subgraph_df = search_subgraph(G, first_node, second_node, n_hops)

pathdir = os.path.join(datadir, "paths")
file_prefix = f'{first_node[0].replace(":", "_").lower()}_{n_hops}_{second_node[0].replace(":", "_").lower()}'
df_file = os.path.join(pathdir, f"{file_prefix}_paths.tsv")
subgraph_df.to_csv(df_file, index=False, sep="\t")

xgmml_file = os.path.join(pathdir, f"{file_prefix}_network.xgmml")
gen_cytoscape(formatted_df, xgmml_file, allowd_types)