### Plots

In [1]:
import os
import pandas as pd

dataset = "drkg-hsdn"
data_dir = os.path.join(os.getcwd(), dataset, "data")
relation_file = os.path.join(data_dir, "relations_hrt.tsv")
relations = pd.read_csv(relation_file, sep="\t", names=["source_id", "relation_type", "target_id"], dtype=str)
list(set(relations["relation_type"].to_list()))

['GNBR::D::Gene:Disease',
 'GNBR::H::Gene:Gene',
 'GNBR::Pr::Compound:Disease',
 'DRUGBANK::carrier::Compound:Gene',
 'GNBR::I::Gene:Gene',
 'Hetionet::AuG::Anatomy:Gene',
 'Hetionet::CtD::Compound:Disease',
 'Hetionet::DdG::Disease:Gene',
 'STRING::OTHER::Gene:Gene',
 'GNBR::U::Gene:Disease',
 'GNBR::Z::Compound:Gene',
 'bioarx::Coronavirus_ass_host_gene::Disease:Gene',
 'Hetionet::CcSE::Compound:SideEffect',
 'GNBR::T::Compound:Disease',
 'DRUGBANK::target::Compound:Gene',
 'Hetionet::AeG::Anatomy:Gene',
 'GNBR::B::Gene:Gene',
 'GNBR::Y::Gene:Disease',
 'Hetionet::DrD::Disease:Disease',
 'DRUGBANK::x-atc::Compound:Atc',
 'Hetionet::GiG::Gene:Gene',
 'bioarx::DrugHumGen:Compound:Gene',
 'STRING::BINDING::Gene:Gene',
 'DGIDB::OTHER::Gene:Compound',
 'DGIDB::BLOCKER::Gene:Compound',
 'GNBR::Sa::Compound:Disease',
 'Hetionet::AdG::Anatomy:Gene',
 'bioarx::HumGenHumGen:Gene:Gene',
 'INTACT::PHOSPHORYLATION REACTION::Gene:Gene',
 'Hetionet::CpD::Compound:Disease',
 'GNBR::V+::Gene:Gene',
 

In [2]:
import pandas as pd
import plotly.express as px

df = pd.DataFrame()
df["resource"] = relations["relation_type"].apply(lambda x: x.split("::")[0])
df["source_type"] = relations["source_id"].apply(lambda x: x.split(":")[0])
df["target_type"] = relations["target_id"].apply(lambda x: x.split(":")[0])
df["source_target"] = df["source_type"] + ":" + df["target_type"]

# source_type:target_type might be same with target_type:source_type, merge them
df["source_target"] = df["source_target"].apply(lambda x: x.split(":")[0] + ":" + x.split(":")[1] if x.split(":")[0] > x.split(":")[1] else x.split(":")[1] + ":" + x.split(":")[0])

# Plot only the rows where matched source and target types are in the list
# filtered_df = df[df["source_type"].isin(["Disease", "Gene", "Compound", "Symptom", "Pathway"]) & df["target_type"].isin(["Disease", "Gene", "Compound", "Symptom", "Pathway"])]

# Group the data by 'label' and 'resource' and count the rows
# grouped_df = filtered_df.groupby(['source_target', "resource"]).size().reset_index(name='count')

# resource = ['bioarx', 'DGIDB', 'DRUGBANK', 'GNBR', 'Hetionet', 'INTACT', 'STRING', 'HSDN']
# # Please specify the colors in the same order as the resource list
# colors = [
#     "#FF0000",  # Red
#     "#008000",  # Green
#     "#0000FF",  # Blue
#     "#FFFF00",  # Yellow
#     "#800080",  # Purple
#     "#FFA500",  # Orange
#     "#FFC0CB",  # Pink
#     "#00FFFF"   # Cyan
# ]

# # Generate a list of colors for each resource
# colors = grouped_df['resource'].apply(lambda x: colors[resource.index(x)])

# # Create the bar chart using Plotly Express
# fig2 = px.bar(grouped_df, x='source_target', y='count', color='resource', title='Count of Rows by Source-Target and Resource', color_discrete_sequence=colors)

# Plot all
grouped_df = df.groupby(['source_target', "resource"]).size().reset_index(name='count')
fig2 = px.bar(grouped_df, x='source_target', y='count', color='resource', title='Count of Rows by Source-Target and Resource')

# Show the plot
fig2.show(renderer='vscode')

### Visualize a subgraph with a specified subset of relations

### Get all paths with length <= 3 and one node as a start point.

> Prompt: 
> If I have a file which contains the following columns: source_id, source_type, target_id, target_type, relation_type. and any node will be treated as a uniq node, if its id:type is different from others. I would like to use one specified node as a start point and get a subgraph which all nodes linked with it and the length of paths <= 3, how to do it? In the meanwhile, please save the paths as a file which contains five columns: source_id, source_type, relation_type, target_id, target_type.

#### Create a subgraph by extracting nodes with paths <= max_length from the start node

In [8]:
import pandas as pd
import networkx as nx

dataset_name = "drkg-hsdn"
rootdir = os.path.dirname(os.getcwd())
total_entity_file = os.path.join(rootdir, "graph_data", "entities.tsv")

datadir = os.path.join(os.getcwd(), dataset_name, "data")

# Specify the start node as a tuple (id, type)
# ME/CFS
# start_node = ('MONDO:0005404', 'Disease')

# ME/CFS
start_node = ('MESH:D015673', 'Disease')

# Lung Cancer
# start_node = ('MONDO:0008903', 'Disease')

max_length = 2

allowd_types = ["Disease", "Gene", "Compound", "Symptom"]
# types_to_remove = ["Disease"]
types_to_remove = []
which_layer_to_remove = 1

# allowd_types = ["Gene", "Compound", "Disease", "Symptom", "Pathway", "Anatomy", "Metabolite", "MolecularFunction", "BiologicalProcess", "CellularComponent"]
colors = ["#e60049", "#0bb4ff", "#50e991", "#e6d800", "#9b19f5", "#ffa300", "#dc0ab4", "#b3d4ff", "#00bfa0", "#ff6e00"]

node_type_colors = {}
for node_type, color in zip(allowd_types, colors):
    node_type_colors[node_type] = color

# Read the data from the file into a DataFrame
df = pd.read_csv(os.path.join(datadir, 'relations.tsv'), sep='\t', dtype=str)

entity_file = os.path.join(datadir, 'entities.tsv')
if os.path.exists(entity_file):
    entites = pd.read_csv(entity_file, sep='\t', dtype=str)

    # Join the df and the entites to get the label of each node and add name field from the entites to the df
    df = df.merge(entites[['id', 'name', 'label']], left_on=['source_id', 'source_type'], right_on=['id', 'label'], how='left')
    df = df.rename(columns={'name': 'source_name'})

    df = df.merge(entites[['id', 'name', 'label']], left_on=['target_id', 'target_type'], right_on=['id', 'label'], how='left')
    df = df.rename(columns={'name': 'target_name'})
elif os.path.exists(total_entity_file):
    # TODO: Add the name field from the all entities where the id matches the source_id and target_id
    df["source_name"] = df["source_id"]
    df["target_name"] = df["target_id"]
    pass

# Create a directed graph to represent the relationships
G = nx.DiGraph()

# Add nodes and edges to the graph
for _, row in df.iterrows():
    source_id = row['source_id']
    source_type = row['source_type']
    source_name = row['source_name']
    target_id = row['target_id']
    target_type = row['target_type']
    target_name = row['target_name']
    relation_type = row['relation_type']

    if source_type not in allowd_types or target_type not in allowd_types:
        continue

    # Add nodes for source and target with node type as an attribute
    G.add_node((source_id, source_type), source_name=source_name, node_type=source_type)
    G.add_node((target_id, target_type), target_name=target_name, node_type=target_type)

    # Add directed edge from source to target
    G.add_edge((source_id, source_type), (target_id, target_type), relation=relation_type, source_name=source_name, target_name=target_name)

subgraph_nodes = nx.single_source_shortest_path_length(G, start_node, cutoff=max_length)


In [9]:
print(subgraph_nodes)
# Iterate through the nodes and create a list of nodes to remove
# Prompt: How to remove all nodes which are disease type and are greater than a specific layer in a path?
nodes_to_remove = []
for node, layer in subgraph_nodes.items():
    if G.nodes[node]["node_type"] in types_to_remove and layer > which_layer_to_remove:
        print("Remove node: ", node, " with layer: ", layer)
        nodes_to_remove.append(node)

G_copy = G.copy()
# Remove the nodes from the graph
G_copy.remove_nodes_from(nodes_to_remove)

# Extract the subgraph from the original graph
subgraph = G_copy.subgraph(subgraph_nodes.keys())

# Create a list to store the paths in the desired format
formatted_paths = []

# Iterate through the edges in the subgraph and store them
for edge in subgraph.edges(data=True):
    source_node = edge[0]
    target_node = edge[1]
    relation_type = edge[2]['relation']
    source_name = edge[2]['source_name']
    target_name = edge[2]['target_name']

    formatted_paths.append((source_node[0], source_node[1], relation_type, target_node[0], target_node[1], source_name, target_name))

# Create a DataFrame from the formatted paths
formatted_df = pd.DataFrame(formatted_paths, columns=['source_id', 'source_type', 'relation_type', 'target_id', 'target_type', 'source_name', 'target_name'])

{('MESH:D015673', 'Disease'): 0, ('MESH:D001247', 'Symptom'): 1, ('MESH:D005334', 'Symptom'): 1, ('MESH:D005335', 'Symptom'): 1, ('MESH:D007035', 'Symptom'): 1, ('MESH:D001835', 'Symptom'): 1, ('MESH:D015430', 'Symptom'): 1, ('MESH:D050177', 'Symptom'): 1, ('MESH:D009765', 'Symptom'): 1, ('MESH:D002303', 'Symptom'): 1, ('MESH:D005221', 'Symptom'): 1, ('MESH:D005222', 'Symptom'): 1, ('MESH:D051346', 'Symptom'): 1, ('MESH:D020820', 'Symptom'): 1, ('MESH:D006948', 'Symptom'): 1, ('MESH:D011595', 'Symptom'): 1, ('MESH:D014202', 'Symptom'): 1, ('MESH:D020233', 'Symptom'): 1, ('MESH:D019954', 'Symptom'): 1, ('MESH:D007806', 'Symptom'): 1, ('MESH:D004410', 'Symptom'): 1, ('MESH:D003221', 'Symptom'): 1, ('MESH:D013575', 'Symptom'): 1, ('MESH:D019462', 'Symptom'): 1, ('MESH:D008569', 'Symptom'): 1, ('MESH:D010468', 'Symptom'): 1, ('MESH:D001308', 'Symptom'): 1, ('MESH:D011596', 'Symptom'): 1, ('MESH:D018908', 'Symptom'): 1, ('MESH:D009133', 'Symptom'): 1, ('MESH:D013035', 'Symptom'): 1, ('MESH:

#### Save the DataFrame to a CSV file

In [10]:
pathdir = os.path.join(datadir, 'paths')
if not os.path.exists(pathdir):
    os.makedirs(pathdir)

# Extract the first character of each type and join them to create a string
allowd_types_str = ''.join([x[0] for x in allowd_types])
prefix = f'{start_node[0].replace(":", "_").lower()}_{max_length}_{allowd_types_str.lower()}'
pathfile = os.path.join(pathdir, f'{prefix}_subgraph.tsv')
formatted_df.to_csv(pathfile, index=False, sep='\t')

#### Convert the df to a file which is compatible with cytoscape

Prompt: I have a data frame which contains seven columns: 'source_id', 'source_type', 'source_name', 'target_id', 'target_type', 'target_name', 'relation_type', how to convert the data frame into a xgmml file.

In [11]:
nodes_df = formatted_df[['source_id', 'source_name', 'source_type']].rename(
    columns={'source_id': 'id', 'source_name': 'name', 'source_type': 'type'}
)
nodes_df = pd.concat([nodes_df, formatted_df[['target_id', 'target_name', 'target_type']].rename(
    columns={'target_id': 'id', 'target_name': 'name', 'target_type': 'type'}
)], axis=0)
nodes_df = nodes_df.drop_duplicates(subset=['id', 'type'])

edges_df = formatted_df[['source_id', 'target_id', 'relation_type']].rename(
    columns={'source_id': 'source', 'target_id': 'target', 'relation_type': 'label'}
)

# Create an XGMML template with additional attributes
xgmml_template = '''<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<!DOCTYPE graph PUBLIC "-//John Punin//DTD graph description//EN" "http://www.cs.rpi.edu/~puninj/XGMML/xgmml.dtd">
<graph label="Knowledge Graph" directed="1">
  {nodes}
  {edges}
</graph>
'''

# Create node and edge elements with additional attributes
node_lst = [
    f'''
    <node id="{id}" label="{name}">
        <att name="id" type="string" value="{id}" />
        <att name="name" type="string" value="{name}" />
        <att name="type" type="string" value="{type}" />
        <att name="node_shape" type="string" value="ellipse" />
        <att name="node_color" type="string" value="{node_type_colors[type]}" />
    </node>
    '''
    for id, name, type in zip(nodes_df['id'], nodes_df['name'], nodes_df['type'])
]
nodes = '\n'.join(node_lst)

edge_lst = [
    f'''
    <edge source="{source}" target="{target}" label="{label}" cy:directed="1">
        <att name="relation_type" type="string" value="{label}" />
        <att name="shared name" value="{label}" type="string"
        cy:type="String" />
        <att name="shared interaction" value="" type="string" cy:type="String" />
        <att name="name" value="{label}" type="string" cy:type="String" />
        <att name="selected" value="0" type="boolean" cy:type="Boolean" />
        <att name="interaction" value="" type="string" cy:type="String" />
        <att name="relation_type" value="{label}" type="string"
        cy:type="String" />
    </edge>
    '''
    for source, target, label in zip(edges_df['source'], edges_df['target'], edges_df['label'])
]
edges = '\n'.join(edge_lst)

# Populate the XGMML template
xgmml_content = xgmml_template.format(nodes=nodes, edges=edges)

# Save XGMML content to a file
xgmml_file = os.path.join(pathdir, f'{prefix}_network.xgmml')
with open(xgmml_file, 'w') as f:
    f.write(xgmml_content)

: 