# For ME/CFS & Long COVID

NOTE: All drugs and symptoms in the benchmarks directory are curated from publications manually. The curated data is then used to generate the subgraph for testing in this notebook.

## Dependencies [Must be run first]

In [1]:
import os
rootdir = os.path.dirname(os.getcwd())
print(rootdir)

/data/biomedgps-data


## Step 1: Filter Entities

### Read in the data

In [2]:
import pandas as pd

def filter_entities(entity_df, entity_dict_lst):
    """
    Filter entities from entity_df based on entity_dict_lst.
    """
    entity_types = list(set([item.get("node_type") for item in entity_dict_lst]))

    # Save all entities which label is not in entity_types
    remaining_entities = entity_df[~entity_df['label'].isin(entity_types)]
    matched_entities = entity_df[entity_df['label'].isin(entity_types)]

    # Save all entities which label is in entity_types and id is in entity_dict_lst
    results = pd.DataFrame()
    for entity_type in entity_types:
        entities = [item.get("node_id") for item in entity_dict_lst if item.get("node_type") == entity_type]
        matched_entities = matched_entities[matched_entities['id'].isin(entities) & matched_entities['label'].isin([entity_type])]
        results = pd.concat([results, matched_entities], axis=0)

    # Concat remaining_entities and results
    entity_df = pd.concat([remaining_entities, results], axis=0)

    return entity_df

entity_file = os.path.join(rootdir, "graph_data/entities.tsv")
entity_dict_file = os.path.join(rootdir, "benchmarks/drug_symptoms_subset.csv")

data = pd.read_csv(entity_file, sep="\t", dtype=str)
print(data.shape)

(825159, 9)


### Get the filtered entities

In [3]:
entity_dict = pd.read_csv(entity_dict_file, sep=",", dtype=str)
entity_dict_lst = entity_dict.to_dict(orient="records")

# Only keep all drug entities
entity_dict_lst = [item for item in entity_dict_lst if item.get("node_type") == "Compound"]

entities_subset = filter_entities(data, entity_dict_lst)
print(entities_subset.shape)

# Remove all metabolite entities
entities_subset = entities_subset[~entities_subset['label'].isin(["Metabolite"])]
print(entities_subset.shape)

# Remove all disease entities which are not MonDO.
disease_entities = entities_subset[entities_subset['label'].isin(["Disease"]) & entities_subset['id'].str.contains("MONDO:")]
entities_subset = entities_subset[~entities_subset['label'].isin(["Disease"])]
entities_subset = pd.concat([entities_subset, disease_entities], axis=0)
print(entities_subset.shape)

# Remove all gene entities which are not a human gene or a mouse gene.
gene_entities = entities_subset[entities_subset['label'].isin(["Gene"])]
gene_entities = gene_entities[gene_entities['taxid'].isin(["9606", "10090"])]
entities_subset = entities_subset[~entities_subset['label'].isin(["Gene"])]
entities_subset = pd.concat([entities_subset, gene_entities], axis=0)
print(entities_subset.shape)

# Remove all anatomy entities
entities_subset = entities_subset[~entities_subset['label'].isin(["Anatomy"])]
print(entities_subset.shape)

(558166, 9)
(310199, 9)
(306119, 9)
(295492, 9)
(277809, 9)


### Save the filtered entities to a file

In [4]:
entities_subset.to_csv(os.path.join(rootdir, "models/biomedgps/data/entities.tsv"), sep="\t", index=False)

### Grouped bar plots

In [5]:
import plotly.express as px

# Group the data by 'label' and 'resource' and count the rows
grouped_df = entities_subset.groupby(['label', 'resource']).size().reset_index(name='count')

# Create the bar chart using Plotly Express
fig1 = px.bar(grouped_df, x='label', y='count', color='resource', title='Count of Rows by Label and Resource')

# Show the plot
fig1.show(renderer='vscode')

## Step 2: Filter Relations

### Read in the data

In [6]:
import pandas as pd

relations = pd.read_csv(os.path.join(rootdir, "graph_data/relations.tsv"), sep="\t", dtype=str)
print(relations.shape)

(50625648, 12)


### Get a subset of the data

In [7]:


# Only keep relations which are in entities_subset
relations_subset = relations[relations['source_id'].isin(entities_subset['id']) & relations['target_id'].isin(entities_subset['id'])]

# Remove all relations coming from the CTD database
relations_subset = relations_subset[~relations_subset['resource'].isin(["CTD"])]
print(relations_subset.shape)

(4592639, 12)


### Save the subset of relations

In [8]:
relations_subset.to_csv(os.path.join(rootdir, "models/biomedgps/data/relations.tsv"), sep="\t", index=False)

### Grouped bar plots

In [9]:
import plotly.express as px

plot_data = relations_subset[relations_subset["source_type"].isin(["Disease", "Gene", "Compound", "Symptom", "Pathway"]) & relations_subset["target_type"].isin(["Disease", "Gene", "Compound", "Symptom", "Pathway"])]

plot_data["source_target"] = plot_data["source_type"] + ":" + plot_data["target_type"]

# source_type:target_type might be same with target_type:source_type, merge them
plot_data["source_target"] = plot_data["source_target"].apply(lambda x: x.split(":")[0] + ":" + x.split(":")[1] if x.split(":")[0] > x.split(":")[1] else x.split(":")[1] + ":" + x.split(":")[0])

# Group the data by 'label' and 'resource' and count the rows
grouped_df = plot_data.groupby(['source_target', "resource"]).size().reset_index(name='count')

# resource = ['bioarx', 'DGIDB', 'DRUGBANK', 'GNBR', 'Hetionet', 'INTACT', 'STRING', 'PrimeKG', 'HSDN']
# Please specify the colors in the same order as the resource list
colors = ["#e60049", "#0bb4ff", "#50e991", "#e6d800", "#9b19f5", "#ffa300", "#dc0ab4", "#b3d4ff", "#00bfa0"]

# Show the total number of each group on the plot

# Create the bar chart using Plotly Express
# fig2 = px.bar(grouped_df, x='source_target', y='count', color='resource', title='Count of Rows by Source-Target and Resource')
fig2 = px.bar(grouped_df, x='source_target', y='count', color='resource', title='Count of Rows by Source-Target and Resource', color_discrete_sequence=colors, text='count')

# Show the plot
fig2.show(renderer='vscode')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [10]:
# Create the bar chart using Plotly Express
fig3 = px.bar(grouped_df, x='resource', y='count', color='source_target', title='Count of Rows by Source-Target and Resource', text='count')

# Show the plot
fig3.show(renderer='vscode')

### Visualize a subgraph with a specified subset of relations

### Get all paths with length <= 3 and one node as a start point.

> Prompt: 
> If I have a file which contains the following columns: source_id, source_type, target_id, target_type, relation_type. and any node will be treated as a uniq node, if its id:type is different from others. I would like to use one specified node as a start point and get a subgraph which all nodes linked with it and the length of paths <= 3, how to do it? In the meanwhile, please save the paths as a file which contains five columns: source_id, source_type, relation_type, target_id, target_type.

#### Create a subgraph by extracting nodes with paths <= max_length from the start node

In [51]:
import pandas as pd
import networkx as nx

# Specify the start node as a tuple (id, type)
# ME/CFS
start_node = ('MONDO:0005404', 'Disease')

# Lung Cancer
# start_node = ('MONDO:0008903', 'Disease')

max_length = 2

allowd_types = ["Disease", "Gene", "Compound", "Symptom"]
types_to_remove = ["Disease"]
which_layer_to_remove = 1

# allowd_types = ["Gene", "Compound", "Disease", "Symptom", "Pathway", "Anatomy", "Metabolite", "MolecularFunction", "BiologicalProcess", "CellularComponent"]
colors = ["#e60049", "#0bb4ff", "#50e991", "#e6d800", "#9b19f5", "#ffa300", "#dc0ab4", "#b3d4ff", "#00bfa0", "#ff6e00"]

node_type_colors = {}
for node_type, color in zip(allowd_types, colors):
    node_type_colors[node_type] = color

# Read the data from the file into a DataFrame
df = pd.read_csv(os.path.join(rootdir, 'models/biomedgps/data/relations.tsv'), sep='\t', dtype=str)

entites = pd.read_csv(os.path.join(rootdir, 'models/biomedgps/data/entities.tsv'), sep='\t', dtype=str)

# Join the df and the entites to get the label of each node and add name field from the entites to the df
df = df.merge(entites[['id', 'name', 'label']], left_on=['source_id', 'source_type'], right_on=['id', 'label'], how='left')
df = df.rename(columns={'name': 'source_name'})

df = df.merge(entites[['id', 'name', 'label']], left_on=['target_id', 'target_type'], right_on=['id', 'label'], how='left')
df = df.rename(columns={'name': 'target_name'})

# Create a directed graph to represent the relationships
G = nx.DiGraph()

# Add nodes and edges to the graph
for _, row in df.iterrows():
    source_id = row['source_id']
    source_type = row['source_type']
    source_name = row['source_name']
    target_id = row['target_id']
    target_type = row['target_type']
    target_name = row['target_name']
    relation_type = row['relation_type']

    if source_type not in allowd_types or target_type not in allowd_types:
        continue

    # Add nodes for source and target with node type as an attribute
    G.add_node((source_id, source_type), source_name=source_name, node_type=source_type)
    G.add_node((target_id, target_type), target_name=target_name, node_type=target_type)

    # Add directed edge from source to target
    G.add_edge((source_id, source_type), (target_id, target_type), relation=relation_type, source_name=source_name, target_name=target_name)

subgraph_nodes = nx.single_source_shortest_path_length(G, start_node, cutoff=max_length)


In [53]:
print(subgraph_nodes)
# Iterate through the nodes and create a list of nodes to remove
# Prompt: How to remove all nodes which are disease type and are greater than a specific layer in a path?
nodes_to_remove = []
for node, layer in subgraph_nodes.items():
    if G.nodes[node]["node_type"] in types_to_remove and layer > which_layer_to_remove:
        print("Remove node: ", node, " with layer: ", layer)
        nodes_to_remove.append(node)

G_copy = G.copy()
# Remove the nodes from the graph
G_copy.remove_nodes_from(nodes_to_remove)

# Extract the subgraph from the original graph
subgraph = G_copy.subgraph(subgraph_nodes.keys())

# Create a list to store the paths in the desired format
formatted_paths = []

# Iterate through the edges in the subgraph and store them
for edge in subgraph.edges(data=True):
    source_node = edge[0]
    target_node = edge[1]
    relation_type = edge[2]['relation']
    source_name = edge[2]['source_name']
    target_name = edge[2]['target_name']

    formatted_paths.append((source_node[0], source_node[1], relation_type, target_node[0], target_node[1], source_name, target_name))

# Create a DataFrame from the formatted paths
formatted_df = pd.DataFrame(formatted_paths, columns=['source_id', 'source_type', 'relation_type', 'target_id', 'target_type', 'source_name', 'target_name'])

{('MONDO:0005404', 'Disease'): 0, ('MONDO:0002254', 'Disease'): 1, ('MONDO:0005218', 'Disease'): 1, ('MONDO:0008856', 'Disease'): 1, ('MONDO:0010296', 'Disease'): 1, ('MONDO:0010386', 'Disease'): 1, ('MONDO:0010504', 'Disease'): 1, ('MONDO:0012426', 'Disease'): 1, ('MONDO:0013427', 'Disease'): 1, ('MONDO:0013500', 'Disease'): 1, ('MONDO:0013953', 'Disease'): 1, ('MONDO:0014222', 'Disease'): 1, ('MONDO:0014278', 'Disease'): 1, ('MONDO:0014280', 'Disease'): 1, ('MONDO:0014453', 'Disease'): 1, ('MONDO:0014491', 'Disease'): 1, ('MONDO:0014597', 'Disease'): 1, ('MONDO:0014727', 'Disease'): 1, ('MONDO:0014981', 'Disease'): 1, ('MONDO:0019117', 'Disease'): 1, ('MONDO:0020849', 'Disease'): 1, ('MONDO:0021094', 'Disease'): 1, ('MONDO:0030013', 'Disease'): 1, ('MONDO:0030858', 'Disease'): 1, ('MONDO:0030898', 'Disease'): 1, ('MONDO:0030973', 'Disease'): 1, ('MONDO:0030981', 'Disease'): 1, ('MONDO:0032599', 'Disease'): 1, ('MONDO:0032723', 'Disease'): 1, ('MONDO:0032763', 'Disease'): 1, ('MONDO:0

#### Save the DataFrame to a CSV file

In [54]:
pathdir = os.path.join(rootdir, 'models/biomedgps/paths')
if not os.path.exists(pathdir):
    os.makedirs(pathdir)

# Extract the first character of each type and join them to create a string
allowd_types_str = ''.join([x[0] for x in allowd_types])
prefix = f'{start_node[0].replace(":", "_").lower()}_{max_length}_{allowd_types_str.lower()}'
pathfile = os.path.join(pathdir, f'{prefix}_subgraph.tsv')
formatted_df.to_csv(pathfile, index=False, sep='\t')

#### Convert the df to a file which is compatible with cytoscape

Prompt: I have a data frame which contains seven columns: 'source_id', 'source_type', 'source_name', 'target_id', 'target_type', 'target_name', 'relation_type', how to convert the data frame into a xgmml file.

In [56]:
nodes_df = formatted_df[['source_id', 'source_name', 'source_type']].rename(
    columns={'source_id': 'id', 'source_name': 'name', 'source_type': 'type'}
)
nodes_df = pd.concat([nodes_df, formatted_df[['target_id', 'target_name', 'target_type']].rename(
    columns={'target_id': 'id', 'target_name': 'name', 'target_type': 'type'}
)], axis=0)
nodes_df = nodes_df.drop_duplicates(subset=['id', 'type'])

edges_df = formatted_df[['source_id', 'target_id', 'relation_type']].rename(
    columns={'source_id': 'source', 'target_id': 'target', 'relation_type': 'label'}
)

# Create an XGMML template with additional attributes
xgmml_template = '''<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<!DOCTYPE graph PUBLIC "-//John Punin//DTD graph description//EN" "http://www.cs.rpi.edu/~puninj/XGMML/xgmml.dtd">
<graph label="Knowledge Graph" directed="1">
  {nodes}
  {edges}
</graph>
'''

# Create node and edge elements with additional attributes
node_lst = [
    f'''
    <node id="{id}" label="{name}">
        <att name="id" type="string" value="{id}" />
        <att name="name" type="string" value="{name}" />
        <att name="type" type="string" value="{type}" />
        <att name="node_shape" type="string" value="ellipse" />
        <att name="node_color" type="string" value="{node_type_colors[type]}" />
    </node>
    '''
    for id, name, type in zip(nodes_df['id'], nodes_df['name'], nodes_df['type'])
]
nodes = '\n'.join(node_lst)

edge_lst = [
    f'''
    <edge source="{source}" target="{target}" label="{label}" cy:directed="1">
        <att name="relation_type" type="string" value="{label}" />
        <att name="shared name" value="{label}" type="string"
        cy:type="String" />
        <att name="shared interaction" value="" type="string" cy:type="String" />
        <att name="name" value="{label}" type="string" cy:type="String" />
        <att name="selected" value="0" type="boolean" cy:type="Boolean" />
        <att name="interaction" value="" type="string" cy:type="String" />
        <att name="relation_type" value="{label}" type="string"
        cy:type="String" />
    </edge>
    '''
    for source, target, label in zip(edges_df['source'], edges_df['target'], edges_df['label'])
]
edges = '\n'.join(edge_lst)

# Populate the XGMML template
xgmml_content = xgmml_template.format(nodes=nodes, edges=edges)

# Save XGMML content to a file
xgmml_file = os.path.join(pathdir, f'{prefix}_network.xgmml')
with open(xgmml_file, 'w') as f:
    f.write(xgmml_content)