# For ME/CFS & Long COVID

NOTE: All drugs and symptoms in the benchmarks directory are curated from publications manually. The curated data is then used to generate the subgraph for testing in this notebook.

## Filter Entities

In [50]:
import pandas as pd

def filter_entities(entity_df, entity_dict_lst):
    """
    Filter entities from entity_df based on entity_dict_lst.
    """
    entity_types = list(set([item.get("node_type") for item in entity_dict_lst]))

    # Save all entities which label is not in entity_types
    remaining_entities = entity_df[~entity_df['label'].isin(entity_types)]
    matched_entities = entity_df[entity_df['label'].isin(entity_types)]

    # Save all entities which label is in entity_types and id is in entity_dict_lst
    results = pd.DataFrame()
    for entity_type in entity_types:
        entities = [item.get("node_id") for item in entity_dict_lst if item.get("node_type") == entity_type]
        matched_entities = matched_entities[matched_entities['id'].isin(entities) & matched_entities['label'].isin([entity_type])]
        results = pd.concat([results, matched_entities], axis=0)

    # Concat remaining_entities and results
    entity_df = pd.concat([remaining_entities, results], axis=0)

    return entity_df

In [51]:
import os
rootdir = os.path.dirname(os.getcwd())
print(rootdir)

/Users/jy006/Documents/Code/biomedgps-data


In [52]:
entity_file = os.path.join(rootdir, "graph_data/entities.tsv")
entity_dict_file = os.path.join(rootdir, "benchmarks/drug_symptoms_subset.csv")

In [53]:
data = pd.read_csv(entity_file, sep="\t", dtype=str)
print(data.shape)

(825159, 9)


In [54]:
entity_dict = pd.read_csv(entity_dict_file, sep=",", dtype=str)
entity_dict_lst = entity_dict.to_dict(orient="records")

# Only keep all drug entities
entity_dict_lst = [item for item in entity_dict_lst if item.get("node_type") == "Compound"]

entities_subset = filter_entities(data, entity_dict_lst)
print(entities_subset.shape)

# Remove all metabolite entities
entities_subset = entities_subset[~entities_subset['label'].isin(["Metabolite"])]
print(entities_subset.shape)

# Remove all disease entities which are not MonDO.
disease_entities = entities_subset[entities_subset['label'].isin(["Disease"]) & entities_subset['id'].str.contains("MONDO:")]
entities_subset = entities_subset[~entities_subset['label'].isin(["Disease"])]
entities_subset = pd.concat([entities_subset, disease_entities], axis=0)
print(entities_subset.shape)

# Remove all gene entities which are not a human gene or a mouse gene.
gene_entities = entities_subset[entities_subset['label'].isin(["Gene"])]
gene_entities = gene_entities[gene_entities['taxid'].isin(["9606", "10090"])]
entities_subset = entities_subset[~entities_subset['label'].isin(["Gene"])]
entities_subset = pd.concat([entities_subset, gene_entities], axis=0)
print(entities_subset.shape)

# Remove all anatomy entities
entities_subset = entities_subset[~entities_subset['label'].isin(["Anatomy"])]
print(entities_subset.shape)

(558166, 9)
(310199, 9)
(306119, 9)
(295492, 9)
(277809, 9)


In [55]:
import plotly.express as px

# Group the data by 'label' and 'resource' and count the rows
grouped_df = entities_subset.groupby(['label', 'resource']).size().reset_index(name='count')

# Create the bar chart using Plotly Express
fig1 = px.bar(grouped_df, x='label', y='count', color='resource', title='Count of Rows by Label and Resource')

# Show the plot
fig1.show(renderer='vscode')

## Filter Relations

In [56]:
relations = pd.read_csv(os.path.join(rootdir, "graph_data/relations.tsv"), sep="\t", dtype=str)
print(relations.shape)

(50625648, 12)


In [57]:
# Only keep relations which are in entities_subset
relations_subset = relations[relations['source_id'].isin(entities_subset['id']) & relations['target_id'].isin(entities_subset['id'])]

# Remove all relations coming from the CTD database
relations_subset = relations_subset[~relations_subset['resource'].isin(["CTD"])]
print(relations_subset.shape)

(4592639, 12)


In [58]:
import plotly.express as px

relations_subset["source_target"] = relations_subset["source_type"] + ":" + relations_subset["target_type"]

# Group the data by 'label' and 'resource' and count the rows
grouped_df = relations_subset.groupby(['source_target', "resource"]).size().reset_index(name='count')

# Create the bar chart using Plotly Express
fig2 = px.bar(grouped_df, x='source_target', y='count', color='resource', title='Count of Rows by Source-Target and Resource')

# Show the plot
fig2.show(renderer='vscode')

In [59]:
# Create the bar chart using Plotly Express
fig3 = px.bar(grouped_df, x='resource', y='count', color='source_target', title='Count of Rows by Source-Target and Resource')

# Show the plot
fig3.show(renderer='vscode')

## Save the subset of entities and relations

In [60]:
entities_subset.to_csv(os.path.join(rootdir, "benchmarks/entities.tsv"), sep="\t", index=False)
relations_subset.to_csv(os.path.join(rootdir, "benchmarks/relations.tsv"), sep="\t", index=False)