## Prepare a full knowledge graph dataset


We need to prepare the training, validation and external test datasets. We will use the training dataset to train the model and the test dataset to evaluate the model for all KGE models.

### [Require to Modify According to Your Situation] Prepare all relation files

We design a strategy to allow users to integrate their expected relation files for different purposes. Such as you might want to include the `malacards_mecfs` dataset when you want to train a model for ME/CFS.

In [4]:
import os

root_dir = os.path.dirname(os.getcwd())

# ---------- Parameters [Must be modified based on your situation] ----------
# dataset_name = "biomedgps-full-v20240127"
dataset_name = "biomedgps"
dataset_version = "v20241115"
skip_rows_not_in_entity_file = True
# The directory names must be consistent with the subdirectories in the formatted_relations folder.
blacklist_databases = ["ctd"]

# It's an optional parameter, if you don't want to split the dataset, you can ignore it.
split_ratio = 0.8

# Which column will be kept in the final formatted file
relation_type_column = "formatted_relation_type"

# Which file will be used to format the relation types
relation_type_file = "relation_types.tsv"

In [6]:
# ---------- Load data ----------
graph_data_dir = os.path.join(root_dir, "graph_data")
formatted_relation_dir = os.path.join(graph_data_dir, "formatted_relations")

files = []
for dir in os.listdir(formatted_relation_dir):
    for file in os.listdir(os.path.join(formatted_relation_dir, dir)):
        if file.endswith(".tsv") and file.startswith("formatted_") and dir not in blacklist_databases:
            files.append(os.path.join(formatted_relation_dir, dir, file))
files = sorted(files)

print("Merging the following files:")
print("\n".join(files))

entity_file = os.path.join(graph_data_dir, "entities.tsv")
print("Number of entities: {}".format(len(open(entity_file).readlines())))

Merging the following files:
/Users/jy006/Documents/Code/BioMedGPS/biomedgps-data/graph_data/formatted_relations/biosnap_disease/formatted_customdb.tsv
/Users/jy006/Documents/Code/BioMedGPS/biomedgps-data/graph_data/formatted_relations/biosnap_phenotype/formatted_customdb.tsv
/Users/jy006/Documents/Code/BioMedGPS/biomedgps-data/graph_data/formatted_relations/cbcg/formatted_customdb.tsv
/Users/jy006/Documents/Code/BioMedGPS/biomedgps-data/graph_data/formatted_relations/customdb/formatted_customdb.tsv
/Users/jy006/Documents/Code/BioMedGPS/biomedgps-data/graph_data/formatted_relations/drkg/formatted_drkg.tsv
/Users/jy006/Documents/Code/BioMedGPS/biomedgps-data/graph_data/formatted_relations/hsdn/formatted_hsdn.tsv
/Users/jy006/Documents/Code/BioMedGPS/biomedgps-data/graph_data/formatted_relations/primekg/formatted_primekg.tsv
/Users/jy006/Documents/Code/BioMedGPS/biomedgps-data/graph_data/formatted_relations/ttd/formatted_customdb.tsv
Number of entities: 926803


### Dependencies

In [2]:
import os
import sys

lib_dir = os.path.join(os.path.dirname(os.getcwd()), "lib")

print("Adding {} to sys.path".format(lib_dir))
sys.path.append(lib_dir)

Adding /Users/jy006/Documents/Code/BioMedGPS/biomedgps-data/lib to sys.path


### Metadata for all relation files

To save the essential information of all relation files for reproducibility.

In [3]:
from metadata import DatasetMetadata, check_repo_clean

# For getting the correct commit id, we need to check if the repo is clean. If not, you should commit your changes first.
check_repo_clean(file_suffix = ".py", raise_error=False)
repo_commit_id = os.popen("git rev-parse HEAD").read().strip()
repo_path = os.popen("git config --get remote.origin.url").read().strip()
outputdir = os.path.join(root_dir, "datasets", f"{dataset_name}-{dataset_version}-{repo_commit_id[:6]}")
os.makedirs(outputdir, exist_ok=True)

dataset_metadata = DatasetMetadata(
    repo_commit_id=repo_commit_id,
    repo_path=repo_path,
    dataset_name=dataset_name,
    dataset_version=dataset_version,
    data_files=files,
    metadata=None,
)

dataset_metadata.to_json(os.path.join(outputdir, "metadata.json"))

### Merge all relation files into one file

In [4]:
import os
import subprocess
import pandas as pd
import tempfile

temp_dir = tempfile.mkdtemp()

args = ["python3", os.path.join(lib_dir, "data.py"), "merge-files"]

for f in files:
    args.extend(["--input", f])

kg_file = os.path.join(temp_dir, "knowledge_graph.tsv")
annotated_kg_file = os.path.join(temp_dir, "annotated_knowledge_graph.tsv")
args.extend(["--output", kg_file])

print("Running: {}".format(" ".join(args)))
args_str = " ".join(args)
!{args_str}

if os.path.exists(kg_file):
    df = pd.read_csv(kg_file, sep="\t")
    source_ids = df[["source_id", "source_type"]].drop_duplicates()
    target_ids = df[["target_id", "target_type"]].drop_duplicates()
    ids = pd.concat([source_ids, target_ids]).drop_duplicates()
    print("Number of unique entity ids: {}".format(len(ids)))
    print("Number of deduplicated relations: {}".format(len(df.drop_duplicates())))

Running: python3 /Users/jy006/Documents/Code/BioMedGPS/biomedgps-data/lib/data.py merge-files --input /Users/jy006/Documents/Code/BioMedGPS/biomedgps-data/graph_data/formatted_relations/ctd/formatted_ctd.tsv --input /Users/jy006/Documents/Code/BioMedGPS/biomedgps-data/graph_data/formatted_relations/drkg/formatted_drkg.tsv --input /Users/jy006/Documents/Code/BioMedGPS/biomedgps-data/graph_data/formatted_relations/hsdn/formatted_hsdn.tsv --input /Users/jy006/Documents/Code/BioMedGPS/biomedgps-data/graph_data/formatted_relations/primekg/formatted_primekg.tsv --input /Users/jy006/Documents/Code/BioMedGPS/biomedgps-data/graph_data/relations/customdb/formatted_malacards_mecfs.tsv --input /Users/jy006/Documents/Code/BioMedGPS/biomedgps-data/graph_data/relations/customdb/formatted_customdb_v20240329.tsv --input /Users/jy006/Documents/Code/BioMedGPS/biomedgps-data/graph_data/relations/customdb/formatted_treatme_survey_compounds.tsv --input /Users/jy006/Documents/Code/BioMedGPS/biomedgps-data/gr

### [Optional] Filter out the relations that are not matched with our requirements

We can follow the results generated by the graph_analysis.ipynb to decide which relations should be kept.

In [5]:
# The kg_file variable might have been updated by the following cells, so we put the definition here for avoiding to load the wrong file
kg_file = os.path.join(temp_dir, "knowledge_graph.tsv")
df = pd.read_csv(kg_file, sep="\t")
df

Unnamed: 0,relation_type,resource,pmids,key_sentence,source_id,source_type,target_id,target_type,source_name,target_name
0,CTD::decreases^expression::Compound:Gene,CTD,,,MESH:C000611729,Compound,ENTREZ:1,Gene,"ammonium 2,3,3,3-tetrafluoro-2-(heptafluoropro...",A1BG
1,CTD::decreases^expression::Compound:Gene,CTD,,,MESH:C000944,Compound,ENTREZ:1,Gene,dicrotophos,A1BG
2,CTD::increases^expression::Compound:Gene,CTD,,,MESH:C005556,Compound,ENTREZ:1,Gene,propionaldehyde,A1BG
3,CTD::decreases^expression::Compound:Gene,CTD,,,MESH:C006253,Compound,ENTREZ:1,Gene,pirinixic acid,A1BG
4,CTD::increases^expression::Compound:Gene,CTD,,,MESH:C006253,Compound,ENTREZ:1,Gene,pirinixic acid,A1BG
...,...,...,...,...,...,...,...,...,...,...
47498664,HSDN::has_symptom::Disease:Symptom,TreatME,,,MONDO:0100233,Disease,MedDRA:10028817,Symptom,long COVID-19,
47498665,HSDN::has_symptom::Disease:Symptom,TreatME,,,MONDO:0100233,Disease,MedDRA:10041367,Symptom,long COVID-19,
47498666,HSDN::has_symptom::Disease:Symptom,TreatME,,,MONDO:0100233,Disease,MedDRA:10033386,Symptom,long COVID-19,
47498667,HSDN::has_symptom::Disease:Symptom,TreatME,,,MONDO:0100233,Disease,MedDRA:10050007,Symptom,long COVID-19,


In [6]:
print("Number of relations: {}".format(len(df)))
ignore_relation_types = [
    # Virus gene relations are not useful for our use case.
    "bioarx::Coronavirus_ass_host_gene::Disease:Gene",
    "bioarx::Covid2_acc_host_gene::Disease:Gene",
    "bioarx::DrugHumGen::Compound:Gene",
    "bioarx::DrugVirGen::Compound:Gene",
    "bioarx::HumGenHumGen::Gene:Gene",
    "bioarx::VirGenHumGen::Gene:Gene",
    # We don't like associated_with relation type.
    "PrimeKG::associated_with::Disease:Gene",
    "PrimeKG::associated_with::Gene:Disease",
    "PrimeKG::associated_with::Gene:Symptom",
    "PrimeKG::associated_with::Symptom:Gene",
    # We don't like ontology tree
    "PrimeKG::parent-child::Anatomy:Anatomy",
    "PrimeKG::parent-child::BiologicalProcess:BiologicalProcess",
    "PrimeKG::parent-child::CellularComponent:CellularComponent",
    "PrimeKG::parent-child::Disease:Disease",
    "PrimeKG::parent-child::MolecularFunction:MolecularFunction",
    "PrimeKG::parent-child::Pathway:Pathway",
    "PrimeKG::parent-child::Symptom:Symptom",
]

df = df[~df["relation_type"].isin(ignore_relation_types)]
print("Number of relations after removed ignore relation_types: {}".format(len(df)))

relation_type_map = pd.read_csv(
    os.path.join(graph_data_dir, "relation_types.tsv"), sep="\t"
)

relation_types = relation_type_map["relation_type"].tolist()
df = df[df["relation_type"].isin(relation_types)]
print("Number of relations after removed unknown relation_types: {}".format(len(df)))
kg_file_ignore_relation_types_filtered = os.path.join(
    temp_dir, "knowledge_graph_ignore_relation_types_filtered.tsv"
)
df = df.merge(relation_type_map[["relation_type", "formatted_relation_type"]], on="relation_type", how="left")

ignore_formatted_relation_types = [
    # There are too much relations in this relation type, but they might not useful.
    "BioMedGPS::Interaction::Compound:Compound",
    # We don't like associated_with relation type.
    "BioMedGPS::AssociatedWith::Gene:Gene",
]
df = df[~df["formatted_relation_type"].isin(ignore_formatted_relation_types)]
print("Number of relations after removed ignore formatted_relation_types: {}".format(len(df)))

df.to_csv(kg_file_ignore_relation_types_filtered, sep="\t", index=False)
kg_file = kg_file_ignore_relation_types_filtered
kg_file

Number of relations: 47498669
Number of relations after removed ignore relation_types: 46619502
Number of relations after removed unknown relation_types: 15070186
Number of relations after removed ignore formatted_relation_types: 10549943


'/var/folders/4s/d4nr1sg91ps1k3qz00h28w_r0000gp/T/tmpectx_yk8/knowledge_graph_ignore_relation_types_filtered.tsv'

### [Optional] Map all mouse genes to human genes as much as possible

In [7]:
## Number of Mouse / Rat / Human Genes
entities = pd.read_csv(entity_file, sep="\t")
genes = entities[entities["label"] == "Gene"]
mouse_genes = genes[genes["taxid"] == 10090]
rat_genes = genes[genes["taxid"] == 10116]
human_genes = genes[genes["taxid"] == 9606]

print("Number of Entities: ", len(mouse_genes), len(rat_genes), len(human_genes))
knowledge_graph = pd.read_csv(kg_file, sep="\t")
mouse_relations = knowledge_graph[
    knowledge_graph["source_id"].isin(mouse_genes["id"])
    | knowledge_graph["target_id"].isin(mouse_genes["id"])
]

human_relations = knowledge_graph[
    knowledge_graph["source_id"].isin(human_genes["id"])
    | knowledge_graph["target_id"].isin(human_genes["id"])
]

print(f"Number of mouse gene relations: {len(mouse_relations)}, Number of human gene relations: {len(human_relations)}")

human_mouse_gene_mappings = pd.read_csv(
    os.path.join(graph_data_dir, "mapping", "human_mouse_gene_mappings.tsv"), sep="\t"
)
# NOTE: There might be multiple mappings for a single mouse gene, we will use the first mapping for now. such as PTCD1[ENTREZ:26024] and ATP5MF-PTCD1[ENTREZ:100526740] have the same mouse gene mapping. Ptcd1[ENTREZ: 71799]. 
# human_mouse_gene_map["ENTREZ:71799"]
human_mouse_gene_map = dict(
    zip(
        human_mouse_gene_mappings["entrez_id_mouse"],
        human_mouse_gene_mappings["entrez_id_human"],
    )
)

  entities = pd.read_csv(entity_file, sep="\t")


Number of Entities:  50015 0 44000
Number of mouse gene relations: 110419, Number of human gene relations: 9990154


In [8]:
# We don't like mouse genes, let's convert them to human genes. If a mouse gene doesn't have a human gene mapping, we will keep the mouse gene. So the users can see that the gene is a mouse gene.
# Convert the mouse_genes["id"] Series to a set for faster lookup
mouse_gene_ids = set(mouse_genes["id"].values)

# Vectorized operation for source_id
knowledge_graph["source_id"] = knowledge_graph["source_id"].map(
    lambda x: human_mouse_gene_map.get(x, x) if x in mouse_gene_ids else x
)

# Vectorized operation for target_id
knowledge_graph["target_id"] = knowledge_graph["target_id"].map(
    lambda x: human_mouse_gene_map.get(x, x) if x in mouse_gene_ids else x
)

# Check whether the conversion is successful
mouse_relations = knowledge_graph[
    knowledge_graph["source_id"].isin(mouse_genes["id"])
    | knowledge_graph["target_id"].isin(mouse_genes["id"])
]

human_relations = knowledge_graph[
    knowledge_graph["source_id"].isin(human_genes["id"])
    | knowledge_graph["target_id"].isin(human_genes["id"])
]

# We cannot use the pattern below because some gene names don't follow the pattern. for example, "Bdnf" is used as a human gene in GNBR database.
# pattern = r"^[A-Z][a-z]+$"
# not_matched_genes = knowledge_graph[
#     ((knowledge_graph["source_type"] == "Gene") & knowledge_graph["source_name"].str.match(pattern, na=False)) |
#     ((knowledge_graph["target_type"] == "Gene") & knowledge_graph["target_name"].str.match(pattern, na=False))
# ]
# not_matched_genes[
#     (not_matched_genes["source_id"] == "ENTREZ:627")
#     | (not_matched_genes["target_id"] == "ENTREZ:627")
# ]

not_matched_genes = knowledge_graph[
    knowledge_graph["source_id"].isin(mouse_genes["id"])
    | knowledge_graph["target_id"].isin(mouse_genes["id"])
]

# Expected: 0, xxx
print(len(mouse_relations), len(human_relations), len(not_matched_genes))

# Write the knowledge graph to a file
kg_file_mouse_converted = os.path.join(temp_dir, "knowledge_graph_mouse_converted.tsv")
knowledge_graph.to_csv(kg_file_mouse_converted, sep="\t", index=False)
kg_file = kg_file_mouse_converted

17284 10083075 17284


### [Optional] Format all relation types

In [None]:
if relation_type_file and os.path.exists(relation_type_file):
    relation_types = pd.read_csv(relation_type_file, sep="\t")

    print("Number of relation types: {}".format(len(relation_types)))

    # Read the kg file
    knowledge_graph = pd.read_csv(kg_file, sep="\t")

    # Format the relation types
    ## Remove the formatted_relation_type column if it exists
    if "formatted_relation_type" in knowledge_graph.columns:
        knowledge_graph = knowledge_graph.drop(columns=["formatted_relation_type"])
    knowledge_graph = knowledge_graph.merge(relation_types[["relation_type", "formatted_relation_type"]], on="relation_type", how="left")

    invalid_knowledge_graph = knowledge_graph[knowledge_graph["formatted_relation_type"].isna()]
    print("Number of invalid knowledge graph: {}".format(len(invalid_knowledge_graph)))

    invalid_knowledge_graph_file = os.path.join(temp_dir, "invalid_knowledge_graph.tsv")
    invalid_knowledge_graph.to_csv(invalid_knowledge_graph_file, sep="\t", index=False)
    print("Please check the invalid knowledge graph file: {}".format(invalid_knowledge_graph_file))

    kg_file_relation_types_formatted = os.path.join(temp_dir, "knowledge_graph_relation_types_formatted.tsv")
    knowledge_graph.to_csv(kg_file_relation_types_formatted, sep="\t", index=False)
    kg_file = kg_file_relation_types_formatted

### Annotate the knowledge graph with the entities

In [None]:
args = [
    "python3",
    os.path.join(os.path.dirname(lib_dir), "graph_data", "scripts", "annotate_relations.py"),
    "--entity-file",
    entity_file,
    "--relation-file",
    kg_file,
    "--output-dir",
    os.path.dirname(kg_file),
    "--strict-mode" if skip_rows_not_in_entity_file else "",
]

print("Running: {}".format(" ".join(args)))
args_str = " ".join(args)
!{args_str}
print("File written to: {}".format(annotated_kg_file))

### Copy all files to the dataset folder

In [None]:
os.makedirs(outputdir, exist_ok=True)

files = [
    (entity_file, os.path.join(outputdir, "annotated_entities.tsv")),
    (kg_file, os.path.join(outputdir, "knowledge_graph.tsv")),
    (annotated_kg_file, os.path.join(outputdir, "annotated_knowledge_graph.tsv")),
]

for f, output_file in files:
    subprocess.check_output(["cp", f, output_file])

print("Please found all files in {}".format(outputdir))