In [3]:
import pickle

from anngel.datasets.openbiolink import OpenBioLinkDataset

import pandas as pd
import numpy as np

from pathlib import Path

out_dir = Path("tmp")
out_dir.mkdir(exist_ok=True)

ds = OpenBioLinkDataset(out_dir / "openbiolink")
graph = ds.to_networkx()
with open("graph_obl_directed.pkl", "wb") as f:
    pickle.dump(graph, f)
with open("graph_obl_undirected.pkl", "wb") as f:
    pickle.dump(graph.to_undirected(), f)
del graph

df = pd.read_csv("../../data/full_node_info_train_val_test.csv", sep=";")
df["irr_gene"] = df["IR list"] == "IR_True"
df = (
    ds.get_nodes()
    .merge(
        df[["Node ID", "irr_gene"]], how="left", left_on="node_id", right_on="Node ID"
    )
    .drop(columns=["Node ID"])
)

irr_diseases_df = pd.read_csv(
    "../../data/IR_related_diseases_DOID_533.tsv", sep="\t", header=None
)
irr_diseases_df.columns = ["Disease", "URL"]
irr_diseases_df["node_id"] = (
    irr_diseases_df["URL"].str.split("/").apply(lambda r: r[-1].replace("_", ":"))
)
irr_diseases_df.to_csv("ir_related_diseases.csv", index=False)
df["irr_disease"] = df["node_id"].isin(irr_diseases_df["node_id"])

irr_pathways_df = pd.read_csv("../../data/IRrelatedPathways", sep="\t")
df["irr_pathway"] = (
    df["node_id"]
    .str.split(":")
    .apply(lambda x: x[1])
    .isin(irr_pathways_df["Pathway identifier"])
)
df.to_parquet("openbiolink_df.parquet", index=False)

Found 0 files in local directory "tmp/s3_files".


HQ_DIR.zip: 109MB [00:40, 2.67MB/s]                              
100%|██████████| 27/27 [00:13<00:00,  1.97it/s]


No local copy of file "full_node_info_train_val_test.csv" available.
Remote copy available. Trying to download...
No local copy of file "IR_related_diseases_DOID_533.tsv" available.
Remote copy available. Trying to download...
No local copy of file "IRrelatedPathways" available.
Remote copy available. Trying to download...
No local copy of file "openbiolink_df.parquet" available.
Remote copy available. Trying to download...


In [4]:
irr_pathway_idcs = np.where(df["irr_pathway"])[0]
gene_idcs = np.where(df["node_type"] == "GENE")[0]
pairs_pathway = [(a, b) for a in gene_idcs for b in irr_pathway_idcs]
with open("pairs_obl_pathway.pkl", "wb") as f:
    pickle.dump(pairs_pathway, f)

sources = np.sort(np.unique(np.array(pairs_pathway)[:, 0]))
targets = np.sort(np.unique(np.array(pairs_pathway)[:, 1]))
input_pairs_pathway = [(g_idx, targets) for g_idx in sources]
with open("input_pairs_obl_pathway.pkl", "wb") as f:
    pickle.dump(input_pairs_pathway, f)
# input_pairs_pathway_dsf.upload()
del pairs_pathway
del input_pairs_pathway

In [5]:
gene_idcs = np.where(df["node_type"] == "GENE")[0]
irr_disease_idcs = np.where(df["irr_disease"])[0]
pairs_disease = [(a, b) for a in gene_idcs for b in irr_disease_idcs]
with open("pairs_obl_disease.pkl", "wb") as f:
    pickle.dump(pairs_disease, f)

sources = np.sort(np.unique(np.array(pairs_disease)[:, 0]))
targets = np.sort(np.unique(np.array(pairs_disease)[:, 1]))
input_pairs_disease = [(g_idx, targets) for g_idx in sources]

with open("input_pairs_obl_disease.pkl", "wb") as f:
    pickle.dump(input_pairs_disease, f)

del pairs_disease
del input_pairs_disease

In [6]:
ds.get_nodes().to_csv("nodes_obl.csv", index=False)