In [3]:
import pickle

from anngel.datalib.s3datastore import S3DataStore
from anngel.datasets.openbiolink import OpenBioLinkDataset

import pandas as pd
import numpy as np

from pathlib import Path

out_dir = Path('tmp')
out_dir.mkdir(exist_ok=True)

data_store = S3DataStore(
    aws_access_key_id=None,
    aws_secret_access_key=None,
    aws_session_token=None,
    bucket_name="nnedl-core-prd-eu-central-1-curated",
    prefix="compbiosandbox/Projects/NNRCO_KG/Sprint_3_3_IR_Paper/",
    local_directory=out_dir / "s3_files",
)

ds = OpenBioLinkDataset(out_dir / "openbiolink")
graph = ds.to_networkx()
graph_directed_dsf = data_store.get_file(
    "graph_obl_directed.pkl", new_if_not_exist=True
)
with open(graph_directed_dsf.local_path, "wb") as f:
    pickle.dump(graph, f)
# graph_directed_dsf.upload()
graph_undirected_dsf = data_store.get_file(
    "graph_obl_undirected.pkl", new_if_not_exist=True
)
with open(graph_undirected_dsf.local_path, "wb") as f:
    pickle.dump(graph.to_undirected(), f)
# graph_undirected_dsf.upload()
del graph

train_test_dsf = data_store.get_file("full_node_info_train_val_test.csv")
df = pd.read_csv(train_test_dsf.local_path, sep=";")
df["irr_gene"] = df["IR list"] == "IR_True"
df = (
    ds.get_nodes()
    .merge(
        df[["Node ID", "irr_gene"]], how="left", left_on="node_id", right_on="Node ID"
    )
    .drop(columns=["Node ID"])
)

irr_diseases_dsf = data_store.get_file("IR_related_diseases_DOID_533.tsv")
irr_diseases_df = pd.read_csv(irr_diseases_dsf.local_path, sep="\t", header=None)
irr_diseases_df.columns = ["Disease", "URL"]
irr_diseases_df["node_id"] = (
    irr_diseases_df["URL"].str.split("/").apply(lambda r: r[-1].replace("_", ":"))
)
irr_diseases_out_dsf = data_store.get_file(
    "ir_related_diseases.csv", new_if_not_exist=True
)
irr_diseases_df.to_csv(irr_diseases_out_dsf.local_path, index=False)
# irr_diseases_out_dsf.upload()
df["irr_disease"] = df["node_id"].isin(irr_diseases_df["node_id"])

irr_pathways_dsf = data_store.get_file("IRrelatedPathways")
irr_pathways_df = pd.read_csv(irr_pathways_dsf.local_path, sep="\t")
df["irr_pathway"] = (
    df["node_id"]
    .str.split(":")
    .apply(lambda x: x[1])
    .isin(irr_pathways_df["Pathway identifier"])
)
obl_df_dfs = data_store.get_file("openbiolink_df.parquet", new_if_not_exist=True)
df.to_parquet(obl_df_dfs.local_path, index=False)
# obl_df_dfs.upload()

Found 0 files in local directory "tmp/s3_files".


HQ_DIR.zip: 109MB [00:40, 2.67MB/s]                              
100%|██████████| 27/27 [00:13<00:00,  1.97it/s]


No local copy of file "full_node_info_train_val_test.csv" available.
Remote copy available. Trying to download...
No local copy of file "IR_related_diseases_DOID_533.tsv" available.
Remote copy available. Trying to download...
No local copy of file "IRrelatedPathways" available.
Remote copy available. Trying to download...
No local copy of file "openbiolink_df.parquet" available.
Remote copy available. Trying to download...


In [4]:
irr_pathway_idcs = np.where(df["irr_pathway"])[0]
gene_idcs = np.where(df["node_type"] == "GENE")[0]
pairs_pathway = [(a, b) for a in gene_idcs for b in irr_pathway_idcs]
pairs_pathway_dsf = data_store.get_file("pairs_obl_pathway.pkl", new_if_not_exist=True)
with open(pairs_pathway_dsf.local_path, "wb") as f:
    pickle.dump(pairs_pathway, f)
# pairs_pathway_dsf.upload()

sources = np.sort(np.unique(np.array(pairs_pathway)[:, 0]))
targets = np.sort(np.unique(np.array(pairs_pathway)[:, 1]))
input_pairs_pathway = [(g_idx, targets) for g_idx in sources]
input_pairs_pathway_dsf = data_store.get_file(
    "input_pairs_obl_pathway.pkl", new_if_not_exist=True
)
with open(input_pairs_pathway_dsf.local_path, "wb") as f:
    pickle.dump(input_pairs_pathway, f)
# input_pairs_pathway_dsf.upload()
del pairs_pathway
del input_pairs_pathway


In [5]:
gene_idcs = np.where(df["node_type"] == "GENE")[0]
irr_disease_idcs = np.where(df["irr_disease"])[0]
pairs_disease = [(a, b) for a in gene_idcs for b in irr_disease_idcs]
pairs_disease_dsf = data_store.get_file("pairs_obl_disease.pkl", new_if_not_exist=True)
with open(pairs_disease_dsf.local_path, "wb") as f:
    pickle.dump(pairs_disease, f)
# pairs_disease_dsf.upload()

sources = np.sort(np.unique(np.array(pairs_disease)[:, 0]))
targets = np.sort(np.unique(np.array(pairs_disease)[:, 1]))
input_pairs_disease = [(g_idx, targets) for g_idx in sources]
input_pairs_disease_dsf = data_store.get_file(
    "input_pairs_obl_disease.pkl", new_if_not_exist=True
)
with open(input_pairs_disease_dsf.local_path, "wb") as f:
    pickle.dump(input_pairs_disease, f)
# input_pairs_disease_dsf.upload()
del pairs_disease
del input_pairs_disease


In [6]:
nodes_df_dfs = data_store.get_file("nodes_obl.csv", new_if_not_exist=True)
ds.get_nodes().to_csv(nodes_df_dfs.local_path, index=False)