In [1]:
import pickle

from anngel.datalib.s3datastore import S3DataStore
from anngel.datasets.openbiolink import OpenBioLinkDataset
from anngel.datasets.custom_dataset import CustomDataset

import pandas as pd
import numpy as np
import pandas as pd

from pathlib import Path

out_dir = Path('tmp')
out_dir.mkdir(exist_ok=True)

data_store = S3DataStore(
    aws_access_key_id=None,
    aws_secret_access_key=None,
    aws_session_token=None,
    bucket_name="nnedl-core-prd-eu-central-1-curated",
    prefix="compbiosandbox/Projects/NNRCO_KG/Sprint_3_3_IR_Paper/",
    local_directory=out_dir / "s3_files",
)

ds = OpenBioLinkDataset(out_dir / "openbiolink")
TRAIN_IDS_FILE = '../MBVK_workflows/editedOpenBioLink/train_genes_id.csv'
VAL_IDS_FILE = '../MBVK_workflows/editedOpenBioLink/val_genes_id.csv'
TEST_IDS_FILE = '../MBVK_workflows/editedOpenBioLink/test_genes_id.csv'

irr_node = "HP:0000855"

edges = ds.get_edges()
nodes = ds.get_nodes()
genes = nodes.query('node_type == "GENE"')['node_id'].values
small_irr_genes = edges.query('target_node_id == @irr_node and source_node_id.isin(@genes)')['source_node_id'].values

large_irr_genes = pd.concat([
    pd.read_csv(TRAIN_IDS_FILE, index_col=0),
    # pd.read_csv(VAL_IDS_FILE, index_col=0),
    # pd.read_csv(TEST_IDS_FILE, index_col=0),
]).iloc[:,0].values

missing_irr_genes = large_irr_genes[~np.isin(large_irr_genes, small_irr_genes)]

new_edges = pd.DataFrame.from_records([
    (gene, 'GENE_PHENOTYPE', irr_node, np.NaN, 'CUSTOM')
    for gene in missing_irr_genes
], columns=edges.columns)

edges_enriched = pd.concat((edges, new_edges))
nodes_enriched = nodes

ds_name = 'enriched_openbiolink'
(out_dir / ds_name).mkdir(exist_ok=True, parents=True)
edges_enriched.to_csv(out_dir / ds_name / 'edges.tsv', index=False, sep='\t')
nodes_enriched.to_csv(out_dir / ds_name / 'nodes.tsv', index=False, sep='\t')
ds = CustomDataset(out_dir, ds_name)

Found 12 files in local directory "tmp/s3_files".


In [2]:
graph = ds.to_networkx()
graph_directed_dsf = data_store.get_file(
    "graph_obl_enriched_directed.pkl", new_if_not_exist=True
)
with open(graph_directed_dsf.local_path, "wb") as f:
    pickle.dump(graph, f)
# graph_directed_dsf.upload()
graph_undirected_dsf = data_store.get_file(
    "graph_obl_enriched_undirected.pkl", new_if_not_exist=True
)
with open(graph_undirected_dsf.local_path, "wb") as f:
    pickle.dump(graph.to_undirected(), f)
# graph_undirected_dsf.upload()
del graph

genes = ds.get_nodes().query('node_type == "GENE"')['node_id'].values
irr_genes = ds.get_edges().query('target_node_id == @irr_node and source_node_id.isin(@genes)')['source_node_id'].values
df = ds.get_nodes()
df['irr_gene'] = False
df.loc[df['node_id'].isin(irr_genes), 'irr_gene'] = True
df

irr_diseases_dsf = data_store.get_file("IR_related_diseases_DOID_533.tsv")
irr_diseases_df = pd.read_csv(irr_diseases_dsf.local_path, sep="\t", header=None)
irr_diseases_df.columns = ["Disease", "URL"]
irr_diseases_df["node_id"] = (
    irr_diseases_df["URL"].str.split("/").apply(lambda r: r[-1].replace("_", ":"))
)
irr_diseases_out_dsf = data_store.get_file(
    "ir_related_diseases.csv", new_if_not_exist=True
)
irr_diseases_df.to_csv(irr_diseases_out_dsf.local_path, index=False)
# irr_diseases_out_dsf.upload()
df["irr_disease"] = df["node_id"].isin(irr_diseases_df["node_id"])

irr_pathways_dsf = data_store.get_file("IRrelatedPathways")
irr_pathways_df = pd.read_csv(irr_pathways_dsf.local_path, sep="\t")
df["irr_pathway"] = (
    df["node_id"]
    .str.split(":")
    .apply(lambda x: x[1])
    .isin(irr_pathways_df["Pathway identifier"])
)
obl_df_dsf = data_store.get_file("enriched_openbiolink_df.parquet", new_if_not_exist=True)
df.to_parquet(obl_df_dsf.local_path, index=False)
# obl_df_dfs.upload()

  edges_df = pd.read_csv(self.edges_file, sep="\t")


Only local copy available.
Only local copy available.


  edges_df = pd.read_csv(self.edges_file, sep="\t")


Only local copy available.
Only local copy available.


In [3]:
irr_pathway_idcs = np.where(df["irr_pathway"])[0]
gene_idcs = np.where(df["node_type"] == "GENE")[0]
pairs_pathway = [(a, b) for a in gene_idcs for b in irr_pathway_idcs]
pairs_pathway_dsf = data_store.get_file("pairs_obl_enriched_pathway.pkl", new_if_not_exist=True)
with open(pairs_pathway_dsf.local_path, "wb") as f:
    pickle.dump(pairs_pathway, f)
# pairs_pathway_dsf.upload()

sources = np.sort(np.unique(np.array(pairs_pathway)[:, 0]))
targets = np.sort(np.unique(np.array(pairs_pathway)[:, 1]))
input_pairs_pathway = [(g_idx, targets) for g_idx in sources]
input_pairs_pathway_dsf = data_store.get_file(
    "input_pairs_obl_enriched_pathway.pkl", new_if_not_exist=True
)
with open(input_pairs_pathway_dsf.local_path, "wb") as f:
    pickle.dump(input_pairs_pathway, f)
# input_pairs_pathway_dsf.upload()
del pairs_pathway
del input_pairs_pathway


Only local copy available.
Only local copy available.


In [4]:
gene_idcs = np.where(df["node_type"] == "GENE")[0]
irr_disease_idcs = np.where(df["irr_disease"])[0]
pairs_disease = [(a, b) for a in gene_idcs for b in irr_disease_idcs]
pairs_disease_dsf = data_store.get_file("pairs_obl_enriched_disease.pkl", new_if_not_exist=True)
with open(pairs_disease_dsf.local_path, "wb") as f:
    pickle.dump(pairs_disease, f)
# pairs_disease_dsf.upload()

sources = np.sort(np.unique(np.array(pairs_disease)[:, 0]))
targets = np.sort(np.unique(np.array(pairs_disease)[:, 1]))
input_pairs_disease = [(g_idx, targets) for g_idx in sources]
input_pairs_disease_dsf = data_store.get_file(
    "input_pairs_obl_enriched_disease.pkl", new_if_not_exist=True
)
with open(input_pairs_disease_dsf.local_path, "wb") as f:
    pickle.dump(input_pairs_disease, f)
# input_pairs_disease_dsf.upload()
del pairs_disease
del input_pairs_disease


Only local copy available.
Only local copy available.


In [5]:
nodes_df_dfs = data_store.get_file("nodes_obl_enriched.csv", new_if_not_exist=True)
ds.get_nodes().to_csv(nodes_df_dfs.local_path, index=False)

Only local copy available.
