In [2]:
import pickle
from pathlib import Path

import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
PAIRS_DIR = "."
NODEWISE_DIR = "2_outputs/nodewise/"
PAIRWISE_DIR = "2_outputs/pairwise/"
PAIRWISE_PROCESSED_DIR = "./3_outputs/pairwise_processed/"
NODEWISE_PROCESSED_DIR = "./3_outputs/nodewise_processed/"

## Prepare Pairwise Metrics Files

In [3]:
def reshape_pairwise_metric(
    pairs_file,
    metric_file,
    out_file,
):
    with open(pairs_file, "rb") as f:
        pairs = pickle.load(f)

    p0s, p0_counts = np.unique([p[0] for p in pairs], return_counts=True)
    p1s, p1_counts = np.unique([p[1] for p in pairs], return_counts=True)

    val_df = pd.read_csv(metric_file)

    metric_df = pd.DataFrame(
        val_df.values.reshape((len(p0s), len(p1s))), columns=p1s, index=p0s
    )
    metric_df.to_csv(out_file)


def reshape_pairwise_metric2(
    pairs_file,
    metric_file,
    out_file,
):
    with open(pairs_file, "rb") as f:
        pairs = pickle.load(f)

    p0s = np.unique([p[0] for p in pairs])
    p1s = pairs[0][1]

    metric_df = pd.read_csv(metric_file, header=None)

    metric_df.index = p0s
    metric_df.columns = p1s
    metric_df.to_csv(out_file)

In [4]:
metric_input_dir = Path(NODEWISE_DIR)
out_dir = Path(NODEWISE_PROCESSED_DIR)
out_dir.mkdir(exist_ok=True, parents=True)

metric_input_dir = Path(NODEWISE_DIR)
metric_files = list(metric_input_dir.glob(f"*.csv"))

for metric_file in tqdm(metric_files):
    m_df = pd.read_csv(metric_file)
    m_df.to_csv(out_dir / metric_file.name)

100%|██████████| 9/9 [00:08<00:00,  1.11it/s]


In [5]:
metric_input_dir = Path(PAIRWISE_DIR)
pairs_dir = Path(PAIRS_DIR)
out_dir = Path(PAIRWISE_PROCESSED_DIR)
out_dir.mkdir(exist_ok=True, parents=True)

for suffix in ("_disease", "_pathway"):
    metric_files = list(metric_input_dir.glob(f"*{suffix}.csv"))

    for metric_file in tqdm(metric_files):
        if metric_file.name.startswith("personalized_pagerank"):
            fun = reshape_pairwise_metric2
            pairs_file = pairs_dir / f"input_pairs_obl{suffix}.pkl"
        else:
            fun = reshape_pairwise_metric
            pairs_file = pairs_dir / f"pairs_obl{suffix}.pkl"
        out_file = out_dir / metric_file.name
        fun(
            pairs_file=pairs_file,
            metric_file=metric_file,
            out_file=out_file,
        )

100%|██████████| 8/8 [02:02<00:00, 15.35s/it]
100%|██████████| 8/8 [09:03<00:00, 67.92s/it]


In [1]:
sampling_strategy = 1.0

pp_dir = Path(PAIRWISE_PROCESSED_DIR)
nd_dir = Path(NODEWISE_PROCESSED_DIR)
y_train
xy_dir = Path(XY_OUTPUT_DIR)
xy_dir.mkdir(exist_ok=True, parents=True)
train_dir = xy_dir / "train"
train_dir.mkdir(exist_ok=True, parents=True)
val_dir = xy_dir / "val"
val_dir.mkdir(exist_ok=True, parents=True)
test_dir = xy_dir / "test"
test_dir.mkdir(exist_ok=True, parents=True)

nd_files = list(nd_dir.glob("*.csv"))
pp_files_d = list(pp_dir.glob("*_disease.csv"))
pp_files_p = list(pp_dir.glob("*_pathway.csv"))

ds = OpenBioLinkDataset("./tmp/openbiolink/")

ref_df = pd.read_csv(REF_DF_OUT_FILE, index_col=0)

idcs_train = ref_df.query("train").index.values
y = ref_df.query("train")["irr"].values
sampler = RandomOverSampler(sampling_strategy=sampling_strategy)
idcs_train_rs, y_rs = sampler.fit_resample(idcs_train.reshape(-1, 1), y=y)
idcs_train_rs = idcs_train_rs.flatten()

idcs_val = ref_df.query("val").index.values
idcs_test = ref_df.query("test").index.values

y_train = ref_df.loc[idcs_train_rs]["irr"].values
y_val = ref_df.loc[idcs_val]["irr"].values
y_test = ref_df.loc[idcs_test]["irr"].values

y_train_df = pd.DataFrame(dict(irr=y_train), index=idcs_train_rs)
y_val_df = pd.DataFrame(dict(irr=y_val), index=idcs_val)
y_test_df = pd.DataFrame(dict(irr=y_test), index=idcs_test)
y_train_df.to_csv(train_dir / "y.csv")
y_val_df.to_csv(val_dir / "y.csv")
y_test_df.to_csv(test_dir / "y.csv")


def select_gene(metric_df):
    idx = ds.get_nodes().query("node_type == 'GENE'").index.values
    return metric_df[metric_df.index.isin(idx)]


for f in tqdm([*pp_files_d, *pp_files_p, *nd_files]):
    nm = f.name.split(".")[0]
    m_df = pd.read_csv(f, index_col=0)

    for idcs, out_dir in zip(
        (idcs_train_rs, idcs_val, idcs_test), (train_dir, val_dir, test_dir)
    ):
        X = m_df.loc[idcs]
        X.to_csv(out_dir / f"{nm}.csv")

NameError: name 'Path' is not defined