# Prepare Input for Machine Learning

Prepare the feature matrix X and the target vector y. The minority class is oversampled.

In [1]:
from pathlib import Path

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from tqdm import tqdm

from pykeen.datasets import OpenBioLink

from anngel.datasets.openbiolink import OpenBioLinkDataset

from pykeen.datasets import OpenBioLink

In [2]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [3]:
DS_PATH = "./tmp/openbiolink/"
TRAIN_IDS_FILE = "./data/train_obl.csv"
VAL_IDS_FILE = "../../data/editedOpenBioLink/val_genes_id.csv"
TEST_IDS_FILE = "../../data/editedOpenBioLink/test_genes_id.csv"

PAIRWISE_PROCESSED_DIR = Path("./3_outputs/pairwise_processed/")
NODEWISE_PROCESSED_DIR = Path("./3_outputs/nodewise_processed/")

OUT_DIR = Path("./4_outputs/small_DS/")
REF_DF_OUT_FILE = OUT_DIR / "ref_df.csv"
XY_OUTPUT_DIR = OUT_DIR / "XY"

OUT_DIR.mkdir(exist_ok=True, parents=True)
XY_OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

## Prepare X and y files

In [4]:
obl = OpenBioLink()

id_to_entity = {i: e for e, i in obl.entity_to_id.items()}
entity_to_id = obl.entity_to_id

ds = OpenBioLinkDataset(DS_PATH)

train_ids = pd.read_csv(TRAIN_IDS_FILE, header=None).iloc[:, 0].values
val_ids = pd.read_csv(VAL_IDS_FILE, index_col=0).iloc[:, 0].values
test_ids = pd.read_csv(TEST_IDS_FILE, index_col=0).iloc[:, 0].values

gene_df = ds.get_nodes().query('node_type == "GENE"')

train_idcs = gene_df.index[gene_df["node_id"].isin(train_ids)].values
test_idcs = gene_df.index[gene_df["node_id"].isin(test_ids)].values
val_idcs = gene_df.index[gene_df["node_id"].isin(val_ids)].values

nirr_idcs = gene_df.index[
    ~gene_df["node_id"].isin(np.r_[train_ids, val_ids, test_ids])
].values
nirr_idcs_train, nirr_idcs_val = train_test_split(nirr_idcs, test_size=0.40)
nirr_idcs_val, nirr_idcs_test = train_test_split(nirr_idcs_val, test_size=0.50)

irr_idcs = np.r_[train_idcs, val_idcs, test_idcs]
train_idcs = np.r_[train_idcs, nirr_idcs_train]
val_idcs = np.r_[val_idcs, nirr_idcs_val]
test_idcs = np.r_[test_idcs, nirr_idcs_test]

ref_df = gene_df[["node_id"]].copy()
ref_df["train"] = ref_df.index.isin(train_idcs)
ref_df["val"] = ref_df.index.isin(val_idcs)
ref_df["test"] = ref_df.index.isin(test_idcs)
ref_df["irr"] = ref_df.index.isin(irr_idcs)

ref_df.to_csv(REF_DF_OUT_FILE, index=True)

You're trying to map triples with 2052 entities and 0 relations that are not in the training set. These triples will be excluded from the mapping.
In total 2047 from 183011 triples were filtered out


In [5]:
sampling_strategy = 1.0

train_dir = XY_OUTPUT_DIR / "train"
val_dir = XY_OUTPUT_DIR / "val"
test_dir = XY_OUTPUT_DIR / "test"
train_dir.mkdir(exist_ok=True, parents=True)
val_dir.mkdir(exist_ok=True, parents=True)
test_dir.mkdir(exist_ok=True, parents=True)

nd_files = list(NODEWISE_PROCESSED_DIR.glob("*.csv"))
pp_files_d = list(PAIRWISE_PROCESSED_DIR.glob("*_disease.csv"))
pp_files_p = list(PAIRWISE_PROCESSED_DIR.glob("*_pathway.csv"))

ds = OpenBioLinkDataset("./tmp/openbiolink/")

ref_df = pd.read_csv(REF_DF_OUT_FILE, index_col=0)

idcs_train = ref_df.query("train").index.values
y = ref_df.query("train")["irr"].values
sampler = RandomOverSampler(sampling_strategy=sampling_strategy)
idcs_train_rs, y_rs = sampler.fit_resample(idcs_train.reshape(-1, 1), y=y)
idcs_train_rs = idcs_train_rs.flatten()

idcs_val = ref_df.query("val").index.values
idcs_test = ref_df.query("test").index.values

y_train = ref_df.loc[idcs_train_rs]["irr"].values
y_val = ref_df.loc[idcs_val]["irr"].values
y_test = ref_df.loc[idcs_test]["irr"].values

y_train_df = pd.DataFrame(dict(irr=y_train), index=idcs_train_rs)
y_val_df = pd.DataFrame(dict(irr=y_val), index=idcs_val)
y_test_df = pd.DataFrame(dict(irr=y_test), index=idcs_test)
y_train_df.to_csv(train_dir / "y.csv")
y_val_df.to_csv(val_dir / "y.csv")
y_test_df.to_csv(test_dir / "y.csv")

In [6]:
def select_gene(metric_df):
    idx = ds.get_nodes().query("node_type == 'GENE'").index.values
    return metric_df[metric_df.index.isin(idx)]


for f in tqdm([*pp_files_d, *pp_files_p, *nd_files]):
    nm = f.name.split(".")[0]
    m_df = pd.read_csv(f, index_col=0)

    for idcs, out_dir in zip(
        (idcs_train_rs, idcs_val, idcs_test), (train_dir, val_dir, test_dir)
    ):
        X = m_df.loc[idcs]
        X.to_csv(out_dir / f"{nm}.csv")

100%|██████████| 25/25 [04:44<00:00, 11.38s/it]
