In [1]:
from pathlib import Path

import pandas as pd
import numpy as np

import pykeen
from pykeen.triples import TriplesFactory

In [2]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [3]:
out_dir = Path('1_outputs') / 'large_small_DS'
out_dir.mkdir(exist_ok=True, parents=True)

In [4]:
het = TriplesFactory.from_labeled_triples(
    np.loadtxt('./data/Hetionet_training_small.csv', dtype=str, delimiter=','),
)

id_to_entity = {i:e for e, i in het.entity_to_id.items()}
entity_to_id = het.entity_to_id

In [5]:
# TRAIN_IDS_FILE = '../MBVK_workflows/editedOpenBioLink/train_genes_id.csv'
VAL_IDS_FILE = '../../MBVK_workflows/editedOpenBioLink/val_genes_id.csv'
TEST_IDS_FILE = '../../MBVK_workflows/editedOpenBioLink/test_genes_id.csv'

pat = 'NCBIGENE:'
repl = 'Gene::'

# train_entities = pd.read_csv(TRAIN_IDS_FILE, index_col=0).iloc[:,0].values
val_entities = pd.read_csv(VAL_IDS_FILE, index_col=0).iloc[:,0].str.replace(pat, repl).values
test_entities = pd.read_csv(TEST_IDS_FILE, index_col=0).iloc[:,0].str.replace(pat, repl).values

# train_ids = np.array([entity_to_id[i] for i in train_entities])
val_ids = np.array([entity_to_id[i] for i in val_entities])
test_ids = np.array([entity_to_id[i] for i in test_entities])

train_entities_obl = pd.read_csv('../data/train_obl.csv', header=None).iloc[:,0].str.replace(pat, repl).values
train_ids = np.array([entity_to_id[i] for i in train_entities_obl])

In [6]:
gene_entities = np.array([e for e in entity_to_id.keys() if e.startswith(repl)])
gene_ids = np.array([entity_to_id[e] for e in gene_entities])

np.save(out_dir / 'gene_entities.npy', gene_entities)
np.save(out_dir / 'gene_ids.npy', gene_ids)

In [7]:
y = np.zeros(len(id_to_entity), dtype=bool)
y[train_ids] = True
y[val_ids] = True
y[test_ids] = True

np.save(out_dir / 'y.npy', y)

In [8]:
ids = np.array([*id_to_entity.keys()])
irr_ids = np.r_[train_ids, val_ids, test_ids]
nirr_ids = ids[~np.isin(ids, irr_ids)]

np.save(out_dir / 'ids.npy', ids)
np.save(out_dir / 'irr_ids.npy', irr_ids)
np.save(out_dir / 'nirr_ids.npy', nirr_ids)

In [9]:
from sklearn.model_selection import train_test_split

nirr_ids_train, nirr_ids_val = train_test_split(nirr_ids, test_size=0.4)
nirr_ids_val, nirr_ids_test = train_test_split(nirr_ids_val, test_size=0.5)

all_ids_train = np.r_[nirr_ids_train, train_ids]
all_ids_val = np.r_[nirr_ids_val, val_ids]
all_ids_test = np.r_[nirr_ids_test, test_ids]

In [10]:
df = pd.DataFrame(index=gene_ids)
df['id'] = gene_ids
df['entity'] = [id_to_entity[i] for i in gene_ids]
df['y'] = y[gene_ids]
df['train'] = df['id'].isin(all_ids_train)
df['test'] = df['id'].isin(all_ids_val)
df['val'] = df['id'].isin(all_ids_test)


df.to_csv(out_dir / 'ref_df.csv', index=True)

In [13]:
embeddings = np.load('./data/RotatE/large/entity_embedding_RotatE_10models.npy')
print(embeddings.shape)

for i, emb in enumerate(embeddings):
    print(out_dir / f'RotatE_X_{i}.npy')
    X = np.c_[emb.real, emb.imag]
    np.save(out_dir / f'RotatE_X_{i}.npy', X)

(10, 45159, 200)
1_outputs/large_small_DS/RotatE_X_0.npy
1_outputs/large_small_DS/RotatE_X_1.npy
1_outputs/large_small_DS/RotatE_X_2.npy
1_outputs/large_small_DS/RotatE_X_3.npy
1_outputs/large_small_DS/RotatE_X_4.npy
1_outputs/large_small_DS/RotatE_X_5.npy
1_outputs/large_small_DS/RotatE_X_6.npy
1_outputs/large_small_DS/RotatE_X_7.npy
1_outputs/large_small_DS/RotatE_X_8.npy
1_outputs/large_small_DS/RotatE_X_9.npy


In [14]:
embeddings = np.load('./data/TransE/large/entity_embedding_TransE_10models.npy')
print(embeddings.shape)

for i, emb in enumerate(embeddings):
    print(out_dir / f'TransE_X_{i}.npy')
    X = emb
    np.save(out_dir / f'TransE_X_{i}.npy', X) 

(10, 45159, 200)
1_outputs/large_small_DS/TransE_X_0.npy
1_outputs/large_small_DS/TransE_X_1.npy
1_outputs/large_small_DS/TransE_X_2.npy
1_outputs/large_small_DS/TransE_X_3.npy
1_outputs/large_small_DS/TransE_X_4.npy
1_outputs/large_small_DS/TransE_X_5.npy
1_outputs/large_small_DS/TransE_X_6.npy
1_outputs/large_small_DS/TransE_X_7.npy
1_outputs/large_small_DS/TransE_X_8.npy
1_outputs/large_small_DS/TransE_X_9.npy


In [11]:
embeddings = np.load('./data/CompGCN/large/entity_embedding_10models.npy')
print(embeddings.shape)

for i, emb in enumerate(embeddings):
    print(out_dir / f'CompGCN_X_{i}.npy')
    X = emb
    np.save(out_dir / f'CompGCN_X_{i}.npy', X) 

(10, 45159, 200)
1_outputs/large_small_DS/CompGCN_X_0.npy
1_outputs/large_small_DS/CompGCN_X_1.npy
1_outputs/large_small_DS/CompGCN_X_2.npy
1_outputs/large_small_DS/CompGCN_X_3.npy
1_outputs/large_small_DS/CompGCN_X_4.npy
1_outputs/large_small_DS/CompGCN_X_5.npy
1_outputs/large_small_DS/CompGCN_X_6.npy
1_outputs/large_small_DS/CompGCN_X_7.npy
1_outputs/large_small_DS/CompGCN_X_8.npy
1_outputs/large_small_DS/CompGCN_X_9.npy
