In [2]:
import os
import pathlib

import pandas as pd
import numpy as np
import scipy
import pickle

from  MAGNN_preprocess_utils.preprocess import (
    get_column, 
    assign_index, 
    map_index_to_relation_file, 
    export_index2dat, 
    split_date,
    process_and_save_metapath_batches
)

# Input data preprocess

In [6]:
# list all file paths for the original relation .dat files
file_path = os.getcwd()
# file 1, 2: microbe-disease
file1 = os.path.join(file_path, "../data", "MAGNN_data", "disbiome_taxid_mondo.dat")
file2 = os.path.join(file_path, "../data", "MAGNN_data", "gmmad2_taxid_mondo.dat")
# file 3, 4: microbe-metabolite
file3 = os.path.join(file_path, "../data", "MAGNN_data", "gmmad2_taxid_met.dat")
file4 = os.path.join(file_path, "../data", "MAGNN_data", "hmdb_taxid_met.dat")
# file 5: metabolite-disease
file5 = os.path.join(file_path, "../data", "MAGNN_data", "hmdb_met_disease.dat")

In [7]:
# get entity columns individually
microbes1 = get_column(file1, colname1="Microbe", colname2="Disease", col="col1")
microbes2 = get_column(file2, colname1="Microbe", colname2="Disease", col="col1")
microbes3 = get_column(file3, colname1="Microbe", colname2="Metabolite", col="col1")
microbes4 = get_column(file4, colname1="Microbe", colname2="Metabolite", col="col1")
all_microbes = assign_index([microbes1, microbes2, microbes3, microbes4])

disease1 = get_column(file1, colname1="Microbe", colname2="Disease", col="col2")
disease2 = get_column(file2, colname1="Microbe", colname2="Disease", col="col2")
disease3 = get_column(file5, colname1="Metabolite", colname2="Disease", col="col2")
all_diseases = assign_index([disease1, disease2, disease3])

metabolite1 = get_column(file3, colname1="Microbe", colname2="Metabolite", col="col2")
metabolite2 = get_column(file4, colname1="Microbe", colname2="Metabolite", col="col2")
metabolite3 = get_column(file5, colname1="Metabolite", colname2="Disease", col="col1")
all_metabolites = assign_index([metabolite1, metabolite2, metabolite3])

In [8]:
# export index file to MKG_data_processing/MAGNN/data
export_index2dat(all_microbes, "data/raw/microbe_index.dat")
export_index2dat(all_metabolites, "data/raw/metabolite_index.dat")
export_index2dat(all_diseases, "data/raw/disease_index.dat")

In [9]:
# merge two relation dfs together
md_merged_df = map_index_to_relation_file([file1, file2], "Microbe", "Disease", all_microbes, all_diseases)
mm_df = map_index_to_relation_file([file3, file4], "Microbe", "Metabolite", all_microbes, all_metabolites)
metd_df = map_index_to_relation_file([file5], "Metabolite", "Disease", all_metabolites, all_diseases)

In [10]:
# export relational dfs to .dat files
export_index2dat(md_merged_df, "data/raw/microbe_disease_idx.dat")
export_index2dat(mm_df, "data/raw/microbe_metabolite_idx.dat")
export_index2dat(metd_df, "data/raw/metabolite_disease_idx.dat")

# Create adjacency matrix

In [3]:
save_prefix = "data/preprocessed/"

In [16]:
microbe_disease = pd.read_csv("data/raw/microbe_disease_idx.dat", encoding='utf-8', delimiter='\t', names=['MicrobeIdx', 'DiseaseIdx'])
microbe_metabolite = pd.read_csv('data/raw/microbe_metabolite_idx.dat', encoding='utf-8', delimiter='\t', names=['MicrobeIdx', 'MetaboliteIdx'])
metabolite_disease = pd.read_csv('data/raw/metabolite_disease_idx.dat', encoding='utf-8', delimiter='\t', names=['MetaboliteIdx', 'DiseaseIdx'])
num_microbe = 8202
num_metabolite = 23823
num_disease = 898

### Check for duplicate relationships in the data
If duplicates exist, need to make increments when creating adjacency matrix 
<br>
Microbe-Disease and Microbe-Metabolite relationships have duplicates

In [10]:
# Check for duplicates in microbe_disease data
# if duplicates=True, need to make increments when creating adjM 
duplicates_in_microbe_disease = microbe_disease[microbe_disease.duplicated()]
print(duplicates_in_microbe_disease)

# Check for duplicates in microbe_metabolite data
duplicates_in_microbe_metabolite = microbe_metabolite[microbe_metabolite.duplicated()]
print(duplicates_in_microbe_metabolite)

# Check for duplicates in metabolite_disease data
duplicates_in_metabolite_disease = metabolite_disease[metabolite_disease.duplicated()]
print(duplicates_in_metabolite_disease)

        MicrobeIdx  DiseaseIdx
5083            62          48
5278            10         115
5504            92           8
7773           153          48
8396           498         187
...            ...         ...
501973         221          94
503661         182          40
504745           7          94
505352         247          29
505647         255           8

[527 rows x 2 columns]
        MicrobeIdx  MetaboliteIdx
598604          41            586
598635          41             45
598671          41            720
598714          41            772
598737          41            795
598774          41            675
598807          41            734
598833          41            125
598844          41            116
598854          41            600
598855          49           1412
598858          41            988
598895         177            309
598913          41            260
598921         177            118
598943          41            976
598995          41        

In [17]:
# build adjacency matrix
# 0 for microbe, 1 for disease, 2 for metabolite
dim = num_microbe + num_disease + num_metabolite

type_mask = np.zeros(dim, dtype=int)
type_mask[num_microbe:num_microbe+num_disease] = 1
type_mask[num_microbe+num_disease:]=2

adjM = np.zeros((dim, dim), dtype=int)
for _, row in microbe_disease.iterrows():
    microID = row["MicrobeIdx"]
    diseaseID = num_microbe + row["DiseaseIdx"]
    # increment accounts for multiple links exist between same microbe and disease relationships
    adjM[microID, diseaseID] += 1
    adjM[diseaseID, microID] += 1
for _, row in microbe_metabolite.iterrows():
    microID = row["MicrobeIdx"]
    metID = num_microbe + num_disease + row["MetaboliteIdx"]
    adjM[microID, metID] += 1
    adjM[metID, microID] += 1
for _, row in metabolite_disease.iterrows():
    metID = num_microbe + num_disease + row["MetaboliteIdx"]
    diseaseID = num_microbe + row["DiseaseIdx"]
    adjM[metID, diseaseID] = 1
    adjM[diseaseID, metID] = 1

In [18]:
# reduce adjM by filtering out links with count less than 1
valid_microbe_idx = adjM[:num_microbe, num_microbe:num_microbe+num_disease].sum(axis=1).nonzero()[0]
num_microbe = len(valid_microbe_idx)
print(num_microbe)

valid_disease_idx = adjM[num_microbe:num_microbe+num_disease, :num_microbe].sum(axis=1).nonzero()[0]
num_disease = len(valid_disease_idx)
print(num_disease)

valid_metabolite_idx = adjM[num_microbe:num_microbe+num_disease, num_microbe+num_disease:].sum(axis=0).nonzero()[0]
num_metabolite = len(valid_metabolite_idx)
print(num_metabolite)

7167
0
0


 # Create edge metapath index array

In [12]:
# map each microbe to a list of diseases
microbe_disease_list = {i: adjM[i, num_microbe:num_microbe+num_disease].nonzero()[0] for i in range(num_microbe)}
# map each disease to a list of microbes
disease_microbe_list = {i: adjM[num_microbe+i, :num_microbe].nonzero()[0] for i in range(num_disease)}
# map each metabolite to a list of diseases
metabolite_disease_list = {i: adjM[num_microbe+num_disease+i, num_microbe:num_microbe+num_disease].nonzero()[0] for i in range(num_metabolite)}
# map each disease to a list of metabolites
disease_metabolite_list = {i: adjM[num_microbe+i, num_microbe+num_disease:num_microbe+num_disease+num_metabolite].nonzero()[0] for i in range(num_disease)}
# map each microbe to a list of metabolites
microbe_metabolite_list = {i: adjM[i, num_microbe+num_disease:num_microbe+num_disease+num_metabolite].nonzero()[0] for i in range(num_microbe)}
# map each metabolite to a list of microbes
metabolite_microbe_list = {i: adjM[num_microbe+num_disease+i, :num_microbe].nonzero()[0] for i in range(num_metabolite)}

In [14]:
microbes_list = []
for _, microbe_list in microbe_disease_list.items():
    if microbe_list.any():
        pass
    else:
        microbes_list.append(microbe_list) 
print(len(microbes_list))

1035


In [23]:
sum(len(disease_list) ** 2 for disease_list in microbe_disease_list.values())

20442111

In [24]:
sum(len(microbe_list) ** 2 for microbe_list in disease_microbe_list.values())

1509142841

In [25]:
sum(len(disease_list) ** 2 for disease_list in metabolite_disease_list.values())

92218

In [26]:
sum(len(metabolite_list) ** 2 for metabolite_list in disease_metabolite_list.values())

402996188

In [27]:
sum(len(metabolite_list) ** 2 for metabolite_list in microbe_metabolite_list.values())

437852694

In [28]:
sum(len(microbe_list) ** 2 for microbe_list in metabolite_microbe_list.values())

384861682

In [None]:
# 0-1-0 (microbe-disease-microbe)
microbe_disease_microbe = []
for disease, microbe_list in disease_microbe_list.items():
    microbe_disease_microbe.extend(
        [(microbe1, disease, microbe2) for microbe1 in microbe_list for microbe2 in microbe_list]
    )
microbe_disease_microbe = np.array(microbe_disease_microbe)
# map the disease index back to the index in the adjacency matrix
microbe_disease_microbe[:, 1] += num_microbe
sorted_index = sorted(list(range(len(microbe_disease_microbe))), 
                      key=lambda i : microbe_disease_microbe[i, [0, 2, 1]].tolist())
microbe_disease_microbe = microbe_disease_microbe[sorted_index]

In [None]:
# 0-1-0 (microbe-disease-microbe)
microbe_disease_microbe = np.array([
    (microbe1, disease, microbe2) 
    for disease, microbe_list in disease_microbe_list.items() 
    for microbe1 in microbe_list 
    for microbe2 in microbe_list
], dtype=np.int32)  
microbe_disease_microbe[:, 1] += num_microbe

microbe_disease_microbe = microbe_disease_microbe[np.lexsort((microbe_disease_microbe[:, 1], 
                                                              microbe_disease_microbe[:, 2], 
                                                              microbe_disease_microbe[:, 0]))]

peak memory: 842.33 MiB, increment: 0.00 MiB


In [None]:
# 0-2-0 (microbe-metabolite-microbe)
microbe_metabolite_microbe = np.array([
    (microbe1, metabolite, microbe2)
    for metabolite, microbe_list in metabolite_microbe_list.items()
    for microbe1 in microbe_list
    for microbe2 in microbe_list
], dtype=np.int32)
microbe_metabolite_microbe[:, 1] += num_microbe + num_disease

microbe_metabolite_microbe = microbe_metabolite_microbe[np.lexsort((
    microbe_metabolite_microbe[:, 1],
    microbe_metabolite_microbe[:, 2],
    microbe_metabolite_microbe[:, 0]
))]

In [None]:
# 1-2-1 (disease-metabolite-disease)
disease_metabolite_disease = np.array([
    (d1, metabolite, d2)
    for metabolite, disease_list in metabolite_disease_list.items()
    for d1 in disease_list
    for d2 in disease_list
], dtype=np.int32)
disease_metabolite_disease[:, [0, 2]] += num_microbe  
disease_metabolite_disease[:, 1] += num_disease     

disease_metabolite_disease = disease_metabolite_disease[np.lexsort((
    disease_metabolite_disease[:, 1],
    disease_metabolite_disease[:, 2],
    disease_metabolite_disease[:, 0]
))]

In [None]:
# 2-1-2 (metabolite-disease-metabolite) 
metabolite_disease_metabolite = np.array([
    (m1, disease, m2)
    for disease, metabolite_list in disease_metabolite_list.items()
    for m1 in metabolite_list
    for m2 in metabolite_list
], dtype=np.int32)
metabolite_disease_metabolite[:, [0, 2]] += num_microbe + num_disease  
metabolite_disease_metabolite[:, 1] -= num_disease                   

metabolite_disease_metabolite = metabolite_disease_metabolite[np.lexsort((
    metabolite_disease_metabolite[:, 1],
    metabolite_disease_metabolite[:, 2],
    metabolite_disease_metabolite[:, 0]
))]

In [None]:
# 0-1-2-1-0 (microbe-disease-metabolite-disease-microbe)
micro_d_meta_d_micro = []
for d1, meta, d2 in disease_metabolite_disease:
    if len(disease_microbe_list[d1 - num_microbe]) == 0 or len(disease_microbe_list[d2 - num_microbe]) == 0:
        continue

    candidate_microbe1_list = np.random.choice(
        disease_microbe_list[d1 - num_microbe],
        int(0.2 * len(disease_microbe_list[d1 - num_microbe])),
        replace=False
    )
    candidate_microbe2_list = np.random.choice(
        disease_microbe_list[d2 - num_microbe],
        int(0.2 * len(disease_microbe_list[d2 - num_microbe])),
        replace=False
    )
    
    micro_d_meta_d_micro.extend(
        (microbe1, d1, meta, d2, microbe2) 
        for microbe1 in candidate_microbe1_list 
        for microbe2 in candidate_microbe2_list
    )

micro_d_meta_d_micro = np.array(micro_d_meta_d_micro, dtype=np.int32)
micro_d_meta_d_micro = micro_d_meta_d_micro[np.lexsort((
    micro_d_meta_d_micro[:, 3],  # Sort by meta (3rd column)
    micro_d_meta_d_micro[:, 2],  # then by d2 (4th column)
    micro_d_meta_d_micro[:, 1],  # then by d1 (2nd column)
    micro_d_meta_d_micro[:, 4],  # then by microbe1 (1st column)
    micro_d_meta_d_micro[:, 0]   # and finally by microbe2 (5th column)
))]

In [None]:
# 1-0-2-0-1 (disease-microbe-metabolite-microbe-disease)
d_micro_meta_micro_d = []
for micro1, meta, micro2 in microbe_metabolite_microbe:
    if len(microbe_disease_list[micro1]) == 0 or len(microbe_disease_list[micro2]) == 0:
        continue

    candidate_d1_list = np.random.choice(
        microbe_disease_list[micro1],
        int(0.2 * len(microbe_disease_list[micro1])),
        replace=False
    )
    candidate_d2_list = np.random.choice(
        microbe_disease_list[micro2],
        int(0.2 * len(microbe_disease_list[micro2])),
        replace=False
    )

    d_micro_meta_micro_d.extend(
        (d1, micro1, meta, micro2, d2)
        for d1 in candidate_d1_list
        for d2 in candidate_d2_list
    )

d_micro_meta_micro_d = np.array(d_micro_meta_micro_d, dtype=np.int32)
d_micro_meta_micro_d = d_micro_meta_micro_d[np.lexsort((
    d_micro_meta_micro_d[:, 3],  
    d_micro_meta_micro_d[:, 2],  
    d_micro_meta_micro_d[:, 1],  
    d_micro_meta_micro_d[:, 4],  
    d_micro_meta_micro_d[:, 0] 
))]

In [None]:
# 0-2-1-2-0 (microbe-metabolite-disease-metabolite-microbe)
micro_meta_d_meta_micro = []
for meta1, d, meta2 in metabolite_disease_metabolite:
    if len(metabolite_microbe_list[meta1 - num_microbe - num_disease]) == 0 or len(metabolite_microbe_list[meta2 - num_microbe - num_disease]) == 0:
        continue

    candidate_micro1_list = np.random.choice(
        metabolite_microbe_list[meta1 - num_microbe - num_disease],
        int(0.2 * len(metabolite_microbe_list[meta1 - num_microbe - num_disease])),
        replace=False
    )
    candidate_micro2_list = np.random.choice(
        metabolite_microbe_list[meta2 - num_microbe - num_disease],
        int(0.2 * len(metabolite_microbe_list[meta2 - num_microbe - num_disease])),
        replace=False
    )

    micro_meta_d_meta_micro.extend(
        (micro1, meta1, d, meta2, micro2)
        for micro1 in candidate_micro1_list
        for micro2 in candidate_micro2_list
    )

micro_meta_d_meta_micro = np.array(micro_meta_d_meta_micro, dtype=np.int32)
micro_meta_d_meta_micro = micro_meta_d_meta_micro[np.lexsort((
    micro_meta_d_meta_micro[:, 3],  
    micro_meta_d_meta_micro[:, 2],  
    micro_meta_d_meta_micro[:, 1],  
    micro_meta_d_meta_micro[:, 4],  
    micro_meta_d_meta_micro[:, 0]
))]

In [None]:
# 1-0-1 (disease-microbe-disease)
disease_microbe_disease = np.array([
    (d1, microbe, d2)
    for microbe, disease_list in microbe_disease_list.items()
    for d1 in disease_list
    for d2 in disease_list
], dtype=np.int32)
disease_microbe_disease[:, [0, 2]] += num_microbe

disease_microbe_disease = disease_microbe_disease[np.lexsort((
    disease_microbe_disease[:, 1],  
    disease_microbe_disease[:, 2],  
    disease_microbe_disease[:, 0]
))]

In [None]:
# 2-0-2 (metabolite-microbe-metabolite)
metabolite_microbe_metabolite = np.array([
    (meta1, microbe, meta2)
    for microbe, metabolite_list in microbe_metabolite_list.items()
    for meta1 in metabolite_list
    for meta2 in metabolite_list
], dtype=np.int32)
metabolite_microbe_metabolite[:, [0, 2]] += num_microbe + num_disease

metabolite_microbe_metabolite = metabolite_microbe_metabolite[np.lexsort((
    metabolite_microbe_metabolite[:, 1],  
    metabolite_microbe_metabolite[:, 2],
    metabolite_microbe_metabolite[:, 0]
))]

In [None]:
# 1-2-0-2-1 (disease-metabolite-microbe-metabolite-disease)
d_meta_micro_meta_d = []
for meta1, micro, meta2 in metabolite_microbe_metabolite.items():
    if len(metabolite_disease_list[meta1 - num_microbe - num_disease]) == 0 or len(metabolite_disease_list[meta2 - num_microbe - num_disease]) == 0:
        continue

    candidate_d1_list = np.random.choice(
        metabolite_disease_list[meta1 - num_microbe - num_disease],
        int(0.2 * len(metabolite_disease_list[meta1 - num_microbe - num_disease])),
        replace=False
    )
    candidate_d2_list = np.random.choice(
        metabolite_disease_list[meta2 - num_microbe - num_disease],
        int(0.2 * len(metabolite_disease_list[meta2 - num_microbe - num_disease])),
        replace=False
    )

    d_meta_micro_meta_d.extend(
        (d1, meta1, micro, meta2, d2)
        for d1 in candidate_d1_list
        for d2 in candidate_d2_list
    )

d_meta_micro_meta_d = np.array(d_meta_micro_meta_d, dtype=np.int32)

d_meta_micro_meta_d = d_meta_micro_meta_d[np.lexsort((
    d_meta_micro_meta_d[:, 3],
    d_meta_micro_meta_d[:, 2],
    d_meta_micro_meta_d[:, 1],
    d_meta_micro_meta_d[:, 4],
    d_meta_micro_meta_d[:, 0]
))]

In [None]:
# 2-0-1-0-2 (metabolite-microbe-disease-microbe-metabolite)
meta_micro_d_micro_meta = []
for micro1, d, micro2 in microbe_metabolite_microbe.items():
    if len(microbe_metabolite_list[micro1]) == 0 or len(microbe_metabolite_list[micro2]) == 0:
        continue

    candidate_meta1_list = np.random.choice(
        microbe_metabolite_list[micro1],
        int(0.2 * len(microbe_metabolite_list[micro1])),
        replace=False
    )
    candidate_meta2_list = np.random.choice(
        microbe_metabolite_list[micro2],
        int(0.2 * len(microbe_metabolite_list[micro2])),
        replace=False
    )

    meta_micro_d_micro_meta.extend(
        (meta1, micro1, d, micro2, meta2)
        for meta1 in candidate_meta1_list
        for meta2 in candidate_meta2_list
    )

meta_micro_d_micro_meta = np.array(meta_micro_d_micro_meta, dtype=np.int32)

meta_micro_d_micro_meta = meta_micro_d_micro_meta[np.lexsort((
    meta_micro_d_micro_meta[:, 3],
    meta_micro_d_micro_meta[:, 2],
    meta_micro_d_micro_meta[:, 1],
    meta_micro_d_micro_meta[:, 4],
    meta_micro_d_micro_meta[:, 0]
))]

In [None]:
# 2-1-0-1-2 (metabolite-disease-microbe-disease-metabolite)
meta_d_micro_d_meta = []
for d1, micro, d2 in disease_microbe_disease.items():
    if len(disease_metabolite_list[d1 - num_microbe]) == 0 or len(disease_metabolite_list[d2 - num_microbe]) == 0:
        continue

    candidate_meta1_list = np.random.choice(
        disease_metabolite_list[d1 - num_microbe],
        int(0.2 * len(disease_metabolite_list[d1 - num_microbe])),
        replace=False
    )
    candidate_meta2_list = np.random.choice(
        disease_metabolite_list[d2 - num_microbe],
        int(0.2 * len(disease_metabolite_list[d2 - num_microbe])),
        replace=False
    )

    meta_d_micro_d_meta.extend(
        (meta1, d1, micro, d2, meta2)
        for meta1 in candidate_meta1_list
        for meta2 in candidate_meta2_list
    )

meta_d_micro_d_meta = np.array(meta_d_micro_d_meta, dtype=np.int32)

meta_d_micro_d_meta = meta_d_micro_d_meta[np.lexsort((
    meta_d_micro_d_meta[:, 3], 
    meta_d_micro_d_meta[:, 2],
    meta_d_micro_d_meta[:, 1],
    meta_d_micro_d_meta[:, 4],
    meta_d_micro_d_meta[:, 0]
))]

In [None]:
# adjust batch_size based on memory
batch_size = 100000

expected_metapaths = [
    (0, 1, 0), (0, 2, 0), (1, 0, 1), (2, 0, 2), (1, 2, 1), (2, 1, 2),  
    (0, 1, 2, 1, 0), (1, 0, 2, 0, 1), (0, 2, 1, 2, 0), (1, 2, 0, 2, 1), (2, 0, 1, 0, 2), (2, 1, 0, 1, 2)
]

metapath_indices_mapping = {
    (0, 1, 0): microbe_disease_microbe,
    (0, 2, 0): microbe_metabolite_microbe,
    (1, 0, 1): disease_microbe_disease,
    (2, 0, 2): metabolite_microbe_metabolite,
    (1, 2, 1): disease_metabolite_disease,
    (2, 1, 2): metabolite_disease_metabolite,
    (0, 1, 2, 1, 0): micro_d_meta_d_micro,
    (1, 0, 2, 0, 1): d_micro_meta_micro_d,
    (0, 2, 1, 2, 0): micro_meta_d_meta_micro,
    (1, 2, 0, 2, 1): d_meta_micro_meta_d,
    (2, 0, 1, 0, 2): meta_micro_d_micro_meta,
    (2, 1, 0, 1, 2): meta_d_micro_d_meta
}

target_idx_lists = [np.arange(num_microbe), np.arange(num_disease)]
offset_list = [0, num_microbe]

for i, metapath in enumerate(expected_metapaths):
    edge_metapath_idx_array = metapath_indices_mapping[metapath]
    save_dir = f"{save_prefix}{i}"
    
    if len(metapath) == 3:
        target_idx_list = target_idx_lists[0] if metapath[0] == 0 else target_idx_lists[1]
        offset = offset_list[0] if metapath[0] == 0 else offset_list[1]
    else:
        target_idx_list = target_idx_lists[0] if metapath[0] == 0 else target_idx_lists[1]
        offset = offset_list[0] if metapath[0] == 0 else offset_list[1]
    
    # Define sort columns based on metapath type
    sort_columns = (0, 2, 1) if len(metapath) == 3 else (0, 4, 1, 2, 3)

    # Process and save the metapath data in batches
    process_and_save_metapath_batches(
        metapath=metapath,
        edges=edge_metapath_idx_array,
        batch_size=batch_size,
        sort_columns=sort_columns,
        target_idx_list=target_idx_list,
        offset=offset,
        save_dir=save_dir
    )

# Save the adjacency matrix and type mask
scipy.sparse.save_npz(f"{save_prefix}adjM.npz", scipy.sparse.csr_matrix(adjM))
np.save(f"{save_prefix}node_types.npy", type_mask)

In [None]:
# # 0-1-0 (microbe-disease-microbe)
# microbe_disease_microbe = []
# for disease, microbe_list in disease_microbe_list.items():
#     microbe_disease_microbe.extend(
#         [(microbe1, disease, microbe2) for microbe1 in microbe_list for microbe2 in microbe_list]
#     )
# microbe_disease_microbe = np.array(microbe_disease_microbe)
# # map the disease index back to the index in the adjacency matrix
# microbe_disease_microbe[:, 1] += num_microbe
# sorted_index = sorted(list(range(len(microbe_disease_microbe))), 
#                       key=lambda i :microbe_disease_microbe[i, [0, 2, 1]].tolist())
# microbe_disease_microbe = microbe_disease_microbe[sorted_index]
# 
# # 0-2-0 (microbe-metabolite-microbe)
# microbe_metabolite_microbe = []
# for metabolite, microbe_list in metabolite_microbe_list.items():
#     microbe_metabolite_microbe.extend(
#         [(microbe1, metabolite, microbe2) for microbe1 in microbe_list for microbe2 in microbe_list]
#     )
# microbe_metabolite_microbe = np.array(microbe_metabolite_microbe)
# microbe_metabolite_microbe[:, 1] += num_microbe + num_disease
# sorted_index = sorted(list(range(len(microbe_metabolite_microbe))), 
#                       key=lambda i : microbe_metabolite_microbe[i, [0, 2, 1]].tolist())
# microbe_metabolite_microbe = microbe_metabolite_microbe[sorted_index]
# 
# # 1-2-1 (disease-metabolite-disease)
# disease_metabolite_disease = []
# for metabolite, disease_list in metabolite_disease_list.items():
#     disease_metabolite_disease.extend([(d1, metabolite, d2) for d1 in disease_list for d2 in disease_list])
# disease_metabolite_disease = np.array(disease_metabolite_disease)
# disease_metabolite_disease += num_microbe
# disease_metabolite_disease[:, 1] += num_disease
# sorted_index = sorted(list(range(len(disease_metabolite_disease))), 
#                       key=lambda i : disease_metabolite_disease[i, [0, 2, 1]].tolist())
# disease_metabolite_disease = disease_metabolite_disease[sorted_index]
# 
# # 2-1-2 (metabolite-disease-metabolite)
# metabolite_disease_metabolite = []
# for disease, metabolite_list in disease_metabolite_list.items():
#     metabolite_disease_metabolite.extend([(m1, disease, m2) for m1 in metabolite_list for m2 in metabolite_list])
# metabolite_disease_metabolite = np.array(metabolite_disease_metabolite)
# metabolite_disease_metabolite += num_microbe + num_disease
# metabolite_disease_metabolite[:, 1] -= num_disease
# sorted_index = sorted(list(range(len(metabolite_disease_metabolite))), 
#                       key=lambda i : metabolite_disease_metabolite[i, [0, 2, 1]].tolist())
# metabolite_disease_metabolite = metabolite_disease_metabolite[sorted_index]
# 
# # 0-1-2-1-0 (microbe-disease-metabolite-disease-microbe)
# micro_d_meta_d_micro = []
# for d1, meta, d2 in disease_metabolite_disease:
#     # filter out diseases without microbe connections
#     if len(disease_microbe_list[d1 - num_microbe]) == 0 or len(disease_microbe_list[d2 - num_microbe]) == 0:
#         continue
#     candidate_microbe1_list = np.random.choice(len(disease_microbe_list[d1 - num_microbe]), int(0.2 * len(disease_microbe_list[d1 - num_microbe])), replace=False)
#     candidate_microbe1_list = disease_microbe_list[d1 - num_microbe][candidate_microbe1_list]
#     candidate_microbe2_list = np.random.choice(len(disease_microbe_list[d2 - num_microbe]), int(0.2 * len(disease_microbe_list[d2 - num_microbe])), replace=False)
#     candidate_microbe2_list = disease_microbe_list[d2 - num_microbe][candidate_microbe2_list]
#     micro_d_meta_d_micro.extend(
#         [
#             (microbe1, d1, meta, d2, microbe2) 
#             for microbe1 in candidate_microbe1_list 
#             for microbe2 in candidate_microbe2_list
#         ]
#     )
# micro_d_meta_d_micro = np.array(micro_d_meta_d_micro)
# sorted_index = sorted(list(range(len(micro_d_meta_d_micro))), 
#                       key=lambda i : micro_d_meta_d_micro[i, [0, 4, 1, 2, 3]].tolist())
# micro_d_meta_d_micro = micro_d_meta_d_micro[sorted_index]
# 
# # 1-0-2-0-1 (disease-microbe-metabolite-microbe-disease)
# d_micro_meta_micro_d = []
# for micro1, meta, micro2 in microbe_metabolite_microbe:
#     # filter out diseases without microbe connections
#     if len(microbe_disease_list[micro1]) == 0 or len(microbe_disease_list[micro2]) == 0:
#         continue
#     candidate_d1_list = np.random.choice(len(microbe_disease_list[micro1]), 
#                                          int(0.2 * len(microbe_disease_list[micro1])), 
#                                          replace=False)
#     candidate_d1_list = microbe_disease_list[micro1][candidate_d1_list]
#     candidate_d2_list = np.random.choice(len(microbe_disease_list[micro2]), 
#                                          int(0.2 * len(microbe_disease_list[micro2])), 
#                                          replace=False)
#     candidate_d2_list = microbe_disease_list[micro2][candidate_d2_list]
#     d_micro_meta_micro_d.extend(
#         [
#             (d1, micro1, meta, micro2, d2) 
#             for d1 in candidate_d1_list 
#             for d2 in candidate_d2_list
#         ]
#     )
# d_micro_meta_micro_d = np.array(d_micro_meta_micro_d)
# sorted_index = sorted(list(range(len(d_micro_meta_micro_d))), 
#                       key=lambda i : d_micro_meta_micro_d[i, [0, 4, 1, 2, 3]].tolist())
# d_micro_meta_micro_d = d_micro_meta_micro_d[sorted_index]
# 
# # 0-2-1-2-0 (microbe-metabolite-disease-metabolite-microbe)
# micro_meta_d_meta_micro = []
# for meta1, d, meta2 in metabolite_disease_metabolite:
#     # filter out microbes without metabolite connections
#     if (len(metabolite_microbe_list[meta1 - num_microbe - num_disease]) == 0 or 
#             len(metabolite_microbe_list[meta2 - num_microbe - num_disease]) == 0):
#         continue
#     candidate_micro1_list = np.random.choice(
#         len(metabolite_microbe_list[meta1 - num_microbe - num_disease]), 
#         int(0.2 * len(metabolite_microbe_list[meta1 - num_microbe - num_disease])), 
#         replace=False
#     )
#     candidate_micro1_list = metabolite_microbe_list[meta1 - num_microbe - num_disease][candidate_micro1_list]
#     candidate_micro2_list = np.random.choice(
#         len(metabolite_microbe_list[meta2 - num_microbe - num_disease]), 
#         int(0.2 * len(metabolite_microbe_list[meta2 - num_microbe - num_disease])), 
#         replace=False
#     )
#     candidate_micro2_list = metabolite_microbe_list[meta2 - metabolite_microbe_list][candidate_micro2_list]
#     micro_meta_d_meta_micro.extend(
#         [
#             (micro1, meta1, d, meta2, micro2) 
#             for micro1 in candidate_micro1_list
#             for micro2 in candidate_micro2_list
#         ]
#     )
# micro_meta_d_meta_micro = np.array(micro_meta_d_meta_micro)
# sorted_index = sorted(list(range(len(micro_meta_d_meta_micro))), 
#                       key=lambda i : micro_meta_d_meta_micro[i, [0, 4, 1, 2, 3]].tolist())
# micro_meta_d_meta_micro = micro_meta_d_meta_micro[sorted_index]
# 
# # 1-0-1 (disease-microbe-disease)
# disease_microbe_disease = []
# for microbe, disease_list in microbe_disease_list.items():
#     disease_microbe_disease.extend([(d1, microbe, d2) for d1 in disease_list for d2 in disease_list])
# disease_microbe_disease = np.array(disease_microbe_disease)
# disease_microbe_disease[:, [0, 2]] += num_microbe
# sorted_index = sorted(list(range(len(disease_microbe_disease))), 
#                       key=lambda i :disease_microbe_disease[i, [0, 2, 1]].tolist())
# disease_microbe_disease = disease_microbe_disease[sorted_index]
# 
# # 2-0-2 (metabolite-microbe-metabolite)
# metabolite_microbe_metabolite = []
# for microbe, metabolite_list in microbe_metabolite_list.items():
#     metabolite_microbe_metabolite.extend([(meta1, microbe, meta2) for meta1 in metabolite_list for meta2 in metabolite_list])
# metabolite_microbe_metabolite = np.array(metabolite_microbe_metabolite)
# metabolite_microbe_metabolite[:, [0, 2]] += num_microbe + num_disease
# sorted_index = sorted(list(range(len(metabolite_microbe_metabolite))), 
#                       key=lambda i : metabolite_microbe_metabolite[i, [0, 2, 1]].tolist())
# metabolite_microbe_metabolite = metabolite_microbe_metabolite[sorted_index]
# 
# # 1-2-0-2-1 (disease-metabolite-microbe-metabolite-disease)
# d_meta_micro_meta_d = []
# for meta1, micro, meta2 in metabolite_microbe_metabolite.items():
#     if len(metabolite_disease_list[meta1 - num_microbe - num_disease]) == 0 or len(metabolite_disease_list[meta2 - num_microbe - num_disease]) == 0:
#         continue
#     candidate_d1_list = np.random.choice(len(metabolite_disease_list[meta1 - num_microbe - num_disease]), int(0.2 * len(metabolite_disease_list[meta1 - num_microbe - num_disease])), replace=False)
#     candidate_d1_list = metabolite_disease_list[meta1 - num_microbe - num_disease][candidate_d1_list]
#     candidate_d2_list = np.random.choice(len(metabolite_disease_list[meta2 - num_microbe - num_disease]), int(0.2 * len(metabolite_disease_list[meta2 - num_microbe - num_disease])), replace=False)
#     candidate_d2_list = metabolite_disease_list[meta2 - num_microbe - num_disease][candidate_d2_list]
#     d_meta_micro_meta_d.extend(
#         [
#             (d1, meta1, micro, meta2, d2) 
#             for d1 in candidate_d1_list 
#             for d2 in candidate_d2_list
#         ]
#     )
# d_meta_micro_meta_d = np.array(d_meta_micro_meta_d)
# sorted_index = sorted(list(range(len(d_meta_micro_meta_d))), 
#                       key=lambda i : d_meta_micro_meta_d[i, [0, 4, 1, 2, 3]].tolist())
# d_meta_micro_meta_d = d_meta_micro_meta_d[sorted_index]
# 
# # 2-0-1-0-2 (metabolite-microbe-disease-microbe-metabolite)
# meta_micro_d_micro_meta = []
# for micro1, d, micro2 in microbe_metabolite_microbe.items():
#     if len(microbe_metabolite_list[micro1]) == 0 or len(microbe_metabolite_list[micro2]) == 0:
#         continue
#     candidate_meta1_list = np.random.choice(len(microbe_metabolite_list[micro1]), int(0.2 * len(microbe_metabolite_list[micro1])), replace=False)
#     candidate_meta1_list = microbe_metabolite_list[micro1][candidate_meta1_list]
#     candidate_meta2_list = np.random.choice(len(microbe_metabolite_list[micro2]), int(0.2 * len(microbe_metabolite_list[micro2])), replace=False)
#     candidate_meta2_list = microbe_metabolite_list[micro2][candidate_meta2_list]
#     meta_micro_d_micro_meta.extend(
#         [
#             (meta1, micro1, d, micro2, meta2) 
#             for meta1 in candidate_meta1_list 
#             for meta2 in candidate_meta2_list
#         ]
#     )
# meta_micro_d_micro_meta = np.array(meta_micro_d_micro_meta)
# sorted_index = sorted(list(range(len(meta_micro_d_micro_meta))), 
#                       key=lambda i : meta_micro_d_micro_meta[i, [0, 4, 1, 2, 3]].tolist())
# meta_micro_d_micro_meta = meta_micro_d_micro_meta[sorted_index]
# 
# # 2-1-0-1-2 (metabolite-disease-microbe-disease-metabolite)
# meta_d_micro_d_meta = []
# for d1, micro, d2 in disease_microbe_disease.items():
#     if len(disease_metabolite_list[d1 - num_microbe]) == 0 or len(disease_metabolite_list[d2 - num_microbe]) == 0:
#         continue
#     candidate_meta1_list = np.random.choice(len(disease_metabolite_list[d1 - num_microbe]), int(0.2 * len(disease_metabolite_list[d1 - num_microbe])), replace=False)
#     candidate_meta1_list = disease_metabolite_list[d1 - num_microbe][candidate_meta1_list]
#     candidate_meta2_list = np.random.choice(len(disease_metabolite_list[d2 - num_microbe]), int(0.2 * len(disease_metabolite_list[d2 - num_microbe])), replace=False)
#     candidate_meta2_list = disease_metabolite_list[d2 - num_microbe][candidate_meta2_list]
#     meta_d_micro_d_meta.extend(
#         [
#             (meta1, d1, micro, d2, meta2) 
#             for meta1 in candidate_meta1_list 
#             for meta2 in candidate_meta2_list
#         ]
#     )
# meta_d_micro_d_meta = np.array(meta_d_micro_d_meta)
# sorted_index = sorted(list(range(len(meta_d_micro_d_meta))), 
#                       key=lambda i : meta_d_micro_d_meta[i, [0, 4, 1, 2, 3]].tolist())
# meta1_d1_micro_d2_meta2 = meta_d_micro_d_meta[sorted_index]

In [None]:
expected_metapaths = [(0, 1, 0), (0, 2, 0), (1, 0, 1), (2, 0, 2), (1, 2, 1), (2, 1, 2),  
                      (0, 1, 2, 1, 0), (1, 0, 2, 0, 1), (0, 2, 1, 2, 0), (1, 2, 0, 2, 1), (2-0-1-0-2),  (2-1-0-1-2)] 
# create the directories if they do not exist
for i in range(len(expected_metapaths)):
    pathlib.Path(save_prefix + '{}'.format(i)).mkdir(parents=True, exist_ok=True)

metapath_indices_mapping = {
    (0, 1, 0): microbe_disease_microbe,
    (0, 2, 0): microbe_metabolite_microbe,
    (1, 0, 1): disease_microbe_disease,
    (2, 0, 2): metabolite_microbe_metabolite,
    (1, 2, 1): disease_metabolite_disease,
    (2, 1, 2): metabolite_disease_metabolite,
    (0, 1, 2, 1, 0): micro_d_meta_d_micro,
    (1, 0, 2, 0, 1): d_micro_meta_micro_d,
    (0, 2, 1, 2, 0): micro_meta_d_meta_micro,
    (1, 2, 0, 2, 1): d_meta_micro_meta_d,
    (2, 0, 1, 0, 2): meta_micro_d_micro_meta,
    (2, 1, 0, 1, 2): meta_d_micro_d_meta
}

# write all things
target_idx_lists = [np.arange(num_microbe), np.arange(num_disease)]
offset_list = [0, num_microbe]
for i, metapaths in enumerate(expected_metapaths):
    for metapath in metapaths:
        edge_metapath_idx_array = metapath_indices_mapping[metapath]
        
        with open(save_prefix + '{}/'.format(i) + '-'.join(map(str, metapath)) + '_idx.pickle', 'wb') as out_file:
            target_metapaths_mapping = {}
            left = 0
            right = 0
            for target_idx in target_idx_lists[i]:
                while right < len(edge_metapath_idx_array) and edge_metapath_idx_array[right, 0] == target_idx + offset_list[i]:
                    right += 1
                target_metapaths_mapping[target_idx] = edge_metapath_idx_array[left:right, ::-1]
                left = right
            pickle.dump(target_metapaths_mapping, out_file)

        #np.save(save_prefix + '{}/'.format(i) + '-'.join(map(str, metapath)) + '_idx.npy', edge_metapath_idx_array)
        
        with open(save_prefix + '{}/'.format(i) + '-'.join(map(str, metapath)) + '.adjlist', 'w') as out_file:
            left = 0
            right = 0
            for target_idx in target_idx_lists[i]:
                while right < len(edge_metapath_idx_array) and edge_metapath_idx_array[right, 0] == target_idx + offset_list[i]:
                    right += 1
                neighbors = edge_metapath_idx_array[left:right, -1] - offset_list[i]
                neighbors = list(map(str, neighbors))
                if len(neighbors) > 0:
                    out_file.write('{} '.format(target_idx) + ' '.join(neighbors) + '\n')
                else:
                    out_file.write('{}\n'.format(target_idx))
                left = right

# save scipy sparse adjM 
scipy.sparse.save_npz(save_prefix + 'adjM.npz', scipy.sparse.csr_matrix(adjM))
# save node type_mask
np.save(save_prefix + 'node_types.npy', type_mask)

In [None]:
# output microbe_disease.npy
microbe_disease = pd.read_csv('data/raw/microbe_disease_idx.dat', encoding='utf-8', delimiter='\t', names=['MicrobeID', 'DiseaseID'])
microbe_disease = microbe_disease[['MicrobeID', 'DiseaseID']].to_numpy()
np.save(save_prefix + 'microbe_disease.npy', microbe_disease)

## Split data into training, validation and testing sets

In [None]:
md_train, md_val, md_test = split_date(microbe_disease, train_ratio=0.7, val_ratio=0.2, test_ratio=0.1)
# save_split_data2npz(md_train, md_val, md_test, "data/micro_disease_train_val_test_idx.npz")

In [None]:
# training: 70%, validation: 20%, testing: 10%
train_val_test_idx = np.load("data/raw/micro_disease_train_val_test_idx.npz")
train_idx = train_val_test_idx['train']
val_idx = train_val_test_idx['val']
test_idx = train_val_test_idx['test']

# reset microbe-disease index 
microbe_disease = microbe_disease.loc[train_idx].reset_index(drop=True)
microbe_disease.head()
print(f"Length of Training data: {len(microbe_disease)}")

## Output positive and negative samples for training, validation and testing sets

In [None]:
# output positive and negative samples for training, validation and testing
np.random.seed(453289)
save_prefix = 'data/preprocessed/microbe_disease_neg_pos_processed/'
num_microbe = 8202
num_disease = 898
microbe_disease = np.load('data/preprocessed/microbe_disease.npy')
train_val_test_idx = np.load('data/raw/micro_disease_train_val_test_idx.npz')
train_idx = train_val_test_idx['train']
val_idx = train_val_test_idx['val']
test_idx = train_val_test_idx['test']

neg_candidates = []
counter = 0
for i in range(num_microbe):
    for j in range(num_disease):
        if counter < len(num_disease):
            if i == num_disease[counter, 0] and j == num_disease[counter, 1]:
                counter += 1
            else:
                neg_candidates.append([i, j])
        else:
            neg_candidates.append([i, j])
neg_candidates = np.array(neg_candidates)

idx = np.random.choice(len(neg_candidates), len(val_idx) + len(test_idx), replace=False)
val_neg_candidates = neg_candidates[sorted(idx[:len(val_idx)])]
test_neg_candidates = neg_candidates[sorted(idx[len(val_idx):])]

train_microbe_disease = microbe_disease[train_idx]
train_neg_candidates = []
counter = 0
for i in range(num_microbe):
    for j in range(num_disease):
        if counter < len(train_microbe_disease):
            if i == train_microbe_disease[counter, 0] and j == train_microbe_disease[counter, 1]:
                counter += 1
            else:
                train_neg_candidates.append([i, j])
        else:
            train_neg_candidates.append([i, j])
train_neg_candidates = np.array(train_neg_candidates)

np.savez(save_prefix + 'train_val_test_neg_microbe_disease.npz',
         train_neg_user_artist=train_neg_candidates,
         val_neg_user_artist=val_neg_candidates,
         test_neg_user_artist=test_neg_candidates)
np.savez(save_prefix + 'train_val_test_pos_microbe_disease.npz',
         train_pos_user_artist=microbe_disease[train_idx],
         val_pos_user_artist=microbe_disease[val_idx],
         test_pos_user_artist=microbe_disease[test_idx])