In [1]:
import sys, os
sys.path.append('..')

# std
import numpy as np
import random as rn
import json
import time
from collections import defaultdict
from itertools import combinations
from sklearn.model_selection import KFold

# datasets
import STRING
import MINT
import bioGRID
import HuRI
import synthetic_PPI

# my lib
import PPILinkPred as pred
import genData_helper as helper
import traversalHelper as tr
import helper as hr

ModuleNotFoundError: No module named 'sklearn'

# Generate Random PPI Samples from Datasets

In [2]:
# yeast dataset
# DataFrame standard: {nodeA, nodeB, type, score}
# randomly 50% of the dataset 10 times, save into json

import_funcs = [bioGRID.parse_bioGRID(root="../"), STRING.parse_STRING(root="../"), MINT.parse_MINT(root="../")]
names = ['bioGRID', 'STRING', 'MINT']

for n in range(len(names)):
    _, df = import_funcs[n]
    ppi = [list(arr) for arr in np.asarray(df[['nodeA', 'nodeB']])]
    sampledPPIs = [rn.sample(ppi, int(len(ppi)*0.5)) for i in range(10)]
    with open("./sampled_datasets/{}_sampledPPIs.json".format(names[n]), "w") as f:
        f.write(json.dumps(sampledPPIs))

In [6]:
# yeast dataset
# DataFrame standard: {nodeA, nodeB, type, score}
# do 5-fold CV, rmb when do PR, do until 20% of top PPIs (because 5-fold is about 80% train and 20% test)

import_funcs = [bioGRID.parse_bioGRID(root="../"), STRING.parse_STRING(root="../"), MINT.parse_MINT(root="../")]
names = ['bioGRID', 'STRING', 'MINT']

for n in range(len(names)):
    _, df = import_funcs[n]
    ppi = [list(arr) for arr in np.asarray(df[['nodeA', 'nodeB']])]
    rn.shuffle(ppi)
    kf = KFold(n_splits=5)
    sampledPPIs = []
    for train_index, test_index in kf.split(ppi):
        sampledPPIs.append(np.asarray(ppi)[train_index].tolist())
    with open("./sampled_datasets/{}_5FoldCV_sampledPPIs.json".format(names[n]), "w") as f:
        f.write(json.dumps(sampledPPIs))

In [3]:
# yeast dataset
# DataFrame standard: {nodeA, nodeB, type, score}
# randomly 50% of the dataset 10 times, save into json

import_funcs = [bioGRID.parse_bioGRID(root="../", wFile_GGI='./data/parsed/BioGRID_GGI_noSelf.pkl',
                                wFile_PPI='./data/parsed/BioGRID_PPI_noSelf.pkl', noSelfPPI=False),
                STRING.parse_STRING(root="../", wFile_GGI='./data/parsed/STRING_GGI_noSelf.pkl',
                                wFile_PPI='./data/parsed/STRING_PPI_noSelf.pkl', noSelfPPI=False),
                MINT.parse_MINT(root="../", wFile_GGI='./data/parsed/MINT_GGI_noSelf.pkl',
                                wFile_PPI='./data/parsed/MINT_PPI_noSelf.pkl', noSelfPPI=False)]
names = ['bioGRID_noSelf', 'STRING_noSelf', 'MINT_noSelf']

for n in range(len(names)):
    _, df = import_funcs[n]
    ppi = [list(arr) for arr in np.asarray(df[['nodeA', 'nodeB']])]
    sampledPPIs = [rn.sample(ppi, int(len(ppi)*0.5)) for i in range(10)]
    with open("./sampled_datasets/{}_sampledPPIs.json".format(names[n]), "w") as f:
        f.write(json.dumps(sampledPPIs))

In [7]:
# yeast dataset
# DataFrame standard: {nodeA, nodeB, type, score}
# randomly 50% of the dataset, then add 5%,10%,20% of neg PPIs, 10 replicates, save into json

import_funcs = [bioGRID.parse_bioGRID(root="../"), STRING.parse_STRING(root="../"), MINT.parse_MINT(root="../")]
names = ['bioGRID', 'STRING', 'MINT']

for n in range(len(names)):
    _, df = import_funcs[n]
    ppi = [list(arr) for arr in np.asarray(df[['nodeA', 'nodeB']])]
    sampledPPIs = [rn.sample(ppi, int(len(ppi)*0.5)) for i in range(10)]
    realPPIsz = len(ppi)
    dual_ppiStr = set(tr.Helper.list_to_pathStrs(tr.Helper.to_dual_binary_relation(ppi)))
    sampleNodes = tr.Helper.binary_relation_to_node(ppi)
    nodePairs = tr.Helper.list_to_pathStrs(list(combinations(sampleNodes, 2)))
    nonPPIs = [ppi for ppi in nodePairs if ppi not in dual_ppiStr]
    
    for saltSz in [5, 10, 20]:
        sampled_nonPPIs = [tr.Helper.pathStrs_to_list(
            rn.sample(nonPPIs, int(realPPIsz*(saltSz*0.01)))) for i in range(10)]
        sampledPPIs = [sampledPPIs[i]+sampled_nonPPIs[i] for i in range(10)]
        with open("./sampled_datasets/{}_salted{}Percent_sampledPPIs.json".format(
            names[n], saltSz), "w") as f:
            f.write(json.dumps(sampledPPIs))

In [14]:
# yeast dataset
# DataFrame standard: {nodeA, nodeB, type, score}
# randomly 50% of the dataset 10 times, save into json

import_funcs = [synthetic_PPI.parse_synthetic(root="../")]
names = ['synthetic']

for n in range(len(names)):
    df = import_funcs[n]
    ppi = [list(arr) for arr in np.asarray(df[['nodeA', 'nodeB']])]
    sampledPPIs = [rn.sample(ppi, int(len(ppi)*0.5)) for i in range(10)]
    with open("./sampled_datasets/{}_sampledPPIs.json".format(names[n]), "w") as f:
        f.write(json.dumps(sampledPPIs))

In [4]:
# yeast dataset
# DataFrame standard: {nodeA, nodeB, type, score}
# randomly 50% of the dataset, then add 5%,10%,20% of neg PPIs, 10 replicates, save into json

import_funcs = [bioGRID.parse_bioGRID(root="../"), STRING.parse_STRING(root="../"), MINT.parse_MINT(root="../")]
names = ['bioGRID', 'STRING', 'MINT']

for n in range(len(names)):    
    for saltSz in [5,10,15,20,25]:
        _, df = import_funcs[n]
        ppi = [list(arr) for arr in np.asarray(df[['nodeA', 'nodeB']])]
        sampledPPIs = [rn.sample(ppi, int(len(ppi)*(0.5-(saltSz*0.01)))) for i in range(10)]
        realPPIsz = len(ppi)
        dual_ppiStr = set(tr.Helper.list_to_pathStrs(tr.Helper.to_dual_binary_relation(ppi)))
        sampleNodes = tr.Helper.binary_relation_to_node(ppi)
        nodePairs = tr.Helper.list_to_pathStrs(list(combinations(sampleNodes, 2)))
        nonPPIs = [ppi for ppi in nodePairs if ppi not in dual_ppiStr]
        sampled_nonPPIs = [tr.Helper.pathStrs_to_list(
            rn.sample(nonPPIs, int(realPPIsz*(saltSz*0.01)))) for i in range(10)]
        sampledPPIs = [sampledPPIs[i]+sampled_nonPPIs[i] for i in range(10)]
        with open("./sampled_datasets/{}_contam{}Percent_sampledPPIs.json".format(
            names[n], saltSz), "w") as f:
            f.write(json.dumps(sampledPPIs))

In [2]:
# yeast dataset
# DataFrame standard: {nodeA, nodeB, type, score}
# randomly 50% of the dataset 10 times, save into json

import_funcs = [bioGRID.parse_bioGRID(root="../"), STRING.parse_STRING(root="../"), MINT.parse_MINT(root="../")]
names = ['bioGRID', 'STRING', 'MINT']

for randSz in range(95, 54, -5):
    for n in range(len(names)):
        _, df = import_funcs[n]
        ppi = [list(arr) for arr in np.asarray(df[['nodeA', 'nodeB']])]
        sampledPPIs = [rn.sample(ppi, int(len(ppi)*(randSz*0.01))) for i in range(10)]
        with open("./sampled_datasets/{}_sampledPPIs_{}Percent.json".format(names[n], randSz), "w") as f:
            f.write(json.dumps(sampledPPIs))

In [8]:
# yeast dataset
# DataFrame standard: {nodeA, nodeB, type, score}
# randomly 50% of the dataset 10 times, save into json

import_funcs = [bioGRID.parse_bioGRID(root="../"), STRING.parse_STRING(root="../"), MINT.parse_MINT(root="../")]
names = ['synthetic']

for randSz in range(95, 54, -5):
    for n in range(len(names)):
        _, df = import_funcs[n]
        ppi = [list(arr) for arr in np.asarray(df[['nodeA', 'nodeB']])]
        sampledPPIs = [rn.sample(ppi, int(len(ppi)*(randSz*0.01))) for i in range(10)]
        with open("./sampled_datasets/{}_sampledPPIs_{}Percent.json".format(names[n], randSz), "w") as f:
            f.write(json.dumps(sampledPPIs))

In [3]:
# yeast dataset
# sample non-PPIs of real-PPIs size

import_funcs = [bioGRID.parse_bioGRID(root="../"), STRING.parse_STRING(root="../"), MINT.parse_MINT(root="../")]
names = ['bioGRID', 'STRING', 'MINT']

for n in range(len(names)):
    _, df = import_funcs[n]
    ppi = [list(arr) for arr in np.asarray(df[['nodeA', 'nodeB']])]
    realPPIsz = len(ppi)
    dual_ppiStr = set(tr.Helper.list_to_pathStrs(tr.Helper.to_dual_binary_relation(ppi)))
    
    sampleNodes = tr.Helper.binary_relation_to_node(ppi)
    nodePairs = tr.Helper.list_to_pathStrs(list(combinations(sampleNodes, 2)))
    
    nonPPIs = [ppi for ppi in nodePairs if ppi not in dual_ppiStr]
    sampled_nonPPIs = [tr.Helper.pathStrs_to_list(rn.sample(nonPPIs, realPPIsz)) for i in range(10)]
    
    with open("./sampled_datasets/{}_sampled_nonPPIs.json".format(names[n]), "w") as f:
        f.write(json.dumps(sampled_nonPPIs))

# Run Link Prediction

In [None]:
methods = ["commonNeighbor", "L3Normalizing", "CRA", "CH2_L3", "Sim", 'L3E1_f1', 'L3E1_f2', 'L3E_f1Alt', 'L3E_f2Alt', 'random']
ds_names = ['bioGRID', 'STRING', 'MINT']

for ds_name in ds_names:
    # read dataset
    samplePPIs = []
    with open("./sampled_datasets/{}_sampledPPIs.json".format(ds_name), "r") as f:
        samplePPIs = json.loads(f.read())

    # do link prediction & save results
    for method in methods:
        for i in range(len(samplePPIs)):
            saveFilename = "{}_{}_sample_{}".format(method, ds_name, i)
            startTime = time.time()
            
            # jupyter notebook cannot display multi core logging, do it only in terminal
            predPPI, predScore = pred.multiCore_PPILinkPred(samplePPIs[i]
                                                            , method, coreNo=14, logging=False)
            helper.write_runTime(saveFilename, time.time()-startTime)
            helper.write_resultData(predPPI, predScore, saveFilename)

In [3]:
methods = ["commonNeighbor", "L3Normalizing", "CRA", "CH2_L3", "Sim", 'L3E1_f1', 'L3E1_f2', 'random']
ds_names = ['synthetic']

for ds_name in ds_names:
    # read dataset
    samplePPIs = []
    with open("./sampled_datasets/{}_sampledPPIs.json".format(ds_name), "r") as f:
        samplePPIs = json.loads(f.read())

    # do link prediction & save results
    for method in methods:
        for i in range(len(samplePPIs)):
            saveFilename = "{}_{}_sample_{}".format(method, ds_name, i)
            print(saveFilename)
            startTime = time.time()
            
            # jupyter notebook cannot display multi core logging, do it only in terminal
            predPPI, predScore = pred.multiCore_PPILinkPred(samplePPIs[i]
                                                            , method, coreNo=14, logging=False)
            helper.write_runTime(saveFilename, time.time()-startTime)
            helper.write_resultData(predPPI, predScore, saveFilename)

commonNeighbor_synthetic_sample_0


commonNeighbor_synthetic_sample_1


commonNeighbor_synthetic_sample_2


commonNeighbor_synthetic_sample_3


commonNeighbor_synthetic_sample_4


commonNeighbor_synthetic_sample_5


commonNeighbor_synthetic_sample_6


commonNeighbor_synthetic_sample_7


commonNeighbor_synthetic_sample_8


commonNeighbor_synthetic_sample_9


L3Normalizing_synthetic_sample_0


L3Normalizing_synthetic_sample_1


L3Normalizing_synthetic_sample_2


L3Normalizing_synthetic_sample_3


L3Normalizing_synthetic_sample_4


L3Normalizing_synthetic_sample_5


L3Normalizing_synthetic_sample_6


L3Normalizing_synthetic_sample_7


L3Normalizing_synthetic_sample_8


L3Normalizing_synthetic_sample_9


CRA_synthetic_sample_0


CRA_synthetic_sample_1


CRA_synthetic_sample_2


CRA_synthetic_sample_3


CRA_synthetic_sample_4


CRA_synthetic_sample_5


CRA_synthetic_sample_6


CRA_synthetic_sample_7


CRA_synthetic_sample_8


CRA_synthetic_sample_9


CH2_L3_synthetic_sample_0


CH2_L3_synth

In [3]:
methods = ['L3E1_f1Alt', 'L3E1_f2Alt']
ds_names = ['IM24272_src', "Lit_BM_src", "Lit_NB_src", "HI_14_src"]

for ds_name in ds_names:
    # read dataset
    samplePPIs = []
    with open("./sampled_datasets/{}_sampledPPIs.json".format(ds_name), "r") as f:
        samplePPIs = json.loads(f.read())

    # do link prediction & save results
    for method in methods:
        for i in range(len(samplePPIs)):
            saveFilename = "{}_{}_sample_{}".format(method, ds_name, i)
            print(saveFilename)
            if os.path.exists("./linkPred_out/"+saveFilename+"_PPI.json"): continue
            startTime = time.time()
            
            # jupyter notebook cannot display multi core logging, do it only in terminal
            predPPI, predScore = pred.multiCore_PPILinkPred(samplePPIs[i]
                                                            , method, coreNo=14, logging=False)
            helper.write_runTime(saveFilename, time.time()-startTime)
            helper.write_resultData(predPPI, predScore, saveFilename)

L3E1_f1Alt_IM24272_src_sample_0


L3E1_f1Alt_IM24272_src_sample_1


L3E1_f1Alt_IM24272_src_sample_2


L3E1_f1Alt_IM24272_src_sample_3


L3E1_f1Alt_IM24272_src_sample_4


L3E1_f1Alt_IM24272_src_sample_5


L3E1_f1Alt_IM24272_src_sample_6


L3E1_f1Alt_IM24272_src_sample_7


L3E1_f1Alt_IM24272_src_sample_8


L3E1_f1Alt_IM24272_src_sample_9


L3E1_f2Alt_IM24272_src_sample_0


L3E1_f2Alt_IM24272_src_sample_1


L3E1_f2Alt_IM24272_src_sample_2


L3E1_f2Alt_IM24272_src_sample_3


L3E1_f2Alt_IM24272_src_sample_4


L3E1_f2Alt_IM24272_src_sample_5


L3E1_f2Alt_IM24272_src_sample_6


L3E1_f2Alt_IM24272_src_sample_7


L3E1_f2Alt_IM24272_src_sample_8


L3E1_f2Alt_IM24272_src_sample_9


L3E1_f1Alt_Lit_BM_src_sample_0


L3E1_f1Alt_Lit_BM_src_sample_1


L3E1_f1Alt_Lit_BM_src_sample_2


L3E1_f1Alt_Lit_BM_src_sample_3


L3E1_f1Alt_Lit_BM_src_sample_4


L3E1_f1Alt_Lit_BM_src_sample_5


L3E1_f1Alt_Lit_BM_src_sample_6


L3E1_f1Alt_Lit_BM_src_sample_7


L3E1_f1Alt_Lit_BM_src_sample_8


L3E1_f1Alt_Lit_BM_src_s

In [8]:
methods = ["commonNeighbor", "L3Normalizing", "CRA", "CH2_L3", "Sim", 'L3E1_f1', 'L3E1_f2', 'random']
ds_names = ['bioGRID_noSelf']

for ds_name in ds_names:
    # read dataset
    samplePPIs = []
    with open("./sampled_datasets/{}_sampledPPIs.json".format(ds_name), "r") as f:
        samplePPIs = json.loads(f.read())

    # do link prediction & save results
    for method in methods:
        for i in range(len(samplePPIs)):
            saveFilename = "{}_{}_sample_{}".format(method, ds_name, i)
            print(saveFilename)
            if os.path.exists("./linkPred_out/"+saveFilename+"_PPI.json"): continue
            startTime = time.time()
            
            # jupyter notebook cannot display multi core logging, do it only in terminal
            predPPI, predScore = pred.multiCore_PPILinkPred(samplePPIs[i]
                                                            , method, coreNo=14, logging=False)
            helper.write_runTime(saveFilename, time.time()-startTime)
            helper.write_resultData(predPPI, predScore, saveFilename)

commonNeighbor_bioGRID_noSelf_sample_0
commonNeighbor_bioGRID_noSelf_sample_1
commonNeighbor_bioGRID_noSelf_sample_2
commonNeighbor_bioGRID_noSelf_sample_3
commonNeighbor_bioGRID_noSelf_sample_4
commonNeighbor_bioGRID_noSelf_sample_5
commonNeighbor_bioGRID_noSelf_sample_6
commonNeighbor_bioGRID_noSelf_sample_7
commonNeighbor_bioGRID_noSelf_sample_8
commonNeighbor_bioGRID_noSelf_sample_9
L3Normalizing_bioGRID_noSelf_sample_0
L3Normalizing_bioGRID_noSelf_sample_1
L3Normalizing_bioGRID_noSelf_sample_2
L3Normalizing_bioGRID_noSelf_sample_3


L3Normalizing_bioGRID_noSelf_sample_4


L3Normalizing_bioGRID_noSelf_sample_5


L3Normalizing_bioGRID_noSelf_sample_6


L3Normalizing_bioGRID_noSelf_sample_7


L3Normalizing_bioGRID_noSelf_sample_8


L3Normalizing_bioGRID_noSelf_sample_9


CRA_bioGRID_noSelf_sample_0


CRA_bioGRID_noSelf_sample_1


CRA_bioGRID_noSelf_sample_2


CRA_bioGRID_noSelf_sample_3


CRA_bioGRID_noSelf_sample_4


CRA_bioGRID_noSelf_sample_5


CRA_bioGRID_noSelf_sample_6


CRA_bi

In [4]:
methods = ["commonNeighbor", "L3Normalizing", "CRA", "CH2_L3", "Sim", 'L3E1_f1', 'L3E1_f2', 'random']
ds_names = ['bioGRID_5FoldCV', 'STRING_5FoldCV', 'MINT_5FoldCV']

for ds_name in ds_names:
    # read dataset
    samplePPIs = []
    with open("./sampled_datasets/{}_sampledPPIs.json".format(ds_name), "r") as f:
        samplePPIs = json.loads(f.read())

    # do link prediction & save results
    for method in methods:
        for i in range(len(samplePPIs)):
            saveFilename = "{}_{}_sample_{}".format(method, ds_name, i)
            print(saveFilename)
            if os.path.exists("./linkPred_out/"+saveFilename+"_PPI.json"): continue
            startTime = time.time()
            
            # jupyter notebook cannot display multi core logging, do it only in terminal
            predPPI, predScore = pred.multiCore_PPILinkPred(samplePPIs[i]
                                                            , method, coreNo=14, logging=False)
            helper.write_runTime(saveFilename, time.time()-startTime)
            helper.write_resultData(predPPI, predScore, saveFilename)

commonNeighbor_bioGRID_5FoldCV_sample_0
commonNeighbor_bioGRID_5FoldCV_sample_1
commonNeighbor_bioGRID_5FoldCV_sample_2
commonNeighbor_bioGRID_5FoldCV_sample_3
commonNeighbor_bioGRID_5FoldCV_sample_4
L3Normalizing_bioGRID_5FoldCV_sample_0
L3Normalizing_bioGRID_5FoldCV_sample_1
L3Normalizing_bioGRID_5FoldCV_sample_2
L3Normalizing_bioGRID_5FoldCV_sample_3
L3Normalizing_bioGRID_5FoldCV_sample_4
CRA_bioGRID_5FoldCV_sample_0
CRA_bioGRID_5FoldCV_sample_1
CRA_bioGRID_5FoldCV_sample_2
CRA_bioGRID_5FoldCV_sample_3
CRA_bioGRID_5FoldCV_sample_4
CH2_L3_bioGRID_5FoldCV_sample_0
CH2_L3_bioGRID_5FoldCV_sample_1
CH2_L3_bioGRID_5FoldCV_sample_2
CH2_L3_bioGRID_5FoldCV_sample_3
CH2_L3_bioGRID_5FoldCV_sample_4


Sim_bioGRID_5FoldCV_sample_0


Sim_bioGRID_5FoldCV_sample_1


Sim_bioGRID_5FoldCV_sample_2


Sim_bioGRID_5FoldCV_sample_3


Sim_bioGRID_5FoldCV_sample_4


L3E1_f1_bioGRID_5FoldCV_sample_0


L3E1_f1_bioGRID_5FoldCV_sample_1


L3E1_f1_bioGRID_5FoldCV_sample_2


L3E1_f1_bioGRID_5FoldCV_sample_3


L3E

In [5]:
methods = ["commonNeighbor", "L3Normalizing", "CRA", "CH2_L3", "Sim", 'L3E1_f1', 'L3E1_f2', 'random']
ds_names = ['bioGRID', 'STRING', 'MINT']

for ds_name in ds_names:
    for saltSz in [15, 25]:
        # read dataset
        samplePPIs = []
        with open("./sampled_datasets/{}_contam{}Percent_sampledPPIs.json".format(
            ds_name, saltSz), "r") as f:
            samplePPIs = json.loads(f.read())

        # do link prediction & save results
        for method in methods:
            for i in range(len(samplePPIs)):
                saveFilename = "{}_{}_contam{}Percent_sample_{}".format(method, ds_name, saltSz, i)
                print(saveFilename)
                if os.path.exists("./linkPred_out/"+saveFilename+"_PPI.json"): continue
                startTime = time.time()

                # jupyter notebook cannot display multi core logging, do it only in terminal
                predPPI, predScore = pred.multiCore_PPILinkPred(samplePPIs[i]
                                                                , method, coreNo=14, logging=False)
                helper.write_runTime(saveFilename, time.time()-startTime)
                helper.write_resultData(predPPI, predScore, saveFilename)

commonNeighbor_bioGRID_contam15Percent_sample_0
commonNeighbor_bioGRID_contam15Percent_sample_1
commonNeighbor_bioGRID_contam15Percent_sample_2
commonNeighbor_bioGRID_contam15Percent_sample_3
commonNeighbor_bioGRID_contam15Percent_sample_4
commonNeighbor_bioGRID_contam15Percent_sample_5
commonNeighbor_bioGRID_contam15Percent_sample_6
commonNeighbor_bioGRID_contam15Percent_sample_7
commonNeighbor_bioGRID_contam15Percent_sample_8
commonNeighbor_bioGRID_contam15Percent_sample_9
L3Normalizing_bioGRID_contam15Percent_sample_0
L3Normalizing_bioGRID_contam15Percent_sample_1
L3Normalizing_bioGRID_contam15Percent_sample_2
L3Normalizing_bioGRID_contam15Percent_sample_3
L3Normalizing_bioGRID_contam15Percent_sample_4
L3Normalizing_bioGRID_contam15Percent_sample_5
L3Normalizing_bioGRID_contam15Percent_sample_6
L3Normalizing_bioGRID_contam15Percent_sample_7
L3Normalizing_bioGRID_contam15Percent_sample_8
L3Normalizing_bioGRID_contam15Percent_sample_9
CRA_bioGRID_contam15Percent_sample_0
CRA_bioGRID_c



CH2_L3_STRING_contam15Percent_sample_3


CH2_L3_STRING_contam15Percent_sample_4


CH2_L3_STRING_contam15Percent_sample_5


CH2_L3_STRING_contam15Percent_sample_6


CH2_L3_STRING_contam15Percent_sample_7


CH2_L3_STRING_contam15Percent_sample_8


CH2_L3_STRING_contam15Percent_sample_9


Sim_STRING_contam15Percent_sample_0


Sim_STRING_contam15Percent_sample_1


Sim_STRING_contam15Percent_sample_2


Sim_STRING_contam15Percent_sample_3


Sim_STRING_contam15Percent_sample_4


Sim_STRING_contam15Percent_sample_5


Sim_STRING_contam15Percent_sample_6


Sim_STRING_contam15Percent_sample_7


Sim_STRING_contam15Percent_sample_8


Sim_STRING_contam15Percent_sample_9


L3E1_f1_STRING_contam15Percent_sample_0


L3E1_f1_STRING_contam15Percent_sample_1


L3E1_f1_STRING_contam15Percent_sample_2


L3E1_f1_STRING_contam15Percent_sample_3


L3E1_f1_STRING_contam15Percent_sample_4


L3E1_f1_STRING_contam15Percent_sample_5


L3E1_f1_STRING_contam15Percent_sample_6


L3E1_f1_STRING_contam15Percent_sample

random_MINT_contam15Percent_sample_3
random_MINT_contam15Percent_sample_4
random_MINT_contam15Percent_sample_5
random_MINT_contam15Percent_sample_6
random_MINT_contam15Percent_sample_7
random_MINT_contam15Percent_sample_8
random_MINT_contam15Percent_sample_9
commonNeighbor_MINT_contam25Percent_sample_0


commonNeighbor_MINT_contam25Percent_sample_1


commonNeighbor_MINT_contam25Percent_sample_2


commonNeighbor_MINT_contam25Percent_sample_3


commonNeighbor_MINT_contam25Percent_sample_4


commonNeighbor_MINT_contam25Percent_sample_5


commonNeighbor_MINT_contam25Percent_sample_6


commonNeighbor_MINT_contam25Percent_sample_7


commonNeighbor_MINT_contam25Percent_sample_8


commonNeighbor_MINT_contam25Percent_sample_9


L3Normalizing_MINT_contam25Percent_sample_0


L3Normalizing_MINT_contam25Percent_sample_1


L3Normalizing_MINT_contam25Percent_sample_2


L3Normalizing_MINT_contam25Percent_sample_3


L3Normalizing_MINT_contam25Percent_sample_4


L3Normalizing_MINT_contam25Percent_sample

In [None]:
methods = ["commonNeighbor", "L3Normalizing", "CRA", "Sim", "L3E1_f1", 'L3E1_f2', 'random']
# skip CH2 and L3E1_f2 first because waste time, may use HPC
ds_names = ['bioGRID', 'STRING', 'MINT']

for randSz in range(60, 100, 10):
    for ds_name in ds_names:
        # read dataset
        samplePPIs = []
        with open("./sampled_datasets/{}_sampledPPIs_{}Percent.json".format(ds_name, randSz), "r") as f:
            samplePPIs = json.loads(f.read())

        # do link prediction & save results
        for method in methods:
            print(randSz, ds_name, method)
            for i in range(len(samplePPIs)):
                saveFilename = "{}_{}_sample_{}_randSz{}Percent".format(method, ds_name, i, randSz)
                startTime = time.time()

                # jupyter notebook cannot display multi core logging, do it only in terminal
                predPPI, predScore = pred.multiCore_PPILinkPred(samplePPIs[i]
                                                                , method, coreNo=14, logging=False)
                helper.write_runTime(saveFilename, time.time()-startTime)
                helper.write_resultData(predPPI, predScore, saveFilename)

In [None]:
methods = ["commonNeighbor", "L3Normalizing", "CRA", "Sim", "CH2_L3", "L3E1_f1", 'L3E1_f2', 'random']
ds_names = ['synthetic']

for randSz in range(60, 100, 10):
    for ds_name in ds_names:
        # read dataset
        samplePPIs = []
        with open("./sampled_datasets/{}_sampledPPIs_{}Percent.json".format(ds_name, randSz), "r") as f:
            samplePPIs = json.loads(f.read())

        # do link prediction & save results
        for method in methods:
            print(randSz, ds_name, method)
            for i in range(len(samplePPIs)):
                saveFilename = "{}_{}_sample_{}_randSz{}Percent".format(method, ds_name, i, randSz)
                startTime = time.time()

                # jupyter notebook cannot display multi core logging, do it only in terminal
                predPPI, predScore = pred.multiCore_PPILinkPred(samplePPIs[i]
                                                                , method, coreNo=14, logging=False)
                helper.write_runTime(saveFilename, time.time()-startTime)
                helper.write_resultData(predPPI, predScore, saveFilename)

60 synthetic commonNeighbor




















60 synthetic L3Normalizing




















60 synthetic CRA




















60 synthetic Sim




















60 synthetic CH2_L3


# Data Cleaning for Analysis & Processing

In [23]:
# extract only top n edges, n = size of sampled dataset
methods = ["commonNeighbor", "L3Normalizing", "CRA", "CH2_L3", "Sim", 'random', "L3E1_f1", "L3E1_f2"]
ds_names = ['bioGRID', 'STRING', 'MINT']

for ds_name in ds_names:
    samplePPIs = []
    with open("./sampled_datasets/{}_sampledPPIs.json".format(ds_name), "r") as f:
        samplePPIs = json.loads(f.read())
    sampleSize = len(samplePPIs[0])
    
    # loop each method, each trial, extract the number into one json
    for method in methods:
        fullPPI, fullScore = [], []
        for trial in range(10):
            with open("E:/research/ppiLPred_BMC/notebook/linkPred_out/{}_{}_sample_{}_PPI.json".format(method, ds_name, trial), "r") as f:
                fullPPI.append(json.loads(f.read())[0:sampleSize])
            with open("E:/research/ppiLPred_BMC/notebook/linkPred_out/{}_{}_sample_{}_score.json".format(method, ds_name, trial), "r") as f:
                fullScore.append(json.loads(f.read())[0:sampleSize])
        with open("./linkPred_out_reduced/{}_{}_topPPI.json".format(method, ds_name), "w") as f:
            f.write(json.dumps(fullPPI))
        with open("./linkPred_out_reduced/{}_{}_topScore.json".format(method, ds_name), "w") as f:
            f.write(json.dumps(fullScore))

In [5]:
# extract only top n edges, n = size of sampled dataset
methods = ["commonNeighbor", "L3Normalizing", "CRA", "CH2_L3", "Sim", 'random', "L3E1_f1", "L3E1_f2"]
#ds_names = ['bioGRID', 'STRING', 'MINT']
#ds_names = [ds+"_contam{}Percent".format(salt) for ds in ds_names for salt in [15,25]]
#ds_names = ['synthetic']
ds_names = ['bioGRID_noSelf']

for ds_name in ds_names:
    samplePPIs = []
    with open("./sampled_datasets/{}_sampledPPIs.json".format(ds_name), "r") as f:
        samplePPIs = json.loads(f.read())
    sampleSize = len(samplePPIs[0])
    
    # loop each method, each trial, extract the number into one json
    for method in methods:
        fullPPI, fullScore = [], []
        saveFilename = "{}_{}".format(method, ds_name)
        print(saveFilename)
        if os.path.exists("./linkPred_out_reduced/"+saveFilename+"_topPPI.json"): continue

        for trial in range(10):
            with open("./linkPred_out/{}_{}_sample_{}_PPI.json".format(method, ds_name, trial), "r") as f:
                fullPPI.append(json.loads(f.read())[0:sampleSize])
            with open("./linkPred_out/{}_{}_sample_{}_score.json".format(method, ds_name, trial), "r") as f:
                fullScore.append(json.loads(f.read())[0:sampleSize])
        with open("./linkPred_out_reduced/{}_{}_topPPI.json".format(method, ds_name), "w") as f:
            f.write(json.dumps(fullPPI))
        with open("./linkPred_out_reduced/{}_{}_topScore.json".format(method, ds_name), "w") as f:
            f.write(json.dumps(fullScore))

commonNeighbor_bioGRID_noSelf
L3Normalizing_bioGRID_noSelf
CRA_bioGRID_noSelf
CH2_L3_bioGRID_noSelf
Sim_bioGRID_noSelf
random_bioGRID_noSelf
L3E1_f1_bioGRID_noSelf
L3E1_f2_bioGRID_noSelf


In [6]:
# extract only top n edges, n = size of sampled dataset
methods = ["commonNeighbor", "L3Normalizing", "CRA", "CH2_L3", "Sim", 'random', "L3E1_f1", "L3E1_f2"]
#ds_names = ['bioGRID', 'STRING', 'MINT']
#ds_names = [ds+"_contam{}Percent".format(salt) for ds in ds_names for salt in [15,25]]
#ds_names = ['synthetic']
ds_names = ['bioGRID_5FoldCV', 'STRING_5FoldCV', 'MINT_5FoldCV']

for ds_name in ds_names:
    samplePPIs = []
    with open("./sampled_datasets/{}_sampledPPIs.json".format(ds_name), "r") as f:
        samplePPIs = json.loads(f.read())
    sampleSize = int((len(samplePPIs[0])/0.8)*0.2)
    
    # loop each method, each trial, extract the number into one json
    for method in methods:
        fullPPI, fullScore = [], []
        saveFilename = "{}_{}".format(method, ds_name)
        print(saveFilename)
        if os.path.exists("./linkPred_out_reduced/"+saveFilename+"_topPPI.json"): continue

        for trial in range(5):
            with open("./linkPred_out/{}_{}_sample_{}_PPI.json".format(method, ds_name, trial), "r") as f:
                fullPPI.append(json.loads(f.read())[0:sampleSize])
            with open("./linkPred_out/{}_{}_sample_{}_score.json".format(method, ds_name, trial), "r") as f:
                fullScore.append(json.loads(f.read())[0:sampleSize])
        with open("./linkPred_out_reduced/{}_{}_topPPI.json".format(method, ds_name), "w") as f:
            f.write(json.dumps(fullPPI))
        with open("./linkPred_out_reduced/{}_{}_topScore.json".format(method, ds_name), "w") as f:
            f.write(json.dumps(fullScore))

commonNeighbor_bioGRID_5FoldCV
L3Normalizing_bioGRID_5FoldCV
CRA_bioGRID_5FoldCV
CH2_L3_bioGRID_5FoldCV
Sim_bioGRID_5FoldCV
random_bioGRID_5FoldCV
L3E1_f1_bioGRID_5FoldCV
L3E1_f2_bioGRID_5FoldCV
commonNeighbor_STRING_5FoldCV
L3Normalizing_STRING_5FoldCV
CRA_STRING_5FoldCV
CH2_L3_STRING_5FoldCV
Sim_STRING_5FoldCV
random_STRING_5FoldCV
L3E1_f1_STRING_5FoldCV
L3E1_f2_STRING_5FoldCV
commonNeighbor_MINT_5FoldCV
L3Normalizing_MINT_5FoldCV
CRA_MINT_5FoldCV
CH2_L3_MINT_5FoldCV
Sim_MINT_5FoldCV
random_MINT_5FoldCV
L3E1_f1_MINT_5FoldCV
L3E1_f2_MINT_5FoldCV


In [9]:
allPaths = ["I:/research/ppiLPred_BMC/notebook"
            , "E:/research/ppiLPred_BMC/notebook"
            , "D:/research offline repo/ppiLPred_BMC/notebook"
            , "G:/research/ppiLPred_BMC/notebook"]
coreNo, trialNum = 24, 10
def verify(method, ds, randSz):
    # check HPC or not
    isHPC = None
    for path in allPaths:
        # check if file exists in linkPred_out_reduced
        if os.path.exists("./linkPred_out_reduced/{}_{}_randSz{}_topPPI.json".format(
            method, ds, randSz)): return 0, None, None
        if os.path.exists("{}/linkPred_out/{}_{}_sample_9_randSz{}Percent_c0_PPI.json".format(
            path, method, ds, randSz)):
            isHPC = True
            break
        elif os.path.exists("{}/linkPred_out/{}_{}_sample_9_randSz{}Percent_PPI.json".format(
            path, method, ds, randSz)):
            isHPC = False
            break
    if isHPC is None: return 2, None, None
    # iterate the abs path to all related files
    filenames = []
    if isHPC:
        for trial in range(trialNum):
            for core in range(coreNo):
                for path in allPaths:
                    filename = "{}/linkPred_out/{}_{}_sample_{}_randSz{}Percent_c{}_PPI.json".format(
                        path, method, ds, trial, randSz, core)
                    if os.path.exists(filename): filenames.append(filename.split("_PPI.json")[0])
    else:
        for trial in range(trialNum):
            for path in allPaths:
                filename = "{}/linkPred_out/{}_{}_sample_{}_randSz{}Percent_PPI.json".format(
                    path, method, ds, trial, randSz)
                if os.path.exists(filename): filenames.append(filename.split("_PPI.json")[0])
    # return available, list of files, also isHPC
    return 1, filenames, isHPC

In [11]:
# trim data that isn't trimmed yet
methods = ["commonNeighbor", "L3Normalizing", "CRA", "CH2_L3", "Sim", 'random', "L3E1_f1", "L3E1_f2"]
dss = ['bioGRID', 'STRING', 'MINT']
coreNo, trialNum = 24, 10

for randSz in range(60, 100, 10):
    for ds in dss:
        samplePPIs = []
        with open("./sampled_datasets/{}_sampledPPIs_{}Percent.json".format(ds, randSz), "r") as f:
            samplePPIs = json.loads(f.read())
        sampleSize = len(samplePPIs[0])
    
        for method in methods:
            available, filenames, isHPC = verify(method, ds, randSz)
            print(randSz, ds, method, isHPC, available)
            if available != 1: continue

            if isHPC:
                
                for trial in range(trialNum):
                    topPPIs, topScores = [], []
                    for core in range(coreNo):
                        with open(filenames[trial*coreNo+core]+"_PPI.json", "r") as f: topPPIs += json.loads(f.read())
                        with open(filenames[trial*coreNo+core]+"_score.json", "r") as f: topScores += json.loads(f.read())
                        topPPIs, topScores = hr.sort_key_val(topPPIs, topScores)
                        topPPIs, topScores = topPPIs[:sampleSize], topScores[:sampleSize]
                    with open("./linkPred_out_combined/{}_{}_sample_{}_randSz{}_topPPI.json".format(method, ds, trial, randSz), "w") as f:
                        f.write(json.dumps(topPPIs))
                    with open("./linkPred_out_combined/{}_{}_sample_{}_randSz{}_topScore.json".format(method, ds, trial, randSz), "w") as f:
                        f.write(json.dumps(topScores))
                        
                fullPPIs, fullScores = [], []
                for trial in range(10):
                    with open("./linkPred_out_combined/{}_{}_sample_{}_randSz{}_topPPI.json".format(method, ds, trial, randSz), "r") as f:
                        fullPPIs.append(json.loads(f.read()))
                    with open("./linkPred_out_combined/{}_{}_sample_{}_randSz{}_topScore.json".format(method, ds, trial, randSz), "r") as f:
                        fullScores.append(json.loads(f.read()))
                with open("./linkPred_out_reduced/{}_{}_randSz{}_topPPI.json".format(method, ds, randSz), "w") as f:
                    f.write(json.dumps(fullPPIs))
                with open("./linkPred_out_reduced/{}_{}_randSz{}_topScore.json".format(method, ds, randSz), "w") as f:
                    f.write(json.dumps(fullScores))
                    
            else:
                
                fullPPIs, fullScores = [], []
                for filename in filenames:
                    with open(filename+"_PPI.json", "r") as f: fullPPIs.append(json.loads(f.read())[0:sampleSize])
                    with open(filename+"_score.json", "r") as f: fullScores.append(json.loads(f.read())[0:sampleSize])
                with open("./linkPred_out_reduced/{}_{}_randSz{}_topPPI.json".format(method, ds, randSz), "w") as f:
                    f.write(json.dumps(fullPPIs))
                with open("./linkPred_out_reduced/{}_{}_randSz{}_topScore.json".format(method, ds, randSz), "w") as f:
                    f.write(json.dumps(fullScores))

60 bioGRID commonNeighbor False 1
60 bioGRID L3Normalizing False 1
60 bioGRID CRA False 1
60 bioGRID CH2_L3 True 1
60 bioGRID Sim False 1
60 bioGRID random False 1
60 bioGRID L3E1_f1 False 1
60 bioGRID L3E1_f2 True 1
60 STRING commonNeighbor False 1
60 STRING L3Normalizing False 1
60 STRING CRA False 1
60 STRING CH2_L3 True 1
60 STRING Sim False 1
60 STRING random False 1
60 STRING L3E1_f1 False 1
60 STRING L3E1_f2 True 1
60 MINT commonNeighbor False 1
60 MINT L3Normalizing False 1
60 MINT CRA False 1
60 MINT CH2_L3 True 1
60 MINT Sim False 1
60 MINT random False 1
60 MINT L3E1_f1 False 1
60 MINT L3E1_f2 True 1
70 bioGRID commonNeighbor False 1
70 bioGRID L3Normalizing False 1
70 bioGRID CRA False 1
70 bioGRID CH2_L3 True 1
70 bioGRID Sim False 1
70 bioGRID random False 1
70 bioGRID L3E1_f1 False 1
70 bioGRID L3E1_f2 True 1
70 STRING commonNeighbor False 1
70 STRING L3Normalizing False 1
70 STRING CRA False 1
70 STRING CH2_L3 True 1
70 STRING Sim False 1
70 STRING random False 1
70 STR

In [35]:
# above randSz reduced wrong, reduce even more
methods = ["commonNeighbor", "L3Normalizing", "CRA", "CH2_L3", "Sim", 'random', "L3E1_f1", "L3E1_f2"]
dss = ['bioGRID', 'STRING', 'MINT']

for randSz in range(60, 100, 10):
    for ds in dss:
        samplePPIs = []
        with open("./sampled_datasets/{}_sampledPPIs_{}Percent.json".format(ds, randSz), "r") as f:
            samplePPIs = json.loads(f.read())
        sampleSize = len(samplePPIs[0])
    
        for method in methods:
            reducedPPIs, reducedScores = [], []
            with open("./linkPred_out_reduced/{}_{}_randSz{}_topPPI.json".format(method, ds, randSz), "r") as f:
                reducedPPIs = json.loads(f.read())
            with open("./linkPred_out_reduced/{}_{}_randSz{}_topScore.json".format(method, ds, randSz), "r") as f:
                reducedScores = json.loads(f.read())
                
            for i in range(len(reducedPPIs)):
                reducedPPIs[i] = reducedPPIs[i][:int(sampleSize/(randSz/100)*np.around(1-randSz/100, 2))]
                reducedScores[i] = reducedScores[i][:int(sampleSize/(randSz/100)*np.around(1-randSz/100, 2))]
                
            with open("./linkPred_out_reduced/{}_{}_randSz{}_topPPI.json".format(method, ds, randSz), "w") as f:
                f.write(json.dumps(reducedPPIs))
            with open("./linkPred_out_reduced/{}_{}_randSz{}_topScore.json".format(method, ds, randSz), "w") as f:
                f.write(json.dumps(reducedScores))

# Generate GOSemSim

run **GOSemSim_compute.R** of the same directory, it scans ./linkPred_out and output GOSemSim in the same format of **xxx_topScore.json**

# Generate precision recall

In [5]:
# for each dataset & predictor, get precision recall and save in one file for each trial
methods = ["commonNeighbor", "L3Normalizing", "CRA", "CH2_L3", "Sim", "L3E1_f1", "L3E1_f2", "L3E_f1Alt", "L3E_f2Alt", "random"]
ds_names = ['bioGRID', 'STRING', 'MINT']

completePPIs_map = [
    [list(ppi) for ppi in np.asarray([*bioGRID.parse_bioGRID(root='../')][1][['nodeA', 'nodeB']])]
    , [list(ppi) for ppi in np.asarray([*STRING.parse_STRING(root='../')][1][['nodeA', 'nodeB']])]
    , [list(ppi) for ppi in np.asarray([*MINT.parse_MINT(root='../')][1][['nodeA', 'nodeB']])]
]
completePPIs = dict(zip(ds_names, completePPIs_map))


for ds_name in ds_names:
    samplePPIs = []
    with open("./sampled_datasets/{}_sampledPPIs.json".format(ds_name), "r") as f:
        samplePPIs = json.loads(f.read())

    for method in methods:
        fullPPIs = []
        with open("./linkPred_out_reduced/{}_{}_topPPI.json".format(method, ds_name), "r") as f:
            fullPPIs = json.loads(f.read())
            
        # len(fullPPIs) = len(samplePPIs) = 10, because 10 trials
        precRecMap = pred.precRecMap_multiCore(
            ["{}_{}_topPPI_{}".format(method, ds_name, i) for i in range(len(fullPPIs))]
          , fullPPIs, samplePPIs, [completePPIs[ds_name] for i in range(len(fullPPIs))]
          , coreNo=10)
        
        for key in precRecMap:
            with open("./precision_recall_out/{}.json".format(key), 'w') as f:
                f.write(json.dumps(precRecMap[key]))

In [9]:
# for each dataset & predictor, get precision recall and save in one file for each trial
methods = ["commonNeighbor", "L3Normalizing", "CRA", "CH2_L3", "Sim", "L3E1_f1", "L3E1_f2", "random"]
ds_names = ['bioGRID', 'STRING', 'MINT']

completePPIs_map = [
    [list(ppi) for ppi in np.asarray([*bioGRID.parse_bioGRID(root='../')][1][['nodeA', 'nodeB']])]
    , [list(ppi) for ppi in np.asarray([*STRING.parse_STRING(root='../')][1][['nodeA', 'nodeB']])]
    , [list(ppi) for ppi in np.asarray([*MINT.parse_MINT(root='../')][1][['nodeA', 'nodeB']])]
]
completePPIs = dict(zip(ds_names, completePPIs_map))


for ds_name in ds_names:
    for salt in [15, 25]:
        samplePPIs = []
        with open("./sampled_datasets/{}_contam{}Percent_sampledPPIs.json".format(ds_name, salt), "r") as f:
            samplePPIs = json.loads(f.read())

        for method in methods:
            fullPPIs = []
            with open("./linkPred_out_reduced/{}_{}_contam{}Percent_topPPI.json".format(
                method, ds_name, salt), "r") as f:
                fullPPIs = json.loads(f.read())

            # len(fullPPIs) = len(samplePPIs) = 10, because 10 trials
            precRecMap = pred.precRecMap_multiCore(
                ["{}_{}_contam{}Percent_topPPI_{}".format(method, ds_name, salt, i) for i in range(len(fullPPIs))]
              , fullPPIs, samplePPIs, [completePPIs[ds_name] for i in range(len(fullPPIs))]
              , coreNo=10)

            for key in precRecMap:
                with open("./precision_recall_out/{}.json".format(key), 'w') as f:
                    f.write(json.dumps(precRecMap[key]))

In [10]:
# for each dataset & predictor, get precision recall and save in one file for each trial
methods = ["commonNeighbor", "L3Normalizing", "CRA", "CH2_L3", "Sim", "L3E1_f1", "L3E1_f2", "random"]
ds_names = ['bioGRID_noSelf']

completePPIs_map = [
    [list(ppi) for ppi in np.asarray(bioGRID.parse_bioGRID(root='../')[1][['nodeA', 'nodeB']])]
]
completePPIs = dict(zip(ds_names, completePPIs_map))


for ds_name in ds_names:
    samplePPIs = []
    with open("./sampled_datasets/{}_sampledPPIs.json".format(ds_name), "r") as f:
        samplePPIs = json.loads(f.read())

    for method in methods:
        fullPPIs = []
        with open("./linkPred_out_reduced/{}_{}_topPPI.json".format(
            method, ds_name), "r") as f:
            fullPPIs = json.loads(f.read())

        # len(fullPPIs) = len(samplePPIs) = 10, because 10 trials
        precRecMap = pred.precRecMap_multiCore(
            ["{}_{}_topPPI_{}".format(method, ds_name, i) for i in range(len(fullPPIs))]
          , fullPPIs, samplePPIs, [completePPIs[ds_name] for i in range(len(fullPPIs))]
          , coreNo=10)

        for key in precRecMap:
            with open("./precision_recall_out/{}.json".format(key), 'w') as f:
                f.write(json.dumps(precRecMap[key]))

In [7]:
# for each dataset & predictor, get precision recall and save in one file for each trial
methods = ["commonNeighbor", "L3Normalizing", "CRA", "CH2_L3", "Sim", "L3E1_f1", "L3E1_f2", "random"]
ds_names = ['bioGRID_5FoldCV', 'STRING_5FoldCV', 'MINT_5FoldCV']

completePPIs_map = [
    [list(ppi) for ppi in np.asarray([*bioGRID.parse_bioGRID(root='../')][1][['nodeA', 'nodeB']])]
    , [list(ppi) for ppi in np.asarray([*STRING.parse_STRING(root='../')][1][['nodeA', 'nodeB']])]
    , [list(ppi) for ppi in np.asarray([*MINT.parse_MINT(root='../')][1][['nodeA', 'nodeB']])]
    , [list(ppi) for ppi in np.asarray(synthetic_PPI.parse_synthetic(root='../')[['nodeA', 'nodeB']])]
]
completePPIs = dict(zip(ds_names, completePPIs_map))


for ds_name in ds_names:
    samplePPIs = []
    with open("./sampled_datasets/{}_sampledPPIs.json".format(ds_name), "r") as f:
        samplePPIs = json.loads(f.read())

    for method in methods:
        fullPPIs = []
        with open("./linkPred_out_reduced/{}_{}_topPPI.json".format(
            method, ds_name), "r") as f:
            fullPPIs = json.loads(f.read())

        # len(fullPPIs) = len(samplePPIs) = 10, because 10 trials
        precRecMap = pred.precRecMap_multiCore(
            ["{}_{}_topPPI_{}".format(method, ds_name, i) for i in range(len(fullPPIs))]
          , fullPPIs, samplePPIs, [completePPIs[ds_name] for i in range(len(fullPPIs))]
          , coreNo=10)

        for key in precRecMap:
            with open("./precision_recall_out/{}.json".format(key), 'w') as f:
                f.write(json.dumps(precRecMap[key]))

In [36]:
# for each dataset & predictor, get precision recall and save in one file for each trial
methods = ["commonNeighbor", "L3Normalizing", "CRA", "CH2_L3", "Sim", "random", "L3E1_f1", "L3E1_f2"]
ds_names = ['bioGRID', 'STRING', 'MINT']

completePPIs_map = [
    [list(ppi) for ppi in np.asarray([*bioGRID.parse_bioGRID(root='../')][1][['nodeA', 'nodeB']])]
    , [list(ppi) for ppi in np.asarray([*STRING.parse_STRING(root='../')][1][['nodeA', 'nodeB']])]
    , [list(ppi) for ppi in np.asarray([*MINT.parse_MINT(root='../')][1][['nodeA', 'nodeB']])]
]
completePPIs = dict(zip(ds_names, completePPIs_map))

for randSz in range(60, 100, 10):
    for ds_name in ds_names:
        samplePPIs = []
        with open("./sampled_datasets/{}_sampledPPIs_{}Percent.json".format(ds_name, randSz), "r") as f:
            samplePPIs = json.loads(f.read())

        for method in methods:
            fullPPIs = [] 
            with open("./linkPred_out_reduced/{}_{}_randSz{}_topPPI.json".format(method, ds_name, randSz), "r") as f:
                fullPPIs = json.loads(f.read())

            # len(fullPPIs) = len(samplePPIs) = 10, because 10 trials
            precRecMap = pred.precRecMap_multiCore(
                ["{}_{}_randSz{}_topPPI_{}".format(method, ds_name, randSz, i) for i in range(len(fullPPIs))]
              , fullPPIs, samplePPIs, [completePPIs[ds_name] for i in range(len(fullPPIs))]
              , coreNo=10)

            for key in precRecMap:
                with open("./precision_recall_out/{}.json".format(key), 'w') as f:
                    f.write(json.dumps(precRecMap[key]))