In [1]:
import sys
sys.path.append('..')

# std
import numpy as np
import random as rn
import json
import time
from collections import defaultdict
from itertools import combinations
import pandas as pd
import os
import math

# datasets
import STRING
import MINT
import bioGRID
import HuRI
import HI_II_14_src
import IM24272_src
import Lit_BM_13_src
import Lit_NB_13_src

# my lib
import PPILinkPred as pred
import helper as hr
import genData_helper as helper
import traversalHelper as tr

class ns:
    BRToRelat = tr.Helper.binary_to_relation
    toDualBR = tr.Helper.to_dual_binary_relation
    BRToNode = tr.Helper.binary_relation_to_node
    arr_pStr = tr.Helper.list_to_pathStrs
    pStr_arr = tr.Helper.pathStrs_to_list
    br_str = tr.Helper.br_to_pathStr

# Generate Random PPI Samples from Datasets

In [2]:
# human dataset: HuRI
# DataFrame standard: {nodeA, nodeB, type, score}
# randomly 50% of the dataset 10 times, save into json

import_funcs = [HuRI.parse_HuRI(root="../")]
names = ['HuRI']

for n in range(len(names)):
    df = import_funcs[n]
    ppi = [list(arr) for arr in np.asarray(df[['nodeA', 'nodeB']])]
    sampledPPIs = [rn.sample(ppi, int(len(ppi)*0.5)) for i in range(10)]
    with open("./sampled_datasets/{}_sampledPPIs.json".format(names[n]), "w") as f:
        f.write(json.dumps(sampledPPIs))

In [3]:
# human dataset: IM-24272, Lit-BM-13 (binary & complex), HI-II-14
# DataFrame standard: {nodeA, nodeB, type, score}
# randomly 50% of the dataset 10 times, save into json

import_funcs = [IM24272.parse_IM24272(root="../"),
                (Lit_BM_13.parse_Lit_BM_13(root="../"))[0],
                (Lit_BM_13.parse_Lit_BM_13(root="../"))[1],
                HI_II_14.parse_HI(root="../")]
names = ['IM24272', "Lit_13_binary", "Lit_13_cplx", "HI_14"]

for n in range(len(names)):
    df = import_funcs[n]
    ppi = [list(arr) for arr in np.asarray(df[['nodeA', 'nodeB']])]
    sampledPPIs = [rn.sample(ppi, int(len(ppi)*0.5)) for i in range(10)]
    with open("./sampled_datasets/{}_sampledPPIs.json".format(names[n]), "w") as f:
        f.write(json.dumps(sampledPPIs))

NameError: name 'IM24272' is not defined

In [12]:
# human dataset direct source from L3
# DataFrame standard: {nodeA, nodeB, type, score}
# randomly 50% of the dataset 10 times, save into json

import_funcs = [IM24272_src.parse_IM24272_src(root="../"),
                Lit_BM_13_src.parse_Lit_BM_src(root="../"),
                Lit_NB_13_src.parse_Lit_NB_src(root="../"),
                HI_II_14_src.parse_HI_src(root="../")]
names = ['IM24272_src', "Lit_BM_src", "Lit_NB_src", "HI_14_src"]

for n in range(len(names)):
    df = import_funcs[n]
    ppi = [list(arr) for arr in np.asarray(df[['nodeA', 'nodeB']])]
    sampledPPIs = [rn.sample(ppi, int(len(ppi)*0.5)) for i in range(10)]
    with open("./sampled_datasets/{}_sampledPPIs.json".format(names[n]), "w") as f:
        f.write(json.dumps(sampledPPIs))

14390 0
5522 1
2453 2
6934 3


In [2]:
# human dataset: HuRI
# DataFrame standard: {nodeA, nodeB, type, score}
# randomly 55-90% of the dataset 10 times, save into json

import_funcs = [HuRI.parse_HuRI(root="../")]
names = ['HuRI']

for randSz in range(95, 54, -5):
    for n in range(len(names)):
        df = import_funcs[n]
        ppi = [list(arr) for arr in np.asarray(df[['nodeA', 'nodeB']])]
        sampledPPIs = [rn.sample(ppi, int(len(ppi)*(randSz*0.01))) for i in range(10)]
        with open("./sampled_datasets/{}_sampledPPIs_{}Percent.json".format(names[n], randSz), "w") as f:
            f.write(json.dumps(sampledPPIs))

In [3]:
# human datasets
import_funcs = [
    bioGRID.parse_bioGRID(filename='./data/BioGRID/BIOGRID-ORGANISM-Homo_sapiens-3.5.187.tab2.txt'
        , wFile_GGI='./data/parsed/BioGRID_homo_GGI.pkl'
        , wFile_PPI='./data/parsed/BioGRID_homo_PPI.pkl', root="../")

    , STRING.parse_STRING(ppiFile='./data/STRING/9606.protein.links.v11.0.txt'
        , typeFile='./data/STRING/9606.protein.actions.v11.0.txt'
        , uniProtMap='./data/UniProt/uniprot-taxonomy_9606_STRING.tab', root='../'
        , wFile_GGI='./data/parsed/STRING_homo_GGI.pkl', wFile_PPI='./data/parsed/STRING_homo_PPI.pkl')

    , MINT.parse_MINT(ppiFile='./data/MINT/species human', uniProtMap="./data/UniProt/uniprot-taxonomy_9606.tab"
        , wFile_GGI='./data/parsed/MINT_homo_GGI.pkl', wFile_PPI='./data/parsed/MINT_homo_PPI.pkl', root="../")
]
names = ['bioGRID_human', "STRING_human", "MINT_human"]

for n in range(len(names)):
    _, df = import_funcs[n]
    ppi = [list(arr) for arr in np.asarray(df[['nodeA', 'nodeB']])]
    sampledPPIs = [rn.sample(ppi, int(len(ppi)*0.5)) for i in range(10)]
    with open("./sampled_datasets/{}_sampledPPIs.json".format(names[n]), "w") as f:
        f.write(json.dumps(sampledPPIs))

In [3]:
# human datasets
import_funcs = [
    bioGRID.parse_bioGRID(filename='./data/BioGRID/BIOGRID-ORGANISM-Homo_sapiens-3.5.187.tab2.txt'
        , wFile_GGI='./data/parsed/BioGRID_homo_GGI.pkl'
        , wFile_PPI='./data/parsed/BioGRID_homo_PPI.pkl', root="../")

    , STRING.parse_STRING(ppiFile='./data/STRING/9606.protein.links.v11.0.txt'
        , typeFile='./data/STRING/9606.protein.actions.v11.0.txt'
        , uniProtMap='./data/UniProt/uniprot-taxonomy_9606_STRING.tab', root='../'
        , wFile_GGI='./data/parsed/STRING_homo_GGI.pkl', wFile_PPI='./data/parsed/STRING_homo_PPI.pkl')

    , MINT.parse_MINT(ppiFile='./data/MINT/species human', uniProtMap="./data/UniProt/uniprot-taxonomy_9606.tab"
        , wFile_GGI='./data/parsed/MINT_homo_GGI.pkl', wFile_PPI='./data/parsed/MINT_homo_PPI.pkl', root="../")
]
names = ['bioGRID_human', "STRING_human", "MINT_human"]

for randSz in range(95, 54, -5):
    for n in range(len(names)):
        _, df = import_funcs[n]
        ppi = [list(arr) for arr in np.asarray(df[['nodeA', 'nodeB']])]
        sampledPPIs = [rn.sample(ppi, int(len(ppi)*(randSz*0.01))) for i in range(10)]
        with open("./sampled_datasets/{}_sampledPPIs_{}Percent.json".format(names[n], randSz), "w") as f:
            f.write(json.dumps(sampledPPIs))

In [2]:
# human dataset
# sample non-PPIs of real-PPIs size

ds_names = ['bioGRID_human', 'STRING_human', 'MINT_human', 'HuRI']
import_funcs = [
    bioGRID.parse_bioGRID(filename='./data/BioGRID/BIOGRID-ORGANISM-Homo_sapiens-3.5.187.tab2.txt'
        , wFile_GGI='./data/parsed/BioGRID_homo_GGI.pkl'
        , wFile_PPI='./data/parsed/BioGRID_homo_PPI.pkl', root="../")

    , STRING.parse_STRING(ppiFile='./data/STRING/9606.protein.links.v11.0.txt'
        , typeFile='./data/STRING/9606.protein.actions.v11.0.txt'
        , uniProtMap='./data/UniProt/uniprot-taxonomy_9606_STRING.tab', root='../'
        , wFile_GGI='./data/parsed/STRING_homo_GGI.pkl', wFile_PPI='./data/parsed/STRING_homo_PPI.pkl')

    , MINT.parse_MINT(ppiFile='./data/MINT/species human', uniProtMap="./data/UniProt/uniprot-taxonomy_9606.tab"
        , wFile_GGI='./data/parsed/MINT_homo_GGI.pkl', wFile_PPI='./data/parsed/MINT_homo_PPI.pkl', root="../")
]
completePPIs_map = [
    [list(ppi) for ppi in np.asarray([*import_funcs[0]][1][['nodeA', 'nodeB']])]
    , [list(ppi) for ppi in np.asarray([*import_funcs[1]][1][['nodeA', 'nodeB']])]
    , [list(ppi) for ppi in np.asarray([*import_funcs[2]][1][['nodeA', 'nodeB']])]
    , [list(ppi) for ppi in np.asarray(HuRI.parse_HuRI(root="../")[['nodeA', 'nodeB']])]
]
ppi_ds = dict(zip(ds_names, completePPIs_map))

for ds in ppi_ds:
    ppi = ppi_ds[ds]
    validNodes = list(ns.BRToNode(ppi))
    ppi_str = set(ns.arr_pStr(ns.toDualBR(ppi)))
    ppiNumDoub = len(validNodes)*len(validNodes)-1

    sampled_nonPPIs = []
    for i in range(10):
        candidatePPIs = set()
        while len(candidatePPIs) < len(ppi):
            rnPPI_i = rn.randint(0, ppiNumDoub)
            nodeA, nodeB = validNodes[math.floor(rnPPI_i/len(validNodes))], validNodes[rnPPI_i%len(validNodes)]
            if nodeA == nodeB: continue
            rnPPI = [nodeA, nodeB]
            rnPPI_str, rnPPI_str_rev = ns.br_str(rnPPI), ns.br_str(rnPPI[::-1])
            if rnPPI_str in ppi_str or rnPPI_str_rev in candidatePPIs or rnPPI_str in candidatePPIs: continue
            candidatePPIs.add(rnPPI_str)
        sampled_nonPPIs.append(ns.pStr_arr(list(candidatePPIs)))
    
    with open("./sampled_datasets/{}_sampled_nonPPIs.json".format(ds), "w") as f:
        f.write(json.dumps(sampled_nonPPIs))

# Run Link Prediction

In [None]:
methods = ["commonNeighbor", "L3Normalizing", "CRA", "CH2_L3", "Sim", "L3E1_f1", "L3E1_f2"]
ds_names = ['HuRI', 'MINT_human']

for ds_name in ds_names:
    # read dataset
    samplePPIs = []
    with open("./sampled_datasets/{}_sampledPPIs.json".format(ds_name), "r") as f:
        samplePPIs = json.loads(f.read())

    # do link prediction & save results
    for method in methods:
        for i in range(len(samplePPIs)):
            saveFilename = "{}_{}_sample_{}".format(method, ds_name, i)
            startTime = time.time()
            
            # jupyter notebook cannot display multi core logging, do it only in terminal
            predPPI, predScore = pred.multiCore_PPILinkPred(samplePPIs[i]
                                                            , method, coreNo=14, logging=False)
            helper.write_runTime(saveFilename, time.time()-startTime)
            helper.write_resultData(predPPI, predScore, saveFilename)

In [13]:
#methods = ['commonNeighbor', 'L3Normalizing', 'L3E1_f1', 'L3E1_f2']
methods = ['CRA']
ds_names = ['IM24272_src', "Lit_BM_src", "Lit_NB_src", "HI_14_src"]

for ds_name in ds_names:
    # read dataset
    samplePPIs = []
    with open("./sampled_datasets/{}_sampledPPIs.json".format(ds_name), "r") as f:
        samplePPIs = json.loads(f.read())

    # do link prediction & save results
    for method in methods:
        for i in range(len(samplePPIs)):
            saveFilename = "{}_{}_sample_{}".format(method, ds_name, i)
            startTime = time.time()
            
            # jupyter notebook cannot display multi core logging, do it only in terminal
            predPPI, predScore = pred.multiCore_PPILinkPred(samplePPIs[i]
                                                            , method, coreNo=14, logging=False)
            helper.write_runTime(saveFilename, time.time()-startTime)
            helper.write_resultData(predPPI, predScore, saveFilename)



































































































































































































































































































































In [None]:
methods = ["commonNeighbor", "L3Normalizing", "CRA", "Sim", "L3E1_f1", 'L3E1_f2', 'random']
# skip CH2 and L3E1_f2 first because waste time, may use HPC
ds_names = ['HuRI', 'MINT_human']

for randSz in range(60, 100, 10):
    for ds_name in ds_names:
        # read dataset
        samplePPIs = []
        with open("./sampled_datasets/{}_sampledPPIs_{}Percent.json".format(ds_name, randSz), "r") as f:
            samplePPIs = json.loads(f.read())

        # do link prediction & save results
        for method in methods:
            print(randSz, ds_name, method)
            for i in range(len(samplePPIs)):
                saveFilename = "{}_{}_sample_{}_randSz{}Percent".format(method, ds_name, i, randSz)
                startTime = time.time()

                # jupyter notebook cannot display multi core logging, do it only in terminal
                predPPI, predScore = pred.multiCore_PPILinkPred(samplePPIs[i]
                                                                , method, coreNo=14, logging=False)
                helper.write_runTime(saveFilename, time.time()-startTime)
                helper.write_resultData(predPPI, predScore, saveFilename)

In [2]:
# randomly choose n edges, n = size of sampled dataset
ds_names = ['bioGRID_human', 'STRING_human', 'MINT_human', 'HuRI']

for randSz in range(50, 100, 10):
    for ds_name in ds_names:
        samplePPIs = []
        if randSz == 50:
            with open("./sampled_datasets/{}_sampledPPIs.json".format(ds_name), "r") as f:
                samplePPIs = json.loads(f.read())
        else:
            with open("./sampled_datasets/{}_sampledPPIs_{}Percent.json".format(ds_name, randSz), "r") as f:
                samplePPIs = json.loads(f.read())
        sampleSize = len(samplePPIs[0])
        sampleSize = int(sampleSize/(randSz/100)*np.around(1-randSz/100, 2))

        # loop each method, each trial, extract the number into one json
        fullPPIs, fullScores = [], []
        for trial in range(10):
            samplePPIbr = samplePPIs[trial]
            sampleNodes = list(ns.BRToNode(samplePPIbr))
            samplePPIbr_str = set(ns.arr_pStr(ns.toDualBR(samplePPIbr)))
            ppiNumDoub = len(sampleNodes)*len(sampleNodes)-1

            candidatePPIs = set()
            while len(candidatePPIs) < sampleSize:
                rnPPI_i = rn.randint(0, ppiNumDoub)
                nodeA, nodeB = sampleNodes[math.floor(rnPPI_i/len(sampleNodes))], sampleNodes[rnPPI_i%len(sampleNodes)]
                if nodeA == nodeB: continue
                rnPPI = [nodeA, nodeB]
                rnPPI_str, rnPPI_str_rev = ns.br_str(rnPPI), ns.br_str(rnPPI[::-1])
                if rnPPI_str in samplePPIbr_str or rnPPI_str_rev in candidatePPIs or rnPPI_str in candidatePPIs: continue
                candidatePPIs.add(rnPPI_str)

            fullPPIs.append(ns.pStr_arr(candidatePPIs))
            fullScores.append([1 for i in range(sampleSize)])
        
        if randSz == 50:
            with open("./linkPred_out_reduced/random_{}_topPPI.json".format(ds_name), "w") as f:
                f.write(json.dumps(fullPPIs))
            with open("./linkPred_out_reduced/random_{}_topScore.json".format(ds_name), "w") as f:
                f.write(json.dumps(fullScores))
        else:
            with open("./linkPred_out_reduced/random_{}_randSz{}_topPPI.json".format(ds_name, randSz), "w") as f:
                f.write(json.dumps(fullPPIs))
            with open("./linkPred_out_reduced/random_{}_randSz{}_topScore.json".format(ds_name, randSz), "w") as f:
                f.write(json.dumps(fullScores))

In [None]:
# bioGRID, STRING Human Dataset are generated using generate_prediction_HPC.py script

# Data Cleaning for Analysis & Processing

In [None]:
# extract only top n edges, n = size of sampled dataset
methods = ["commonNeighbor", "L3Normalizing", "CRA", "CH2_L3", "Sim", "L3E1_f1", "L3E1_f2"]
ds_names = ['bioGRID_human', 'STRING_human']

for ds_name in ds_names:
    sampleSize = 0
    with open("./sampled_datasets/{}_sampledPPIs.json".format(ds_name), "r") as f:
        sampleSize = len(json.loads(f.read())[0])
    
    # loop each method, each trial, extract the number into one json
    for method in methods:
        for trial in range(10):
            topPPIs, topScores = [], []
            for core in range(24):
                with open("./linkPred_human_out/{}_{}_sample_{}_c{}_PPI.json".format(method, ds_name, trial, core), "r") as f:
                    topPPIs += json.loads(f.read())
                with open("./linkPred_human_out/{}_{}_sample_{}_c{}_score.json".format(method, ds_name, trial, core), "r") as f:
                    topScores += json.loads(f.read())
                topPPIs, topScores = hr.sort_key_val(topPPIs, topScores)
                topPPIs, topScores = topPPIs[:sampleSize], topScores[:sampleSize]
            
            with open("./linkPred_human_out_combined/{}_{}_sample_{}_topPPI.json".format(method, ds_name, trial), "w") as f:
                f.write(json.dumps(topPPIs))
            with open("./linkPred_human_out_combined/{}_{}_sample_{}_topScore.json".format(method, ds_name, trial), "w") as f:
                f.write(json.dumps(topScores))

In [4]:
# extract only top n edges, n = size of sampled dataset
methods = ["L3E_f1Alt", "L3E_f2Alt"]
ds_names = ['STRING_human']

for ds_name in ds_names:
    sampleSize = 0
    with open("./sampled_datasets/{}_sampledPPIs.json".format(ds_name), "r") as f:
        sampleSize = len(json.loads(f.read())[0])
    
    # loop each method, each trial, extract the number into one json
    for method in methods:
        for trial in range(10):
            topPPIs, topScores = [], []
            for core in range(12):
                with open("E:/research/ppiLPred_BMC/notebook/linkPred_out/{}_{}_sample_{}_c{}_PPI.json".format(method, ds_name, trial, core), "r") as f:
                    topPPIs += json.loads(f.read())
                with open("E:/research/ppiLPred_BMC/notebook/linkPred_out/{}_{}_sample_{}_c{}_score.json".format(method, ds_name, trial, core), "r") as f:
                    topScores += json.loads(f.read())
                topPPIs, topScores = hr.sort_key_val(topPPIs, topScores)
                topPPIs, topScores = topPPIs[:sampleSize], topScores[:sampleSize]
            
            with open("E:/research/ppiLPred_BMC/notebook/linkPred_out_combined/{}_{}_sample_{}_topPPI.json".format(method, ds_name, trial), "w") as f:
                f.write(json.dumps(topPPIs))
            with open("E:/research/ppiLPred_BMC/notebook/linkPred_out_combined/{}_{}_sample_{}_topScore.json".format(method, ds_name, trial), "w") as f:
                f.write(json.dumps(topScores))

In [None]:
methods = ["commonNeighbor", "L3Normalizing", "CRA", "CH2_L3", "Sim", "L3E1_f1", "L3E1_f2"]
ds_names = ['STRING_human', 'bioGRID_human']

for ds_name in ds_names:
    for method in methods:
        fullPPIs, fullScores = [], []
        for trial in range(10):
            with open("./linkPred_human_out_combined/{}_{}_sample_{}_topPPI.json".format(method, ds_name, trial), "r") as f:
                fullPPIs.append(json.loads(f.read()))
            with open("./linkPred_human_out_combined/{}_{}_sample_{}_topScore.json".format(method, ds_name, trial), "r") as f:
                fullScores.append(json.loads(f.read()))
                
        with open("./linkPred_human_out_reduced/{}_{}_topPPI.json".format(method, ds_name), "w") as f:
            f.write(json.dumps(fullPPIs))
        with open("./linkPred_human_out_reduced/{}_{}_topScore.json".format(method, ds_name), "w") as f:
            f.write(json.dumps(fullScores))

In [6]:
methods = ["L3E_f1Alt", "L3E_f2Alt"]
ds_names = ['STRING_human', 'bioGRID_human']

for ds_name in ds_names:
    for method in methods:
        fullPPIs, fullScores = [], []
        for trial in range(10):
            with open("E:/research/ppiLPred_BMC/notebook/linkPred_out_combined/{}_{}_sample_{}_topPPI.json".format(method, ds_name, trial), "r") as f:
                fullPPIs.append(json.loads(f.read()))
            with open("E:/research/ppiLPred_BMC/notebook/linkPred_out_combined/{}_{}_sample_{}_topScore.json".format(method, ds_name, trial), "r") as f:
                fullScores.append(json.loads(f.read()))
                
        with open("./linkPred_out_reduced/{}_{}_topPPI.json".format(method, ds_name), "w") as f:
            f.write(json.dumps(fullPPIs))
        with open("./linkPred_out_reduced/{}_{}_topScore.json".format(method, ds_name), "w") as f:
            f.write(json.dumps(fullScores))

In [4]:
# extract only top n edges, n = size of sampled dataset
methods = ["commonNeighbor", "L3Normalizing", "CRA", "CH2_L3", "L3E1_f1", "L3E1_f2", "Sim"]
ds_names = ['MINT_human', 'HuRI']

for ds_name in ds_names:
    samplePPIs = []
    with open("./sampled_datasets/{}_sampledPPIs.json".format(ds_name), "r") as f:
        samplePPIs = json.loads(f.read())
    sampleSize = len(samplePPIs[0])
    
    # loop each method, each trial, extract the number into one json
    for method in methods:
        fullPPI, fullScore = [], []
        for trial in range(10):
            with open("./linkPred_human_out/{}_{}_sample_{}_PPI.json".format(method, ds_name, trial), "r") as f:
                fullPPI.append(json.loads(f.read())[0:sampleSize])
            with open("./linkPred_human_out/{}_{}_sample_{}_score.json".format(method, ds_name, trial), "r") as f:
                fullScore.append(json.loads(f.read())[0:sampleSize])
        with open("./linkPred_human_out_reduced/{}_{}_topPPI.json".format(method, ds_name), "w") as f:
            f.write(json.dumps(fullPPI))
        with open("./linkPred_human_out_reduced/{}_{}_topScore.json".format(method, ds_name), "w") as f:
            f.write(json.dumps(fullScore))

In [2]:
# get full PPIs here
methods = ["L3E1_f1Alt", "L3E1_f2Alt"]
ds_names = ['IM24272_src', "Lit_BM_src", "Lit_NB_src", "HI_14_src"]

for ds_name in ds_names:
    samplePPIs = []
    with open("./sampled_datasets/{}_sampledPPIs.json".format(ds_name), "r") as f:
        samplePPIs = json.loads(f.read())
    sampleSize = len(samplePPIs[0])
    
    # loop each method, each trial, extract the number into one json
    for method in methods:
        fullPPI, fullScore = [], []
        print(method, ds_name)
        #if os.path.exists("./linkPred_out_reduced/{}_{}_topPPI.json".format(method, ds_name)): continue
        for trial in range(10):
            with open("./linkPred_out/{}_{}_sample_{}_PPI.json".format(method, ds_name, trial), "r") as f:
                fullPPI.append(json.loads(f.read()))
            with open("./linkPred_out/{}_{}_sample_{}_score.json".format(method, ds_name, trial), "r") as f:
                fullScore.append(json.loads(f.read()))
        with open("./linkPred_out_reduced/{}_{}_topPPI.json".format(method, ds_name), "w") as f:
            f.write(json.dumps(fullPPI))
        with open("./linkPred_out_reduced/{}_{}_topScore.json".format(method, ds_name), "w") as f:
            f.write(json.dumps(fullScore))

L3E1_f1Alt IM24272_src
L3E1_f2Alt IM24272_src
L3E1_f1Alt Lit_BM_src
L3E1_f2Alt Lit_BM_src
L3E1_f1Alt Lit_NB_src
L3E1_f2Alt Lit_NB_src
L3E1_f1Alt HI_14_src
L3E1_f2Alt HI_14_src


In [5]:
allPaths = ["I:/research/ppiLPred_BMC/notebook"
            , "E:/research/ppiLPred_BMC/notebook"
            , "D:/research offline repo/ppiLPred_BMC/notebook"
            , "G:/research/ppiLPred_BMC/notebook"]
coreNo, trialNum = 12, 10
def verify(method, ds, randSz):
    # check HPC or not
    isHPC = None
    for path in allPaths:
        # check if file exists in linkPred_out_reduced
        if os.path.exists("./linkPred_out_reduced/{}_{}_randSz{}_topPPI.json".format(
            method, ds, randSz)): return 0, None, None
        
        if os.path.exists("{}/linkPred_out/{}_{}_sample_9_randSz{}Percent_c0_PPI.json".format(
            path, method, ds, randSz)):
            isHPC = True
            break
        elif os.path.exists("{}/linkPred_out/{}_{}_sample_9_randSz{}Percent_PPI.json".format(
            path, method, ds, randSz)):
            isHPC = False
            break
            
    if isHPC is None: return 2, None, None
    # iterate the abs path to all related files
    filenames = []
    if isHPC:
        for trial in range(trialNum):
            for core in range(coreNo):
                for path in allPaths:
                    filename = "{}/linkPred_out/{}_{}_sample_{}_randSz{}Percent_c{}_PPI.json".format(
                        path, method, ds, trial, randSz, core)
                    if os.path.exists(filename): filenames.append(filename.split("_PPI.json")[0])
    else:
        for trial in range(trialNum):
            for path in allPaths:
                filename = "{}/linkPred_out/{}_{}_sample_{}_randSz{}Percent_PPI.json".format(
                    path, method, ds, trial, randSz)
                if os.path.exists(filename): filenames.append(filename.split("_PPI.json")[0])
    # return available, list of files, also isHPC
    return 1, filenames, isHPC

In [4]:
allPaths = ["I:/research/ppiLPred_BMC/notebook"
            , "E:/research/ppiLPred_BMC/notebook"
            , "D:/research offline repo/ppiLPred_BMC/notebook"
            , "G:/research/ppiLPred_BMC/notebook"]
coreNo, trialNum = 24, 10
def verify_tmp(method, ds):
    for path in allPaths:
        # check if file exists in linkPred_out_reduced
        if os.path.exists("./linkPred_out_reduced/{}_{}_topPPI.json".format(
            method, ds)): return 0, None, None
            
    # iterate the abs path to all related files
    filenames = []
    for trial in range(trialNum):
        for core in range(coreNo):
            for path in allPaths:
                filename = "{}/linkPred_out/{}_{}_sample_{}_c{}_PPI.json".format(
                    path, method, ds, trial, core)
                if os.path.exists(filename): filenames.append(filename.split("_PPI.json")[0])
    # return available, list of files, also isHPC
    return 1, filenames, True

# trim data that isn't trimmed yet
methods = ["commonNeighbor", "L3Normalizing", "CRA", "CH2_L3", "Sim", 'random', "L3E1_f1", "L3E1_f2"]
dss = ['bioGRID_human', 'STRING_human']
coreNo, trialNum = 24, 10

for ds in dss:
    samplePPIs = []
    with open("./sampled_datasets/{}_sampledPPIs.json".format(ds), "r") as f:
        samplePPIs = json.loads(f.read())
    sampleSize = len(samplePPIs[0])

    for method in methods:
        available, filenames, isHPC = verify_tmp(method, ds)
        print(50, ds, method, isHPC, available)
        if available != 1: continue
            
        for trial in range(trialNum):
            topPPIs, topScores = [], []
            for core in range(coreNo):
                with open(filenames[trial*coreNo+core]+"_PPI.json", "r") as f: topPPIs += json.loads(f.read())
                with open(filenames[trial*coreNo+core]+"_score.json", "r") as f: topScores += json.loads(f.read())
                topPPIs, topScores = hr.sort_key_val(topPPIs, topScores)
                topPPIs, topScores = topPPIs[:sampleSize], topScores[:sampleSize]
            with open("./linkPred_out_combined/{}_{}_sample_{}_topPPI.json".format(method, ds, trial), "w") as f:
                f.write(json.dumps(topPPIs))
            with open("./linkPred_out_combined/{}_{}_sample_{}_topScore.json".format(method, ds, trial), "w") as f:
                f.write(json.dumps(topScores))

        fullPPIs, fullScores = [], []
        for trial in range(10):
            with open("./linkPred_out_combined/{}_{}_sample_{}_topPPI.json".format(method, ds, trial), "r") as f:
                fullPPIs.append(json.loads(f.read()))
            with open("./linkPred_out_combined/{}_{}_sample_{}_topScore.json".format(method, ds, trial), "r") as f:
                fullScores.append(json.loads(f.read()))
        with open("./linkPred_out_reduced/{}_{}_topPPI.json".format(method, ds), "w") as f:
            f.write(json.dumps(fullPPIs))
        with open("./linkPred_out_reduced/{}_{}_topScore.json".format(method, ds), "w") as f:
            f.write(json.dumps(fullScores))

50 bioGRID_human commonNeighbor None 0
50 bioGRID_human L3Normalizing None 0
50 bioGRID_human CRA None 0
50 bioGRID_human CH2_L3 None 0
50 bioGRID_human Sim None 0
50 bioGRID_human random None 0
50 bioGRID_human L3E1_f1 None 0
50 bioGRID_human L3E1_f2 None 0
50 STRING_human commonNeighbor None 0
50 STRING_human L3Normalizing None 0
50 STRING_human CRA None 0
50 STRING_human CH2_L3 True 1
50 STRING_human Sim True 1
50 STRING_human random None 0
50 STRING_human L3E1_f1 True 1
50 STRING_human L3E1_f2 True 1


In [7]:
# trim data that isn't trimmed yet
methods = ["L3E1_f2", "CH2_L3"]
dss = ['bioGRID_human', 'STRING_human']
coreNo, trialNum = 24, 10

# assume rawData complete
def verify_tmp(method, ds, randSz):
    for path in allPaths:
        # check if file exists in linkPred_out_reduced
        if os.path.exists("./linkPred_out_reduced/{}_{}_randSz{}_topPPI.json".format(
            method, ds, randSz)): return 0, None, None
            
    # iterate the abs path to all related files
    filenames = []
    for trial in range(trialNum):
        for core in range(coreNo):
            noFile = 0
            for path in allPaths:
                filename = "{}/linkPred_out/{}_{}_sample_{}_randSz{}Percent_c{}_PPI.json".format(
                    path, method, ds, trial, randSz, core)
                if os.path.exists(filename):
                    filenames.append(filename.split("_PPI.json")[0])
                else:
                    noFile += 1
            if noFile == len(allPaths): break
    # return available, list of files, also isHPC
    return 1, filenames, True

for randSz in range(80, 91, 10):
    for ds in dss:
        samplePPIs = []
        with open("./sampled_datasets/{}_sampledPPIs_{}Percent.json".format(ds, randSz), "r") as f:
            samplePPIs = json.loads(f.read())
        sampleSize = len(samplePPIs[0])
        sampleSize = int(sampleSize/(randSz/100)*np.around(1-randSz/100, 2))

        for method in methods:
            available, filenames, isHPC = verify_tmp(method, ds, randSz)
            print(randSz, ds, method, isHPC, available)
            if available != 1: continue

            for trial in range(trialNum):
                topPPIs, topScores = [], []
                relatedFiles = [file for file in filenames if "sample_"+str(trial) in file]
                for file in relatedFiles:
                    with open(file+"_PPI.json", "r") as f: topPPIs += json.loads(f.read())
                    with open(file+"_score.json", "r") as f: topScores += json.loads(f.read())
                    topPPIs, topScores = hr.sort_key_val(topPPIs, topScores)
                    topPPIs, topScores = topPPIs[:sampleSize], topScores[:sampleSize]
                with open("./linkPred_out_combined/{}_{}_sample_{}_randSz{}_topPPI.json".format(method, ds, trial, randSz), "w") as f:
                    f.write(json.dumps(topPPIs))
                with open("./linkPred_out_combined/{}_{}_sample_{}_randSz{}_topScore.json".format(method, ds, trial, randSz), "w") as f:
                    f.write(json.dumps(topScores))

            fullPPIs, fullScores = [], []
            for trial in range(10):
                with open("./linkPred_out_combined/{}_{}_sample_{}_randSz{}_topPPI.json".format(method, ds, trial, randSz), "r") as f:
                    fullPPIs.append(json.loads(f.read()))
                with open("./linkPred_out_combined/{}_{}_sample_{}_randSz{}_topScore.json".format(method, ds, trial, randSz), "r") as f:
                    fullScores.append(json.loads(f.read()))
            with open("./linkPred_out_reduced/{}_{}_randSz{}_topPPI.json".format(method, ds, randSz), "w") as f:
                f.write(json.dumps(fullPPIs))
            with open("./linkPred_out_reduced/{}_{}_randSz{}_topScore.json".format(method, ds, randSz), "w") as f:
                f.write(json.dumps(fullScores))

80 bioGRID_human L3E1_f2 True 1
80 bioGRID_human CH2_L3 True 1
80 STRING_human L3E1_f2 True 1
80 STRING_human CH2_L3 True 1
90 bioGRID_human L3E1_f2 True 1
90 bioGRID_human CH2_L3 True 1
90 STRING_human L3E1_f2 True 1
90 STRING_human CH2_L3 True 1


# Generate GOSemSim

run **GOSemSim_compute.R** of the same directory, it scans ./linkPred_out and output GOSemSim in the same format of **xxx_topScore.json**

# Generate precision recall

In [9]:
# for each dataset & predictor, get precision recall and save in one file for each trial
methods = ["commonNeighbor", "L3Normalizing", "CRA", "CH2_L3", "Sim", "L3E1_f1", "L3E1_f2"]
ds_names = ['bioGRID_human', 'STRING_human', 'MINT_human', 'HuRI']
import_funcs = [
    bioGRID.parse_bioGRID(filename='./data/BioGRID/BIOGRID-ORGANISM-Homo_sapiens-3.5.187.tab2.txt'
        , wFile_GGI='./data/parsed/BioGRID_homo_GGI.pkl'
        , wFile_PPI='./data/parsed/BioGRID_homo_PPI.pkl', root="../")

    , STRING.parse_STRING(ppiFile='./data/STRING/9606.protein.links.v11.0.txt'
        , typeFile='./data/STRING/9606.protein.actions.v11.0.txt'
        , uniProtMap='./data/UniProt/uniprot-taxonomy_9606_STRING.tab', root='../'
        , wFile_GGI='./data/parsed/STRING_homo_GGI.pkl', wFile_PPI='./data/parsed/STRING_homo_PPI.pkl')

    , MINT.parse_MINT(ppiFile='./data/MINT/species human', uniProtMap="./data/UniProt/uniprot-taxonomy_9606.tab"
        , wFile_GGI='./data/parsed/MINT_homo_GGI.pkl', wFile_PPI='./data/parsed/MINT_homo_PPI.pkl', root="../")
]
completePPIs_map = [
    [list(ppi) for ppi in np.asarray([*import_funcs[0]][1][['nodeA', 'nodeB']])]
    , [list(ppi) for ppi in np.asarray([*import_funcs[1]][1][['nodeA', 'nodeB']])]
    , [list(ppi) for ppi in np.asarray([*import_funcs[2]][1][['nodeA', 'nodeB']])]
    , [list(ppi) for ppi in np.asarray(HuRI.parse_HuRI(root="../")[['nodeA', 'nodeB']])]
]
completePPIs = dict(zip(ds_names, completePPIs_map))

for randSz in range(50, 91, 10):
    for ds_name in ds_names:
        samplePPIs = []
        if randSz != 50:
            with open("./sampled_datasets/{}_sampledPPIs_{}Percent.json".format(ds_name, randSz), "r") as f:
                samplePPIs = json.loads(f.read())
        else:
            with open("./sampled_datasets/{}_sampledPPIs.json".format(ds_name), "r") as f:
                samplePPIs = json.loads(f.read())


        for method in methods:
            fullPPIs = []
            
            if randSz != 50:
                with open("./linkPred_out_reduced/{}_{}_randSz{}_topPPI.json".format(method, ds_name, randSz), "r") as f:
                    fullPPIs = json.loads(f.read())

                # len(fullPPIs) = len(samplePPIs) = 10, because 10 trials
                precRecMap = pred.precRecMap_multiCore(
                    ["{}_{}_randSz{}_topPPI_{}".format(method, ds_name, randSz, i) for i in range(len(fullPPIs))]
                  , fullPPIs, samplePPIs, [completePPIs[ds_name] for i in range(len(fullPPIs))]
                  , coreNo=10)
            else:
                with open("./linkPred_out_reduced/{}_{}_topPPI.json".format(method, ds_name), "r") as f:
                    fullPPIs = json.loads(f.read())

                # len(fullPPIs) = len(samplePPIs) = 10, because 10 trials
                precRecMap = pred.precRecMap_multiCore(
                    ["{}_{}_topPPI_{}".format(method, ds_name, i) for i in range(len(fullPPIs))]
                  , fullPPIs, samplePPIs, [completePPIs[ds_name] for i in range(len(fullPPIs))]
                  , coreNo=10)

            for key in precRecMap:
                with open("./precision_recall_out/{}.json".format(key), 'w') as f:
                    f.write(json.dumps(precRecMap[key]))

In [7]:
# for each dataset & predictor, get precision recall and save in one file for each trial
methods = ["L3E_f1Alt", "L3E_f2Alt"]
ds_names = ['bioGRID_human', 'STRING_human']
import_funcs = [
    bioGRID.parse_bioGRID(filename='./data/BioGRID/BIOGRID-ORGANISM-Homo_sapiens-3.5.187.tab2.txt'
        , wFile_GGI='./data/parsed/BioGRID_homo_GGI.pkl'
        , wFile_PPI='./data/parsed/BioGRID_homo_PPI.pkl', root="../")

    , STRING.parse_STRING(ppiFile='./data/STRING/9606.protein.links.v11.0.txt'
        , typeFile='./data/STRING/9606.protein.actions.v11.0.txt'
        , uniProtMap='./data/UniProt/uniprot-taxonomy_9606_STRING.tab', root='../'
        , wFile_GGI='./data/parsed/STRING_homo_GGI.pkl', wFile_PPI='./data/parsed/STRING_homo_PPI.pkl')
]
completePPIs_map = [
    [list(ppi) for ppi in np.asarray([*import_funcs[0]][1][['nodeA', 'nodeB']])]
    , [list(ppi) for ppi in np.asarray([*import_funcs[1]][1][['nodeA', 'nodeB']])]
]
completePPIs = dict(zip(ds_names, completePPIs_map))

for randSz in range(50, 51, 10):
    for ds_name in ds_names:
        samplePPIs = []
        if randSz != 50:
            with open("./sampled_datasets/{}_sampledPPIs_{}Percent.json".format(ds_name, randSz), "r") as f:
                samplePPIs = json.loads(f.read())
        else:
            with open("./sampled_datasets/{}_sampledPPIs.json".format(ds_name), "r") as f:
                samplePPIs = json.loads(f.read())


        for method in methods:
            fullPPIs = []
            
            if randSz != 50:
                with open("./linkPred_out_reduced/{}_{}_randSz{}_topPPI.json".format(method, ds_name, randSz), "r") as f:
                    fullPPIs = json.loads(f.read())

                # len(fullPPIs) = len(samplePPIs) = 10, because 10 trials
                precRecMap = pred.precRecMap_multiCore(
                    ["{}_{}_randSz{}_topPPI_{}".format(method, ds_name, randSz, i) for i in range(len(fullPPIs))]
                  , fullPPIs, samplePPIs, [completePPIs[ds_name] for i in range(len(fullPPIs))]
                  , coreNo=10)
            else:
                with open("./linkPred_out_reduced/{}_{}_topPPI.json".format(method, ds_name), "r") as f:
                    fullPPIs = json.loads(f.read())

                # len(fullPPIs) = len(samplePPIs) = 10, because 10 trials
                precRecMap = pred.precRecMap_multiCore(
                    ["{}_{}_topPPI_{}".format(method, ds_name, i) for i in range(len(fullPPIs))]
                  , fullPPIs, samplePPIs, [completePPIs[ds_name] for i in range(len(fullPPIs))]
                  , coreNo=10)

            for key in precRecMap:
                with open("./precision_recall_out/{}.json".format(key), 'w') as f:
                    f.write(json.dumps(precRecMap[key]))

In [None]:
# for each dataset & predictor, get precision recall and save in one file for each trial
methods = ['commonNeighbor', 'L3Normalizing', 'L3E1_f1', 'L3E1_f2']
ds_names = ['IM24272_src', "Lit_BM_src", "Lit_NB_src", "HI_14_src"]

import_funcs = [IM24272_src.parse_IM24272_src(root="../"),
                Lit_BM_13_src.parse_Lit_BM_src(root="../"),
                Lit_NB_13_src.parse_Lit_NB_src(root="../"),
                HI_II_14_src.parse_HI_src(root="../")]
completePPIs_map = [
    [list(arr) for arr in np.asarray(import_funcs[0][['nodeA', 'nodeB']])],
    [list(arr) for arr in np.asarray(import_funcs[1][['nodeA', 'nodeB']])],
    [list(arr) for arr in np.asarray(import_funcs[2][['nodeA', 'nodeB']])],
    [list(arr) for arr in np.asarray(import_funcs[3][['nodeA', 'nodeB']])]
]

completePPIs = dict(zip(ds_names, completePPIs_map))

for ds_name in ds_names:
    samplePPIs = []
    with open("./sampled_datasets/{}_sampledPPIs.json".format(ds_name), "r") as f:
        samplePPIs = json.loads(f.read())


    for method in methods:
        fullPPIs = []

        with open("./linkPred_out_reduced/{}_{}_topPPI.json".format(method, ds_name), "r") as f:
            fullPPIs = json.loads(f.read())

        # len(fullPPIs) = len(samplePPIs) = 10, because 10 trials
        precRecMap = pred.precRecMap_multiCore(
            ["{}_{}_topPPI_{}".format(method, ds_name, i) for i in range(len(fullPPIs))]
          , fullPPIs, samplePPIs, [completePPIs[ds_name] for i in range(len(fullPPIs))]
          , coreNo=5)

        for key in precRecMap:
            with open("./precision_recall_out/{}.json".format(key), 'w') as f:
                f.write(json.dumps(precRecMap[key]))

In [6]:
# dynamic PR generate, to 10% recall
# for each dataset & predictor, get precision recall and save in one file for each trial
methods = ['L3E1_f1Alt', 'L3E1_f2Alt']
ds_names = ['IM24272_src', "Lit_BM_src", "Lit_NB_src", "HI_14_src"]

import_funcs = [IM24272_src.parse_IM24272_src(root="../"),
                Lit_BM_13_src.parse_Lit_BM_src(root="../"),
                Lit_NB_13_src.parse_Lit_NB_src(root="../"),
                HI_II_14_src.parse_HI_src(root="../")]
import_funcs = dict(zip(ds_names, import_funcs))

for ds_name in ds_names:
    samplePPIs = []
    with open("./sampled_datasets/{}_sampledPPIs.json".format(ds_name), "r") as f:
        samplePPIs = json.loads(f.read())
    completePPIs = [list(arr) for arr in np.asarray(import_funcs[ds_name][['nodeA', 'nodeB']])]
    for method in methods:
        topPPIs = []
        for trial in range(10):
            with open("./linkPred_out/{}_{}_sample_{}_PPI.json".format(method, ds_name, trial), "r") as f:
                topPPIs = json.loads(f.read())
            prCurve = {"prec": [], "rec": []}
            tpPPICnt = 0
            PPIToHit = set(ns.arr_pStr(completePPIs))-set(ns.arr_pStr(ns.toDualBR(samplePPIs[trial])))
            truePPICnt = len(PPIToHit)
            for i in range(len(topPPIs)):
                if ns.br_str(topPPIs[i]) in PPIToHit or ns.br_str(topPPIs[i][::-1]) in PPIToHit:
                    tpPPICnt += 1
                prCurve['prec'].append(tpPPICnt/(i+1))
                prCurve['rec'].append(tpPPICnt/truePPICnt)
                if (tpPPICnt/truePPICnt) >= 0.1: break
            with open("./precision_recall_out/{}_{}_topPPI_{}.json".format(
                method, ds_name, trial), 'w') as f:
                f.write(json.dumps(prCurve))