In [1]:
import sys
sys.path.append('..')
import warnings
warnings.filterwarnings("ignore")


# standard
import json, pickle
from collections import defaultdict
import pandas as pd
import numpy as np
import seaborn as sns
from statistics import median
from matplotlib import pyplot as plt
from sklearn import metrics
import matplotlib.ticker as ticker
from itertools import combinations
from scipy import stats
import math
from matplotlib.ticker import FormatStrFormatter

# my lib
import PPILinkPred as pred
import genData_helper as helper
import traversalHelper as tr

ModuleNotFoundError: No module named 'seaborn'

In [2]:
colors = {
    "L3E1_f1": "tab:blue",
    "L3E1_f2": "tab:olive",
    "L3E4_f1": "tab:cyan",
    "L3E4_f2": "tab:pink",
    "L3": "tab:orange",
    "CN": "tab:green",
    "CRA": "tab:red",
    "CH2": "tab:brown",
    "Sim": "tab:purple",
    "rand": "tab:grey"
}

methods = ["commonNeighbor", "L3Normalizing", "CRA", "Sim", "CH2_L3", "random"]+["L3E1_{}".format(i) for i in ['f1', 'f2']]
methods_map = ["CN", "L3", "CRA", "Sim", "CH2", "rand"]+["L3E1_{}".format(i) for i in ['f1', 'f2']]
abbrev_map = ["CN", "L3", "CRA", "Sim", "CH2", "rand"]+["L3E\n($f_{"+str(i)+"}$)" for i in range(1,3)]
label_map = ["CN", "L3", "CRA", "Sim", "CH2", "rand"]+["L3E($f_{"+str(i)+"}$)" for i in range(1,3)]

methods_names = dict(zip(methods, methods_map))
abbrevs = dict(zip(methods_map, abbrev_map))
labels = dict(zip(methods_map, label_map))

In [3]:
methods = ["commonNeighbor", "L3Normalizing", "CRA", "Sim", "CH2_L3", "L3E1_f1", "L3E1_f2", "random"]
ds_names = ['bioGRID_human', 'STRING_human', 'MINT_human', 'HuRI']
parseTopPPIs = {}

for randSz in range(50, 100, 10):
    parseTopPPIs[randSz] = {}
    for ds in ds_names:
        parseTopPPIs[randSz][ds] = {}
        for method in methods:
            methodName = methods_names[method]
            parseTopPPIs[randSz][ds][methodName] = set()
            if randSz == 50:
                filename = "./linkPred_out_reduced/{}_{}_topPPI.json".format(method, ds)
            else:
                filename = "./linkPred_out_reduced/{}_{}_randSz{}_topPPI.json".format(method, ds, randSz)
            with open(filename, "r") as f:
                tmpPPIs = json.loads(f.read())
                for trial in range(len(tmpPPIs)):
                    parseTopPPIs[randSz][ds][methodName].update(set(["\t".join(sorted(ppi)) for ppi in tmpPPIs[trial]]))
#print(parseTopPPIs[50]['bioGRID']['CN'])

In [4]:
# calculate overlap class-based
myMethods = ["CN", "CRA", "L3", "CH2", "Sim", "L3E1_f1", "L3E1_f2", "rand"]
principles = ["CN", "CN", "L3", "L3", "L3", "L3", "L3", "rand"]
ds_names = ['bioGRID_human', 'STRING_human', 'MINT_human', 'HuRI']
overlaps_mats_grouped = {}

for randSz in range(50, 100, 10):
    overlaps_mats_grouped[randSz] = {}
    for ds in ds_names:
        overlaps_mats_grouped[randSz][ds] = defaultdict(list)
        for A_i in range(len(myMethods)):
            for B_i in range(len(myMethods)):
                if B_i < A_i or A_i == B_i: continue
                methodA, methodB = myMethods[A_i], myMethods[B_i]
                overlaps_mats_grouped[randSz][ds]["{}_{}".format(principles[A_i], principles[B_i])].append(
                    np.around(len(parseTopPPIs[randSz][ds][methodA]&parseTopPPIs[randSz][ds][methodB]
                           )/len(parseTopPPIs[randSz][ds][methodA]), 4))
                
        overlaps_mats_grouped[randSz][ds]["CRA_L3E".format()].append(
            np.around(len(parseTopPPIs[randSz][ds]["CRA"]&parseTopPPIs[randSz][ds]["L3E1_f1"]
                   )/len(parseTopPPIs[randSz][ds]["CRA"]), 4))

        
                
# convert grouped to mean & std
overlaps_mats_mean, overlaps_mats_std = {}, {}
for randSz in range(50, 100, 10):
    overlaps_mats_mean[randSz], overlaps_mats_std[randSz] = {}, {}
    for ds in ds_names:
        overlaps_mats_mean[randSz][ds], overlaps_mats_std[randSz][ds] = {}, {}
        for pair in overlaps_mats_grouped[randSz][ds]:
            overlaps_mats_mean[randSz][ds][pair] = np.around(np.mean(overlaps_mats_grouped[randSz][ds][pair]), 2)
            overlaps_mats_std[randSz][ds][pair] = np.around(np.std(overlaps_mats_grouped[randSz][ds][pair]), 2)
                
print(overlaps_mats_mean)
print(overlaps_mats_std)

{50: {'bioGRID_human': {'CN_CN': 0.64, 'CN_L3': 0.38, 'CN_rand': 0.01, 'L3_L3': 0.69, 'L3_rand': 0.01, 'CRA_L3E': 0.37}, 'STRING_human': {'CN_CN': 0.54, 'CN_L3': 0.44, 'CN_rand': 0.01, 'L3_L3': 0.58, 'L3_rand': 0.01, 'CRA_L3E': 0.44}, 'MINT_human': {'CN_CN': 0.37, 'CN_L3': 0.04, 'CN_rand': 0.01, 'L3_L3': 0.71, 'L3_rand': 0.01, 'CRA_L3E': 0.05}, 'HuRI': {'CN_CN': 0.64, 'CN_L3': 0.24, 'CN_rand': 0.01, 'L3_L3': 0.79, 'L3_rand': 0.01, 'CRA_L3E': 0.23}}, 60: {'bioGRID_human': {'CN_CN': 0.67, 'CN_L3': 0.43, 'CN_rand': 0.01, 'L3_L3': 0.7, 'L3_rand': 0.01, 'CRA_L3E': 0.44}, 'STRING_human': {'CN_CN': 0.54, 'CN_L3': 0.46, 'CN_rand': 0.01, 'L3_L3': 0.6, 'L3_rand': 0.01, 'CRA_L3E': 0.55}, 'MINT_human': {'CN_CN': 0.49, 'CN_L3': 0.05, 'CN_rand': 0.0, 'L3_L3': 0.71, 'L3_rand': 0.0, 'CRA_L3E': 0.06}, 'HuRI': {'CN_CN': 0.7, 'CN_L3': 0.26, 'CN_rand': 0.01, 'L3_L3': 0.79, 'L3_rand': 0.01, 'CRA_L3E': 0.26}}, 70: {'bioGRID_human': {'CN_CN': 0.67, 'CN_L3': 0.44, 'CN_rand': 0.0, 'L3_L3': 0.62, 'L3_rand': 0.0

In [7]:
# overlaps_mats_mean, overlaps_mats_std
# y: dataset w sample size, x: principle pair
principlePairs = ['CN_CN', 'L3_L3', 'CN_L3', 'CRA_L3E']
colors = ["blue", "blue", "red", "red"]
colorsMap = dict(zip(principlePairs, colors))

overlapMeanTB_str = []
for ds in ['bioGRID_human', 'STRING_human', 'MINT_human', 'HuRI']:
    for randSz in range(50, 51, 10):
        buildStr = "\cellcolor{gray!15} "+ds
        for pair in principlePairs:
            if 'rand' in pair: continue
            if 'CN_CN' == pair or "CRA" in pair:
                buildStr += " & \\textcolor{"+colorsMap[pair]+"}{"+(str(int(overlaps_mats_mean[randSz][ds][pair]*100))+"\\%}")
            else:
                buildStr += " & \\textcolor{"+colorsMap[pair]+"}{"+(str(int(np.around(overlaps_mats_mean[randSz][ds][pair]*100, 2)))+" $\\pm$ "+str(
                        int(np.around(overlaps_mats_std[randSz][ds][pair]*100, 2)))+" \\%}")
        print(buildStr+" \\\\ \\hline")

\cellcolor{gray!15} bioGRID_human & \textcolor{blue}{64\%} & \textcolor{blue}{69 $\pm$ 4 \%} & \textcolor{red}{38 $\pm$ 4 \%} & \textcolor{red}{37\%} \\ \hline
\cellcolor{gray!15} STRING_human & \textcolor{blue}{54\%} & \textcolor{blue}{58 $\pm$ 4 \%} & \textcolor{red}{44 $\pm$ 3 \%} & \textcolor{red}{44\%} \\ \hline
\cellcolor{gray!15} MINT_human & \textcolor{blue}{37\%} & \textcolor{blue}{71 $\pm$ 10 \%} & \textcolor{red}{4 $\pm$ 2 \%} & \textcolor{red}{5\%} \\ \hline
\cellcolor{gray!15} HuRI & \textcolor{blue}{64\%} & \textcolor{blue}{79 $\pm$ 7 \%} & \textcolor{red}{24 $\pm$ 3 \%} & \textcolor{red}{23\%} \\ \hline


In [8]:
# overlaps_mats_mean, overlaps_mats_std
# y: dataset w sample size, x: principle pair
principlePairs = ['CN_CN', 'L3_L3', 'CN_L3', 'CRA_L3E']
colors = ["blue", "blue", "red", "red"]
colorsMap = dict(zip(principlePairs, colors))

overlapMeanTB_str = []
for ds in ['bioGRID_human', 'STRING_human', 'MINT_human', 'HuRI']:
    print("\\multicolumn{5}{|l|}{\\textbf{"+ds+" Yeast}} \\\\ \\hline")
    for randSz in range(50, 100, 10):
        buildStr = "\\cellcolor{gray!15} "+str(randSz)+"\\% sample size"
        for pair in principlePairs:
            if 'rand' in pair: continue
            if 'CN_CN' == pair or "CRA" in pair:
                buildStr += " & \\textcolor{"+colorsMap[pair]+"}{"+(str(int(np.around(overlaps_mats_mean[randSz][ds][pair]*100, 2)))+"\\%}")
            else:
                buildStr += " & \\textcolor{"+colorsMap[pair]+"}{"+(str(int(np.around(overlaps_mats_mean[randSz][ds][pair]*100, 2)))+" $\\pm$ "+str(
                        int(np.around(overlaps_mats_std[randSz][ds][pair]*100, 2)))+" \\%}")
        print(buildStr+" \\\\ \\hline")

\multicolumn{5}{|l|}{\textbf{bioGRID_human Yeast}} \\ \hline
\cellcolor{gray!15} 50\% sample size & \textcolor{blue}{64\%} & \textcolor{blue}{69 $\pm$ 4 \%} & \textcolor{red}{38 $\pm$ 4 \%} & \textcolor{red}{37\%} \\ \hline
\cellcolor{gray!15} 60\% sample size & \textcolor{blue}{67\%} & \textcolor{blue}{70 $\pm$ 4 \%} & \textcolor{red}{43 $\pm$ 3 \%} & \textcolor{red}{44\%} \\ \hline
\cellcolor{gray!15} 70\% sample size & \textcolor{blue}{67\%} & \textcolor{blue}{62 $\pm$ 11 \%} & \textcolor{red}{44 $\pm$ 6 \%} & \textcolor{red}{48\%} \\ \hline
\cellcolor{gray!15} 80\% sample size & \textcolor{blue}{64\%} & \textcolor{blue}{61 $\pm$ 6 \%} & \textcolor{red}{44 $\pm$ 4 \%} & \textcolor{red}{50\%} \\ \hline
\cellcolor{gray!15} 90\% sample size & \textcolor{blue}{64\%} & \textcolor{blue}{57 $\pm$ 7 \%} & \textcolor{red}{43 $\pm$ 7 \%} & \textcolor{red}{52\%} \\ \hline
\multicolumn{5}{|l|}{\textbf{STRING_human Yeast}} \\ \hline
\cellcolor{gray!15} 50\% sample size & \textcolor{blue}{54\%} &