In [1]:
import pandas as pd
import numpy as np
import os
import copy
import pickle
import collections
import dgl
import dgl.function as fn
import dgl.nn as dglnn
import torch as th
import torch

from dgl import save_graphs, load_graphs
from dgl.heterograph import DGLHeteroGraph
from tqdm import tqdm
from pandas.core.frame import DataFrame, Series
from typing import Dict, Tuple, Sequence

# output control
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.set_option("max_columns", 1000)
pd.set_option("max_row", 300)
pd.set_option("display.float_format", lambda x: '%.5f' % x)

Using backend: pytorch


In [2]:
BASE_DIR = "../0-data/2-build_graphs/"
OUT_DIR = "../0-data/3-models/"
TERM_FILE = "terms.pkl"

INPUT_DATASET = {
    "human": {
        "id": "9606",
        "ppsn-min": "9606-ppsn-min.csv",
        "uniprot-min": "9606-uniprot-min.csv"
    },
    "mouse": {
        "id": "10090",
        "ppsn-min": "10090-ppsn-min.csv",
        "uniprot-min": "10090-uniprot-min.csv"
    }
}

sub_ontologies = {
    "bp": "GO:0008150",
    "cc": "GO:0005575",
    "mf": "GO:0003674"
}

# 1. Function definition for constructing network

In [3]:
def load_files(species: str) -> Tuple[DataFrame, DataFrame, dict]:
    
    # file path
    ppsn = BASE_DIR + INPUT_DATASET[species]['ppsn-min']
    uniprot = BASE_DIR + INPUT_DATASET[species]['uniprot-min']
    term_path = BASE_DIR + TERM_FILE

    # load files
    df_ppsn = pd.read_csv(ppsn, sep = '\t')
    df_uniprot = pd.read_csv(uniprot, sep = '\t')
    with open(term_path, "rb") as f:
        terms, namespace = pickle.load(f)
    
    return df_ppsn, df_uniprot, terms, namespace


def build_ppsn_net(df_uniprot: DataFrame, df_ppsn: DataFrame) -> dict:
    
    # node: 18560 edge:11154322 node_without_annotation: 687 node_with_annotation: 17873
    ppsn_net = {}
#     ppsn_net['node2id'] = dict(zip(df_uniprot['uniprot_id'], df_uniprot['id']))
    ppsn_net['id2node'] = df_uniprot['uniprot_id'].to_numpy()
    ppsn_net['edge_src'] = df_ppsn['protein1'].to_numpy()
    ppsn_net['edge_dst'] = df_ppsn['protein2'].to_numpy()
    ppsn_net['edge_score'] = df_ppsn['score'].to_numpy()
    ppsn_net['node_seq'] = df_uniprot['sequence'].to_numpy()

    # data validation
    assert(len(ppsn_net['edge_src']) == len(ppsn_net['edge_dst']))
    assert(len(ppsn_net['id2node']) == len(ppsn_net['node_seq']))

#     for i in range(len(ppsn_net['node2id'])):
#         assert(ppsn_net['node2id'][ppsn_net['id2node'][i]] == i)
    
    return ppsn_net


def propagate(annotation: set, terms: dict) -> set:
    """propagate annotations with True Path Rule
    
    Args:
        annotation: a annotation set of certain protein
        terms: adjacency list of gene ontology directed acyclic graph, for example
            key -> 'GO:0000001'
            value -> ['GO:0048308', 'GO:0048311']
    """
    while True:
        length=len(annotation)
        temp=[]
        for i in annotation:
            if i not in terms or terms[i] is None:
                continue
            temp.extend(terms[i])
        annotation.update(temp)
        if len(annotation)==length: # 本轮未更新标签
            return annotation


def resolve_annotations(df_uniprot: DataFrame, terms: dict) -> dict:
    
    # handle labels
    annotations = dict(zip(df_uniprot['id'], df_uniprot['labels']))

    cnt = 0
    for i in annotations:
        if not isinstance(annotations[i],float):
            annotations[i] = set(map(lambda x: x.strip(), annotations[i].split(';')))
            cnt += len(annotations[i])
    print("\nbefore propagate: ", cnt)
    cnt = 0
    for i in annotations:
        if not isinstance(annotations[i],float):
            annotations[i] = propagate(annotations[i], terms)
            cnt += len(annotations[i])
    print("after propagate: ", cnt)
    
    return annotations


def split_branches(annotations: dict, namespace: dict, terms: dict) -> dict:
    """split terms into three branches
    """

    print("-"*30, "Total proteins/terms", "-"*30)
    ##划分子空间，每个子空间是一个集合
    bp,mf,cc=set(),set(),set()
    for i in terms:
        if namespace[i]=='biological_process':
            bp.add(i)
        elif namespace[i]=='molecular_function':
            mf.add(i)
        elif namespace[i]=='cellular_component':
            cc.add(i)
            
    print("Total proteins:\t", len(annotations))
    print("Total terms:\t", len(terms))
    print("Total terms in bp branch:\t", len(bp))
    print("Total terms in mf branch:\t", len(mf))
    print("Total terms in cc branch:\t", len(cc))
    assert(len(bp) +
          len(mf) +
          len(cc) == len(terms))

    labels = copy.deepcopy(annotations)
    labels_with_go={}
    for i in labels:
        if not isinstance(labels[i],float):
            labels_with_go[i] = set()
            for j in labels[i]:
                if j in terms:
                    labels_with_go[i].add(j)
    len(labels),len(labels_with_go)### some items has no label are discarded

    #按照子本体分开
    label_bp,label_cc,label_mf=collections.defaultdict(list),collections.defaultdict(list),\
    collections.defaultdict(list)
    for i in labels_with_go:
        for j in labels_with_go[i]:
            if j in bp:
                label_bp[i].append(j)
            elif j in cc:
                label_cc[i].append(j)
            elif j in mf:
                label_mf[i].append(j)


    print("-"*30, "proteins/terms used", "-"*30)

    fre_counter = collections.Counter()
    edge_counter = 0
    for i in labels_with_go:
        fre_counter.update(labels_with_go[i])
        edge_counter += len(labels_with_go[i])
    print("full ontology: ", 
          "\n\tTotal proteins: ", len(labels_with_go),
          "\n\tTotal terms: ", len(fre_counter),
          "\n\tTotal edges: ", edge_counter)


    bp_counter=collections.Counter()
    edge_counter = 0
    for i in label_bp:
        bp_counter.update(label_bp[i])
        edge_counter += label_bp[i].__len__()
    print("bp: \n",
          "\n\tproteins: ", len(label_bp),
          "\n\tterms: ", len(bp_counter),
          "\n\tedges: ", edge_counter)


    cc_counter=collections.Counter()
    edge_counter = 0
    for i in label_cc:
        cc_counter.update(label_cc[i])
        edge_counter += label_cc[i].__len__()
    print("cc: \n",
          "\n\tproteins: ", len(label_cc),
          "\n\tterms: ", len(cc_counter),
          "\n\tedges: ", edge_counter)


    mf_counter=collections.Counter()
    edge_counter = 0
    for i in label_mf:
        mf_counter.update(label_mf[i])
        edge_counter += label_mf[i].__len__()
    print("mf: \n",
          "\n\tproteins: ", len(label_mf),
          "\n\tterms: ", len(mf_counter),
          "\n\tedges: ", edge_counter)

#     print("-"*30, "-"*len("proteins/terms used"), "-"*30)

    term_branches = {
        "full" : {
            "labels":  labels_with_go,
            "counter": fre_counter
        },
        "bp" : {
            "labels":  label_bp,
            "counter": bp_counter
        },
        "cc" :  {
            "labels":  label_cc,
            "counter": cc_counter
        },
        "mf" : {
            "labels":  label_mf,
            "counter": mf_counter
        }
    }

    # for branch in term_branches:
    #     print(branch + ": \n",
    #       "\n\tproteins: ", len(term_branches[branch]["labels"]),
    #       "\n\tterms: ", len(term_branches[branch]["counter"]))
    return term_branches


def build_single_net(id2term: list, annotations: dict, son_of: dict) -> tuple:
    """build term_network upon the given term set and son_of relations
       build annotation_network upon the given annotaions connectons
       
    Args:
        id2term: term list with the index is term_id and the value is term's gene ontology ID
        annotations: {'protein_id': ['term_GO_ID1', 'term_GO_ID2'...]}
        son_of: {'son_term_GO_ID': ['father_GO_ID1', 'father_GO_ID2']}
    """
    
#     print("*"*30, str(len(id2term)))
    term2id = {id2term[i]:i for i in range(len(id2term))}
    # term2id
    for term in term2id:
        assert(term == id2term[term2id[term]])

    # build annotation net
    protein2term_src = []
    protein2term_dst = []

    for protein_id, annotation in annotations.items():
        if isinstance(annotation, float):
            continue
        for term in annotation:
            if term in term2id:
                protein2term_src.append(protein_id)
                protein2term_dst.append(term2id[term])

    assert(len(protein2term_src) == len(protein2term_dst))
    assert(max(protein2term_dst) < len(term2id))

    # build term net
    son_of_src = []
    son_of_dst = []

    for son, son_id in term2id.items():
#         if (son is "GO:0050444"):
#             print(son, son_id)
#             global temp
#             temp = term2id
        fathers = son_of[son]
        if fathers is None:
            if son not in sub_ontologies.values():
                print(son)
            continue
        for father in fathers:
            if father in term2id:
                son_of_src.append(son_id)
                son_of_dst.append(term2id[father])
                
    assert(max(son_of_src) < len(term2id))
    assert(max(son_of_dst) < len(term2id))
    occurence = set(son_of_src).union(set(son_of_dst))
    for i in range(len(term2id)):
        if i not in occurence:
            print("Outlier error: ", id2term[i])
    
    net = {
        'term_net': {
            "id2node":  id2term,
            "edge_src": son_of_src,
            "edge_dst": son_of_dst
        },
        'annotation_net': {
            "edge_src": protein2term_src,
            "edge_dst": protein2term_dst
        }
    }
    
    return net



def build_term_and_annotation_nets(branch: dict, annotations: dict, son_of: dict) -> dict:

    sorted_branch = list(branch.items())
    sorted_branch.sort(key = lambda x: x[1], reverse = True)

    term_frequence_list = {}

    n = len(sorted_branch)
    span = int(n*0.05)
    for cnt in tqdm(range(1, 21)):
        end = span * cnt
        if cnt == 20:
            end = len(sorted_branch)
        cur_terms = [x[0] for x in sorted_branch[0: end]]
        term_frequence_list[str(0.05*cnt)[:4]] = cur_terms

#     for k, v in term_frequence_list.items():
#         print(k + ": " + str(len(v)))

    nets = {}

    for freq, id2term in tqdm(term_frequence_list.items()):
        nets[freq] = build_single_net(id2term, annotations, son_of)
    # net = build_single_net(term_frequence_list['1.0'], annotations, son_of)
#     for freq in nets:
#         print(freq)
#         for net_name, net in nets[freq].items():
#             print(net_name + ": ")
#             for k, v in net.items():
#                 print("\t", k, len(v))
                
    return nets

def build_default_term_net(branches: dict, annotations: dict, son_of: dict) -> dict:
    
    sum_length = 0
    counters = {}
    counter_full = set()

    for k, v in branches.items():
        if k == "full":
            continue
        cnt = 100 if k!="bp" else 300
#         cnt = 25 if k!="bp" else 150
        counters[k] = set(k for k, v in v["counter"].items() if v>=cnt)
        counter_full = counter_full.union(counters[k])

        sum_length += len(counters[k])
#         print(k + ":", len(counters[k]))

#     print("full" + ":", len(counter_full))
    assert(len(counter_full) == sum_length)
    counters["full"] = counter_full

    default = {}
    for k, id2term in counters.items():
#         print(k)
        default[k] = build_single_net(list(id2term), annotations, son_of)
    
    return default

# 2. Function definition for building graphs

In [4]:
# graph building function
def build_network_from_path(ppi_path, term_path, graph_path='./temp_graph.bin', save=False):
    
    warnings.warn("build_network_from_path is deprecated, use build_network instead", DeprecationWarning)
    print("build network from sub_net...")
    with open(ppi_path, "rb") as f:
        ppi_net = pickle.load(f)

    with open(term_path, "rb") as f:
        term_net = pickle.load(f)

    ppi_net.keys()
    term_net.keys()

    # 无向
    interaction_src = np.concatenate([ppi_net['edge_src'], ppi_net['edge_dst']])
    interaction_dst = np.concatenate([ppi_net['edge_dst'], ppi_net['edge_src']])
    ppi_net['edge_score'] =  np.concatenate([ppi_net['edge_score'], ppi_net['edge_score']]) #更新权重

    # 有向
    is_a_src = term_net['edge_src']
    is_a_dst = term_net['edge_dst']

    # 无向，非对称关联
    annotated_by_src = term_net['protein2term_src']
    annotated_by_dst = term_net['protein2term_dst']
    annotate_src = term_net['protein2term_dst']
    annotate_dst = term_net['protein2term_src']


    hetero_graph = dgl.heterograph({
        ('protein', 'interaction', 'protein'): (interaction_src, interaction_dst),
        ('term', 'is_a', 'term'): (is_a_src, is_a_dst),
        ('protein', 'annotated_by', 'term'): (annotated_by_src, annotated_by_dst),
        ('term', 'annotate', 'protein'): (annotate_src, annotate_dst)})

    # 属性：数据集成
    hetero_graph.nodes['protein'].data['node_vertex_embedding'] = torch.from_numpy(ppi_net['node_vertex_embedding'])    #属性特征
#     hetero_graph.nodes['protein'].data['node_seq_embedding'] = torch.Tensor(ppi_net['node_seq_embedding'])
    hetero_graph.nodes['term'].data['feature'] = torch.randn(len(term_net['id2term']), 128)
    # hetero_graph.nodes['user'].data['label'] = torch.randint(0, n_user_classes, (n_users,)) #分类特征
    hetero_graph.edges['interaction'].data['weight'] = torch.from_numpy(ppi_net['edge_score'])
    # randomly generate training masks on user nodes and click edges
    hetero_graph.nodes['protein'].data['train_mask'] = torch.zeros(len(ppi_net['id2node']), dtype=torch.bool).bernoulli(0.7)   #节点分类
    hetero_graph.edges['annotated_by'].data['train_mask'] = torch.zeros(len(term_net['protein2term_dst']), dtype=torch.bool).bernoulli(0.7)#链接预测 
    
    # 序列化，减少网络预处理次数
    if save:
        save_graphs(graph_path, hetero_graph)
    
    hetero_graph = hetero_graph.to("cuda:1")
    return hetero_graph, ppi_net, term_net
    
    
def load_network_from_path(graph_path):
    
    warnings.warn("load_network_from_path is deprecated, use build_network instead", DeprecationWarning)
    print("load network from {} file...".format(graph_path))
    hetero_graph, _ = load_graphs(graph_path)
    hetero_graph = hetero_graph[0].to("cuda:1")
    return hetero_graph

In [5]:
def build_single_network(ppsn_net: dict, term_net: dict, annotation_net: dict, branch_name: str, namespace: dict) -> DGLHeteroGraph:
    
    # build dgl graph
    # 无向
    # interaction_src = np.concatenate([ppi_net['edge_src'], ppi_net['edge_dst']])
    # interaction_dst = np.concatenate([ppi_net['edge_dst'], ppi_net['edge_src']])
    interaction_src = ppsn_net['edge_src']
    interaction_dst = ppsn_net['edge_dst']

    # 有向
    son_of_src = term_net['edge_src']
    son_of_dst = term_net['edge_dst']

    # 无向，非对称关联
    annotated_by_src = annotation_net['edge_src']
    annotated_by_dst = annotation_net['edge_dst']
    annotate_src = annotation_net['edge_dst']
    annotate_dst = annotation_net['edge_src']


    hetero_graph = dgl.heterograph({
        ('protein', 'similar_with', 'protein'): (interaction_src, interaction_dst),
        ('term', 'son_of', 'term'): (son_of_src, son_of_dst),
        ('protein', 'annotated_by', 'term'): (annotated_by_src, annotated_by_dst),
        ('term', 'annotate', 'protein'): (annotate_src, annotate_dst)}, idtype=th.int32)
    
    # handle properties
    if branch_name == "full":
        id2term = term_net['id2node']
        branch_map = {
            'biological_process': [0 for i in range(len(id2term))],
            'molecular_function': [0 for i in range(len(id2term))],
            'cellular_component': [0 for i in range(len(id2term))]
        }
        
        for term_id in range(len(id2term)):
            if (branch_name != "full"):
                print("x"*30, branch_name)
            term = id2term[term_id]
            branch_map[namespace[term]][term_id] = 1
        
        cnt = 0
        for _, masks in branch_map.items():
            cnt += sum(masks)
        assert(cnt == len(id2term))
        
        hetero_graph.nodes['term'].data['bp_mask'] = torch.from_numpy(np.array(branch_map['biological_process']))
        hetero_graph.nodes['term'].data['mf_mask'] = torch.from_numpy(np.array(branch_map['molecular_function']))
        hetero_graph.nodes['term'].data['cc_mask'] = torch.from_numpy(np.array(branch_map['cellular_component']))
    
    return hetero_graph

def build_networks(ppsn_net: dict, term_nets: dict, namespace: dict) -> dict:
    # echo some base information of input nets
    print("ppsn_net: ")
    for k, v in ppsn_net.items():
        print("\t" + k + "->" + str(len(v)))

    print("term_nets: ")
    for branch, cur_nets in term_nets.items():
        print("\n" + "-"*30, branch, "-"*30)
        for freq in cur_nets:
            print(freq)
            for net_name, net in cur_nets[freq].items():
                print(net_name + ":\t", end = "")
                for k, v in net.items():
                    print(k + "->" + str(len(v)), " ", end = "")
                print()

    graphs = {}
    for branch_name, cur_nets in term_nets.items():
        cur_graphs = {}
        for freq, cur_net in tqdm(cur_nets.items()):
             cur_graphs[freq] = build_single_network(ppsn_net, cur_net['term_net'], cur_net['annotation_net'], branch_name, namespace)
        graphs[branch_name] = cur_graphs

    print("graphs: ")
    for branch, cur_graphs in graphs.items():
        print("\n" + "-"*30, branch, "-"*30)
        for freq, graph in cur_graphs.items():
            print('\033[1;36m' + branch + ": " + freq + '\033[0m')
            print(graph)
    return graphs

In [6]:
for species in INPUT_DATASET:
    nets_out_path = OUT_DIR + INPUT_DATASET[species]['id'] + "-nets-min.pkl"
    graphs_out_path = OUT_DIR + INPUT_DATASET[species]['id'] + "-graphs-min.pkl"
    df_ppsn, df_uniprot, terms, term_namespace = load_files(species)

    print("\n" + "*"*35, species, "*"*35, ": ")
    df_ppsn.head()
    df_uniprot.head()
    print("terms:")
    list(terms.items())[0:10]

    # print("\nppsn_net:")
    ppsn_net = build_ppsn_net(df_uniprot, df_ppsn)
    # ppsn_net

    annotations = resolve_annotations(df_uniprot, terms)
    branches = split_branches(annotations, term_namespace, terms)

    branches.keys()
    print("-"*30, "build term and annotation nets", "-"*30)
    
    # build default net
    default_nets = build_default_term_net(branches, annotations, terms)
    # build freq nets
    term_nets = {}
    for branch_name, branch in branches.items():
        print("-"*30, branch_name, "-"*30)
        term_nets[branch_name] = build_term_and_annotation_nets(branch['counter'], annotations, terms)
        term_nets[branch_name]['default'] = default_nets[branch_name]
        
        # output statistical information of nets
#         cur_nets = term_nets[branch_name]
#         for freq in cur_nets:
#             print(freq)
#             for net_name, net in cur_nets[freq].items():
#                 print(net_name + ": ")
#                 for k, v in net.items():
#                     print("\t", k, len(v))
    
    # svae nets
    with open(nets_out_path, "wb+") as f:
        pickle.dump(ppsn_net, f)
        
    print("-"*30, "build term and annotation graphs", "-"*30)
    # build and save graphs
    graphs = build_networks(ppsn_net, term_nets, term_namespace)
    with open(graphs_out_path, "wb+") as f:
        pickle.dump(graphs, f)


*********************************** human *********************************** : 


Unnamed: 0,protein1,protein2,score
0,0,76,162.0
1,0,93,207.0
2,0,105,194.0
3,0,152,540.0
4,0,163,183.0


Unnamed: 0,id,uniprot_id,string_id,sequence,labels
0,0,Q66K14,9606.ENSP00000349291,MWLSPEEVLVANALWVTERANPFFVLQRRRGHGRGGGLTGLLVGTL...,GO:0005096; GO:0005509; GO:0006886; GO:0016021...
1,1,Q9UMR3,9606.ENSP00000386170,MEFTASPKPQLSSRANAFSIAALMSSGGSKEKEATENTIKPLEQFV...,GO:0000122; GO:0000785; GO:0000977; GO:0000978...
2,2,Q9P031,9606.ENSP00000256151,MAPVRRSAKWRPGGIEARGEGVSTVGYRNKNVRQKTWRPNHPQAFV...,GO:0003723; GO:0005654; GO:0044267
3,3,Q6PEY2,9606.ENSP00000318197,MRECISIHVGQAGVQIGNACWELYCLEHGIQPDGQMPSDKTIGGGD...,GO:0000226; GO:0000278; GO:0003924; GO:0005200...
4,4,Q9P016,9606.ENSP00000341657,MSRPRKRLAGTSGSDKGLSGKRTKTENSGEALAKVEDSNPQKTSAT...,GO:0005634


terms:


[('GO:0008150', None),
 ('GO:0005575', None),
 ('GO:0003674', None),
 ('GO:0000001', ['GO:0048308', 'GO:0048311']),
 ('GO:0000002', ['GO:0007005']),
 ('GO:0000003', ['GO:0008150']),
 ('GO:0000006', ['GO:0005385']),
 ('GO:0000007', ['GO:0005385']),
 ('GO:0000009', ['GO:0000030']),
 ('GO:0000010', ['GO:0004659'])]


before propagate:  263264
after propagate:  1888515
------------------------------ Total proteins/terms ------------------------------
Total proteins:	 18560
Total terms:	 44085
Total terms in bp branch:	 28748
Total terms in mf branch:	 11153
Total terms in cc branch:	 4184
------------------------------ proteins/terms used ------------------------------
full ontology:  
	Total proteins:  17873 
	Total terms:  22456 
	Total edges:  1888512
bp: 
 
	proteins:  16546 
	terms:  15658 
	edges:  1273112
cc: 
 
	proteins:  17388 
	terms:  1995 
	edges:  370968
mf: 
 
	proteins:  15108 
	terms:  4803 
	edges:  244432


dict_keys(['full', 'bp', 'cc', 'mf'])

------------------------------ build term and annotation nets ------------------------------


100%|██████████| 20/20 [00:00<00:00, 600.64it/s]
  0%|          | 0/20 [00:00<?, ?it/s]

------------------------------ full ------------------------------


100%|██████████| 20/20 [00:11<00:00,  1.68it/s]
100%|██████████| 20/20 [00:00<00:00, 587.82it/s]
  0%|          | 0/20 [00:00<?, ?it/s]

------------------------------ bp ------------------------------


100%|██████████| 20/20 [00:09<00:00,  2.10it/s]
100%|██████████| 20/20 [00:00<00:00, 11529.15it/s]
  0%|          | 0/20 [00:00<?, ?it/s]

------------------------------ cc ------------------------------


100%|██████████| 20/20 [00:05<00:00,  3.91it/s]
100%|██████████| 20/20 [00:00<00:00, 1627.65it/s]
  0%|          | 0/20 [00:00<?, ?it/s]

------------------------------ mf ------------------------------


100%|██████████| 20/20 [00:04<00:00,  4.03it/s]


------------------------------ build term and annotation graphs ------------------------------
ppsn_net: 
	id2node->18560
	edge_src->11154322
	edge_dst->11154322
	edge_score->11154322
	node_seq->18560
term_nets: 

------------------------------ full ------------------------------
0.05
term_net:	id2node->1122  edge_src->1895  edge_dst->1895  
annotation_net:	edge_src->1466183  edge_dst->1466183  
0.1
term_net:	id2node->2244  edge_src->3878  edge_dst->3878  
annotation_net:	edge_src->1641049  edge_dst->1641049  
0.15
term_net:	id2node->3366  edge_src->5966  edge_dst->5966  
annotation_net:	edge_src->1723706  edge_dst->1723706  
0.2
term_net:	id2node->4488  edge_src->8040  edge_dst->8040  
annotation_net:	edge_src->1770727  edge_dst->1770727  
0.25
term_net:	id2node->5610  edge_src->10148  edge_dst->10148  
annotation_net:	edge_src->1801189  edge_dst->1801189  
0.30
term_net:	id2node->6732  edge_src->12270  edge_dst->12270  
annotation_net:	edge_src->1822120  edge_dst->1822120  
0.35
term

  0%|          | 0/21 [00:00<?, ?it/s]

0.8
term_net:	id2node->1584  edge_src->2778  edge_dst->2778  
annotation_net:	edge_src->370427  edge_dst->370427  
0.85
term_net:	id2node->1683  edge_src->2938  edge_dst->2938  
annotation_net:	edge_src->370625  edge_dst->370625  
0.9
term_net:	id2node->1782  edge_src->3091  edge_dst->3091  
annotation_net:	edge_src->370755  edge_dst->370755  
0.95
term_net:	id2node->1881  edge_src->3253  edge_dst->3253  
annotation_net:	edge_src->370854  edge_dst->370854  
1.0
term_net:	id2node->1995  edge_src->3454  edge_dst->3454  
annotation_net:	edge_src->370968  edge_dst->370968  
default
term_net:	id2node->293  edge_src->516  edge_dst->516  
annotation_net:	edge_src->347459  edge_dst->347459  

------------------------------ mf ------------------------------
0.05
term_net:	id2node->240  edge_src->302  edge_dst->302  
annotation_net:	edge_src->201281  edge_dst->201281  
0.1
term_net:	id2node->480  edge_src->618  edge_dst->618  
annotation_net:	edge_src->218259  edge_dst->218259  
0.15
term_net:	i

100%|██████████| 21/21 [00:13<00:00,  1.57it/s]
100%|██████████| 21/21 [00:09<00:00,  2.25it/s]
100%|██████████| 21/21 [00:04<00:00,  4.39it/s]
100%|██████████| 21/21 [00:03<00:00,  5.46it/s]


graphs: 

------------------------------ full ------------------------------
[1;36mfull: 0.05[0m
Graph(num_nodes={'protein': 18560, 'term': 1122},
      num_edges={('protein', 'annotated_by', 'term'): 1466183, ('protein', 'similar_with', 'protein'): 11154322, ('term', 'annotate', 'protein'): 1466183, ('term', 'son_of', 'term'): 1895},
      metagraph=[('protein', 'term', 'annotated_by'), ('protein', 'protein', 'similar_with'), ('term', 'protein', 'annotate'), ('term', 'term', 'son_of')])
[1;36mfull: 0.1[0m
Graph(num_nodes={'protein': 18560, 'term': 2244},
      num_edges={('protein', 'annotated_by', 'term'): 1641049, ('protein', 'similar_with', 'protein'): 11154322, ('term', 'annotate', 'protein'): 1641049, ('term', 'son_of', 'term'): 3878},
      metagraph=[('protein', 'term', 'annotated_by'), ('protein', 'protein', 'similar_with'), ('term', 'protein', 'annotate'), ('term', 'term', 'son_of')])
[1;36mfull: 0.15[0m
Graph(num_nodes={'protein': 18560, 'term': 3366},
      num_edges=

Unnamed: 0,protein1,protein2,score
0,0,80,247.0
1,0,241,242.0
2,0,253,210.0
3,0,286,179.0
4,0,398,220.0


Unnamed: 0,id,uniprot_id,string_id,sequence,labels
0,0,Q8VCZ3,10090.ENSMUSP00000097701,MPRHCSAAGCCTRDTRETRNRGISFHRLPKKDNPRRGLWLANCQRL...,GO:0000122; GO:0001226; GO:0003677; GO:0005634...
1,1,Q62264,10090.ENSMUSP00000042988,MQVLTKRYPKNCLLTVMDRYSAVVRNMEQVVMIPSLLRDVQLSGPG...,GO:0005654; GO:0005829; GO:0006629; GO:0009617...
2,2,Q6QNU9,10090.ENSMUSP00000074381,MGRYWLLPGLLLSLPLVTGWSTSNCLVTEGSRLPLVSRYFTFCRHS...,GO:0002224; GO:0004888; GO:0005887; GO:0006954...
3,3,Q8BHE4,10090.ENSMUSP00000140027,MKRSLQALYCQLLSFLLTLALTKALVLAVHEPSPRESLQTLPSGSP...,GO:0005769; GO:0006898; GO:0008090; GO:0010008...
4,4,Q9JLF7,10090.ENSMUSP00000106625,MACQLDLLIGVIFMASPVLVISPCSSDGRIAFFRGCNLTQIPWILN...,GO:0002224; GO:0002755; GO:0004888; GO:0005149...


terms:


[('GO:0008150', None),
 ('GO:0005575', None),
 ('GO:0003674', None),
 ('GO:0000001', ['GO:0048308', 'GO:0048311']),
 ('GO:0000002', ['GO:0007005']),
 ('GO:0000003', ['GO:0008150']),
 ('GO:0000006', ['GO:0005385']),
 ('GO:0000007', ['GO:0005385']),
 ('GO:0000009', ['GO:0000030']),
 ('GO:0000010', ['GO:0004659'])]


before propagate:  250127
after propagate:  1729340
------------------------------ Total proteins/terms ------------------------------
Total proteins:	 16420
Total terms:	 44085
Total terms in bp branch:	 28748
Total terms in mf branch:	 11153
Total terms in cc branch:	 4184
------------------------------ proteins/terms used ------------------------------
full ontology:  
	Total proteins:  16025 
	Total terms:  22595 
	Total edges:  1729337
bp: 
 
	proteins:  14843 
	terms:  15838 
	edges:  1175632
cc: 
 
	proteins:  15530 
	terms:  1985 
	edges:  329818
mf: 
 
	proteins:  13537 
	terms:  4772 
	edges:  223887


dict_keys(['full', 'bp', 'cc', 'mf'])

------------------------------ build term and annotation nets ------------------------------


100%|██████████| 20/20 [00:00<00:00, 406.94it/s]
  0%|          | 0/20 [00:00<?, ?it/s]

------------------------------ full ------------------------------


 55%|█████▌    | 11/20 [00:05<00:04,  1.89it/s]

Outlier error:  GO:0036486


100%|██████████| 20/20 [00:11<00:00,  1.71it/s]
100%|██████████| 20/20 [00:00<00:00, 1075.13it/s]
  0%|          | 0/20 [00:00<?, ?it/s]

------------------------------ bp ------------------------------


100%|██████████| 20/20 [00:09<00:00,  2.14it/s]
100%|██████████| 20/20 [00:00<00:00, 9751.93it/s]
  0%|          | 0/20 [00:00<?, ?it/s]

------------------------------ cc ------------------------------


 45%|████▌     | 9/20 [00:02<00:03,  3.49it/s]

Outlier error:  GO:0036057


100%|██████████| 20/20 [00:05<00:00,  3.78it/s]
100%|██████████| 20/20 [00:00<00:00, 4065.04it/s]
  5%|▌         | 1/20 [00:00<00:03,  5.42it/s]

------------------------------ mf ------------------------------


100%|██████████| 20/20 [00:04<00:00,  4.74it/s]


------------------------------ build term and annotation graphs ------------------------------
ppsn_net: 
	id2node->16420
	edge_src->9780551
	edge_dst->9780551
	edge_score->9780551
	node_seq->16420
term_nets: 

------------------------------ full ------------------------------
0.05
term_net:	id2node->1129  edge_src->1913  edge_dst->1913  
annotation_net:	edge_src->1324806  edge_dst->1324806  
0.1
term_net:	id2node->2258  edge_src->3919  edge_dst->3919  
annotation_net:	edge_src->1489702  edge_dst->1489702  
0.15
term_net:	id2node->3387  edge_src->5999  edge_dst->5999  
annotation_net:	edge_src->1567861  edge_dst->1567861  
0.2
term_net:	id2node->4516  edge_src->8066  edge_dst->8066  
annotation_net:	edge_src->1612949  edge_dst->1612949  
0.25
term_net:	id2node->5645  edge_src->10239  edge_dst->10239  
annotation_net:	edge_src->1642276  edge_dst->1642276  
0.30
term_net:	id2node->6774  edge_src->12382  edge_dst->12382  
annotation_net:	edge_src->1662591  edge_dst->1662591  
0.35
term_ne

  0%|          | 0/21 [00:00<?, ?it/s]


annotation_net:	edge_src->315244  edge_dst->315244  
0.25
term_net:	id2node->495  edge_src->883  edge_dst->883  
annotation_net:	edge_src->319568  edge_dst->319568  
0.30
term_net:	id2node->594  edge_src->1064  edge_dst->1064  
annotation_net:	edge_src->322132  edge_dst->322132  
0.35
term_net:	id2node->693  edge_src->1247  edge_dst->1247  
annotation_net:	edge_src->323850  edge_dst->323850  
0.4
term_net:	id2node->792  edge_src->1432  edge_dst->1432  
annotation_net:	edge_src->325141  edge_dst->325141  
0.45
term_net:	id2node->891  edge_src->1614  edge_dst->1614  
annotation_net:	edge_src->326168  edge_dst->326168  
0.5
term_net:	id2node->990  edge_src->1795  edge_dst->1795  
annotation_net:	edge_src->326955  edge_dst->326955  
0.55
term_net:	id2node->1089  edge_src->1971  edge_dst->1971  
annotation_net:	edge_src->327575  edge_dst->327575  
0.60
term_net:	id2node->1188  edge_src->2134  edge_dst->2134  
annotation_net:	edge_src->328072  edge_dst->328072  
0.65
term_net:	id2node->1287

100%|██████████| 21/21 [00:11<00:00,  1.79it/s]
100%|██████████| 21/21 [00:09<00:00,  2.19it/s]
100%|██████████| 21/21 [00:03<00:00,  5.27it/s]
100%|██████████| 21/21 [00:03<00:00,  5.42it/s]


graphs: 

------------------------------ full ------------------------------
[1;36mfull: 0.05[0m
Graph(num_nodes={'protein': 16420, 'term': 1129},
      num_edges={('protein', 'annotated_by', 'term'): 1324806, ('protein', 'similar_with', 'protein'): 9780551, ('term', 'annotate', 'protein'): 1324806, ('term', 'son_of', 'term'): 1913},
      metagraph=[('protein', 'term', 'annotated_by'), ('protein', 'protein', 'similar_with'), ('term', 'protein', 'annotate'), ('term', 'term', 'son_of')])
[1;36mfull: 0.1[0m
Graph(num_nodes={'protein': 16420, 'term': 2258},
      num_edges={('protein', 'annotated_by', 'term'): 1489702, ('protein', 'similar_with', 'protein'): 9780551, ('term', 'annotate', 'protein'): 1489702, ('term', 'son_of', 'term'): 3919},
      metagraph=[('protein', 'term', 'annotated_by'), ('protein', 'protein', 'similar_with'), ('term', 'protein', 'annotate'), ('term', 'term', 'son_of')])
[1;36mfull: 0.15[0m
Graph(num_nodes={'protein': 16420, 'term': 3387},
      num_edges={(