In [1]:
import time
import numpy as np
import itertools
from itertools import combinations_with_replacement
import networkx as nx
from itertools import product


def read_gene_file(gene_file):
    """
    Reads a list genes from an external file.

    * The genes must be provided as a table in rows. The first column will
    be the identifier, the rest are the values.

    * Lines that start with '#' will be ignored
    """
    no_genes=0
    genes_set ={}
    fp=open(gene_file, 'r', encoding='utf-8', errors='ignore')
    for line in fp:
    
        # lines starting with '#' will be ignored
        if line[0]=='#':
            continue
        # the first column in the line will be interpreted as a seed
        # gene:
        line_data = line.strip().split('\t')
        disease      = line_data[0]
        genes      = set(line_data[1:])
        no_genes+=len(genes)
        genes_set[disease]=genes
    fp.close()

    return genes_set 


# funnction to give you the averate shortest dist between two sets of nodes 
def av_short_dist_comb(G,a, b):
    set1=set(G.nodes())&set(a)
    set2=set(G.nodes())&set(b)
    c = list(itertools.product(set1, set2))
    sp=[]
    for what in c:
        one_path=nx.shortest_path_length(G, what[0],  what[1])
        sp.append(one_path)
    totsp = sum(sp)
    average_shortest_dist= totsp / len(sp)
    return average_shortest_dist
    

In [2]:
# calling the function on your favorite dictionaries with a set of nodes



#this will be your dictionary with the pathways and their assocaited nodes 
genelist_A=read_gene_file('mydata/mono_AIM_AIF_IDs.txt')
genelist_B=read_gene_file('mydata/mono_AIM_AIF_IDs.txt')

# this is the network that you are using
G0=nx.read_edgelist('mydata/Supplementary_File_1_interactome_edgelist_cleaned.txt')
#taking the lcc of the network (to make sure that there are always shortest paths available)
G = (G0.subgraph(c) for c in nx.connected_components(G0))
G = list(G)[0] # the lcc of the network that you are using


# distance calculations starting
t0 = time.time()
mycombinations= list(itertools.product(set(genelist_A.keys()), set(genelist_B.keys())))


distance_dict={} # this dicitonary will store all your distance pairs
for pathwaytubple in mycombinations:
    genes_A = set(genelist_A[pathwaytubple[0]]) & set(G.nodes())
    genes_B = set(genelist_B[pathwaytubple[1]]) & set(G.nodes())
    if (len(genes_A)>0) &(len(genes_B)>0):
        mydist=av_short_dist_comb(G,genes_A,genes_B)
        mykey = pathwaytubple[0], pathwaytubple[1]
        distance_dict[mykey]=mydist            
print('computing time - group dist calculation: %.2f (sec)' %(float(time.time()-t0)))

#print(distance_dict)

FileNotFoundError: [Errno 2] No such file or directory: 'mydata/mono_AIM_AIF_IDs.txt'

In [None]:
#
# significance calculation
#

import time 
t0 = time.time()
print('randomisation')
import random as rand
import statistics as stat

rand_res_dict={}
usage_of_dict=0
number=1000
rand_resu={}
for groupa, genesa in genelist_A.items():
    for groupb, genesb in genelist_B.items():
        genes_A = set(genesa) & set(G.nodes())
        genes_B = set(genesb) & set(G.nodes())
        mykey = groupa, groupb
        if (len(genes_A)>0)  &(len(genes_B)>0):
            rand_list=[]
            rand_res_key=len(genes_A), len(genes_B)
            #print(rand_res_key)
            if rand_res_key in rand_res_dict.keys():
                rand_list=rand_res_dict[rand_res_key]
                usage_of_dict+=1
                print(usage_of_dict)
            else:
                while len(rand_list)<number:

                    random_nodes_1=set((rand.sample(G.nodes(),len(genes_A))))
                    random_nodes_2=set((rand.sample(G.nodes(),len(genes_B))))
                    d_AB_rand = distance_min(G,random_nodes_1,random_nodes_2)
                    rand_list.append(d_AB_rand)
                print('done randomisation for %s, %s' %(groupa,groupb ))
                rand_res_dict[rand_res_key]=rand_list
            rand_resu[mykey]=rand_list
            

print('computing time - min distance significance with %sX permut and rand res dict %.2f (sec)' %( number, float(time.time()-t0)))
print('dict used: %s times' %(usage_of_dict))

zscores={}
for my_key, results in rand_resu.items():
    nums = list(map(int, results))
    try:
        devi=stat.stdev(nums)
        if devi==0:
            devi=0.000000000001
    except:
        print("stdev not possible")

    my_Z=(min_distance_dict[my_key]-(stat.mean(nums)))/(devi)
    zscores[my_key]=my_Z