In [7]:
#
#
#
# reading in network and autoimmune/autoinflammatory genes
#
#
#

def read_gene_file(gene_file):
    """
    Reads a list genes from an external file.

    * The genes must be provided as a table in rows. The first column will
    be the identifier, the rest are the values.

    * Lines that start with '#' will be ignored
    """
    no_genes=0
    genes_set ={}
    fp=open(gene_file, 'r', encoding='utf-8', errors='ignore')
    for line in fp:
    
        # lines starting with '#' will be ignored
        if line[0]=='#':
            continue
        # the first column in the line will be interpreted as a seed
        # gene:
        line_data = line.strip().split('\t')
        disease      = line_data[0]
        genes      = set(line_data[1:])
        no_genes+=len(genes)
        genes_set[disease]=genes
    fp.close()

    return genes_set 

import networkx as nx

import itertools
import time 

t0 = time.time()
G0=nx.read_edgelist('mydata/Supplementary_File_1_interactome_edgelist_cleaned.txt')
#G=max(nx.connected_component_subgraphs(G0), key=len)
G = (G0.subgraph(c) for c in nx.connected_components(G0))
G = list(G)[0]

groups=read_gene_file('mydata/aimm_ainfl_grouping_3_jan21_IDs.txt') # file that contains autoimmune/autoinflammatory grouping of monogenic autoimmune/autoinflammatory gene defects


> done loading network:
> network contains 18854 nodes and 483644 links

> done reading gene sets:
> 187 genes found in 3 sets


In [8]:
#
# first neighbors
#
first_neighbors_no={}
first_neighbors={}
for mycluster, genes in groups.items():
    neighbors=set()
    for thegene in genes:
        neigh_list=G.neighbors(thegene) 
        for what in neigh_list:
            neighbors.add(what)
    first_neighbors[mycluster]=neighbors
    first_neighbors_no[mycluster]=len(neighbors)
print(first_neighbors.keys())

dict_keys(['autoinflammation', 'autoimmunity', 'autoimmunity and autoinflammation'])


In [9]:
#
# pathway annotation read in 
#

enrichment_groups=bft.read_gene_file('mydata/reactome_pathways.txt')

#
#
# number of pathways per group
#
#
fp=open('mydata/pathways_reached_by_aimm_ainfl_monsterppi_per_group.txt','w')
pathway_data=[]
pathways_reached_dict={}
pathways_reached_number={}
for myclus, genelist in first_neighbors.items():
    pathways_reached=set()
    for mygene in genelist:
        for path, geneset in enrichment_groups.items():
            if mygene in geneset:
                pathways_reached.add(path)
                
                
    pathways_reached_dict[myclus]=pathways_reached
    for pathwayset in pathways_reached:
        fp.write('%s\t%s\n' %(myclus,pathwayset ))
    pathways_reached_number[myclus]=len(pathways_reached)
    pathway_data.append({'cluster':myclus, 'cluster_size': len(groups[myclus]),'neighborhood_size':first_neighbors_no[myclus], 'pathways_reached':len(pathways_reached), 'pathway_diversity':len(pathways_reached)/first_neighbors_no[myclus] })
fp.close()

pathway_to_clus={}
for theclus, pathwayset in pathways_reached_dict.items():
    for path in pathwayset:
        if path in pathway_to_clus.keys():
            pathway_to_clus[path]+=';'+theclus
        else:
            pathway_to_clus[path]=theclus



> done reading gene sets:
> 89855 genes found in 1554 sets


In [29]:
#
#
# printing out results
#
#


import pandas as pd

neigh_datatable=pd.DataFrame(pathway_data)
print(neigh_datatable)

neigh_datatable.to_csv('mydata/pathway_diversity_autoimmune_autoinflammatory_genes.txt', index=False, sep='\t')

   adjusted_pathways_reached                            cluster  cluster_size  \
0                   3.542254                   autoinflammation            75   
1                   3.245161                       autoimmunity            83   
2                   4.502326  autoimmunity and autoinflammation            29   

   neighborhood_size  pathways_reached  
0                284              1006  
1                310              1006  
2                215               968  
