# This notebook demonstrates Workflow 9, gene-cooccurrence detection

In [9]:
import biclusterco, importlib, asyncio

In [18]:
GeneCoocurrenceByBiclusterObject = biclusterco.CoocurrenceByBicluster()

# In the cells below either input a text file url or manually input the genes to be used as input. Once that is done, the rest of the cells may be executed without modification.

In [23]:
#curated_geneset = GeneCoocurrenceByBiclusterObject.run_getinput('https://raw.githubusercontent.com/NCATS-Tangerine/cq-notebooks/master/FA_gene_sets/FA_1_core_complex.txt')

In [24]:
curated_geneset = ['ENSG00000187741.10']

In [22]:
curated_geneset

['ENSG00000187741.10']

In [6]:
loop = asyncio.get_event_loop()

In [7]:
related_biclusters_and_genes_for_each_input_gene = loop.run_until_complete(GeneCoocurrenceByBiclusterObject.find_related_biclusters_async(curated_geneset))

In [8]:
bicluster_occurences_dict = CoocurrenceByBiclusterObject.bicluster_occurences_dict(related_biclusters_and_genes_for_each_input_gene)

In [9]:
unique_biclusters = CoocurrenceByBiclusterObject.unique_biclusters(bicluster_occurences_dict)

In [10]:
len(unique_biclusters)

8

In [10]:
unique_biclusters

['480',
 '653',
 '1085',
 '169',
 '604',
 '626',
 '643',
 '775',
 '799',
 '921',
 '452',
 '456',
 '679',
 '686',
 '764',
 '1313',
 '251',
 '615',
 '698',
 '18',
 '204',
 '323',
 '482',
 '555',
 '1221',
 '176',
 '569',
 '717',
 '864',
 '1303',
 '168',
 '610',
 '660',
 '680',
 '740',
 '748',
 '1116',
 '1342',
 '196',
 '320',
 '365',
 '1073',
 '1329',
 '1355',
 '268',
 '611',
 '47',
 '158',
 '518',
 '720',
 '25',
 '331',
 '471',
 '504',
 '663',
 '809',
 '912',
 '964']

In [11]:
genes_in_unique_biclusters = CoocurrenceByBiclusterObject.genes_in_unique_biclusters(unique_biclusters, related_biclusters_and_genes_for_each_input_gene)

In [12]:
dict_of_genes_in_unique_biclusters_not_in_inputs = CoocurrenceByBiclusterObject.genes_in_unique_biclusters_not_in_input_gene_list(curated_geneset, genes_in_unique_biclusters)

In [13]:
dict_of_genes_in_unique_biclusters_not_in_inputs

defaultdict(dict,
            {'ncbigene:9582': 4,
             'ncbigene:9824': 7,
             'ncbigene:79915': 7,
             'ncbigene:6790': 7,
             'ncbigene:9212': 8,
             'ncbigene:332': 5,
             'ncbigene:641': 3,
             'ncbigene:699': 10,
             'ncbigene:701': 6,
             'ncbigene:80178': 4,
             'ncbigene:57082': 4,
             'ncbigene:9133': 7,
             'ncbigene:991': 4,
             'ncbigene:995': 5,
             'ncbigene:157313': 4,
             'ncbigene:113130': 8,
             'ncbigene:55143': 7,
             'ncbigene:983': 7,
             'ncbigene:81620': 4,
             'ncbigene:1058': 5,
             'ncbigene:1063': 9,
             'ncbigene:64946': 4,
             'ncbigene:79019': 4,
             'ncbigene:79172': 4,
             'ncbigene:55165': 6,
             'ncbigene:150468': 8,
             'ncbigene:63967': 4,
             'ncbigene:55789': 5,
             'ncbigene:81624': 4,
             

## The above is a dictionary which represents the genes, present in uniquely occurring biclusters, for a given set of input genes. The form of the dictionary is {output_gene : # of occurrences of this gene across relevant biclusters}

## The above list is truncated by python... Let's find the gene with the most occurrences.

In [35]:
importlib.reload(biclusterco) #this is a nice thing or locale module development

<module 'biclusterco' from '/Users/colincurtis/Documents/renci/translator-workflows/WorkFlow9/biclusterco.py'>

In [14]:
sorted_list_of_output_genes = CoocurrenceByBiclusterObject.sorted_list_of_output_genes(dict_of_genes_in_unique_biclusters_not_in_inputs)

In [15]:
sorted_list_of_output_genes

[(31, 'ncbigene:1'),
 (10, 'ncbigene:699'),
 (10, 'ncbigene:3833'),
 (9, 'ncbigene:9787'),
 (9, 'ncbigene:64151'),
 (9, 'ncbigene:4751'),
 (9, 'ncbigene:24137'),
 (9, 'ncbigene:1063'),
 (8, 'ncbigene:9833'),
 (8, 'ncbigene:9212'),
 (8, 'ncbigene:890'),
 (8, 'ncbigene:83540'),
 (8, 'ncbigene:56992'),
 (8, 'ncbigene:4998'),
 (8, 'ncbigene:3832'),
 (8, 'ncbigene:150468'),
 (8, 'ncbigene:113130'),
 (7, 'ncbigene:9837'),
 (7, 'ncbigene:983'),
 (7, 'ncbigene:9824'),
 (7, 'ncbigene:9493'),
 (7, 'ncbigene:9133'),
 (7, 'ncbigene:79915'),
 (7, 'ncbigene:7272'),
 (7, 'ncbigene:6790'),
 (7, 'ncbigene:55723'),
 (7, 'ncbigene:55247'),
 (7, 'ncbigene:55143'),
 (7, 'ncbigene:5347'),
 (7, 'ncbigene:51203'),
 (7, 'ncbigene:29127'),
 (7, 'ncbigene:22974'),
 (7, 'ncbigene:220134'),
 (7, 'ncbigene:157570'),
 (7, 'ncbigene:11065'),
 (7, 'ncbigene:10460'),
 (7, 'ncbigene:10112'),
 (7, 'ncbigene:10024'),
 (6, 'ncbigene:9928'),
 (6, 'ncbigene:9700'),
 (6, 'ncbigene:83990'),
 (6, 'ncbigene:79801'),
 (6, 'ncbige

In [16]:
len(sorted_list_of_output_genes)

4486

# OK - now let's try this process again but for tissue enrichment! We will begin with a single tissue ID = UBERON:0001157

In [36]:
tissue_ID_set = ['UBERON:0001157']

In [37]:
tissue_ID_set

['UBERON:0001157']

In [38]:
loop2 = asyncio.get_event_loop()

In [39]:
Tissue_Object = biclusterco.CoocurrenceByBicluster()

In [41]:
related_biclusters_and_genes_for_each_input_tissue = loop2.run_until_complete(Tissue_Object.find_tissue_related_biclusters_async(tissue_ID_set))

In [43]:
tissue_bicluster_occurences_dict = Tissue_Object.bicluster_occurences_dict(related_biclusters_and_genes_for_each_input_tissue)

In [44]:
unique_tissue_biclusters = Tissue_Object.unique_biclusters(tissue_bicluster_occurences_dict)

In [45]:
len(unique_tissue_biclusters)

25

In [46]:
unique_tissue_biclusters

['1',
 '2',
 '3',
 '29',
 '87',
 '105',
 '139',
 '151',
 '287',
 '415',
 '545',
 '568',
 '605',
 '656',
 '693',
 '823',
 '843',
 '911',
 '940',
 '989',
 '1123',
 '1224',
 '1241',
 '1301',
 '1357']

In [49]:
genes_in_unique_tissue_biclusters = Tissue_Object.genes_in_unique_biclusters(unique_tissue_biclusters, related_biclusters_and_genes_for_each_input_tissue)

In [51]:
dict_of_genes_in_unique_biclusters_not_in_tissue_ID_input = Tissue_Object.genes_in_unique_biclusters_not_in_input_gene_list(tissue_ID_set, genes_in_unique_tissue_biclusters)

In [52]:
dict_of_genes_in_unique_biclusters_not_in_tissue_ID_input

defaultdict(dict,
            {'ENSG00000069431.6': 1,
             'ENSG00000073670.9': 1,
             'ENSG00000197381.11': 2,
             'ENSG00000166025.13': 4,
             'ENSG00000154945.6': 1,
             'ENSG00000177119.11': 2,
             'ENSG00000240771.2': 2,
             'ENSG00000107518.12': 1,
             'ENSG00000185963.9': 2,
             'ENSG00000173068.13': 1,
             'ENSG00000112276.9': 1,
             'ENSG00000142686.7': 2,
             'ENSG00000088854.11': 4,
             'ENSG00000180354.11': 1,
             'ENSG00000196557.6': 1,
             'ENSG00000153956.11': 1,
             'ENSG00000105767.2': 2,
             'ENSG00000134072.6': 1,
             'ENSG00000198624.8': 1,
             'ENSG00000181072.7': 1,
             'ENSG00000105270.10': 1,
             'ENSG00000166250.7': 1,
             'ENSG00000044459.10': 1,
             'ENSG00000172301.6': 2,
             'ENSG00000162733.12': 3,
             'ENSG00000151240.11': 1,
        

In [54]:
sorted_list_of_outputs_for_tissue_cooccurrence = Tissue_Object.sorted_list_of_output_genes(dict_of_genes_in_unique_biclusters_not_in_tissue_ID_input)

In [55]:
sorted_list_of_outputs_for_tissue_cooccurrence

[(10, 'ENSG00000233041.4'),
 (6, 'ENSG00000198853.7'),
 (5, 'ENSG00000221890.2'),
 (5, 'ENSG00000171016.7'),
 (5, 'ENSG00000149639.10'),
 (5, 'ENSG00000095539.11'),
 (4, 'ENSG00000166025.13'),
 (4, 'ENSG00000165424.6'),
 (4, 'ENSG00000163586.5'),
 (4, 'ENSG00000153814.7'),
 (4, 'ENSG00000148516.17'),
 (4, 'ENSG00000121297.6'),
 (4, 'ENSG00000088854.11'),
 (4, 'ENSG00000072840.8'),
 (4, 'ENSG00000061918.8'),
 (3, 'ENSG00000213996.8'),
 (3, 'ENSG00000198947.10'),
 (3, 'ENSG00000183317.12'),
 (3, 'ENSG00000171533.7'),
 (3, 'ENSG00000167081.12'),
 (3, 'ENSG00000166143.5'),
 (3, 'ENSG00000166086.8'),
 (3, 'ENSG00000162733.12'),
 (3, 'ENSG00000156711.12'),
 (3, 'ENSG00000152022.7'),
 (3, 'ENSG00000145012.8'),
 (3, 'ENSG00000144426.14'),
 (3, 'ENSG00000144045.9'),
 (3, 'ENSG00000138794.5'),
 (3, 'ENSG00000138685.8'),
 (3, 'ENSG00000137825.6'),
 (3, 'ENSG00000134138.15'),
 (3, 'ENSG00000132465.6'),
 (3, 'ENSG00000130414.7'),
 (3, 'ENSG00000123094.11'),
 (3, 'ENSG00000121380.8'),
 (3, 'ENSG0000