In [145]:
import pandas as pd
import bs4
import json
import re
import tqdm

import scipy.stats as st
import numpy as np

import bokeh
from bokeh.plotting import show as show_interactive
from bokeh.plotting import output_file, output_notebook
from bokeh.layouts import column, row
from bokeh.models import CustomJS, TextInput, LassoSelectTool, Select, MultiSelect, ColorBar, Legend, LegendItem
from bokeh.models.widgets import DataTable, DateFormatter, TableColumn, Button, HTMLTemplateFormatter
from bokeh.events import SelectionGeometry
from bokeh.transform import linear_cmap, jitter
from bokeh.models import FactorRange


In [146]:
complete_annotation = pd.read_csv('../eggnog/complete_eggnog_annotation.csv')
go_df = pd.read_csv('./go_annotations.csv')
kegg_df = pd.read_csv('./kegg_annotations.csv')
ec_df = pd.read_csv('./ec_annotations.csv')

In [147]:
complete_annotation.head()

Unnamed: 0,TTHERM_ID,seed_ortholog,evalue,score,eggNOG_OGs,max_annot_lvl,COG_category,Description,Preferred_name,GOs,...,KEGG_Pathway,KEGG_Module,KEGG_Reaction,KEGG_rclass,BRITE,KEGG_TC,CAZy,BiGG_Reaction,PFAMs,TGD2021_description
0,TTHERM_01528530,5911.EAS00195,2.2e-14,49.9,"KOG0594@1|root,KOG0594@2759|Eukaryota",2759|Eukaryota,G,cyclin-dependent protein serine/threonine kina...,-,-,...,-,-,-,-,-,-,-,-,-,protein kinase
1,TTHERM_01528510,5911.EAR81750,1.1e-194,633.4,"2EI75@1|root,2SNPE@2759|Eukaryota,3ZBUW@5878|C...",5878|Ciliophora,-,-,-,-,...,-,-,-,-,-,-,-,-,-,hypothetical protein
2,TTHERM_01528500,5911.EAR81749,4e-141,462.0,"COG1100@1|root,KOG0074@2759|Eukaryota",2759|Eukaryota,KLT,GTP binding,ARL13B,"GO:0000902,GO:0000904,GO:0001947,GO:0002009,GO...",...,-,-,-,-,"ko00000,ko04031",-,-,-,-,ADP-ribosylation factor family protein
3,TTHERM_0015284992,5911.EAR97791,8.4e-64,205.1,"2ERUB@1|root,2SUIJ@2759|Eukaryota",5911.EAR97791|-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,Leucine-rich repeat-containing protein 74A
4,TTHERM_00897120,5911.EAR86071,5.2e-77,248.8,"2ERUB@1|root,2SUIJ@2759|Eukaryota",5911.EAR86071|-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,kinase domain protein


In [148]:
complete_annotation.shape

(26258, 22)

In [149]:
annotation_cols = ['TTHERM_ID', 'GOs', 'KEGG_ko', 'EC']

In [150]:
complete_annotated = complete_annotation.loc[
    (complete_annotation['GOs'] != '-') |
    (complete_annotation['KEGG_ko'] != '-') |
    (complete_annotation['EC'] != '-')
].loc[:, annotation_cols]


In [151]:
complete_annotated

Unnamed: 0,TTHERM_ID,GOs,KEGG_ko,EC
2,TTHERM_01528500,"GO:0000902,GO:0000904,GO:0001947,GO:0002009,GO...",ko:K07962,-
13,TTHERM_00898250,-,ko:K12833,-
17,TTHERM_00898300,-,ko:K05643,-
22,TTHERM_01074550,-,ko:K01373,3.4.22.41
25,TTHERM_01071490,-,"ko:K08794,ko:K13412","2.7.11.1,2.7.11.17"
...,...,...,...,...
17269,TTHERM_01076950,-,ko:K09391,-
17270,TTHERM_01076960,-,ko:K12869,-
17271,TTHERM_01076970,-,"ko:K04958,ko:K04960",-
17272,TTHERM_01076980,-,ko:K06631,2.7.11.21


In [152]:
7210/26258

0.2745829842333765

In [153]:
full_filtered_df = pd.read_csv('../microarray_probe_alignment_and_filtering/allgood_filt_agg_tidy_2021aligned_qc_rma_expression_full.csv')
full_filtered_df = full_filtered_df.rename(columns={'Unnamed: 0': 'TTHERM_ID'})
full_filtered_df.shape

(20326, 48)

In [154]:
full_filtered_df['TTHERM_ID'].values

array(['TTHERM_000000042', 'TTHERM_000000045', 'TTHERM_00000010', ...,
       'TTHERM_02555200', 'TTHERM_02607240', 'TTHERM_02653470'],
      dtype=object)

In [155]:
complete_annotated_filtered = complete_annotated.loc[complete_annotated['TTHERM_ID'].isin(full_filtered_df['TTHERM_ID'].values)]
complete_annotated_filtered

Unnamed: 0,TTHERM_ID,GOs,KEGG_ko,EC
2,TTHERM_01528500,"GO:0000902,GO:0000904,GO:0001947,GO:0002009,GO...",ko:K07962,-
13,TTHERM_00898250,-,ko:K12833,-
17,TTHERM_00898300,-,ko:K05643,-
22,TTHERM_01074550,-,ko:K01373,3.4.22.41
25,TTHERM_01071490,-,"ko:K08794,ko:K13412","2.7.11.1,2.7.11.17"
...,...,...,...,...
17269,TTHERM_01076950,-,ko:K09391,-
17270,TTHERM_01076960,-,ko:K12869,-
17271,TTHERM_01076970,-,"ko:K04958,ko:K04960",-
17272,TTHERM_01076980,-,ko:K06631,2.7.11.21


In [156]:
complete_annotated_filtered.shape[0]/full_filtered_df.shape[0]

0.3259372232608482

In [157]:
go_df.head()

Unnamed: 0,GOs,GOs_description,GOs_definition,GOs_obsolete
0,-,-,-,-
1,GO:0000001,mitochondrion inheritance,"The distribution of mitochondria, including th...",False
2,GO:0000002,mitochondrial genome maintenance,The maintenance of the structure and integrity...,False
3,GO:0000003,reproduction,The production of new individuals that contain...,False
4,GO:0000011,vacuole inheritance,The distribution of vacuoles into daughter cel...,False


In [158]:
go_df['GOs_description'].loc[go_df['GOs'] == 'GO:0000001'].values[0]

'mitochondrion inheritance'

In [159]:
kegg_df.head()

Unnamed: 0,KEGG_ko,KEGG_ko_description
0,-,-
1,ko:K00002,alcohol dehydrogenase (NADP+)
2,ko:K00006,glycerol-3-phosphate dehydrogenase (NAD+)
3,ko:K00011,aldehyde reductase
4,ko:K00012,UDPglucose 6-dehydrogenase


In [160]:
ec_df.head()

Unnamed: 0,EC,EC_description
0,-,-
1,1.1.1.1,alcohol dehydrogenase
2,1.1.1.105,all-trans-retinol dehydrogenase (NAD(+))
3,1.1.1.141,15-hydroxyprostaglandin dehydrogenase (NAD(+))
4,1.1.1.178,3-hydroxy-2-methylbutyryl-CoA dehydrogenase


In [161]:
# As of 2020 https://www.ncbi.nlm.nih.gov/research/cog/
COG_dict = {
    "A" : "RNA processing and modification",
    "B" : "Chromatin structure and dynamics",
    "C" : "Energy production and conversion",
    "D" : "Cell cycle control, cell division, chromosome partitioning",
    "E" : "Amino acid transport and metabolism",
    "F" : "Nucleotide transport and metabolism",
    "G" : "Carbohydrate transport and metabolism",
    "H" : "Coenzyme transport and metabolism",
    "I" : "Lipid transport and metabolism",
    "J" : "Translation, ribosomal structure and biogenesis",
    "K" : "Transcription",
    "L" : "Replication, recombination, and repair",
    "M" : "Cell wall/membrane/envelope biogenesis",
    "N" : "Cell motility",
    "O" : "Posttranslational modification, protein turnover, chaperones",
    "P" : "Inorganic ion transport and metabolism",
    "Q" : "Secondary metabolites biosynthesis, transport and catabolism",
    "T" : "Signal transduction mechanisms",
    "U" : "Intracellular trafficking, secretion, and vesicular transport",
    "V" : "Defense mechanisms",
    "W" : "Extracellular structures",
    "X" : "Mobilome: prophages, transposons",
    "Y" : "Nuclear structure",
    "Z" : "Cytoskeleton",
    "R" : "General function prediction only",
    "S" : "Function unknown",
}

In [162]:
def term_count_dict_from_annotation_df(annot_df, term_column):
    
    column = annot_df[term_column].values
    
    funct_terms = []
    for entry in column:
        if entry != '-':
            if term_column == 'COG_category':
                # terms = [f'{e}: {COG_dict[e]}' for e in entry]
                terms = list(entry)
            else:
                terms = entry.split(',')
            for t in terms:
                funct_terms.append(t)

#     len(terms)
    
    term_count_dict = {}
    
    for t in funct_terms:
        count = term_count_dict.get(t, 0)
        count += 1
        term_count_dict[t] = count
        
    return term_count_dict

In [163]:
def enrichment_analysis(module, leiden_label_df, phases, background_annotation, term_column):
    
    module_ttids = leiden_label_df.loc[leiden_label_df[f'leiden_label_{phases}'] == module]['TTHERM_ID'].values
    
    module_annotation = background_annotation.loc[background_annotation['TTHERM_ID'].isin(module_ttids)]
    
    background_term_dict = term_count_dict_from_annotation_df(background_annotation, term_column)
    module_term_dict = term_count_dict_from_annotation_df(module_annotation, term_column)
    
    bs = []
    ps = []
    folds = []
    terms = []
    
    for t, module_count in module_term_dict.items():
        
        background_count = background_term_dict[t]
        module_size = len(module_annotation)
        background_size = len(background_annotation)
        
        standard_contingency_table = [
                                [module_count, background_count - module_count], 
                                [module_size - module_count, background_size - module_size - (background_count - module_count)]
                            ]
        
        # The -1 and +1 make this more conservative (see explanation from the DAVID database: 
        # https://david.ncifcrf.gov/helps/functional_annotation.html#geneenrich)
        conservative_contingency_table = [
                                [module_count - 1, background_count - module_count + 1], 
                                [module_size - module_count, background_size - module_size - (background_count - module_count)]
                            ]
        
        
        odds, p_standard = st.fisher_exact(standard_contingency_table, 'greater')
        odds, p_conservative = st.fisher_exact(conservative_contingency_table, 'greater')
        
        p_reasonable = np.mean([p_standard, p_conservative])
        
        bonferroni  = p_reasonable * len(module_term_dict)

        fold_enrichment = (module_count/module_size) / (background_count/background_size)

        if bonferroni <= 0.05:
            
            ps.append(p_reasonable)
            bs.append(bonferroni)
            folds.append(fold_enrichment)
            terms.append(t)
            
#         else:
#             ps.append('')
#             bs.append('')
#             folds.append('')
#             terms.append('')
            
    return ps, bs, folds, terms
            
            

In [164]:
def get_GO_info(go_term):
    
    name = go_df['GOs_description'].loc[go_df['GOs'] == go_term].values[0]
    
    definition = go_df['GOs_definition'].loc[go_df['GOs'] == go_term].values[0]
    
    obsolete = go_df['GOs_obsolete'].loc[go_df['GOs'] == go_term].values[0]
    
    return name, definition, obsolete

In [165]:
get_GO_info('GO:0000001')

('mitochondrion inheritance',
 'The distribution of mitochondria, including the mitochondrial genome, into daughter cells after mitosis or meiosis, mediated by interactions between mitochondria and the cytoskeleton.',
 'False')

In [166]:
def get_KEGG_info(term):
    return kegg_df['KEGG_ko_description'].loc[kegg_df['KEGG_ko'] == term].values[0]

In [167]:
get_KEGG_info('ko:K00002')

'alcohol dehydrogenase (NADP+)'

In [168]:
def get_EC_info(term):
    return ec_df['EC_description'].loc[ec_df['EC'] == term].values[0]

In [169]:
get_EC_info('1.1.1.141')

'15-hydroxyprostaglandin dehydrogenase (NAD(+))'

In [170]:
def write_enrichment(lldf, phases, out_prefix, term_column):
    
    with open(f'../enrichment/{out_prefix}_enrichment.txt', 'w') as fl:
    
        for m in tqdm.tqdm(sorted(lldf[f'leiden_label_{phases}'].unique())):

            # print(f'Module {m}')
            ps, bs, folds, terms = enrichment_analysis(m, lldf, phases, complete_annot, term_column)

            fl.write(f'Module {m}\n')
            for b, fold, t in zip(bs, folds, terms):
                
                if term_column == 'COG_category':
                    
                    definition = COG_dict[t]
                    t = f'{t}: {definition}'
                
                elif term_column == 'GOs':
                    name, definition, obsolete = get_GO_info(t)
                    if obsolete:
                        t = f'{name.capitalize()} ({t}): {definition} (obsolete)'
                    else:
                        t = f'{name.capitalize()} ({t}): {definition}'
                        
                elif term_column.split('_')[0] == 'KEGG':
                    name = get_KEGG_info(t)
                    
                    t = f'{t}: {name}'
                    
                elif term_column == 'EC':
                    
                    name = get_EC_info(t)
                    t = f'{t}: {name}'
                    
                        
                # print(f"{t}\nFold-enrichment: {fold:.02f}\nBonferroni-corrected p-value: {b:.02e}\n\n")

                fl.write(f'{t}\nFold-enrichment: {fold:.02f}\nBonferroni-corrected p-value: {b:.02e}\n\n')

In [171]:
def get_enrichment_df(lldf, phases, background_annotation, term_columns=['COG_category', 'GOs', 'KEGG_ko', 'EC'], outfile=None):
    
    module_dfs = []
    
    for m in tqdm.tqdm(sorted(lldf[f'leiden_label_{phases}'].unique())):
        
        term_dfs = []

        for tc in term_columns:
        
            ps, bs, folds, terms = enrichment_analysis(m, lldf, phases, background_annotation, tc)

            # fl.write(f'Module {m}\n')
            
            info = []

            if tc == 'GOs':

                for t in terms:
                    name, definition, obsolete = get_GO_info(t)
                    if obsolete:
                        info.append(f'{name.capitalize()}: {definition} (obsolete)')
                    else:
                        info.append(f'{name.capitalize()}: {definition}')
                        
            elif tc == 'COG_category':
                for t in terms:
                    info.append(COG_dict[t])
                                
            elif tc == 'KEGG_ko':
                for t in terms:
                    info.append(get_KEGG_info(t))
                    
            elif tc == 'EC':
                for t in terms:
                    info.append(get_EC_info(t))
                    
            term_df = pd.DataFrame({'module': [m]*len(terms),
                                    'term': terms,
                                    'info': info,
                                    'fold_change': folds,
                                    'bonferroni': bs})
            
            term_dfs.append(term_df)
            
        module_df = pd.concat(term_dfs)
        
        module_dfs.append(module_df)
        
    all_enrichment_df = pd.concat(module_dfs)
    
    if outfile:
        all_enrichment_df.to_csv(outfile, index=False)
    
    return all_enrichment_df
        
        
                        
            
                    
            
        
#         for b, fold, t in zip(bs, folds, terms):

#             if term_column == 'GOs':
#                 name, definition, obsolete = get_GO_info(t)
#                 if obsolete:
#                     t = f'{name.capitalize()} ({t}): {definition} (obsolete)'
#                 else:
#                     t = f'{name.capitalize()} ({t}): {definition}'

#             if term_column.split('_')[0] == 'KEGG':
#                 dfs = pd.read_html(f'https://www.genome.jp/entry/{t}')

#                 data = dfs[4]
#                 entry = data.loc[data[0] == 'Entry'][1].values[0]
#                 name = data.loc[data[0] == 'Name'][1].values[0]

#                 t = f'{t}: {name}'

In [172]:
complete_annotation = pd.read_csv('../eggnog/complete_eggnog_annotation.csv')

In [173]:
lldf_nn3 = pd.read_csv('../embedding/test_nn3_leiden_label_df_round_1.csv')

In [174]:
gene_list_test = ['TTHERM_000013409', 'TTHERM_01321550', 'TTHERM_00011710', 'TTHERM_00321680', 'TTHERM_00355700', 'TTHERM_00938950', 'TTHERM_01372820', 'TTHERM_00013410', 'TTHERM_00390080', 'TTHERM_00516380', 'TTHERM_00038880', 'TTHERM_00059370', 'TTHERM_00473020', 'TTHERM_00497590', 'TTHERM_00558350', 'TTHERM_00052190', 'TTHERM_00392790', 'TTHERM_00410180', 'TTHERM_00685980', 'TTHERM_00445920', 'TTHERM_00471040', 'TTHERM_00140780', 'TTHERM_00145480', 'TTHERM_00321720', 'TTHERM_00628650', 'TTHERM_00526730', 'TTHERM_01156770', 'TTHERM_00312200', 'TTHERM_01332070', 'TTHERM_00318900', 'TTHERM_00340180', 'TTHERM_00592740', 'TTHERM_00440600', 'TTHERM_01321570', 'TTHERM_00537380', 'TTHERM_00585170', 'TTHERM_01197130', 'TTHERM_00554390', 'TTHERM_00649180', 'TTHERM_00691410', ]
for gene in gene_list_test:
    print((lldf_nn3.loc[lldf_nn3['TTHERM_ID'] == gene])['leiden_label_full'].values[0])

839
839
839
839
839
839
839
837
837
837
837
837
837
837
837
837
837
838
837
837
837
838
838
838
838
838
838
839
839
838
838
838
839
839
837
837
839
837
837
837


In [175]:
lldf_nn3['leiden_label_full'].unique()

array([1283, 1282, 1281, ...,    2,    1,    0])

In [176]:
# %pdb

In [177]:
enrich_full_nn3 = get_enrichment_df(lldf_nn3, 'full', complete_annotation, outfile='./test_nn3_full_enrichment.csv')

  0%|          | 2/1284 [00:00<02:35,  8.27it/s]

100%|██████████| 1284/1284 [02:03<00:00, 10.36it/s]


In [178]:
enrich_full_nn3 = pd.read_csv('./test_nn3_full_enrichment.csv')

In [179]:
enrich_full_nn3.head(40)

Unnamed: 0,module,term,info,fold_change,bonferroni
0,1.0,U,"Intracellular trafficking, secretion, and vesi...",32.699875,4.473578e-07
1,1.0,GO:0005575,"Cellular_component: A location, relative to ce...",13.756647,0.01892595
2,1.0,GO:0005622,Intracellular anatomical structure: A componen...,14.877054,0.01493668
3,1.0,GO:0005623,Obsolete cell: OBSOLETE. The basic structural ...,14.309537,0.01680029
4,1.0,GO:0005737,Cytoplasm: The contents of a cell excluding th...,18.12459,0.00822529
5,1.0,GO:0005794,Golgi apparatus: A membrane-bound cytoplasmic ...,140.042667,1.722394e-05
6,1.0,GO:0006464,Protein modification process: The covalent alt...,42.609331,0.0006239795
7,1.0,GO:0006471,Obsolete protein adp-ribosylation: OBSOLETE. T...,2100.64,3.739236e-09
8,1.0,GO:0006807,Nitrogen compound metabolic process: The chemi...,22.957814,0.004028969
9,1.0,GO:0006810,Transport: The directed movement of substances...,51.739901,0.0003475154


In [180]:
for term in enrich_full_nn3['term'].unique():
    if 'ko:' not in term and 'GO:' not in term and '.' not in term:
        print(term)

U
G
Q
M
O
T
B
Y
L
Z
D
K
J
S
P
A
W
F
C
E
H
I


In [181]:
print(max(enrich_full_nn3['module'].values))
print(min(enrich_full_nn3['module'].values))

not_enriched_mod_list = []
enriched_mod_list = []

for num in range(int(max(enrich_full_nn3['module'].values)) + 1):
    if num not in enrich_full_nn3['module'].values:
        not_enriched_mod_list.append(num)
    if num in enrich_full_nn3['module'].values:
        enriched_mod_list.append(num)

print(len(not_enriched_mod_list) + len(enriched_mod_list))
print(max(not_enriched_mod_list + enriched_mod_list))
print()

print(set(range(int(max(enrich_full_nn3['module'].values)) + 1)) - set(enrich_full_nn3['module'].values))
print(len(set(enrich_full_nn3['module'].values)))
print(len(enriched_mod_list))
print(len(not_enriched_mod_list))

1281.0
1.0
1282
1281

{0, 5, 10, 11, 14, 15, 17, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 94, 95, 96, 99, 100, 101, 102, 103, 104, 105, 106, 108, 109, 110, 111, 112, 114, 115, 116, 117, 118, 119, 120, 122, 123, 125, 128, 130, 134, 136, 137, 138, 140, 141, 142, 143, 145, 146, 147, 148, 149, 150, 151, 152, 153, 155, 156, 157, 159, 160, 162, 163, 164, 165, 172, 173, 174, 175, 176, 177, 178, 180, 182, 183, 184, 185, 187, 188, 190, 192, 193, 194, 196, 203, 205, 206, 207, 208, 209, 210, 214, 215, 217, 219, 220, 221, 222, 223, 224, 226, 227, 228, 229, 230, 231, 232, 234, 235, 238, 239, 240, 241, 242, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 259, 260, 261, 263, 264, 265, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 27

In [182]:
print(not_enriched_mod_list)

[0, 5, 10, 11, 14, 15, 17, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 94, 95, 96, 99, 100, 101, 102, 103, 104, 105, 106, 108, 109, 110, 111, 112, 114, 115, 116, 117, 118, 119, 120, 122, 123, 125, 128, 130, 134, 136, 137, 138, 140, 141, 142, 143, 145, 146, 147, 148, 149, 150, 151, 152, 153, 155, 156, 157, 159, 160, 162, 163, 164, 165, 172, 173, 174, 175, 176, 177, 178, 180, 182, 183, 184, 185, 187, 188, 190, 192, 193, 194, 196, 203, 205, 206, 207, 208, 209, 210, 214, 215, 217, 219, 220, 221, 222, 223, 224, 226, 227, 228, 229, 230, 231, 232, 234, 235, 238, 239, 240, 241, 242, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 259, 260, 261, 263, 264, 265, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281,

In [183]:
enrich_full_nn3.loc[enrich_full_nn3['module']==72]

Unnamed: 0,module,term,info,fold_change,bonferroni


In [184]:
enrich_full_nn3['module'].unique()

array([1.000e+00, 2.000e+00, 3.000e+00, 4.000e+00, 6.000e+00, 7.000e+00,
       8.000e+00, 9.000e+00, 1.200e+01, 1.300e+01, 1.600e+01, 1.800e+01,
       3.000e+01, 5.400e+01, 9.300e+01, 9.700e+01, 9.800e+01, 1.070e+02,
       1.130e+02, 1.210e+02, 1.240e+02, 1.260e+02, 1.270e+02, 1.290e+02,
       1.310e+02, 1.320e+02, 1.330e+02, 1.350e+02, 1.390e+02, 1.440e+02,
       1.540e+02, 1.580e+02, 1.610e+02, 1.660e+02, 1.670e+02, 1.680e+02,
       1.690e+02, 1.700e+02, 1.710e+02, 1.790e+02, 1.810e+02, 1.860e+02,
       1.890e+02, 1.910e+02, 1.950e+02, 1.970e+02, 1.980e+02, 1.990e+02,
       2.000e+02, 2.010e+02, 2.020e+02, 2.040e+02, 2.110e+02, 2.120e+02,
       2.130e+02, 2.160e+02, 2.180e+02, 2.250e+02, 2.330e+02, 2.360e+02,
       2.370e+02, 2.430e+02, 2.580e+02, 2.620e+02, 2.660e+02, 2.840e+02,
       2.950e+02, 3.040e+02, 3.090e+02, 3.100e+02, 3.130e+02, 3.140e+02,
       3.280e+02, 3.300e+02, 3.360e+02, 3.470e+02, 3.520e+02, 3.530e+02,
       3.540e+02, 3.580e+02, 3.620e+02, 3.900e+02, 

In [185]:
lldf_nn3['leiden_label_full'].unique()

array([1283, 1282, 1281, ...,    2,    1,    0])

In [186]:
lldf_nn3.loc[lldf_nn3['TTHERM_ID'] == 'TTHERM_01386050']

Unnamed: 0,TTHERM_ID,leiden_label_full
8200,TTHERM_01386050,748


In [187]:
gene_list_test = ['TTHERM_000013409', 'TTHERM_01321550', 'TTHERM_00011710', 'TTHERM_00321680', 'TTHERM_00355700', 'TTHERM_00938950', 'TTHERM_01372820', 'TTHERM_00013410', 'TTHERM_00390080', 'TTHERM_00516380', 'TTHERM_00038880', 'TTHERM_00059370', 'TTHERM_00473020', 'TTHERM_00497590', 'TTHERM_00558350', 'TTHERM_00052190', 'TTHERM_00392790', 'TTHERM_00410180', 'TTHERM_00685980', 'TTHERM_00445920', 'TTHERM_00471040', 'TTHERM_00140780', 'TTHERM_00145480', 'TTHERM_00321720', 'TTHERM_00628650', 'TTHERM_00526730', 'TTHERM_01156770', 'TTHERM_00312200', 'TTHERM_01332070', 'TTHERM_00318900', 'TTHERM_00340180', 'TTHERM_00592740', 'TTHERM_00440600', 'TTHERM_01321570', 'TTHERM_00537380', 'TTHERM_00585170', 'TTHERM_01197130', 'TTHERM_00554390', 'TTHERM_00649180', 'TTHERM_00691410', ]
for gene in gene_list_test:
    print((lldf_nn3.loc[lldf_nn3['TTHERM_ID'] == gene])['leiden_label_full'].values[0])

839
839
839
839
839
839
839
837
837
837
837
837
837
837
837
837
837
838
837
837
837
838
838
838
838
838
838
839
839
838
838
838
839
839
837
837
839
837
837
837


In [188]:
len(set(enrich_full_nn3['module'].values))

285

end here. Return to embedding notebook!