In [1]:
import pandas as pd
import bs4
import requests
import json
import re
import tqdm

import scipy.stats as st
import numpy as np

import bokeh
from bokeh.plotting import show as show_interactive
from bokeh.plotting import output_file, output_notebook
from bokeh.layouts import column, row
from bokeh.models import CustomJS, TextInput, LassoSelectTool, Select, MultiSelect, ColorBar, Legend, LegendItem
from bokeh.models.widgets import DataTable, DateFormatter, TableColumn, Button, HTMLTemplateFormatter
from bokeh.events import SelectionGeometry
from bokeh.transform import linear_cmap, jitter
from bokeh.models import FactorRange


In [2]:
# As of 2020 https://www.ncbi.nlm.nih.gov/research/cog/
COG_dict = {
    "A" : "RNA processing and modification",
    "B" : "Chromatin structure and dynamics",
    "C" : "Energy production and conversion",
    "D" : "Cell cycle control, cell division, chromosome partitioning",
    "E" : "Amino acid transport and metabolism",
    "F" : "Nucleotide transport and metabolism",
    "G" : "Carbohydrate transport and metabolism",
    "H" : "Coenzyme transport and metabolism",
    "I" : "Lipid transport and metabolism",
    "J" : "Translation, ribosomal structure and biogenesis",
    "K" : "Transcription",
    "L" : "Replication, recombination, and repair",
    "M" : "Cell wall/membrane/envelope biogenesis",
    "N" : "Cell motility",
    "O" : "Posttranslational modification, protein turnover, chaperones",
    "P" : "Inorganic ion transport and metabolism",
    "Q" : "Secondary metabolites biosynthesis, transport and catabolism",
    "T" : "Signal transduction mechanisms",
    "U" : "Intracellular trafficking, secretion, and vesicular transport",
    "V" : "Defense mechanisms",
    "W" : "Extracellular structures",
    "X" : "Mobilome: prophages, transposons",
    "Y" : "Nuclear structure",
    "Z" : "Cytoskeleton",
    "R" : "General function prediction only",
    "S" : "Function unknown",
}

In [3]:
def term_count_dict_from_annotation_df(annot_df, term_column):
    
    column = annot_df[term_column].values
    
    funct_terms = []
    for entry in column:
        if entry != '-':
            if term_column == 'COG_category':
                # terms = [f'{e}: {COG_dict[e]}' for e in entry]
                terms = list(entry)
            else:
                terms = entry.split(',')
            for t in terms:
                funct_terms.append(t)

#     len(terms)
    
    term_count_dict = {}
    
    for t in funct_terms:
        count = term_count_dict.get(t, 0)
        count += 1
        term_count_dict[t] = count
        
    return term_count_dict

In [4]:
def enrichment_analysis(module, leiden_label_df, phases, background_annotation, term_column):
    
    module_ttids = leiden_label_df.loc[leiden_label_df[f'leiden_label_{phases}'] == module]['TTHERM_ID'].values
    
    module_annotation = background_annotation.loc[background_annotation['TTHERM_ID'].isin(module_ttids)]
    
    background_term_dict = term_count_dict_from_annotation_df(background_annotation, term_column)
    module_term_dict = term_count_dict_from_annotation_df(module_annotation, term_column)
    
    bs = []
    ps = []
    folds = []
    terms = []
    
    for t, module_count in module_term_dict.items():
        
        background_count = background_term_dict[t]
        module_size = len(module_annotation)
        background_size = len(background_annotation)
        
        standard_contingency_table = [
                                [module_count, background_count - module_count], 
                                [module_size - module_count, background_size - module_size - (background_count - module_count)]
                            ]
        
        # The -1 and +1 make this more conservative (see explanation from the DAVID database: 
        # https://david.ncifcrf.gov/helps/functional_annotation.html#geneenrich)
        conservative_contingency_table = [
                                [module_count - 1, background_count - module_count + 1], 
                                [module_size - module_count, background_size - module_size - (background_count - module_count)]
                            ]
        
        
        odds, p_standard = st.fisher_exact(standard_contingency_table, 'greater')
        odds, p_conservative = st.fisher_exact(conservative_contingency_table, 'greater')
        
        p_reasonable = np.mean([p_standard, p_conservative])
        
        bonferroni  = p_reasonable * len(module_term_dict)

        fold_enrichment = (module_count/module_size) / (background_count/background_size)

        if bonferroni <= 0.05:
            
            ps.append(p_reasonable)
            bs.append(bonferroni)
            folds.append(fold_enrichment)
            terms.append(t)
            
#         else:
#             ps.append('')
#             bs.append('')
#             folds.append('')
#             terms.append('')
            
    return ps, bs, folds, terms
            
            

In [5]:
def get_GO_info(go_term):
    
    r = requests.get(f'https://www.ebi.ac.uk/QuickGO/services/ontology/go/terms/{go_term}/complete', 'html5lib')
    
    go_info = json.loads(r.text)
    
    name = go_info['results'][0]['name']
    
    definition = go_info['results'][0]['definition']['text']
    
    obsolete = go_info['results'][0]['isObsolete']
    
    return name, definition, obsolete

In [6]:
def write_enrichment(lldf, phases, out_prefix, term_column):
    
    with open(f'../enrichment/{out_prefix}_enrichment.txt', 'w') as fl:
    
        for m in tqdm.tqdm(sorted(lldf[f'leiden_label_{phases}'].unique())):

            # print(f'Module {m}')
            ps, bs, folds, terms = enrichment_analysis(m, lldf, phases, complete_annot, term_column)

            fl.write(f'Module {m}\n')
            for b, fold, t in zip(bs, folds, terms):
                
                if term_column == 'COG_category':
                    
                    definition = COG_dict[t]
                    t = f'{t}: {definition}'
                
                elif term_column == 'GOs':
                    name, definition, obsolete = get_GO_info(t)
                    if obsolete:
                        t = f'{name.capitalize()} ({t}): {definition} (obsolete)'
                    else:
                        t = f'{name.capitalize()} ({t}): {definition}'
                        
                elif term_column.split('_')[0] == 'KEGG':
                    dfs = pd.read_html(f'https://www.genome.jp/entry/{t}')
                    
                    data = dfs[4]
                    entry = data.loc[data[0] == 'Entry'][1].values[0]
                    name = data.loc[data[0] == 'Name'][1].values[0]
                    
                    t = f'{t}: {name}'
                        
                # print(f"{t}\nFold-enrichment: {fold:.02f}\nBonferroni-corrected p-value: {b:.02e}\n\n")

                fl.write(f'{t}\nFold-enrichment: {fold:.02f}\nBonferroni-corrected p-value: {b:.02e}\n\n')

In [7]:
def get_KEGG_info(term):
    try:
        dfs = pd.read_html(f'https://www.genome.jp/entry/{term}')
    except:
        return "NA"

    data = dfs[3]
    entry = data.loc[data[0] == 'Entry'][1].values[0]
    name = data.loc[data[0] == 'Name'][1].values[0]
    
    return name

In [8]:
def get_enrichment_df(lldf, phases, background_annotation, term_columns=['COG_category', 'GOs', 'KEGG_ko'], outfile=None):
    
    module_dfs = []
    
    for m in tqdm.tqdm(sorted(lldf[f'leiden_label_{phases}'].unique())):
        
        term_dfs = []

        for tc in term_columns:
        
            ps, bs, folds, terms = enrichment_analysis(m, lldf, phases, background_annotation, tc)

            # fl.write(f'Module {m}\n')
            
            info = []

            if tc == 'GOs':

                for t in terms:
                    name, definition, obsolete = get_GO_info(t)
                    if obsolete:
                        info.append(f'{name.capitalize()}: {definition} (obsolete)')
                    else:
                        info.append(f'{name.capitalize()}: {definition}')
                        
            elif tc == 'COG_category':
                for t in terms:
                    info.append(COG_dict[t])
                                
            elif tc == 'KEGG_ko':
                for t in terms:
                    info.append(get_KEGG_info(t))
                    
            term_df = pd.DataFrame({'module': [m]*len(terms),
                                    'term': terms,
                                    'info': info,
                                    'fold_change': folds,
                                    'bonferroni': bs})
            
            term_dfs.append(term_df)
            
        module_df = pd.concat(term_dfs)
        
        module_dfs.append(module_df)
        
    all_enrichment_df = pd.concat(module_dfs)
    
    if outfile:
        all_enrichment_df.to_csv(outfile, index=False)
    
    return all_enrichment_df
        
        
                        
            
                    
            
        
#         for b, fold, t in zip(bs, folds, terms):

#             if term_column == 'GOs':
#                 name, definition, obsolete = get_GO_info(t)
#                 if obsolete:
#                     t = f'{name.capitalize()} ({t}): {definition} (obsolete)'
#                 else:
#                     t = f'{name.capitalize()} ({t}): {definition}'

#             if term_column.split('_')[0] == 'KEGG':
#                 dfs = pd.read_html(f'https://www.genome.jp/entry/{t}')

#                 data = dfs[4]
#                 entry = data.loc[data[0] == 'Entry'][1].values[0]
#                 name = data.loc[data[0] == 'Name'][1].values[0]

#                 t = f'{t}: {name}'

In [9]:
complete_annotation = pd.read_csv('../eggnog/complete_eggnog_annotation.csv')

In [89]:
lldf_nn3 = pd.read_csv('../embedding/test_nn3_leiden_label_df_round_1_rearranged.csv')

In [90]:
lldf_nn3['leiden_label_full'].unique()

array([43, 37, 16, 41, 24, 30, 54, 19,  3, 45,  4, 22, 57, 28, 20,  8, 21,
        1, 60, 55, 48, 10, 53, 12,  5, 33, 46, 39, 47, 49, 23, 25, 32, 18,
       34,  2, 44, 13, 15, 52, 50, 11, 26, 42, 56, 14, 40, 38,  9, 35, 31,
       36, 59, 17, 58,  7, 51,  0, 29,  6, 27])

In [91]:
lldf_nn2 = pd.read_csv('../embedding/test_nn2_leiden_label_df_round_1.csv')

FileNotFoundError: [Errno 2] No such file or directory: '../embedding/test_nn2_leiden_label_df_round_1.csv'

In [15]:
%pdb

Automatic pdb calling has been turned ON


In [92]:
enrich_full_nn3 = get_enrichment_df(lldf_nn3, 'full', complete_annotation, outfile='./test_nn3_full_enrichment.csv')

100%|██████████| 61/61 [04:08<00:00,  4.08s/it]


In [79]:
enrich_full_nn3 = pd.read_csv('./test_nn3_full_enrichment.csv')

In [80]:
enrich_full_nn3.head()

Unnamed: 0,module,term,info,fold_change,bonferroni
0,0.0,O,"Posttranslational modification, protein turnov...",3.825466,8.344037e-13
1,0.0,GO:0000338,Protein deneddylation: The removal of a ubiqui...,112.213675,0.001572177
2,0.0,GO:0008180,Cop9 signalosome: A protein complex that catal...,112.213675,0.001572177
3,0.0,GO:0016333,Morphogenesis of follicular epithelium: The pr...,96.18315,0.002743434
4,0.0,GO:0019827,Stem cell population maintenance: The process ...,48.091575,0.02796343


In [81]:
enrich_full_nn3['module'].unique()

array([ 0.,  2.,  3.,  4.,  5.,  6.,  9., 10., 11., 12., 13., 14., 15.,
       16., 19., 21., 22., 25., 27., 28., 29., 30., 31., 32., 33., 35.,
       38., 39., 40., 41., 43., 46., 48., 50., 51., 53., 55., 57., 59.,
       60.])

In [82]:
lldf_nn3['leiden_label_full'].unique()

array([43, 60,  1, 21,  8, 20, 28, 57, 22,  4, 45,  3, 19, 54, 30, 37, 16,
       41, 24, 39, 56, 42, 50, 11, 26, 34, 48, 52, 15, 13, 44, 25, 32, 18,
       31, 23, 49, 10, 33, 47, 12,  5, 53, 46, 55, 59, 17, 58, 27,  6, 29,
        0, 51,  2,  7, 35,  9, 38, 40, 14, 36])

In [74]:
lldf_nn3.loc[lldf_nn3['TTHERM_ID'] == 'TTHERM_01386050']

Unnamed: 0,TTHERM_ID,leiden_label_full,leiden_label_veg,leiden_label_sex
17314,TTHERM_01386050,47,3,47


In [135]:
palette64 = """
white\n#FA002E\n#22FC22\n#221CFA\n#FF3DD6\n#FFDA00\n#00FEFB\n#F48684\n#CEB4FE\n#FFFFE5\n#0D933D\n#CC00F8\n#800D5D\n#F10084\n#22267A\n#0DADFF\n#CBFD71\n#9A761C\n#F96C00\n#6399A6\n#FFBCDA\n#8D0DA3\n#F79F26\n#00FFBF\n#A37CFB\n#F68EEB\n#720D0D\n#F163AA\n#7E926A\n#826386\n#B41C32\n#9BEBCE\n#E2DB83\n#56D4FA\n#E6E2FB\n#925D58\n#F7C3A7\n#62E970\n#220DBD\n#5583BB\n#7EA01C\n#CDFDB6\n#FD00FB\n#B30D97\n#F5FF00\n#DD77FD\n#4282FC\n#BBA6A4\n#0D8068\n#AB5F26\n#F7C26E\n#9EFE00\n#9B2EFD\n#C56887\n#FD3D68\n#ABF2FD\n#835FAC\n#FF16B1\n#325371\n#CA16CA\n#D26322\n#AFCFFE\n#91A1FA\nfloralwhite
""".split()

In [136]:
colors = [palette64[int(m)] for m in enrich_full_nn3['module'].values]
enrich_full_nn3['color'] = colors
enrich_full_nn3.head()

Unnamed: 0,module,term,info,fold_change,bonferroni,color
0,2.0,U,"Intracellular trafficking, secretion, and vesi...",1.84981,0.02851542,#22FC22
1,3.0,J,"Translation, ribosomal structure and biogenesis",28.298794,6.50174e-81,#221CFA
2,3.0,GO:0002181,Cytoplasmic translation: The chemical reaction...,127.522296,2.056548e-17,#221CFA
3,3.0,GO:0003674,Molecular_function: A molecular process that c...,3.811736,5.612665e-07,#221CFA
4,3.0,GO:0003676,Nucleic acid binding: Binding to a nucleic acid.,18.458027,4.172928e-15,#221CFA


In [143]:
ecds = bokeh.models.ColumnDataSource(enrich_full_nn3)

In [147]:
len(enrich_full_nn3)

401

In [149]:
len(ecds.data['module'])

401

In [150]:
ecds.data['module'].unique

AttributeError: 'numpy.ndarray' object has no attribute 'unique'

In [141]:
def plot_enrichment(enrich_df):
    
    # pdb.set_trace()
    
    # y_range = FactorRange(factors=[str(y) for y in enrich_df['module'].unique()])
    
    # grouped = enrich_df.groupby('module')
    
    hover = [
        ('module', '@module'),
        ('term', '@term'),
        ('info', '@info'),
        ('fold-change', '@fold_change'),
        ('bonferroni', '@bonferroni')
    ]
    
    p = bokeh.plotting.figure(
        height=1200,
        width=400,
        # y_range=y_range,
        title='Functional term enrichment in modules',
        x_axis_label='fold-change',
        y_axis_label='module',
        x_axis_type='log',
        tooltips=hover,
        # background_fill_color='black'
    )
    
    cds = bokeh.models.ColumnDataSource(enrich_df)
    print(enrich_df.head())
    
    p.circle(y=jitter('module', width=0.4), x='fold_change', source=cds, alpha=0.3, size=7, color='color', line_color='black')
    # p.xaxis.major_label_orientation = 45
    p.ygrid.minor_grid_line_color = 'navy'
    p.ygrid.minor_grid_line_alpha = 0.1
    # p.xgrid.band_fill_alpha = 0.1
    # p.xgrid.band_fill_color = "navy"
    p.yaxis.ticker = [m for m in enrich_df['module'].unique()]
    p.y_range.flipped = True
    
    return p

In [142]:
p = plot_enrichment(enrich_full_nn3)
bokeh.io.show(p)

   module        term                                               info  \
0     2.0           U  Intracellular trafficking, secretion, and vesi...   
1     3.0           J    Translation, ribosomal structure and biogenesis   
2     3.0  GO:0002181  Cytoplasmic translation: The chemical reaction...   
3     3.0  GO:0003674  Molecular_function: A molecular process that c...   
4     3.0  GO:0003676   Nucleic acid binding: Binding to a nucleic acid.   

   fold_change    bonferroni    color  
0     1.849810  2.851542e-02  #22FC22  
1    28.298794  6.501740e-81  #221CFA  
2   127.522296  2.056548e-17  #221CFA  
3     3.811736  5.612665e-07  #221CFA  
4    18.458027  4.172928e-15  #221CFA  


In [76]:
FactorRange(factors=[y for y in enrich_full_nn3['module'].unique()]).factors

['m00',
 'm01',
 'm02',
 'm03',
 'm04',
 'm05',
 'm06',
 'm07',
 'm08',
 'm09',
 'm10',
 'm11',
 'm13',
 'm14',
 'm15',
 'm16',
 'm17',
 'm18',
 'm19',
 'm20',
 'm21',
 'm22',
 'm24',
 'm26',
 'm27',
 'm28',
 'm29',
 'm30',
 'm31',
 'm32',
 'm33',
 'm35',
 'm36',
 'm38',
 'm39',
 'm41',
 'm42',
 'm45',
 'm46',
 'm47',
 'm50',
 'm54',
 'm56',
 'm60',
 'm61',
 'm63']

In [63]:
enrich_full = get_enrichment_df(lldf, 'full', complete_annotation, outfile='./full_enrichment_leiden_round_1.csv')

100%|███████████████████████████████████████████████████████████████████████████████████| 64/64 [05:39<00:00,  5.31s/it]


In [64]:
enrich_sex = get_enrichment_df(lldf, 'sex', complete_annotation, outfile='./sex_enrichment_leiden_round_1.csv')

100%|███████████████████████████████████████████████████████████████████████████████████| 59/59 [03:36<00:00,  3.67s/it]


In [65]:
enrich_veg = get_enrichment_df(lldf, 'veg', complete_annotation, outfile='./veg_enrichment_leiden_round_1.csv')

100%|███████████████████████████████████████████████████████████████████████████████████| 58/58 [05:22<00:00,  5.56s/it]


In [50]:
enrich['module'].unique()

array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 13.,
       14., 15., 16., 17., 18., 19., 20., 21., 22., 24., 26., 27., 28.,
       29., 30., 31., 32., 33., 35., 36., 38., 39., 41., 42., 45., 46.,
       47., 50., 54., 56., 60., 61., 63.])

In [None]:
write_enrichment(leiden_label_df_round_1, 'full', 'full_leiden_GOs', 'GOs')

 23%|███████████████████▍                                                               | 15/64 [02:10<00:57,  1.18s/it]

In [None]:
write_enrichment(consensus_lldf_full_3, 'full', 'consensus_1_full_leiden_GOs', 'GOs')

In [89]:
from bokeh.io import output_file, show
from bokeh.models import ColumnDataSource
from bokeh.plotting import figure
from bokeh.sampledata.commits import data
from bokeh.transform import jitter



In [88]:
data

Unnamed: 0_level_0,day,time
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-04-22 15:11:58-05:00,Sat,15:11:58
2017-04-21 14:20:57-05:00,Fri,14:20:57
2017-04-20 14:35:08-05:00,Thu,14:35:08
2017-04-20 10:34:29-05:00,Thu,10:34:29
2017-04-20 09:17:23-05:00,Thu,09:17:23
...,...,...
2013-01-24 17:08:57-06:00,Thu,17:08:57
2013-01-21 16:22:39-06:00,Mon,16:22:39
2013-01-03 16:28:49-06:00,Thu,16:28:49
2013-01-02 17:46:43-06:00,Wed,17:46:43


In [96]:
data = data.replace({'Mon': 'm00', 'Tue': 'm01', 'Wed': 'm02', 'Thu': 'm03', 'Fri': 'm04', 'Sat': 'm05', 'Sun': 'm06'})
data

Unnamed: 0_level_0,day,time
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-04-22 15:11:58-05:00,m05,15:11:58
2017-04-21 14:20:57-05:00,m04,14:20:57
2017-04-20 14:35:08-05:00,m03,14:35:08
2017-04-20 10:34:29-05:00,m03,10:34:29
2017-04-20 09:17:23-05:00,m03,09:17:23
...,...,...
2013-01-24 17:08:57-06:00,m03,17:08:57
2013-01-21 16:22:39-06:00,m00,16:22:39
2013-01-03 16:28:49-06:00,m03,16:28:49
2013-01-02 17:46:43-06:00,m02,17:46:43


In [97]:
output_file("bars.html")

DAYS = ['m00', 'm01', 'm02', 'm03', 'm04', 'm05', 'm06']

source = ColumnDataSource(data)

p = figure(height=300, y_range=DAYS, x_axis_type='datetime',
           toolbar_location=None, sizing_mode="stretch_width",
           title="Commits by time of day (US/Central) 2012—2016")

p.circle(x='time', y=jitter('day', width=0.6, range=p.y_range),  source=source, alpha=0.3)

p.xaxis[0].formatter.days = ['%Hh']
p.x_range.range_padding = 0
p.ygrid.grid_line_color = None

show(p)