## Import all packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pdb

import bokeh
from bokeh.plotting import show as show_interactive
from bokeh.plotting import output_file, output_notebook
from bokeh.layouts import column, row
from bokeh.models import CustomJS, TextInput, LassoSelectTool, Select, MultiSelect, ColorBar, Legend, LegendItem
from bokeh.models.widgets import DataTable, DateFormatter, TableColumn, Button, HTMLTemplateFormatter
from bokeh.events import SelectionGeometry
from bokeh.transform import linear_cmap, jitter
from matplotlib.pyplot import show as show_static
# from clustergrammer2 import net, Network, CGM2

import igraph as ig
import leidenalg as la
from sklearn.preprocessing import StandardScaler

import scipy.stats as st
import scipy.spatial
import scipy.cluster.hierarchy

import glob
import json
import re
import copy

import requests
import bs4
import tqdm
import os

from Bio import SeqIO

import umap
import pymde

import torch

# bokeh.io.output_notebook()

## Define functions

In [183]:
def get_geom_mean_expression(expression_df):
    """
    
    Function to take an expression dataframe from the microarrays and collapse it into the means of
    all replicate chips.
    """
    # C2 and S12 got removed during quality control
    x = [
        'Ll', 
        'Lm', 
        'Lh', 
        'S0', 
        'S3', 
        'S6', 
        'S9', 
        # 'S12', 
        'S15', 
        'S24', 
        'C0', 
        # 'C2', 
        'C4', 
        'C6', 
        'C8', 
        'C10', 
        'C12', 
        'C14', 
        'C16', 
        'C18']
    
    # cols = expression_df.columns[1:]
    # x = [c for c in x if c in cols]
    
    condition_expr_dict = {c.split("_")[0]: [] for c in expression_df.columns[1:]}
    
    for c in list(expression_df.columns)[1:]:
        
        cond = c.split('_')[0]
        if cond in condition_expr_dict.keys():
            expr_list = condition_expr_dict.get(cond, [])

            # Need to avoid true zeros
            expr_list.append(expression_df[c].values)
            condition_expr_dict[cond] = expr_list
        
    condition_mean_dict = {c: (st.mstats.gmean(np.array(condition_expr_dict[c]) + 1, 0) - 1) for c in condition_expr_dict.keys() if c in x}
    
    mean_expr_df = pd.DataFrame(condition_mean_dict)
    mean_expr_df['TTHERM_ID'] = expression_df['TTHERM_ID'].values
    cols = list(mean_expr_df.columns)
    reorder = cols[-1:] + cols[:-1]
    mean_expr_df = mean_expr_df[reorder]
    
    return mean_expr_df

def normalizer(array):
    """
    Normalizes the values of an array to range from zero to one
    """
    
    a = np.array(array)
    
    normalized = (array - np.min(array)) / (np.max(array) - np.min(array))
    
    return normalized

def normalize_expression_per_gene(expression_df):
    """
    Function to normalize all gene expression to range from zero to one.
    """
    if 'TTHERM_ID' in expression_df.columns:
        ttids = expression_df['TTHERM_ID'].values
        data = expression_df[list(expression_df.columns)[1:]]
        
        norm_expression_df = data.apply(lambda row: normalizer(row), axis=1)
        norm_expression_df['TTHERM_ID'] = ttids
        
        columns = norm_expression_df.columns.tolist()
        
        rearrangment = columns[-1:] + columns[:-1]
        
        norm_expression_df = norm_expression_df[rearrangment]
        
    else:
        norm_expression_df = expression_df.apply(lambda row: normalizer(row), axis=1)
    
    return norm_expression_df
    


def run_leiden(df, n_components=2, n_neighbors=3, random_state=42, metric='manhattan', return_dists=True):
    """
    Function to compute the simplicial sets for coexpression using UMAP and to then apply
    the Leiden algorithm to cluster the resulting graph.
    
    Parameters:
    -----------
    df : pandas dataframe
        the expression data
    n_components : int (default 2)
        the number of dimensions onto which the data should be projected
    n_neighbors : int (default 15)
        a parameter for the UMAP algorithm. I think it has to do with balancing
        local vs. global topology in the data
    random_state : float (default 42)
        Constraining this parameter makes the output reproducible
    metric : str (default "euclidean")
        The distance function
    return_dists : Bool (default True)
        Whether the function should return the computed distances
        
    Returns:
    --------
    leiden_modules : np array
        An array of ints, each corresponding to the module (or cluster) to which a gene belongs,
        listed in ortder of the input dataframe
    """
    
    data = df[list(df.columns)[1:]].values
    
#     mapper = umap.UMAP(random_state=random_state, n_components=n_components, n_neighbors=n_neighbors).fit(data)
    
    result, sigmas, rhos, dists = umap.umap_.fuzzy_simplicial_set(data, n_neighbors, random_state, metric, return_dists=return_dists)
    
    sources, targets = result.nonzero()
    edge_list = zip(sources, targets)
    weights = result.data
    
    g = ig.Graph(edges=edge_list, edge_attrs={'weight': weights})
    
    partition = la.find_partition(g, la.ModularityVertexPartition, seed=random_state, weights='weight')
    leiden_modules = np.array(partition.membership)
    
    return leiden_modules, dists

def build_leiden_label_df(data_df, phases, random_state=42, n_neighbors=3, metric='manhattan', lldf=None):
    """
    Function to build a dataframe of genes labeled according to their UMAP/Leiden modules
    
    Parameters:
    -----------
    data_df : pandas DataFrame
        The expression data
    phases : str ('full', 'veg', or 'sex')
        The physiological phases for which expression data is being provided
    lldf : pandas DataFrame (default None)
        Another leiden label df (lldf) to which to add a column
        
    Returns:
    --------
    lldf : pandas DataFrame
        Leiden Label DataFrame. Gene IDs and their corresponding UMAP/Leiden module
        computed for a specific physiological regime (full set (full), vegetative only
        (veg), or sexual only (sex))
    """
    
    if type(lldf) == type(None):
        lldf = pd.DataFrame.from_dict({'TTHERM_ID': []})
    
    leiden_modules, dists = run_leiden(data_df, random_state=random_state, n_neighbors=n_neighbors, metric=metric)
    lldf['TTHERM_ID'] = data_df['TTHERM_ID'].values
    
    lldf[f'leiden_label_{phases}'] = leiden_modules
    
    return lldf, dists


# The two functions below are taken and adapted from the UMAP package
def _get_umap_embedding(umap_object):
    if hasattr(umap_object, "embedding_"):
        return umap_object.embedding_
    elif hasattr(umap_object, "embedding"):
        return umap_object.embedding
    else:
        raise ValueError("Could not find embedding attribute of umap_object")
        
def plot_enrichment(enrich_column_data_source):
    
    # pdb.set_trace()
    
    # y_range = FactorRange(factors=[str(y) for y in enrich_df['module'].unique()])
    
    # grouped = enrich_df.groupby('module')
    
    hover = [
        ('module', '@module'),
        ('term', '@term'),
        ('info', '@info'),
        ('fold-change', '@fold_change'),
        ('bonferroni', '@bonferroni')
    ]
    
    p = bokeh.plotting.figure(
        height=1000,
        width=400,
        # y_range=y_range,
        title='Functional term enrichment in modules',
        x_axis_label='fold-change',
        y_axis_label='module',
        x_axis_type='log',
        tooltips=hover,
        # background_fill_color='black'
    )
    
    # cds = bokeh.models.ColumnDataSource(enrich_df)
    # print(enrich_df.head())
    
    p.circle(y=jitter('module', width=0.4), x='fold_change', source=enrich_column_data_source, alpha=0.3, size=7, color='color', line_color='black')
    # p.xaxis.major_label_orientation = 45
    p.ygrid.minor_grid_line_color = 'navy'
    p.ygrid.minor_grid_line_alpha = 0.1
    # p.xgrid.band_fill_alpha = 0.1
    # p.xgrid.band_fill_color = "navy"
    ticker = []
    for m in enrich_column_data_source.data['module']:
        if m not in ticker:
            ticker.append(m)
    p.yaxis.ticker = ticker
    p.y_range.flipped = True
    p.xaxis.major_label_text_font_size = '12pt'
    p.yaxis.major_label_text_font_size = '12pt'
    p.yaxis.axis_label_text_font_size = '12pt'
    p.xaxis.axis_label_text_font_size = '12pt'
    
    return p

def heatmap(column_data_source, ls_color_palette, r_low, r_high, x_axis_factors, y_axis_factors, s_z="normalized_expression", index_name='TTHERM_ID', col_name='phase'):
    # adapted from https://gitlab.com/biotransistor/bokehheat/-/blob/master/bokehheat/heat.py
    """
    input:
        df_matrx: a dataframe in same xy orientation as the final heatmap.
          the index should cary the y axis label.
          the column should cary the x axis label.
          the matrix as such should only cary the z axis values.

        ls_color_palette: a list color strings to specify the color spectrum.
            this variable is compatible with the ordinary bokeh palettes:
            https://bokeh.pydata.org/en/latest/docs/reference/palettes.html

        r_low: quantitative minimum value. the dataset can contain lower values,
            but for color labeling they will be mapped to this minimum value.
            e.g.: -8.

        r_high: quantitative maximum value. the dataset can contain lower values,
            but for color labeling they will be mapped to this maximum value.
            e.g.: 8.

        s_z: string. label that specifies what the values in the matrix actually
            are. e.g.: 'gene expression [log2]'

    output:
        p: bokeh plot object.

    description:
        this function will return a bokeh based interactive heatmap plot.
        the color are representing the z value.
    """
    # index as string
#     df_matrix.index = df_matrix.index.astype(str)
#     df_matrix.columns = df_matrix.columns.astype(str)

#     # handle y and x axis name
#     if (df_matrix.index.name == None):
#         df_matrix.index.name = "y_axis"
#     if (df_matrix.columns.name == None):
#         df_matrix.columns.name = "x_axis"
    # pdb.set_trace()
    s_y = index_name
    
    # df_matrix.columns.name = 'phase'
    s_x = col_name
    
    
    # print(df_matrix.head())
    
    # melt dataframe
    # df_tidy = df_matrix.reset_index().melt(
    #     id_vars=[df_matrix.index.name],
    #     value_name=s_z
    # )
    # print(df_tidy.head())
    # color declaration
    d_zcolormapper = linear_cmap(
        field_name=s_z,
        palette=ls_color_palette,
        low=r_low,
        high=r_high
    )
    # tooltip declaration
    lt_tooltip = [
        (s_y, f"@{s_y}"),
        (s_x, f"@{s_x}"),
        (s_z, f"@{s_z}"),
        ('module', f'@module')
    ]
    # generate figure
    o_colorbar = ColorBar(color_mapper=d_zcolormapper['transform'])
    p = bokeh.plotting.figure(
        y_range=y_axis_factors,
        x_range=x_axis_factors,
        width=400,
        height=1000,
        tools = "box_zoom,hover,pan,reset,wheel_zoom,save",  # have to be set hardcoded
        active_drag = "box_zoom",  # have to be set hardcoded
        tooltips=lt_tooltip,
        title=s_z,
        toolbar_location='right',
        
    )
    
    p.rect(
        source=column_data_source,
        x=s_x,
        y=s_y,
        color=d_zcolormapper,
        width=1,
        height=1,
        fill_alpha='fill_alpha',
        line_alpha='line_alpha',
        # line_color='white',
        nonselection_fill_alpha=0.01,
        nonselection_line_alpha=0.01,
        # nonselection_line_color="white"
    )
    p.add_layout(o_colorbar, place='left')
    # p.yaxis.major_label_orientation = "horizontal"
    p.xaxis.major_label_orientation = 45
    # p.yaxis.major_label_text_font_size = '0pt'
    p.yaxis.visible = False
    p.xaxis.major_label_text_font_size = '12pt'

    # out
    return(p)
        
def interactive(
    embedding_df,
    x,
    # mean_expression_df,
    title=None,
    labels=None,
    values=None,
    hover_data=None,
    theme=None,
    cmap="Blues",
    color_key=None,
    color_key_cmap="Spectral",
    background="white",
#     width=800,
#     height=800,
    point_size=None,
    radius=None, # My contribution
#     subset_points=None,
    interactive_text_search=False,
    interactive_text_search_columns=None,
    interactive_text_search_alpha_contrast=0.999,
    alpha=None,
    normalized=True
):
    """Create an interactive bokeh plot of a UMAP embedding.
    While static plots are useful, sometimes a plot that
    supports interactive zooming, and hover tooltips for
    individual points is much more desireable. This function
    provides a simple interface for creating such plots. The
    result is a bokeh plot that will be displayed in a notebook.
    Note that more complex tooltips etc. will require custom
    code -- this is merely meant to provide fast and easy
    access to interactive plotting.
    Parameters
    ----------
    embedding_df: pandas DataFrame
        A expression dataframe with columns x and y, which are the
        2D embedding of a model (e.g., UMAP or pyMDE) on the expression data, and all the
        annotations, geometric means of expression, etc.
    x: list
        The categories for the x-axes of the heatmap and expression profiles
    labels: array, shape (n_samples,) (optional, default None)
        An array of labels (assumed integer or categorical),
        one for each data sample.
        This will be used for coloring the points in
        the plot according to their label. Note that
        this option is mutually exclusive to the ``values``
        option.
    values: array, shape (n_samples,) (optional, default None)
        An array of values (assumed float or continuous),
        one for each sample.
        This will be used for coloring the points in
        the plot according to a colorscale associated
        to the total range of values. Note that this
        option is mutually exclusive to the ``labels``
        option.
    hover_data: DataFrame, shape (n_samples, n_tooltip_features)
    (optional, default None)
        A dataframe of tooltip data. Each column of the dataframe
        should be a Series of length ``n_samples`` providing a value
        for each data point. Column names will be used for
        identifying information within the tooltip.
    theme: string (optional, default None)
        A color theme to use for plotting. A small set of
        predefined themes are provided which have relatively
        good aesthetics. Available themes are:
           * 'blue'
           * 'red'
           * 'green'
           * 'inferno'
           * 'fire'
           * 'viridis'
           * 'darkblue'
           * 'darkred'
           * 'darkgreen'
    cmap: string (optional, default 'Blues')
        The name of a matplotlib colormap to use for coloring
        or shading points. If no labels or values are passed
        this will be used for shading points according to
        density (largely only of relevance for very large
        datasets). If values are passed this will be used for
        shading according the value. Note that if theme
        is passed then this value will be overridden by the
        corresponding option of the theme.
    color_key: dict or array, shape (n_categories) (optional, default None)
        A way to assign colors to categoricals. This can either be
        an explicit dict mapping labels to colors (as strings of form
        '#RRGGBB'), or an array like object providing one color for
        each distinct category being provided in ``labels``. Either
        way this mapping will be used to color points according to
        the label. Note that if theme
        is passed then this value will be overridden by the
        corresponding option of the theme.
    color_key_cmap: string (optional, default 'Spectral')
        The name of a matplotlib colormap to use for categorical coloring.
        If an explicit ``color_key`` is not given a color mapping for
        categories can be generated from the label list and selecting
        a matching list of colors from the given colormap. Note
        that if theme
        is passed then this value will be overridden by the
        corresponding option of the theme.
    background: string (optional, default 'white')
        The color of the background. Usually this will be either
        'white' or 'black', but any color name will work. Ideally
        one wants to match this appropriately to the colors being
        used for points etc. This is one of the things that themes
        handle for you. Note that if theme
        is passed then this value will be overridden by the
        corresponding option of the theme.
    width: int (optional, default 800)
        The desired width of the plot in pixels.
    height: int (optional, default 800)
        The desired height of the plot in pixels
    point_size: int (optional, default None)
        The size of each point marker
    radius: int (optional, default None)
        The radius of each point marker (adjusts the point size while zooming)
    subset_points: array, shape (n_samples,) (optional, default None)
        A way to select a subset of points based on an array of boolean
        values.
    interactive_text_search: bool (optional, default False)
        Whether to include a text search widget above the interactive plot
    interactive_text_search_columns: list (optional, default None)
        Columns of data source to search. Searches labels and hover_data by default.
    interactive_text_search_alpha_contrast: float (optional, default 0.95)
        Alpha value for points matching text search. Alpha value for points
        not matching text search will be 1 - interactive_text_search_alpha_contrast
    alpha: float (optional, default: None)
        The alpha blending value, between 0 (transparent) and 1 (opaque).
    Returns
    -------
    """
    if theme is not None:
        cmap = _themes[theme]["cmap"]
        color_key_cmap = _themes[theme]["color_key_cmap"]
        background = _themes[theme]["background"]

    if labels is not None and values is not None:
        raise ValueError(
            "Conflicting options; only one of labels or values should be set"
        )
        
    if alpha is not None:
        if not 0.0 <= alpha <= 1.0:
            raise ValueError("Alpha must be between 0 and 1 inclusive")

    if point_size is None and radius is None:
        point_size = 100.0 / np.sqrt(points.shape[0])
        
    data = embedding_df
    # data = data.set_index('TTHERM_ID')
    # pdb.set_trace()
    if radius is not None:
        data['radius'] = radius

    if labels is not None:
        data["label"] = labels

        if color_key is None:
            unique_labels = np.unique(labels)
            num_labels = unique_labels.shape[0]
            color_key = _to_hex(
                plt.get_cmap(color_key_cmap)(np.linspace(0, 1, num_labels))
            )

        if isinstance(color_key, dict):
            data["color"] = pd.Series(labels).map(color_key)
        else:
            # print('here')
            unique_labels = np.unique(labels)
            if len(color_key) < unique_labels.shape[0]:
                # raise ValueError(
                #     "Color key must have enough colors for the number of labels"
                # )
                
                print('Color key has fewer colors than labels. Making all white')
                data['color'] = ['white']*len(labels)
            else:

                new_color_key = {k: color_key[i] for i, k in enumerate(unique_labels)}
                data["color"] = pd.Series(labels).map(new_color_key)

        colors = "color"

    elif values is not None:
        data["value"] = values
        palette = _to_hex(plt.get_cmap(cmap)(np.linspace(0, 1, 256)))
        colors = btr.linear_cmap(
            "value", palette, low=np.min(values), high=np.max(values)
        )

    else:
        colors = matplotlib.colors.rgb2hex(plt.get_cmap(cmap)(0.5))

    # print(data['color'].unique())
    # print(colors)

    if hover_data is not None:
        tooltip_dict = {}
        for col_name in hover_data:
            data[col_name] = hover_data[col_name]
            tooltip_dict[col_name] = "@{" + col_name + "}"
        tooltips = list(tooltip_dict.items())
    else:
        tooltips = None

    if alpha is not None:
        data["alpha"] = alpha
    else:
        alpha = 1
        data["alpha"] = alpha

    data_source = bokeh.plotting.ColumnDataSource(data)
    data_source.data['module'] = hover_data['module']
    data_source.data['ID'] = hover_data['ID']
    data_source.data['radius'] = np.ones_like(hover_data['ID']) * radius
    data_source.data['alpha'] = np.ones_like(hover_data['ID']) * alpha
    
    # print(data_source.data['ID'][:5])

    plot = bokeh.plotting.figure(
        width=800,
        height=500,
        tooltips=tooltips,
        tools="tap,box_select,pan,wheel_zoom,box_zoom,reset,save",
        background_fill_color=background,
        title=title
#             x_range=(np.floor(min(points[:,0])), np.ceil(max(points[:,0]))), # Get axes
#             y_range=(np.floor(min(points[:,1])), np.ceil(max(points[:,1])))
    )

    if point_size is not None:

        plot.circle(
            x="x",
            y="y",
            source=data_source,
            color=colors,
            size=point_size,
            alpha="alpha",
            line_color='black'
        )

    elif radius is not None:
        plot.circle(
            x="x",
            y="y",
            source=data_source,
            color=colors,
            radius=radius,
            alpha="alpha",
            line_color='black'
        )

    plot.grid.visible = False
    plot.axis.visible = False

    
    x_heatmap_profile = x
    
    # ['Ll', 
    #      'Lm', 
    #      'Lh', 
    #      'S0', 
    #      'S3', 
    #      'S6', 
    #      'S9', 
    #      # 'S12', 
    #      'S15', 
    #      'S24', 
    #      'C0', 
    #      # 'C2', 
    #      'C4', 
    #      'C6', 
    #      'C8', 
    #      'C10', 
    #      'C12', 
    #      'C14', 
    #      'C16', 
    #      'C18']
    
    if normalized:
        hm_min = 0
        hm_max = 1
        
    else:
        hm_min = 2
        hm_max = 16
    
    # For companion heatmap plot
    ttherm_ids = embedding_df['TTHERM_ID'].values
    hm_df = embedding_df[['TTHERM_ID'] + x_heatmap_profile]
    hm_df['module'] = hover_data['module'].values
    hm_df_tidy = hm_df.melt(id_vars=['TTHERM_ID', 'module'], var_name='phase', value_name='normalized_expression')
    hm_cds = bokeh.plotting.ColumnDataSource(hm_df_tidy)
    hm_cds.data['fill_alpha'] = [0.7]*len(hm_df_tidy)
    hm_cds.data['line_alpha'] = [0.7]*len(hm_df_tidy)
    # hm_cds.data['y_axis'] = ttherm_ids
    
    hm = heatmap(hm_cds, bokeh.palettes.Inferno256, hm_min, hm_max, x_heatmap_profile, ttherm_ids)
    
    

    # For companion expression plot

    expr_source = bokeh.plotting.ColumnDataSource(dict(
        ID=['blah'], 
        expr_xs=[['Ll']], 
        expr_ys=[[0]],
        alpha=[0],
        color=['black']))
    
    if normalized:
        y_axis_label = 'Geometric mean expression of normalized replicates'
        y_range = (-0.01, 1.01)
        
    else:
        y_axis_label = 'Geometric mean expression of replicates (log2-scale)'
        y_range = (3.9, 16.1)
    
    expr_fig = bokeh.plotting.figure(width=800, 
                                     height=500,
                                     background_fill_color=background,
                                     # x_axis_label='Phase or condition',
                                     y_axis_label=y_axis_label,
                                     x_range=x_heatmap_profile, 
                                     y_range=y_range
                                    )

    expr_fig.multi_line('expr_xs', 
                        'expr_ys', 
                        source=expr_source, 
                        alpha='alpha', 
                        line_width=3, 
                        line_join='round',
                        line_color="color"
                       )

    expr_fig.xaxis.major_label_orientation = np.pi/4
    expr_fig.xaxis.major_label_text_font_size = '12pt'
    expr_fig.yaxis.major_label_text_font_size = '12pt'
    expr_fig.yaxis.axis_label_text_font_size = '12pt'
    expr_fig.xgrid.grid_line_color='whitesmoke'
    expr_fig.xgrid.grid_line_alpha=0.2
    expr_fig.ygrid.grid_line_color='whitesmoke'
    expr_fig.ygrid.grid_line_alpha=0.2

    # For data table
    s2 = bokeh.plotting.ColumnDataSource(data=dict(ID=[]))

    columns = [TableColumn(field="ID",  title="TTHERM_ID", width=160, formatter=HTMLTemplateFormatter(template='<a href="http://ciliate.org/index.php/feature/details/<%= ID %>"target="_blank"><%= ID %></a>')),
               TableColumn(field="module",  title="Module", width=160),
               TableColumn(field='TGD2021_description', title='TGD2021_description', width=160),
               TableColumn(field="Description", title="eggNOG_description", width=160),
               TableColumn(field="Preferred_name", title="eggNOG_preferred_name", width=160),
               TableColumn(field="max_annot_lvl", title="max_annot_lvl", width=160),
               TableColumn(field="COG_category", title="COG_category", width=160),
               TableColumn(field='EC', title='EC', width=160),
               TableColumn(field='GOs', title='GOs', width=160),
               TableColumn(field='KEGG_ko', title='KEGG_ko', width=160),
               TableColumn(field='KEGG_Pathway', title='KEGG_Pathway', width=160),
               TableColumn(field='KEGG_Module', title='KEGG_Module', width=160),
               TableColumn(field='KEGG_Reaction', title='KEGG_Reaction', width=160),
               TableColumn(field='KEGG_rclass', title='KEGG_rclass', width=160),
               TableColumn(field='BRITE', title='BRITE', width=160),
               TableColumn(field='KEGG_TC', title='KEGG_TC', width=160),
               TableColumn(field='CAZy', title='CAZy', width=160),
               TableColumn(field='BiGG_Reaction', title='BiGG_Reaction', width=160),
#                    TableColumn(field="x",  title="x"),
#                    TableColumn(field="y",  title="y")
              ]
    table = DataTable(source=s2, 
                      columns=columns, 
                      width=1600, 
                      height=500,
                      editable=True,
                      selectable=True,
                      sortable=True,
                      index_width=10,
                      fit_columns=False,
                     )
    
    heatmap_callback = CustomJS(
        args=dict(
            s1=data_source,
            s_hm=hm_cds,
            cols=x_heatmap_profile
        ),
        code="""
        var d1 = s1.data;
        var d_hm = s_hm.data;
        
        var inds = s1.selected.indices;
        const num_cols = cols.length;
        
        //d_hm['TTHERM_ID'] = []
        //d_hm['normalized_expression'] = []
        d_hm['fill_alpha'] = []
        d_hm['line_alpha'] = []
        
        var selected_ttherm_ids = [];
        
        var ttids = d_hm['TTHERM_ID'].slice(0, 16595);
        
        if (inds.length == 0) {
            d_hm['fill_alpha'] = Array(d_hm['TTHERM_ID'].length).fill(0.7)
            d_hm['line_alpha'] = Array(d_hm['TTHERM_ID'].length).fill(0.7)
        }else{
        
            // Start with everything deselected
            d_hm['fill_alpha'] = Array(d_hm['TTHERM_ID'].length).fill(0.01)
            d_hm['line_alpha'] = Array(d_hm['TTHERM_ID'].length).fill(0.01)
        
            // Get the selected indices
            for (var i = 0; i < inds.length; i++) {
                selected_ttherm_ids.push(d1['ID'][inds[i]])
            }
            console.log(selected_ttherm_ids);
            
            // iterate over the selected ttherm ids
            for (var j = 0; j < selected_ttherm_ids.length; j++) {
            
                // var selected_gene = selected_ttherm_ids[j];
                // console.log(selected_gene);

                // ad hoc function to find if ttherm ids match
                var match = (element) => element == selected_ttherm_ids[j];
            
                // get index of matching ttherm id in heatmap
                var gene_index = ttids.findIndex(match);
                console.log(gene_index);
                
                // loop over the columns and highlight the selected genes
                for (var k = 0; k < num_cols; k++) {
                
                    d_hm['fill_alpha'][gene_index] = 0.7
                    d_hm['line_alpha'][gene_index] = 0.7

                    gene_index = gene_index + ttids.length
                
                }
            
            }
            
        }
        
        console.log(d_hm);
        
        s_hm.change.emit();
        
        """
    )

    expression_callback = CustomJS(
        args=dict(
            s1=data_source,
            s_expr=expr_source,
            alpha=alpha,
        ),
        code="""
        var d1 = s1.data;
        var d_expr = s_expr.data;

        var inds = s1.selected.indices;
        // console.log(inds)

        // console.log(d1['ID'].length)

        // d1['alpha'] = Array(d1['ID'].length).fill(0.2)

        // console.log(d_expr['ID'].length, d_expr['expr_xs'].length, d_expr['expr_ys'].length)

        d_expr['ID'] = [['blah']]
        d_expr['expr_xs'] = [['Ll']]
        d_expr['expr_ys'] = [[0]]
        d_expr['alpha'] = [0]
        d_expr['color'] = ['black']
        // s_expr.change.emit();

        // debugger;

        for (var i = 0; i < inds.length; i++) {
            // d_expr['alpha'][inds[i]] = 1/(inds.length * 2)
            // console.log(inds[i], i)
            d_expr['ID'].push(Array(18).fill(d1['ID'][inds[i]]))
            d_expr['expr_xs'].push(d1['expr_xs'][inds[i]])
            d_expr['expr_ys'].push(d1['expr_ys'][inds[i]])
            d_expr['alpha'].push(Math.min(1, Math.max(7/(inds.length), 0.05)))
            d_expr['color'].push(d1['color'][inds[i]])
            // console.log(d_expr)
            // console.log(i)
            // console.log(
            //     d_expr['ID'].length, 
            //     d_expr['expr_xs'].length, 
            //     d_expr['expr_ys'].length
            // )
        }

        // s1.change.emit();
        s_expr.change.emit();
        // console.log(s_expr.data)

        """

    )

    selection_callback =  CustomJS(args=dict(
                                          s1=data_source, 
                                          s2=s2,
                                          table=table), 
                                               code="""

        var d1 = s1.data;
        var d2 = s2.data;


        var inds = s1.selected.indices;

        d2['module'] = []
        d2['ID'] = []
        d2['TGD2021_description'] = []
        d2['Description'] = []
        d2['Preferred_name'] = []
        d2['max_annot_lvl'] = []
        d2['COG_category'] = []
        d2['EC'] = []
        d2['GOs'] = []
        d2['KEGG_ko'] = []
        d2['KEGG_Pathway'] = []
        d2['KEGG_Module'] = []
        d2['KEGG_Reaction'] = []
        d2['KEGG_rclass'] = []
        d2['BRITE'] = []
        d2['KEGG_TC'] = []
        d2['CAZy'] = []
        d2['BiGG_Reaction'] = []

        for (var i = 0; i < inds.length; i++) {
            d2['module'].push(d1['module'][inds[i]])
            d2['ID'].push(d1['ID'][inds[i]])
            d2['TGD2021_description'].push(d1['TGD2021_description'][inds[i]])
            d2['Description'].push(d1['Description'][inds[i]])
            d2['Preferred_name'].push(d1['Preferred_name'][inds[i]])
            d2['max_annot_lvl'].push(d1['max_annot_lvl'][inds[i]])
            d2['COG_category'].push(d1['COG_category'][inds[i]])
            d2['EC'].push(d1['EC'][inds[i]])
            d2['GOs'].push(d1['GOs'][inds[i]])
            d2['KEGG_ko'].push(d1['KEGG_ko'][inds[i]])
            d2['KEGG_Pathway'].push(d1['KEGG_Pathway'][inds[i]])
            d2['KEGG_Module'].push(d1['KEGG_Module'][inds[i]])
            d2['KEGG_Reaction'].push(d1['KEGG_Reaction'][inds[i]])
            d2['KEGG_rclass'].push(d1['KEGG_rclass'][inds[i]])
            d2['BRITE'].push(d1['BRITE'][inds[i]])
            d2['KEGG_TC'].push(d1['KEGG_TC'][inds[i]])
            d2['CAZy'].push(d1['CAZy'][inds[i]])
            d2['BiGG_Reaction'].push(d1['BiGG_Reaction'][inds[i]])
        }
        s2.change.emit();
        table.change.emit();
    """)

    data_source.selected.js_on_change('indices', selection_callback, expression_callback, heatmap_callback)

    if interactive_text_search:
        text_input = TextInput(value="Search module(s) or TTHERM_ID(s), e.g. TTHERM_00321680, TTHERM_00313130...", width=600)

        if interactive_text_search_columns is None:
            interactive_text_search_columns = []
            if hover_data is not None:
                interactive_text_search_columns.extend(hover_data.columns)
            if labels is not None:
                interactive_text_search_columns.append("label")

        if len(interactive_text_search_columns) == 0:
            warn(
                "interactive_text_search_columns set to True, but no hover_data or labels provided."
                "Please provide hover_data or labels to use interactive text search."
            )

        else:
            callback = CustomJS(
                args=dict(
                    source=data_source,
                    s2=s2,
                    table=table,
                    matching_alpha=interactive_text_search_alpha_contrast,
                    non_matching_alpha=1 - interactive_text_search_alpha_contrast,
                    search_columns=interactive_text_search_columns,
                    default_radius=radius,
                    default_alpha=alpha
                ),
                code="""
                var data = source.data;
                var text_search = cb_obj.value;
                var d2 = s2.data;

                // var ref_expr = ref_e_s.data;
                // var d3 = sel_e_s.data;

                var search_terms = text_search.split(',');

                d2['module'] = []
                d2['ID'] = []

                // d3['xs'] = []
                // d3['ys'] = []

                var search_columns_dict = {}
                for (var col in search_columns){
                    search_columns_dict[col] = search_columns[col]
                }

                // First, clear the data table and selection
                data['alpha'] = []
                data['radius'] = []
                source.selected.indices = []

                // source.change.emit();
                s2.change.emit();
                // sel_e_s.change.emit();
                table.change.emit();

                // Run search
                if (text_search.length > 0){
                    // Loop over columns and values
                    // If there is no match for any column for a given row, change the alpha value
                    var string_match = false;
                    for (var i = 0; i < data.x.length; i++) {
                        string_match = false
                        for (var j in search_columns_dict) {
                            if (search_terms.some(t => String(data[search_columns_dict[j]][i]).includes(t.trim()))) {
                                string_match = true
                            }
                        }
                        if (string_match){
                            data['alpha'][i] = matching_alpha
                            data['radius'][i] = 1
                            d2['module'].push(data['module'][i])
                            d2['ID'].push(data['ID'][i])

                            // d3['xs'].push(ref_expr['xs'][i])
                            // d3['ys'].push(ref_expr['ys'][i])

                            // So that these points are actually considered selected
                            source.selected.indices.push(i)

                        }else{
                            data['alpha'][i] = non_matching_alpha
                            data['radius'][i] = 0.01
                        }
                    }
                    source.change.emit();
                    s2.change.emit();
                    // sel_e_s.change.emit();
                    table.change.emit();

                } else {

                    // Loop over columns and values
                    // If there is no match for any column for a given row, change the alpha value
                    var string_match = false;
                    for (var i = 0; i < data.x.length; i++) {
                        string_match = false
                        for (var j in search_columns_dict) {
                            if (search_terms.some(t => String(data[search_columns_dict[j]][i]).includes(t.trim()))) {
                                string_match = true
                            }
                        }
                        if (string_match){
                            data['alpha'][i] = default_alpha
                            data['radius'][i] = default_radius
                            d2['module'].push()
                            d2['ID'].push()

                            // d3['xs'].push()
                            // d3['ys'].push()

                        }else{
                            data['alpha'][i] = non_matching_alpha
                            data['radius'][i] = 0.01
                        }
                    }
                    source.change.emit();
                    s2.change.emit();
                    // sel_e_s.change.emit();
                    table.change.emit();

                }




            """,
            )

            text_input.js_on_change("value", callback, selection_callback, expression_callback, heatmap_callback)

    # Lifted from https://stackoverflow.com/questions/31824124/is-there-a-way-to-save-bokeh-data-table-content
    button1 = Button(label="Download Annotation Table", button_type="success", width=550)
    button1.js_on_click(
        CustomJS(
            args=dict(source_data=data_source),
            code="""
            var inds = source_data.selected.indices;
            var data = source_data.data;
            var out = "TTHERM_ID\tmodule\tTGD2021_description\teggNOG_description\teggNOG_preferred_name\tmax_annot_lvl\tCOG_category\tGOs\tEC\tKEGG_ko\tKEGG_Pathway\tKEGG_Module\tKEGG_Reaction\tKEGG_rclass\tBRITE\tKEGG_TC\tCAZy\tBiGG_Reaction\\n";
            for (var i = 0; i < inds.length; i++) {
                out += data['ID'][inds[i]] + "\t" + data['module'][inds[i]] + "\t" + data['TGD2021_description'][inds[i]] + "\t" + data['Description'][inds[i]] + "\t" + data['Preferred_name'][inds[i]] + "\t" + data['max_annot_lvl'][inds[i]] + "\t" + data['COG_category'][inds[i]] + "\t" + data['GOs'][inds[i]] + "\t" + data['EC'][inds[i]] + "\t" + data['KEGG_ko'][inds[i]] + "\t" + data['KEGG_Pathway'][inds[i]] + "\t" + data['KEGG_Module'][inds[i]] + "\t" + data['KEGG_Reaction'][inds[i]] + "\t" + data['KEGG_rclass'][inds[i]] + "\t" + data['BRITE'][inds[i]] + "\t" + data['KEGG_TC'][inds[i]] + "\t" + data['CAZy'][inds[i]] + "\t" + data['BiGG_Reaction'][inds[i]] + "\\n";
            }
            var file = new Blob([out], {type: 'text/plain'});
            var elem = window.document.createElement('a');
            elem.href = window.URL.createObjectURL(file);
            elem.download = 'selected-annotation-data.tsv';
            document.body.appendChild(elem);
            elem.click();
            document.body.removeChild(elem);
            """))        
    
    # NEED TO STOP HARDCODING THIS FILE
    enrich_df = pd.read_csv('../enrichment/test_nn3_full_enrichment.csv')
    colors = [color_key[int(m)] for m in enrich_df['module'].values]
    enrich_df['color'] = colors
    
    enrich_cds = bokeh.models.ColumnDataSource(enrich_df)
    enrich_p = plot_enrichment(enrich_cds)
    
    button2 = Button(label="Download Functional Enrichment Data", button_type="success", width=450)
    button2.js_on_click(
        CustomJS(
            args=dict(source_data=enrich_cds),
            code="""
            // var inds = source_data.selected.indices;
            var data = source_data.data;
            var out = "module\tterm\tinfo\tfold_change\tbonferroni\\n";
            for (var i = 0; i < data['module'].length; i++) {
                out += data['module'][i] + "\t" + data['term'][i] + "\t" + data['info'][i] + "\t" + data['fold_change'][i] + "\t" + data['bonferroni'][i] + "\\n";
            }
            var file = new Blob([out], {type: 'text/plain'});
            var elem = window.document.createElement('a');
            elem.href = window.URL.createObjectURL(file);
            elem.download = 'enrichment-data.tsv';
            document.body.appendChild(elem);
            elem.click();
            document.body.removeChild(elem);
            """))  
    
    
    
    
    if interactive_text_search:
        plot = column(row(column(plot, expr_fig), hm, enrich_p), row(text_input, button1, button2), table)
    else:
        plot = column(row(column(plot, expr_fig), hm, enrich_p), row(button1, button2), table)

    return plot

def get_centroid(module_df):
    
    # get rid of ttherm_ids
    data_cols = [c for c in module_df.columns if ('TTHERM' not in c) and ('label' not in c)]
    data = module_df[data_cols]
    
    centroid = data.apply(np.mean, axis=0).values
    
    return centroid

def get_module_centroid_df(expr_df, cluster_label_df, alg, phases):
    
    merge = expr_df.merge(cluster_label_df, on='TTHERM_ID')
    
    grouped = merge.groupby(f'{alg}_label_{phases}')
    
    centroid_rows = []
    
    for label, grp_df in grouped:
        # print(grp_df.head())
        centroid = get_centroid(grp_df)
        centroid_rows.append(centroid)
    
    data_cols = [c for c in merge.columns if ('TTHERM' not in c) and ('label' not in c)]
    
    centroid_df = pd.DataFrame(centroid_rows)
    centroid_df.columns = data_cols
    centroid_df.index.rename('module', inplace=True)
        
    return centroid_df

def get_all_module_centroids(expr_df, cluster_label_df, alg, phases):
    
    merge = expr_df.merge(cluster_label_df, on='TTHERM_ID')
    
    grouped = merge.groupby(f'{alg}_label_{phases}')
    
    module_centroid_list = []
    
    for label, grp_df in grouped:
        
        centroid = get_centroid(grp_df)
        module_centroid_list.append( (label, centroid) )
        
    return module_centroid_list

def arrange_modules(expr_df, cluster_label_df, alg, phases):
    
    if phases == 'full':
        
        x = ['Ll', 
             'Lm', 
             'Lh', 
             'S0', 
             'S3', 
             'S6', 
             'S9', 
             # 'S12',
             'S15', 
             'S24', 
             'C0', 
             # 'C2', 
             'C4', 
             'C6', 
             'C8', 
             'C10', 
             'C12', 
             'C14', 
             'C16', 
             'C18']
        
        
    elif phases == 'veg':
        
        x = ['Ll', 
             'Lm', 
             'Lh', 
             'S0', 
             'S3', 
             'S6', 
             'S9', 
             # 'S12', 
             'S15', 
             'S24']
        
    elif phases == 'sex':
        
        x = ['C0', 
             # 'C2', 
             'C4', 
             'C6', 
             'C8', 
             'C10',
             'C12',
             'C14', 
             'C16', 
             'C18']
        
    cols = ['TTHERM_ID'] + [c for c in expr_df.columns if c.split('_')[0] in x]
    
    module_centroid_df = get_module_centroid_df(expr_df[cols], cluster_label_df, alg, phases)
    
    linkage = scipy.cluster.hierarchy.linkage(module_centroid_df, method='average', metric='correlation', optimal_ordering=True)
    r_cophcorre, ar_copdist = scipy.cluster.hierarchy.cophenet(linkage, scipy.spatial.distance.pdist(module_centroid_df, metric='correlation'))
    
    # print(f'The Copheretic correlation is: {r_cophcorre}')
    
    d_dendro = scipy.cluster.hierarchy.dendrogram(linkage, no_plot=True)
    cat_sorted = list(module_centroid_df.iloc[d_dendro['leaves'],:].index)
    
    sorter_index = dict(zip(cat_sorted, range(len(cat_sorted))))
    
    reassigned_df = cluster_label_df.copy(deep=True)
    
    
    
    reassigned_df[f'{alg}_label_{phases}'] = reassigned_df[f'{alg}_label_{phases}'].map(sorter_index)
    print(len(reassigned_df))
    
    arranged_dfs = []
    
    for cat in cat_sorted:
        
        mini_df = reassigned_df.loc[reassigned_df[f'{alg}_label_{phases}'] == cat]
        # gene_count += len(mini_df)

        arranged_dfs.append(mini_df)
        
#     gene_count = 0
    
#     for mdf in arranged_dfs:
#         gene_count += len(mdf)
    
#     print(gene_count)
        
    arranged_df = pd.concat(arranged_dfs)
    
    
    return arranged_df

def plot_embedding(expression_df, embedding_df, annotation_df, label_df, clust_alg, phases, palette, n_components=2, n_neighbors=15, title=None, random_state=42, radius=0.01, normalized=True):
    
    """
    Function to plot the UMAP of expression data.
    
    
    """
    
    # get new index for clustered heatmap
    label_df = arrange_modules(expression_df, label_df, clust_alg, phases)
    
    # Weirdly, the heatmap looks better-arranged when I just sort by the modules, as
    # given by the hierarchical clustering done by arrange_modules(), than if
    # I stay with the order they were given automatically
    label_df = label_df.sort_values(by=[f'{clust_alg}_label_{phases}', 'TTHERM_ID'], ascending=False)
    new_index = label_df.index
    
    labels = label_df[f'{clust_alg}_label_{phases}'].values
    
    data = expression_df[list(expression_df.columns)[1:]].values
    
    embedding_df['TTHERM_ID'] = expression_df['TTHERM_ID'].values
    
    merge = expression_df.merge(embedding_df, on='TTHERM_ID')
    
    merge = merge.reindex(new_index)
    
    # take part of annotation df that shared TTHERM_IDs with expression df
    relevant_annot = annotation_df.iloc[np.in1d(annotation_df['TTHERM_ID'].values, merge['TTHERM_ID'].values)]
    merge = merge.merge(relevant_annot, on='TTHERM_ID')
    

    mean_expression_df = get_geom_mean_expression(merge)
    
    ttherm_ids = merge['TTHERM_ID'].values
    merge = merge.merge(mean_expression_df, on='TTHERM_ID')
    
    if phases == 'full':
        
        x = ['Ll', 
             'Lm', 
             'Lh', 
             'S0', 
             'S3', 
             'S6', 
             'S9', 
             # 'S12',
             'S15', 
             'S24', 
             'C0', 
             # 'C2', 
             'C4', 
             'C6', 
             'C8', 
             'C10', 
             'C12', 
             'C14', 
             'C16', 
             'C18']
        
        
    elif phases == 'veg':
        
        x = ['Ll', 
             'Lm', 
             'Lh', 
             'S0', 
             'S3', 
             'S6', 
             'S9', 
             # 'S12', 
             'S15', 
             'S24']
        
    elif phases == 'sex':
        
        x = ['C0', 
             # 'C2', 
             'C4', 
             'C6', 
             'C8', 
             'C10',
             'C12',
             'C14', 
             'C16', 
             'C18']
        
    else:
        print('Selected phases must be one of full, sex, or veg!')
        return

    xs = [x for ttid in ttherm_ids]
    ys = [merge.loc[merge['TTHERM_ID'] == ttid, x].values[0] for ttid in ttherm_ids]

    merge['expr_xs'] = xs
    merge['expr_ys'] = ys
    
    # print(merge.head())

    
#     pdb.set_trace()
    hover_data = pd.DataFrame({
                               # 'index':np.arange(len(data)),
                               'ID':merge['TTHERM_ID'].values,
                               'module':[f'm{int(l):02d}' for l in labels]})
    
#     palette = [palette[l] for l in sorted(label_df[label_key].unique())]
    
    p = interactive(merge,
                    x,
                    # mean_expression_df,
                    title=title,
                    hover_data=hover_data, 
                    labels=labels, 
                    color_key=palette, 
#                     color_key_cmap='Paired',
                    background='black', 
                    radius=radius,
                    alpha=0.7,
#                     width=600, 
#                     height=500,
                    interactive_text_search=True,
                    normalized=normalized
                   )
    
    #p.children[1].title = title
    
    return p



## Define palettes for plotting

These palettes are from the R package Polychrome. The first is just palette36 with the first color replaced by white. The second is the alphabet palette with white prepended.

The R code for 64 colors:

library(Polychrome);
seed <- c("#000000", "#ff0000", "#00ff00", "#0000ff");
p64 <- createPalette(64, seed, range=c(40,100));
paste(p64, sep="\n");

Then, replace the first and last with ~white

In [3]:
palette45 = """
#51635F\n#FF1C16\n#16FC26\n#403DFC\n#FE0DCE\n#F9AA00\n#00FFD5\n#22BFFE\n#BB3551\n#E6FE97\n#ECADFF\n#FFBFBD\n#CF00F5\n#0D8B00\n#D7FEFF\n#8D7200\n#F76C00\n#AD3288\n#5C5AB8\n#FC0080\n#B8FF16\n#00AAB4\n#FBE11C\n#9AAAD9\n#8BBB8C\n#934B47\n#6EFE99\n#9C6D91\n#FB9778\n#9D32AF\n#D40087\n#FFDC9D\n#FF8DB6\n#A96AFC\n#FDDDFB\n#168CF7\n#FD6CF9\n#F64553\n#4D6A00\n#FAFEDB\n#A7977D\n#0DFBFF\n#86B80D\n#FD8AE4\n#B7B126
""".split()

palette32 = """
white\n#F91622\n#16FC0D\n#5138FB\n#FD00CF\n#FDD51C\n#16FDD7\n#FC8B8E\n#16BFFF\n#DF9BFD\n#669C2A\n#FEE7C4\n#F31685\n#DF16FD\n#C1F1FE\n#A23D7E\n#D5FD0D\n#8C5A0D\n#FC790D\n#4F5CBC\n#FFCBEF\n#168D72\n#68FA93\n#C4FDC9\n#F7A449\n#16789B\n#AD0DAB\n#C4262E\n#0DF1FF\n#EFF994\n#B6C1FE\n#8F22CD
""".split()

palette35 = """
#585F6A\n#FE1626\n#00FB0D\n#2E40FC\n#FD0DCE\n#FCD200\n#F7868C\n#16FFDC\n#22BEFB\n#D28EF6\n#609000\n#FFE7C9\n#F51683\n#FF730D\n#CAFE16\n#AA3586\n#BEEEFD\n#BD00FA\n#895D22\n#FEC7F0\n#495AA1\n#73F995\n#229270\n#ED963B\n#F6FE97\n#C5FFD0\n#C50DC8\n#6993FF\n#C22A35\n#16ECFC\n#AA707E\n#7A3BCB\n#7C845C\n#358FAA\n#BDBAF6
""".split()

palette38 = """
#636265\n#F60D16\n#00F90D\n#3540FB\n#FD0DD0\n#FDDB0D\n#00FFE2\n#FA8884\n#2ABEFE\n#E5A3FF\n#518F00\n#FEFDD5\n#D51CFF\n#ED007F\n#A33879\n#96731C\n#C8FB16\n#C0ECFE\n#FBC1DA\n#5658BA\n#F96900\n#F69F1C\n#58FA9C\n#008E72\n#BA22B9\n#167D97\n#794D8A\n#CEFE9C\n#BB222E\n#954D45\n#00DCEF\n#FD66B0\n#B2FDD3\n#FDBD9F\n#A9B4F1\n#B371FE\n#849566\n#2A8EFF
""".split()

palette64 = """
white\n#FA002E\n#22FC22\n#221CFA\n#FF3DD6\n#FFDA00\n#00FEFB\n#F48684\n#CEB4FE\n#FFFFE5\n#0D933D\n#CC00F8\n#800D5D\n#F10084\n#22267A\n#0DADFF\n#CBFD71\n#9A761C\n#F96C00\n#6399A6\n#FFBCDA\n#8D0DA3\n#F79F26\n#00FFBF\n#A37CFB\n#F68EEB\n#720D0D\n#F163AA\n#7E926A\n#826386\n#B41C32\n#9BEBCE\n#E2DB83\n#56D4FA\n#E6E2FB\n#925D58\n#F7C3A7\n#62E970\n#220DBD\n#5583BB\n#7EA01C\n#CDFDB6\n#FD00FB\n#B30D97\n#F5FF00\n#DD77FD\n#4282FC\n#BBA6A4\n#0D8068\n#AB5F26\n#F7C26E\n#9EFE00\n#9B2EFD\n#C56887\n#FD3D68\n#ABF2FD\n#835FAC\n#FF16B1\n#325371\n#CA16CA\n#D26322\n#AFCFFE\n#91A1FA\nfloralwhite
""".split()

In [4]:
palette36 = ["#FFFFFF", 
             "#E4E1E3", 
             "#F6222E", 
             "#FE00FA", 
             "#16FF32", 
             "#3283FE", 
             "#FEAF16", 
             "#B00068", 
             "#1CFFCE",
             "#90AD1C", 
             "#2ED9FF", 
             "#DEA0FD", 
             "#AA0DFE", 
             "#F8A19F", 
             "#325A9B", 
             "#C4451C", 
             "#1C8356", 
             "#85660D",
             "#B10DA1", 
             "#FBE426", 
             "#1CBE4F", 
             "#FA0087", 
             "#FC1CBF", 
             "#F7E1A0", 
             "#C075A6", 
             "#782AB6", 
             "#AAF400",
             "#BDCDFF", 
             "#822E1C", 
             "#B5EFB5", 
             "#7ED7D1", 
             "#1C7F93", 
             "#D85FF7", 
             "#683B79", 
             "#66B0FF", 
             "#3B00FB",
             "magenta"]

change index 5 to #778899. Change index 9 to #2F4F4F. Add #FF7F50. Change index 9 to #FFBCD9. Change index 14 to #DEA5A4

In [5]:
palette27 = ["#FFFFFF", 
             "#AA0DFE", 
             "#3283FE", 
             "#85660D", 
             "#782AB6", 
             "#778899", 
             "#1C8356", 
             "#16FF32", 
             "#F7E1A0", 
#              "#2F4F4F",
             "#FFBCD9", 
             "#C4451C", 
             "#DEA0FD", 
             "#FE00FA", 
#              "#325A9B", 
             "#FEAF16", 
             "#DEA5A4", 
             "#90AD1C", 
             "#F6222E",
             "#1CFFCE", 
             "#2ED9FF", 
             "#B10DA1", 
#              "#C075A6", 
#              "#FC1CBF", 
#              "#B00068", 
             "#FBE426", 
             "#FA0087",
             "#FF7F50"
            ]

## Get the expression data

In [100]:
full_filtered_df = pd.read_csv('../microarray_probe_alignment_and_filtering/allgood_filt_agg_tidy_2021aligned_qc_rma_expression_full.csv')
full_filtered_df = full_filtered_df.rename(columns={'Unnamed: 0': 'TTHERM_ID'})

# full_filtered_norm_df = pd.read_csv('../microarray_probe_alignment_and_filtering/greedy_full_norm_filt_agg_tidy_2021aligned_qc_rma_expression.csv')
# full_filtered_norm_df = full_filtered_norm_df.rename(columns={'Unnamed: 0': 'TTHERM_ID'})

# veg_filtered_df = pd.read_csv('../microarray_probe_alignment_and_filtering/greedy_filt_agg_tidy_2021aligned_qc_rma_expression_veg.csv')
# veg_filtered_df = veg_filtered_df.rename(columns={'Unnamed: 0': 'TTHERM_ID'})

# veg_filtered_norm_df = pd.read_csv('../microarray_probe_alignment_and_filtering/greedy_veg_norm_filt_agg_tidy_2021aligned_qc_rma_expression.csv')
# veg_filtered_norm_df = veg_filtered_norm_df.rename(columns={'Unnamed: 0': 'TTHERM_ID'})

# sex_filtered_df = pd.read_csv('../microarray_probe_alignment_and_filtering/greedy_filt_agg_tidy_2021aligned_qc_rma_expression_sex.csv')
# sex_filtered_df = sex_filtered_df.rename(columns={'Unnamed: 0': 'TTHERM_ID'})

# sex_filtered_norm_df = pd.read_csv('../microarray_probe_alignment_and_filtering/greedy_sex_norm_filt_agg_tidy_2021aligned_qc_rma_expression.csv')
# sex_filtered_norm_df = sex_filtered_norm_df.rename(columns={'Unnamed: 0': 'TTHERM_ID'})

In [101]:
len(full_filtered_df['TTHERM_ID'].values)

17503

In [102]:
len(full_filtered_df['TTHERM_ID'].unique())

17503

In [103]:
[c for c in full_filtered_df.columns if c[0] != 'C']

['TTHERM_ID',
 'Ll_GSM283687',
 'Ll_GSM284355',
 'Lm_GSM283690',
 'Lm_GSM284357',
 'Lm_GSM284363',
 'Lh_GSM283691',
 'Lh_GSM284360',
 'Lh_GSM284364',
 'S0_GSM285363',
 'S0_GSM285554',
 'S0_GSM285561',
 'S3_GSM285542',
 'S3_GSM285555',
 'S3_GSM285562',
 'S6_GSM285543',
 'S6_GSM285556',
 'S6_GSM285563',
 'S9_GSM285544',
 'S9_GSM285564',
 'S15_GSM285559',
 'S15_GSM285566',
 'S24_GSM285547',
 'S24_GSM285560']

In [104]:
full_filtered_norm_df = normalize_expression_per_gene(full_filtered_df)

In [105]:
full_filtered_norm_df.columns

Index(['TTHERM_ID', 'Ll_GSM283687', 'Ll_GSM284355', 'Lm_GSM283690',
       'Lm_GSM284357', 'Lm_GSM284363', 'Lh_GSM283691', 'Lh_GSM284360',
       'Lh_GSM284364', 'S0_GSM285363', 'S0_GSM285554', 'S0_GSM285561',
       'S3_GSM285542', 'S3_GSM285555', 'S3_GSM285562', 'S6_GSM285543',
       'S6_GSM285556', 'S6_GSM285563', 'S9_GSM285544', 'S9_GSM285564',
       'S15_GSM285559', 'S15_GSM285566', 'S24_GSM285547', 'S24_GSM285560',
       'C0_GSM285570', 'C0_GSM285586', 'C0_GSM656230', 'C4_GSM285574',
       'C4_GSM285588', 'C4_GSM656234', 'C6_GSM285575', 'C6_GSM656232',
       'C8_GSM285576', 'C8_GSM285590', 'C8_GSM656236', 'C10_GSM285578',
       'C10_GSM285591', 'C12_GSM285579', 'C12_GSM285592', 'C12_GSM656237',
       'C14_GSM285580', 'C14_GSM285593', 'C14_GSM656238', 'C16_GSM285582',
       'C16_GSM285595', 'C16_GSM656239', 'C18_GSM285583', 'C18_GSM285596',
       'C18_GSM656240'],
      dtype='object')

In [106]:
full_filtered_df.head()

Unnamed: 0,TTHERM_ID,Ll_GSM283687,Ll_GSM284355,Lm_GSM283690,Lm_GSM284357,Lm_GSM284363,Lh_GSM283691,Lh_GSM284360,Lh_GSM284364,S0_GSM285363,...,C12_GSM656237,C14_GSM285580,C14_GSM285593,C14_GSM656238,C16_GSM285582,C16_GSM285595,C16_GSM656239,C18_GSM285583,C18_GSM285596,C18_GSM656240
0,TTHERM_000000045,9.634896,9.989517,9.718303,9.618189,10.226969,10.292076,10.467299,10.695231,10.882099,...,11.128746,11.220612,11.021916,10.627867,11.037196,11.021967,10.228801,11.101338,11.179929,10.568351
1,TTHERM_00000010,5.043358,4.757261,6.139791,4.598367,4.724888,5.816432,5.347399,5.466728,9.402276,...,6.279493,7.425076,7.466925,7.380543,7.146096,7.711429,7.054908,7.647231,7.496493,6.891263
2,TTHERM_00000020,4.697104,4.639458,6.922725,5.116679,4.755424,8.460449,4.524214,4.915478,8.587625,...,5.254276,4.975218,5.749876,5.317474,5.200892,7.062564,5.274993,5.107731,5.486354,5.020556
3,TTHERM_00000030,4.648142,4.543234,5.0419,4.578504,4.887833,5.911232,4.536991,4.735445,8.787451,...,4.619825,4.895429,4.640622,4.868888,4.747579,4.748293,4.503546,4.823935,4.929809,4.635393
4,TTHERM_00000040,7.7988,7.646394,7.881154,7.603919,7.446313,7.192676,7.369151,7.194242,7.221652,...,6.973045,8.212708,7.989757,7.462811,7.182194,7.516854,6.875331,7.596719,7.376497,7.15595


In [107]:
# mean_full_filt_df = get_geom_mean_expression(full_filtered_df)
# mean_full_filt_df.head()

In [108]:
# full_filtered_norm_df = normalize_expression_per_gene(mean_full_filt_df)

In [109]:
full_filtered_norm_df.head()

Unnamed: 0,TTHERM_ID,Ll_GSM283687,Ll_GSM284355,Lm_GSM283690,Lm_GSM284357,Lm_GSM284363,Lh_GSM283691,Lh_GSM284360,Lh_GSM284364,S0_GSM285363,...,C12_GSM656237,C14_GSM285580,C14_GSM285593,C14_GSM656238,C16_GSM285582,C16_GSM285595,C16_GSM656239,C18_GSM285583,C18_GSM285596,C18_GSM656240
0,TTHERM_000000045,0.009147,0.203314,0.054815,0.0,0.333326,0.368974,0.464915,0.589715,0.692031,...,0.827078,0.877377,0.768585,0.552831,0.776951,0.768613,0.334329,0.812071,0.855103,0.520244
1,TTHERM_00000010,0.067378,0.024059,0.233393,0.0,0.019157,0.184432,0.113414,0.131482,0.727378,...,0.254546,0.428003,0.434339,0.42126,0.385761,0.471361,0.371954,0.46164,0.438816,0.347176
2,TTHERM_00000020,0.031978,0.02411,0.335741,0.089244,0.039938,0.545616,0.008381,0.061783,0.562974,...,0.108024,0.069936,0.175665,0.116649,0.100737,0.354827,0.110851,0.088022,0.139698,0.076124
3,TTHERM_00000030,0.028657,0.007866,0.106694,0.014856,0.07616,0.278983,0.006628,0.045959,0.849008,...,0.023045,0.077666,0.027166,0.072405,0.048364,0.048505,0.0,0.063497,0.084479,0.02613
4,TTHERM_00000040,0.609623,0.5336,0.650703,0.512413,0.433796,0.307277,0.395306,0.308058,0.321731,...,0.197721,0.816088,0.704876,0.442026,0.302048,0.468983,0.14898,0.508821,0.398971,0.288958


SHOULD NORMALIZE AFTER GEOMETRIC MEAN, NOT BEFORE? But what about clustering in full space?

In [110]:
veg_filtered_norm_df = full_filtered_norm_df[[c for c in full_filtered_norm_df.columns if c[0] != 'C']]
sex_filtered_norm_df = full_filtered_norm_df[[c for c in full_filtered_norm_df.columns if not c[0] in ['L', 'S']]]

In [111]:
full_mean_norm = get_geom_mean_expression(full_filtered_norm_df)
full_mean = get_geom_mean_expression(full_filtered_df)
full_mean_norm.to_csv('./full_mean_norm_df.csv', index=False)

patrick_genes = """
TTHERM_00658810, TTHERM_000193469, TTHERM_00047330, TTHERM_000486279, TTHERM_00141040, TTHERM_00227750, TTHERM_00317390, TTHERM_00670750, TTHERM_01122800, TTHERM_01213910, TTHERM_00527180, TTHERM_00473020, TTHERM_00624730, TTHERM_00624720, TTHERM_00378890, TTHERM_00321730, TTHERM_00522600, TTHERM_01055600, TTHERM_01018540, TTHERM_00221120, TTHERM_00221130
""".split(', ')

patrick_genes = [p.strip() for p in patrick_genes]

patrick_df_norm = full_mean_norm.loc[full_mean_norm['TTHERM_ID'].isin(patrick_genes)]
patrick_df = full_mean.loc[full_mean['TTHERM_ID'].isin(patrick_genes)]

In [65]:
len(patrick_genes)

21

In [66]:
len(patrick_df)

20

In [13]:
patrick_df.to_csv('/Users/eukarya/Downloads/patrick_genes.csv', index=False)
patrick_df_norm.to_csv('/Users/eukarya/Downloads/patrick_genes_normalized', index=False)

In [67]:
veg_mean = get_geom_mean_expression(veg_filtered_norm_df)
veg_mean

Unnamed: 0,TTHERM_ID,Ll,Lm,Lh,S0,S3,S6,S9,S15,S24
0,TTHERM_000000045,0.101962,0.120395,0.471778,0.580312,0.716956,0.659768,0.849064,0.782075,0.990288
1,TTHERM_00000010,0.045494,0.079230,0.142715,0.598423,0.606028,0.713802,0.673972,0.313933,0.434661
2,TTHERM_00000020,0.028037,0.148026,0.182826,0.600568,0.342948,0.333151,0.220792,0.100392,0.137812
3,TTHERM_00000030,0.018208,0.065212,0.104289,0.828899,0.405712,0.610600,0.462508,0.109629,0.073734
4,TTHERM_00000040,0.571152,0.529709,0.336251,0.499084,0.437534,0.270672,0.483558,0.567710,0.649672
...,...,...,...,...,...,...,...,...,...,...
16568,TTHERM_02105572,0.761906,0.700157,0.812984,0.697806,0.661529,0.797139,0.633939,0.731486,0.779311
16569,TTHERM_02272860,0.862911,0.799772,0.831902,0.548981,0.533192,0.719390,0.463936,0.649265,0.676229
16570,TTHERM_02385080,0.590120,0.589438,0.664468,0.724519,0.655034,0.695514,0.654462,0.797614,0.889262
16571,TTHERM_02607240,0.539939,0.309201,0.303815,0.263367,0.358926,0.392351,0.415952,0.618327,0.565566


In [68]:
sex_mean = get_geom_mean_expression(sex_filtered_norm_df)
sex_mean

Unnamed: 0,TTHERM_ID,C0,C4,C6,C8,C10,C12,C14,C16,C18
0,TTHERM_000000045,0.848964,0.705521,0.515015,0.339432,0.786260,0.715562,0.727563,0.612589,0.722472
1,TTHERM_00000010,0.908004,0.246933,0.160877,0.177079,0.193064,0.247425,0.427857,0.409015,0.415001
2,TTHERM_00000020,0.534512,0.198514,0.124202,0.121827,0.101501,0.120762,0.119918,0.183244,0.100939
3,TTHERM_00000030,0.305737,0.215437,0.084690,0.043367,0.059830,0.072690,0.058834,0.032034,0.057759
4,TTHERM_00000040,0.256189,0.693733,0.574922,0.819643,0.027598,0.491155,0.646649,0.300126,0.396031
...,...,...,...,...,...,...,...,...,...,...
16568,TTHERM_02105572,0.739414,0.492361,0.351058,0.385906,0.414258,0.406603,0.696805,0.810859,0.869315
16569,TTHERM_02272860,0.559081,0.598204,0.402831,0.318770,0.246693,0.393300,0.560204,0.731273,0.759633
16570,TTHERM_02385080,0.677853,0.613697,0.527564,0.422570,0.690764,0.448935,0.625155,0.736993,0.785636
16571,TTHERM_02607240,0.868162,0.733789,0.611442,0.587093,0.116234,0.594143,0.842640,0.827077,0.594316


Get the annnotations

In [112]:
complete_annot = pd.read_csv('../eggnog/complete_eggnog_annotation.csv')

Add the TGD annotations for all the genes to the dataframe

In [113]:
complete_annot

Unnamed: 0,TTHERM_ID,seed_ortholog,evalue,score,eggNOG_OGs,max_annot_lvl,COG_category,Description,Preferred_name,GOs,...,KEGG_Pathway,KEGG_Module,KEGG_Reaction,KEGG_rclass,BRITE,KEGG_TC,CAZy,BiGG_Reaction,PFAMs,TGD2021_description
0,TTHERM_01528530,5911.EAS00195,2.2e-14,49.9,"KOG0594@1|root,KOG0594@2759|Eukaryota",2759|Eukaryota,G,cyclin-dependent protein serine/threonine kina...,-,-,...,-,-,-,-,-,-,-,-,-,protein kinase
1,TTHERM_01528510,5911.EAR81750,1.1e-194,633.4,"2EI75@1|root,2SNPE@2759|Eukaryota,3ZBUW@5878|C...",5878|Ciliophora,-,-,-,-,...,-,-,-,-,-,-,-,-,-,hypothetical protein
2,TTHERM_01528500,5911.EAR81749,4e-141,462.0,"COG1100@1|root,KOG0074@2759|Eukaryota",2759|Eukaryota,KLT,GTP binding,ARL13B,"GO:0000902,GO:0000904,GO:0001947,GO:0002009,GO...",...,-,-,-,-,"ko00000,ko04031",-,-,-,-,ADP-ribosylation factor family protein
3,TTHERM_0015284992,5911.EAR97791,8.4e-64,205.1,"2ERUB@1|root,2SUIJ@2759|Eukaryota",5911.EAR97791|-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,Leucine-rich repeat-containing protein 74A
4,TTHERM_00897120,5911.EAR86071,5.2e-77,248.8,"2ERUB@1|root,2SUIJ@2759|Eukaryota",5911.EAR86071|-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,kinase domain protein
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26253,TTHERM_01076820,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,Endoglucanase 1
26254,TTHERM_001076861,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,transmembrane protein putative
26255,TTHERM_01076910,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,Endoglucanase 1
26256,TTHERM_001076981,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,Endoglucanase 1


In [71]:
# checkout = """
# TTHERM_00000030\nTTHERM_000001459\nTTHERM_000002658\nTTHERM_000002709\nTTHERM_000011269\nTTHERM_000011798\nTTHERM_00001210\nTTHERM_00001310\nTTHERM_000013569\nTTHERM_000013649\nTTHERM_000016298\nTTHERM_000028468\nTTHERM_000028809\nTTHERM_000030519\nTTHERM_000033909\nTTHERM_000035399\nTTHERM_000037449\nTTHERM_000037782\nTTHERM_000039088\nTTHERM_000041632\nTTHERM_000041635\nTTHERM_000042736\nTTHERM_000042742\nTTHERM_000051719\nTTHERM_000058888\nTTHERM_000059119\nTTHERM_000059499\nTTHERM_00006050\nTTHERM_00006170\nTTHERM_00006290\nTTHERM_000066709\nTTHERM_000066939\nTTHERM_000069529\nTTHERM_000075950\nTTHERM_000079989\nTTHERM_000083289\nTTHERM_000083329\nTTHERM_000085199\nTTHERM_000086777\nTTHERM_000092856\nTTHERM_000092919\nTTHERM_000095648\nTTHERM_000096738\nTTHERM_000096767\nTTHERM_000097819\nTTHERM_000099988\nTTHERM_000101249\nTTHERM_000105399\nTTHERM_000105638\nTTHERM_000105664\nTTHERM_000115543\nTTHERM_000123578\nTTHERM_000123759\nTTHERM_000123799\nTTHERM_000125189\nTTHERM_000128699\nTTHERM_000128738\nTTHERM_000128899\nTTHERM_000134819\nTTHERM_000135979\nTTHERM_000136019\nTTHERM_000136238\nTTHERM_00013650\nTTHERM_000142399\nTTHERM_000145039\nTTHERM_000145409\nTTHERM_00014930\nTTHERM_000150028\nTTHERM_000151655\nTTHERM_000151657\nTTHERM_000151659\nTTHERM_000151707\nTTHERM_000155257\nTTHERM_000158289\nTTHERM_00016230\nTTHERM_00016450\nTTHERM_000170478\nTTHERM_000170479\nTTHERM_000182182\nTTHERM_000185789\nTTHERM_000188551\nTTHERM_000191328\nTTHERM_000191709\nTTHERM_000193177\nTTHERM_00019600\nTTHERM_00019650\nTTHERM_000198388\nTTHERM_000198509\nTTHERM_000201618\nTTHERM_000204119\nTTHERM_000205206\nTTHERM_000207258\nTTHERM_000212534\nTTHERM_000213644\nTTHERM_000216168\nTTHERM_000218239\nTTHERM_000218619\nTTHERM_000218939\nTTHERM_000222369\nTTHERM_000225898\nTTHERM_000225988\nTTHERM_000227538\nTTHERM_000227669\nTTHERM_000227689\nTTHERM_000238739\nTTHERM_000239299\nTTHERM_00024030\nTTHERM_00024149\nTTHERM_000241779\nTTHERM_000242459\nTTHERM_000242629\nTTHERM_000243879\nTTHERM_000244008\nTTHERM_000245699\nTTHERM_000246949\nTTHERM_000248239\nTTHERM_000251049\nTTHERM_000252389\nTTHERM_000252419\nTTHERM_000257099\nTTHERM_000261919\nTTHERM_000264819\nTTHERM_000266261\nTTHERM_000268259\nTTHERM_000274479\nTTHERM_000275799\nTTHERM_000277338\nTTHERM_000279719\nTTHERM_000279858\nTTHERM_000284240\nTTHERM_00028800\nTTHERM_000294649\nTTHERM_000294809\nTTHERM_00030000\nTTHERM_000300069\nTTHERM_000300369\nTTHERM_000300459\nTTHERM_000300489\nTTHERM_000300549\nTTHERM_00030070\nTTHERM_00030150\nTTHERM_00030220\nTTHERM_00030420\nTTHERM_000304243\nTTHERM_000305499\nTTHERM_000307767\nTTHERM_000309879\nTTHERM_000310179\nTTHERM_000310809\nTTHERM_000316229\nTTHERM_000317059\nTTHERM_000320389\nTTHERM_000320439\nTTHERM_00032870\nTTHERM_000334255\nTTHERM_000335848\nTTHERM_000343739\nTTHERM_000344109\nTTHERM_000344329\nTTHERM_000344368\nTTHERM_00034920\nTTHERM_00035270\nTTHERM_000355149\nTTHERM_000355799\nTTHERM_00035670\nTTHERM_000361561\nTTHERM_000362979\nTTHERM_000364298\nTTHERM_000365339\nTTHERM_000365478\nTTHERM_000365519\nTTHERM_00036990\nTTHERM_000372548\nTTHERM_00037610\nTTHERM_00037730\nTTHERM_000378449\nTTHERM_000382129\nTTHERM_000383469\nTTHERM_000388289\nTTHERM_000390049\nTTHERM_00039090\nTTHERM_000391477\nTTHERM_000396989\nTTHERM_000398140\nTTHERM_000401919\nTTHERM_000402210\nTTHERM_000404309\nTTHERM_000406649\nTTHERM_000408795\nTTHERM_000408949\nTTHERM_000415569\nTTHERM_000418349\nTTHERM_000418498\nTTHERM_000420499\nTTHERM_000423265\nTTHERM_000423321\nTTHERM_000424459\nTTHERM_000424558\nTTHERM_00042640\nTTHERM_00042740\nTTHERM_000429641\nTTHERM_000429979\nTTHERM_000433738\nTTHERM_000436229\nTTHERM_000437619\nTTHERM_000440509\nTTHERM_000440549\nTTHERM_000442267\nTTHERM_000442799\nTTHERM_000443079\nTTHERM_000445939\nTTHERM_000448823\nTTHERM_000455598\nTTHERM_000455680\nTTHERM_000456708\nTTHERM_000458197\nTTHERM_000458218\nTTHERM_00046100\nTTHERM_00046110\nTTHERM_000461810\nTTHERM_00046300\nTTHERM_000463588\nTTHERM_000463589\nTTHERM_000463838\nTTHERM_00046460\nTTHERM_000467909\nTTHERM_000471259\nTTHERM_000471329\nTTHERM_000473239\nTTHERM_00047350\nTTHERM_000474539\nTTHERM_000476461\nTTHERM_000486249\nTTHERM_000487049\nTTHERM_00048760\nTTHERM_00048780\nTTHERM_000488179\nTTHERM_000490799\nTTHERM_000491079\nTTHERM_00049180\nTTHERM_000492408\nTTHERM_00050530\nTTHERM_000516450\nTTHERM_000516461\nTTHERM_000519809\nTTHERM_000519829\nTTHERM_00052070\nTTHERM_00052120\nTTHERM_000526429\nTTHERM_000530108\nTTHERM_000531819\nTTHERM_000535351\nTTHERM_000537309\nTTHERM_00053940\nTTHERM_000541549\nTTHERM_000544739\nTTHERM_000548089\nTTHERM_000548129\nTTHERM_000558438\nTTHERM_000560099\nTTHERM_000560142\nTTHERM_000561598\nTTHERM_000571589\nTTHERM_000572032\nTTHERM_000573209\nTTHERM_000574269\nTTHERM_000575519\nTTHERM_000575579\nTTHERM_000577049\nTTHERM_000578441\nTTHERM_000580367\nTTHERM_000581749\nTTHERM_000581869\nTTHERM_000582139\nTTHERM_00058360\nTTHERM_00058400\nTTHERM_000584531\nTTHERM_000584749\nTTHERM_000584759\nTTHERM_000585019\nTTHERM_000585149\nTTHERM_00058580\nTTHERM_000586861\nTTHERM_00058950\nTTHERM_000590479\nTTHERM_000597629\nTTHERM_000598670\nTTHERM_000600499\nTTHERM_000616089\nTTHERM_000616459\nTTHERM_000619890\nTTHERM_000624648\nTTHERM_000630099\nTTHERM_000633271\nTTHERM_000633479\nTTHERM_000635749\nTTHERM_000637208\nTTHERM_000637209\nTTHERM_000640080\nTTHERM_000641369\nTTHERM_000644779\nTTHERM_000646835\nTTHERM_000646840\nTTHERM_000646899\nTTHERM_000647419\nTTHERM_000648878\nTTHERM_000652549\nTTHERM_000654259\nTTHERM_000655619\nTTHERM_000657361\nTTHERM_000672289\nTTHERM_000678048\nTTHERM_000681909\nTTHERM_000691225\nTTHERM_00069190\nTTHERM_000691929\nTTHERM_000693107\nTTHERM_000693288\nTTHERM_000695779\nTTHERM_00069600\nTTHERM_000699759\nTTHERM_000703757\nTTHERM_00070870\nTTHERM_00071130\nTTHERM_000716179\nTTHERM_000717709\nTTHERM_000717999\nTTHERM_000723051\nTTHERM_000723389\nTTHERM_000726079\nTTHERM_000726339\nTTHERM_000731450\nTTHERM_000748947\nTTHERM_000748978\nTTHERM_000753493\nTTHERM_000753495\nTTHERM_000753599\nTTHERM_00075500\nTTHERM_00075540\nTTHERM_000756121\nTTHERM_00075660\nTTHERM_00075690\nTTHERM_00075730\nTTHERM_00075750\nTTHERM_000762999\nTTHERM_000766429\nTTHERM_000768570\nTTHERM_000770619\nTTHERM_000773079\nTTHERM_00077360\nTTHERM_000774831\nTTHERM_00077530\nTTHERM_00077560\nTTHERM_00077600\nTTHERM_00077810\nTTHERM_000783219\nTTHERM_00079290\nTTHERM_00079550\nTTHERM_00079580\nTTHERM_000798148\nTTHERM_000800229\nTTHERM_000801323\nTTHERM_000804799\nTTHERM_000810475\nTTHERM_000810562\nTTHERM_00081100\nTTHERM_000812603\nTTHERM_000815140\nTTHERM_000817290\nTTHERM_000823383\nTTHERM_000827189\nTTHERM_00083400\nTTHERM_00083780\nTTHERM_00083790\nTTHERM_00083800\nTTHERM_00083930\nTTHERM_000845809\nTTHERM_00085250\nTTHERM_000862770\nTTHERM_000867570\nTTHERM_00086820\nTTHERM_000873623\nTTHERM_000881459\nTTHERM_000885885\nTTHERM_000885891\nTTHERM_000886917\nTTHERM_000891240\nTTHERM_00089200\nTTHERM_000895865\nTTHERM_000895969\nTTHERM_000898161\nTTHERM_000901634\nTTHERM_000901749\nTTHERM_00090250\nTTHERM_00090320\nTTHERM_00090340\nTTHERM_000903858\nTTHERM_00090430\nTTHERM_00091520\nTTHERM_00091580\nTTHERM_00091632\nTTHERM_000923209\nTTHERM_000925559\nTTHERM_00092860\nTTHERM_00094020\nTTHERM_000942839\nTTHERM_000944157\nTTHERM_000948690\nTTHERM_000951962\nTTHERM_000956521\nTTHERM_000957531\nTTHERM_000957669\nTTHERM_000963272\nTTHERM_000964350\nTTHERM_000965394\nTTHERM_000967548\nTTHERM_00096769\nTTHERM_00097820\nTTHERM_000985179\nTTHERM_000993039\nTTHERM_000997598\nTTHERM_001001378\nTTHERM_001008626\nTTHERM_001009941\nTTHERM_00101250\nTTHERM_001018451\nTTHERM_001026363\nTTHERM_001027719\nTTHERM_001034480\nTTHERM_001035719\nTTHERM_001035762\nTTHERM_001037789\nTTHERM_001038803\nTTHERM_001041997\nTTHERM_001042011\nTTHERM_001044673\nTTHERM_00105400\nTTHERM_001054129\nTTHERM_001054139\nTTHERM_001054349\nTTHERM_001055501\nTTHERM_00107080\nTTHERM_00107110\nTTHERM_001076814\nTTHERM_001081695\nTTHERM_00108220\nTTHERM_001100419\nTTHERM_001107439\nTTHERM_00112480\nTTHERM_001126428\nTTHERM_00112940\nTTHERM_001140389\nTTHERM_001141651\nTTHERM_001144931\nTTHERM_001144951\nTTHERM_001144953\nTTHERM_001147129\nTTHERM_001159889\nTTHERM_001164069\nTTHERM_001166292\nTTHERM_001169451\nTTHERM_001170538\nTTHERM_00117550\nTTHERM_001186268\nTTHERM_001192479\nTTHERM_001198169\nTTHERM_001202231\nTTHERM_001212889\nTTHERM_001214996\nTTHERM_00122120\nTTHERM_00122450\nTTHERM_00122460\nTTHERM_00122470\nTTHERM_00122549\nTTHERM_001225669\nTTHERM_001230168\nTTHERM_001232265\nTTHERM_00123640\nTTHERM_00123900\nTTHERM_00123960\nTTHERM_001250151\nTTHERM_00125220\nTTHERM_00125660\nTTHERM_00125750\nTTHERM_001262899\nTTHERM_001264989\nTTHERM_00127060\nTTHERM_00127090\nTTHERM_001273284\nTTHERM_001284761\nTTHERM_00128590\nTTHERM_001287954\nTTHERM_00128880\nTTHERM_00128980\nTTHERM_00129020\nTTHERM_001292214\nTTHERM_001292250\nTTHERM_00129270\nTTHERM_001295314\nTTHERM_00129570\nTTHERM_00129640\nTTHERM_00129790\nTTHERM_00129930\nTTHERM_001302821\nTTHERM_001305848\nTTHERM_001305849\nTTHERM_00131130\nTTHERM_00131210\nTTHERM_00131320\nTTHERM_001325754\nTTHERM_001325773\nTTHERM_001333220\nTTHERM_00133610\nTTHERM_001344745\nTTHERM_00136160\nTTHERM_001371771\nTTHERM_001374862\nTTHERM_001374871\nTTHERM_00137530\nTTHERM_00137830\nTTHERM_00138000\nTTHERM_00138110\nTTHERM_00138300\nTTHERM_001383000\nTTHERM_00141150\nTTHERM_001415159\nTTHERM_001420339\nTTHERM_001421352\nTTHERM_00142240\nTTHERM_001423407\nTTHERM_00143560\nTTHERM_00143570\nTTHERM_00145060\nTTHERM_001451032\nTTHERM_00145630\nTTHERM_001473460\nTTHERM_001476529\nTTHERM_00148810\nTTHERM_00149080\nTTHERM_00149990\nTTHERM_001500990\nTTHERM_00151410\nTTHERM_00151620\nTTHERM_00151760\nTTHERM_001527452\nTTHERM_001541732\nTTHERM_00155240\nTTHERM_00155510\nTTHERM_00155530\nTTHERM_00156762\nTTHERM_00158530\nTTHERM_00160820\nTTHERM_00161150\nTTHERM_00161680\nTTHERM_00170200\nTTHERM_00170580\nTTHERM_00170590\nTTHERM_00171650\nTTHERM_00171680\nTTHERM_00171690\nTTHERM_00171700\nTTHERM_00185130\nTTHERM_00185150\nTTHERM_00185240\nTTHERM_00185380\nTTHERM_00185530\nTTHERM_00185590\nTTHERM_00185620\nTTHERM_00186020\nTTHERM_00187060\nTTHERM_00187200\nTTHERM_00187210\nTTHERM_00187230\nTTHERM_00189240\nTTHERM_00189410\nTTHERM_00189480\nTTHERM_00190650\nTTHERM_00190700\nTTHERM_00190850\nTTHERM_00191360\nTTHERM_00191470\nTTHERM_00191650\nTTHERM_00191990\nTTHERM_00192070\nTTHERM_00192100\nTTHERM_00192140\nTTHERM_00193240\nTTHERM_00193280\nTTHERM_00193290\nTTHERM_00193730\nTTHERM_00194130\nTTHERM_00194430\nTTHERM_00196340\nTTHERM_00196500\nTTHERM_00197685\nTTHERM_00198200\nTTHERM_00198240\nTTHERM_00198350\nTTHERM_00198510\nTTHERM_00198520\nTTHERM_00200560\nTTHERM_00201640\nTTHERM_00202830\nTTHERM_00202860\nTTHERM_00203030\nTTHERM_00204160\nTTHERM_00207259\nTTHERM_00209330\nTTHERM_00209390\nTTHERM_00210490\nTTHERM_00214670\nTTHERM_00216110\nTTHERM_00218710\nTTHERM_00220880\nTTHERM_00220900\nTTHERM_00222350\nTTHERM_00222370\nTTHERM_00222400\nTTHERM_00225670\nTTHERM_00225750\nTTHERM_00225930\nTTHERM_00225980\nTTHERM_00227350\nTTHERM_00227430\nTTHERM_00228920\nTTHERM_00229940\nTTHERM_00233020\nTTHERM_00234140\nTTHERM_00234180\nTTHERM_00236260\nTTHERM_00237340\nTTHERM_00239010\nTTHERM_00241480\nTTHERM_00241920\nTTHERM_00241930\nTTHERM_00242420\nTTHERM_00243700\nTTHERM_00245240\nTTHERM_00245610\nTTHERM_00245810\nTTHERM_00247160\nTTHERM_00248270\nTTHERM_00249590\nTTHERM_00249670\nTTHERM_00249690\nTTHERM_00249790\nTTHERM_00251020\nTTHERM_00251050\nTTHERM_00251110\nTTHERM_00251150\nTTHERM_00251240\nTTHERM_00252260\nTTHERM_00256930\nTTHERM_00257030\nTTHERM_00257040\nTTHERM_00257130\nTTHERM_00257180\nTTHERM_00258260\nTTHERM_00259290\nTTHERM_00259339\nTTHERM_00259540\nTTHERM_00261920\nTTHERM_00263000\nTTHERM_00263080\nTTHERM_00263120\nTTHERM_00263200\nTTHERM_00263320\nTTHERM_00263409\nTTHERM_00263420\nTTHERM_00263620\nTTHERM_002653450\nTTHERM_002653530\nTTHERM_00266280\nTTHERM_00266540\nTTHERM_00266660\nTTHERM_00268020\nTTHERM_00268080\nTTHERM_00268250\nTTHERM_00268300\nTTHERM_00268320\nTTHERM_00268340\nTTHERM_00274510\nTTHERM_00274530\nTTHERM_00275880\nTTHERM_00277170\nTTHERM_00279660\nTTHERM_00283160\nTTHERM_00283320\nTTHERM_00283500\nTTHERM_00283570\nTTHERM_00285260\nTTHERM_00285330\nTTHERM_00285479\nTTHERM_00285650\nTTHERM_00289110\nTTHERM_00289160\nTTHERM_00289490\nTTHERM_00290620\nTTHERM_00290650\nTTHERM_00292110\nTTHERM_00292120\nTTHERM_00294600\nTTHERM_00294630\nTTHERM_00295350\nTTHERM_00295680\nTTHERM_00297100\nTTHERM_00297120\nTTHERM_00298490\nTTHERM_00299790\nTTHERM_00301870\nTTHERM_00309930\nTTHERM_00310010\nTTHERM_00310020\nTTHERM_00310460\nTTHERM_00310690\nTTHERM_00310780\nTTHERM_00310810\nTTHERM_00312020\nTTHERM_00312030\nTTHERM_00312682\nTTHERM_00313170\nTTHERM_00313200\nTTHERM_00313230\nTTHERM_00313390\nTTHERM_00313580\nTTHERM_00313660\nTTHERM_00316130\nTTHERM_00317250\nTTHERM_00317380\nTTHERM_00317420\nTTHERM_00320039\nTTHERM_00320160\nTTHERM_00321540\nTTHERM_00324460\nTTHERM_00325630\nTTHERM_00328520\nTTHERM_00329780\nTTHERM_00329910\nTTHERM_00329980\nTTHERM_00331050\nTTHERM_00338160\nTTHERM_00339740\nTTHERM_00340130\nTTHERM_00343300\nTTHERM_00343640\nTTHERM_00343740\nTTHERM_00343840\nTTHERM_00344080\nTTHERM_00344110\nTTHERM_00344130\nTTHERM_00344391\nTTHERM_00346910\nTTHERM_00348640\nTTHERM_00348890\nTTHERM_00351140\nTTHERM_00353430\nTTHERM_00355150\nTTHERM_00355169\nTTHERM_00355440\nTTHERM_00355480\nTTHERM_00355650\nTTHERM_00355850\nTTHERM_00359160\nTTHERM_00361610\nTTHERM_00361700\nTTHERM_00361750\nTTHERM_00361800\nTTHERM_00361900\nTTHERM_00363110\nTTHERM_00370950\nTTHERM_00371170\nTTHERM_00371290\nTTHERM_00372630\nTTHERM_00376203\nTTHERM_00377210\nTTHERM_00378420\nTTHERM_00378650\nTTHERM_00378980\nTTHERM_00382400\nTTHERM_00383520\nTTHERM_00383590\nTTHERM_00384650\nTTHERM_00384980\nTTHERM_00387150\nTTHERM_00391260\nTTHERM_00391470\nTTHERM_00392890\nTTHERM_00393110\nTTHERM_00398090\nTTHERM_00399150\nTTHERM_00399340\nTTHERM_00399660\nTTHERM_00400760\nTTHERM_00400810\nTTHERM_00405580\nTTHERM_00408920\nTTHERM_00409090\nTTHERM_00409121\nTTHERM_00410291\nTTHERM_00411440\nTTHERM_00411450\nTTHERM_00412000\nTTHERM_00412040\nTTHERM_00412090\nTTHERM_00414390\nTTHERM_00415690\nTTHERM_00417880\nTTHERM_00417950\nTTHERM_00420590\nTTHERM_00421190\nTTHERM_00421200\nTTHERM_00423380\nTTHERM_00424473\nTTHERM_00425890\nTTHERM_00426050\nTTHERM_00427580\nTTHERM_00429710\nTTHERM_00429729\nTTHERM_00430130\nTTHERM_00430180\nTTHERM_00433440\nTTHERM_00433760\nTTHERM_00434220\nTTHERM_00437300\nTTHERM_00437490\nTTHERM_00437520\nTTHERM_00438810\nTTHERM_00439260\nTTHERM_00441700\nTTHERM_00441800\nTTHERM_00444810\nTTHERM_00444860\nTTHERM_00446060\nTTHERM_00446100\nTTHERM_00448770\nTTHERM_00449230\nTTHERM_00450830\nTTHERM_00450860\nTTHERM_00450900\nTTHERM_00450910\nTTHERM_00456700\nTTHERM_00458160\nTTHERM_00462820\nTTHERM_00463220\nTTHERM_00463460\nTTHERM_00463700\nTTHERM_00463840\nTTHERM_00467400\nTTHERM_00467440\nTTHERM_00470780\nTTHERM_00473230\nTTHERM_00473240\nTTHERM_00475350\nTTHERM_00475380\nTTHERM_00476690\nTTHERM_00476890\nTTHERM_00476920\nTTHERM_00476952\nTTHERM_00481170\nTTHERM_00483410\nTTHERM_00485850\nTTHERM_00485960\nTTHERM_00488380\nTTHERM_00490860\nTTHERM_00490960\nTTHERM_00490970\nTTHERM_00490980\nTTHERM_00491000\nTTHERM_00491060\nTTHERM_00491190\nTTHERM_00494300\nTTHERM_00494730\nTTHERM_00494780\nTTHERM_00494848\nTTHERM_00497690\nTTHERM_00497720\nTTHERM_00498080\nTTHERM_00499390\nTTHERM_00500730\nTTHERM_00500940\nTTHERM_00502350\nTTHERM_00502450\nTTHERM_00502530\nTTHERM_00508990\nTTHERM_00516410\nTTHERM_00516430\nTTHERM_00518470\nTTHERM_00518480\nTTHERM_00519840\nTTHERM_00520930\nTTHERM_00522400\nTTHERM_00525120\nTTHERM_00526300\nTTHERM_00526340\nTTHERM_00527290\nTTHERM_00527420\nTTHERM_00529490\nTTHERM_00529520\nTTHERM_00530039\nTTHERM_00530050\nTTHERM_00530410\nTTHERM_00530659\nTTHERM_00532140\nTTHERM_00532300\nTTHERM_00533900\nTTHERM_00535200\nTTHERM_00535390\nTTHERM_00535510\nTTHERM_00535650\nTTHERM_00537310\nTTHERM_00538999\nTTHERM_00540020\nTTHERM_00540040\nTTHERM_00541510\nTTHERM_00541520\nTTHERM_00543570\nTTHERM_00543640\nTTHERM_00545880\nTTHERM_00548380\nTTHERM_00549450\nTTHERM_00554440\nTTHERM_00558700\nTTHERM_00559850\nTTHERM_00559960\nTTHERM_00560130\nTTHERM_00561150\nTTHERM_00561210\nTTHERM_00561430\nTTHERM_00561540\nTTHERM_00561610\nTTHERM_00566910\nTTHERM_00569070\nTTHERM_00569230\nTTHERM_00569340\nTTHERM_00571750\nTTHERM_00572180\nTTHERM_00575460\nTTHERM_00578740\nTTHERM_00578970\nTTHERM_00580400\nTTHERM_00581460\nTTHERM_00581909\nTTHERM_00582039\nTTHERM_00582040\nTTHERM_00582190\nTTHERM_00583400\nTTHERM_00584770\nTTHERM_00584820\nTTHERM_00586560\nTTHERM_00586690\nTTHERM_00586840\nTTHERM_00588860\nTTHERM_00590300\nTTHERM_00590310\nTTHERM_00592870\nTTHERM_00592970\nTTHERM_00594180\nTTHERM_00594240\nTTHERM_00597620\nTTHERM_00598660\nTTHERM_00607120\nTTHERM_00607170\nTTHERM_00607260\nTTHERM_00609390\nTTHERM_00616020\nTTHERM_00616490\nTTHERM_00617800\nTTHERM_00620900\nTTHERM_00621040\nTTHERM_00621570\nTTHERM_00625970\nTTHERM_00627080\nTTHERM_00627280\nTTHERM_00629770\nTTHERM_00629890\nTTHERM_00629940\nTTHERM_00629970\nTTHERM_00630180\nTTHERM_00630350\nTTHERM_00630370\nTTHERM_00630530\nTTHERM_00630570\nTTHERM_00630620\nTTHERM_00630710\nTTHERM_00630720\nTTHERM_00633200\nTTHERM_00633210\nTTHERM_00633339\nTTHERM_00633640\nTTHERM_00634672\nTTHERM_00635690\nTTHERM_00635760\nTTHERM_00637190\nTTHERM_00637430\nTTHERM_00637460\nTTHERM_00640060\nTTHERM_00640070\nTTHERM_00641260\nTTHERM_00641340\nTTHERM_00643410\nTTHERM_00644660\nTTHERM_00644670\nTTHERM_00644760\nTTHERM_00647450\nTTHERM_00648820\nTTHERM_00648830\nTTHERM_00648850\nTTHERM_00649510\nTTHERM_00652530\nTTHERM_00653700\nTTHERM_00654000\nTTHERM_00656070\nTTHERM_00657540\nTTHERM_00657590\nTTHERM_00658720\nTTHERM_00658990\nTTHERM_00660350\nTTHERM_00661510\nTTHERM_00666940\nTTHERM_00670130\nTTHERM_00670150\nTTHERM_00672050\nTTHERM_00672110\nTTHERM_00675720\nTTHERM_00675750\nTTHERM_00675770\nTTHERM_00675780\nTTHERM_00678340\nTTHERM_00680640\nTTHERM_00683040\nTTHERM_00683320\nTTHERM_00684670\nTTHERM_00685830\nTTHERM_00685840\nTTHERM_00686130\nTTHERM_00688530\nTTHERM_00688640\nTTHERM_00688740\nTTHERM_00689830\nTTHERM_00689870\nTTHERM_00689910\nTTHERM_00691620\nTTHERM_00691910\nTTHERM_00694310\nTTHERM_00694320\nTTHERM_00694530\nTTHERM_00695840\nTTHERM_00697030\nTTHERM_00697050\nTTHERM_00697120\nTTHERM_00697122\nTTHERM_00697570\nTTHERM_00698720\nTTHERM_00699750\nTTHERM_00699830\nTTHERM_00703350\nTTHERM_00704030\nTTHERM_00705090\nTTHERM_00707470\nTTHERM_00709560\nTTHERM_00709630\nTTHERM_00709650\nTTHERM_00709670\nTTHERM_00709700\nTTHERM_00712000\nTTHERM_00713000\nTTHERM_00713260\nTTHERM_00713430\nTTHERM_00713540\nTTHERM_00715940\nTTHERM_00716160\nTTHERM_00716170\nTTHERM_00716330\nTTHERM_00717530\nTTHERM_00717600\nTTHERM_00717650\nTTHERM_00717880\nTTHERM_00717950\nTTHERM_00720100\nTTHERM_00720102\nTTHERM_00721190\nTTHERM_00721470\nTTHERM_00721560\nTTHERM_00721870\nTTHERM_00722970\nTTHERM_00723520\nTTHERM_00723650\nTTHERM_00725950\nTTHERM_00725960\nTTHERM_00726030\nTTHERM_00727530\nTTHERM_00727670\nTTHERM_00727740\nTTHERM_00730250\nTTHERM_00732680\nTTHERM_00733980\nTTHERM_00734070\nTTHERM_00734140\nTTHERM_00735190\nTTHERM_00736530\nTTHERM_00736532\nTTHERM_00748940\nTTHERM_00751032\nTTHERM_00753610\nTTHERM_00754680\nTTHERM_00754690\nTTHERM_00756180\nTTHERM_00758900\nTTHERM_00760440\nTTHERM_00760550\nTTHERM_00760760\nTTHERM_00765350\nTTHERM_00773180\nTTHERM_00773200\nTTHERM_00773440\nTTHERM_00773550\nTTHERM_00775870\nTTHERM_00775910\nTTHERM_00775949\nTTHERM_00777040\nTTHERM_00778400\nTTHERM_00780520\nTTHERM_00782150\nTTHERM_00784250\nTTHERM_00784730\nTTHERM_00787130\nTTHERM_00790550\nTTHERM_00790850\nTTHERM_00790870\nTTHERM_00794010\nTTHERM_00794220\nTTHERM_00794360\nTTHERM_00794460\nTTHERM_00797800\nTTHERM_00798050\nTTHERM_00798120\nTTHERM_00798140\nTTHERM_00798170\nTTHERM_00799182\nTTHERM_00799210\nTTHERM_00802360\nTTHERM_00802530\nTTHERM_00803590\nTTHERM_00803690\nTTHERM_00804780\nTTHERM_00807900\nTTHERM_00808050\nTTHERM_00809110\nTTHERM_00809130\nTTHERM_00809220\nTTHERM_00809250\nTTHERM_00809320\nTTHERM_00809420\nTTHERM_00810530\nTTHERM_00812910\nTTHERM_00816240\nTTHERM_00819450\nTTHERM_00820750\nTTHERM_00821880\nTTHERM_00822150\nTTHERM_00822280\nTTHERM_00823420\nTTHERM_00823840\nTTHERM_00823890\nTTHERM_00824010\nTTHERM_00824050\nTTHERM_00825140\nTTHERM_00827200\nTTHERM_00829330\nTTHERM_00833810\nTTHERM_00834900\nTTHERM_00834920\nTTHERM_00834960\nTTHERM_00835170\nTTHERM_00835210\nTTHERM_00836710\nTTHERM_00840040\nTTHERM_00841290\nTTHERM_00842570\nTTHERM_00842650\nTTHERM_00846960\nTTHERM_00847040\nTTHERM_00848080\nTTHERM_00852770\nTTHERM_00857790\nTTHERM_00857890\nTTHERM_00858020\nTTHERM_00860520\nTTHERM_00861620\nTTHERM_00861650\nTTHERM_00861660\nTTHERM_00861670\nTTHERM_00861680\nTTHERM_00864910\nTTHERM_00869580\nTTHERM_00874740\nTTHERM_00874760\nTTHERM_00875870\nTTHERM_00881320\nTTHERM_00884680\nTTHERM_00887990\nTTHERM_00888000\nTTHERM_00888002\nTTHERM_00888010\nTTHERM_00891270\nTTHERM_00892330\nTTHERM_00894430\nTTHERM_00894440\nTTHERM_00895720\nTTHERM_00895770\nTTHERM_00895790\nTTHERM_00896080\nTTHERM_00898140\nTTHERM_00898160\nTTHERM_00899540\nTTHERM_00901690\nTTHERM_00904010\nTTHERM_00904050\nTTHERM_00907070\nTTHERM_00907090\nTTHERM_00912240\nTTHERM_00912280\nTTHERM_00918470\nTTHERM_00923200\nTTHERM_00925370\nTTHERM_00925430\nTTHERM_00925440\nTTHERM_00925680\nTTHERM_00925730\nTTHERM_00925790\nTTHERM_00925830\nTTHERM_00925850\nTTHERM_00925870\nTTHERM_00927330\nTTHERM_00929490\nTTHERM_00930710\nTTHERM_00930880\nTTHERM_00932030\nTTHERM_00933110\nTTHERM_00935490\nTTHERM_00939010\nTTHERM_00939070\nTTHERM_00941470\nTTHERM_00944110\nTTHERM_00944150\nTTHERM_00947340\nTTHERM_00947370\nTTHERM_00951750\nTTHERM_00951810\nTTHERM_00954080\nTTHERM_00954260\nTTHERM_00957600\nTTHERM_00961900\nTTHERM_00962000\nTTHERM_00972990\nTTHERM_00974040\nTTHERM_00974150\nTTHERM_00974260\nTTHERM_00975330\nTTHERM_00975400\nTTHERM_00979890\nTTHERM_00979940\nTTHERM_00985050\nTTHERM_00985130\nTTHERM_00985190\nTTHERM_00989310\nTTHERM_00992650\nTTHERM_00992670\nTTHERM_00994190\nTTHERM_00994280\nTTHERM_00994370\nTTHERM_00997600\nTTHERM_00997730\nTTHERM_00997770\nTTHERM_00998920\nTTHERM_00999110\nTTHERM_01002570\nTTHERM_01002700\nTTHERM_01002820\nTTHERM_01002900\nTTHERM_01005040\nTTHERM_01006602\nTTHERM_01012090\nTTHERM_01013120\nTTHERM_01013370\nTTHERM_01014500\nTTHERM_01014610\nTTHERM_01015950\nTTHERM_01018430\nTTHERM_01020670\nTTHERM_01026400\nTTHERM_01027580\nTTHERM_01027600\nTTHERM_01030020\nTTHERM_01034450\nTTHERM_01040850\nTTHERM_01043192\nTTHERM_01044560\nTTHERM_01046990\nTTHERM_01050450\nTTHERM_01050550\nTTHERM_01055730\nTTHERM_01058762\nTTHERM_01063940\nTTHERM_01068020\nTTHERM_01071450\nTTHERM_01074550\nTTHERM_01075750\nTTHERM_01075770\nTTHERM_01076820\nTTHERM_01076930\nTTHERM_01079070\nTTHERM_01079230\nTTHERM_01079330\nTTHERM_01080370\nTTHERM_01080430\nTTHERM_01084310\nTTHERM_01087890\nTTHERM_01088000\nTTHERM_01092360\nTTHERM_01093760\nTTHERM_01097910\nTTHERM_01097920\nTTHERM_01097970\nTTHERM_01099080\nTTHERM_01102790\nTTHERM_01104870\nTTHERM_01106170\nTTHERM_01106210\nTTHERM_01107460\nTTHERM_01108590\nTTHERM_01114210\nTTHERM_01117280\nTTHERM_01117320\nTTHERM_01121640\nTTHERM_01122810\nTTHERM_01125220\nTTHERM_01127460\nTTHERM_01127470\nTTHERM_01129750\nTTHERM_01131820\nTTHERM_01132860\nTTHERM_01137233\nTTHERM_01138283\nTTHERM_01139330\nTTHERM_01139360\nTTHERM_01141460\nTTHERM_01141630\nTTHERM_01142700\nTTHERM_01148180\nTTHERM_01149250\nTTHERM_01149310\nTTHERM_01150350\nTTHERM_01150420\nTTHERM_01156800\nTTHERM_01156890\nTTHERM_01165250\nTTHERM_01169420\nTTHERM_01169510\nTTHERM_01169520\nTTHERM_01170540\nTTHERM_01178670\nTTHERM_01178750\nTTHERM_01183080\nTTHERM_01186250\nTTHERM_01188330\nTTHERM_01188340\nTTHERM_01188390\nTTHERM_01194810\nTTHERM_01197010\nTTHERM_01197150\nTTHERM_01202212\nTTHERM_01206400\nTTHERM_01207610\nTTHERM_01217230\nTTHERM_01223622\nTTHERM_01224650\nTTHERM_01227690\nTTHERM_01227790\nTTHERM_01232200\nTTHERM_01236340\nTTHERM_01244600\nTTHERM_01245650\nTTHERM_01246720\nTTHERM_01248940\nTTHERM_01250150\nTTHERM_01250170\nTTHERM_01250180\nTTHERM_01251200\nTTHERM_01260670\nTTHERM_01263970\nTTHERM_012701404\nTTHERM_01273260\nTTHERM_01284820\nTTHERM_01287960\nTTHERM_01293260\nTTHERM_01302830\nTTHERM_01306850\nTTHERM_01307940\nTTHERM_01311230\nTTHERM_01311270\nTTHERM_01311280\nTTHERM_01318430\nTTHERM_01318450\nTTHERM_01318470\nTTHERM_01321580\nTTHERM_01321640\nTTHERM_01322650\nTTHERM_01323720\nTTHERM_01323730\nTTHERM_01324740\nTTHERM_01326860\nTTHERM_01326870\nTTHERM_01330050\nTTHERM_01332100\nTTHERM_01334240\nTTHERM_01338460\nTTHERM_01338530\nTTHERM_01347950\nTTHERM_01351080\nTTHERM_01354280\nTTHERM_01364620\nTTHERM_01365632\nTTHERM_01365640\nTTHERM_01374860\nTTHERM_01393300\nTTHERM_01396430\nTTHERM_01399520\nTTHERM_01403830\nTTHERM_01405890\nTTHERM_01407940\nTTHERM_01413110\nTTHERM_01415160\nTTHERM_01417300\nTTHERM_01421350\nTTHERM_01423410\nTTHERM_01423430\nTTHERM_01424460\nTTHERM_01434610\nTTHERM_01434620\nTTHERM_01434650\nTTHERM_01446960\nTTHERM_01461140\nTTHERM_01462160\nTTHERM_01466260\nTTHERM_01467270\nTTHERM_01469340\nTTHERM_01485662\nTTHERM_01498890\nTTHERM_01512220\nTTHERM_01532560\nTTHERM_01538670\nTTHERM_01542800\nTTHERM_01546850\nTTHERM_01551970\nTTHERM_01571180\nTTHERM_01594610\nTTHERM_01606700\nTTHERM_01607710\nTTHERM_01615782\nTTHERM_016600704\nTTHERM_01671170\nTTHERM_01683270\nTTHERM_01808870
# """.split()

In [19]:
# checkout_df = complete_annot.loc[complete_annot['TTHERM_ID'].isin(checkout)]

In [20]:
# len(checkout)

In [21]:
# checkout_df.to_csv('../checkout_different_filtered_genes.csv', index=False)

In [22]:
# complete_annot.to_csv('/Users/eukarya/Downloads/tgn_complete_annot.csv', index=False)

## Round one of plotting and consensus 

First, cluster accoding to UMAP/Leiden in full-dimensional space, using Manhattan distance and setting the number of neighbors to 5

In [152]:
leiden_label_df_round_1, full_dists = build_leiden_label_df(full_filtered_norm_df, 'full', random_state=42, n_neighbors=3)
leiden_label_df_round_1, veg_dists = build_leiden_label_df(veg_filtered_norm_df, 'veg', lldf=leiden_label_df_round_1, random_state=42, n_neighbors=3)
leiden_label_df_round_1, sex_dists = build_leiden_label_df(sex_filtered_norm_df, 'sex', lldf=leiden_label_df_round_1, random_state=42, n_neighbors=3)

In [153]:
leiden_label_df_round_1

Unnamed: 0,TTHERM_ID,leiden_label_full,leiden_label_veg,leiden_label_sex
0,TTHERM_000000045,15,19,29
1,TTHERM_00000010,26,39,21
2,TTHERM_00000020,26,25,53
3,TTHERM_00000030,50,39,6
4,TTHERM_00000040,0,8,9
...,...,...,...,...
17498,TTHERM_02105572,36,30,25
17499,TTHERM_02272860,36,30,25
17500,TTHERM_02385080,23,30,25
17501,TTHERM_02607240,17,38,38


Check out how many clusters we have

In [154]:
# leiden_label_df_round_1.to_csv('../clustergrammer/leiden_label_df_round_1_nn2.csv', index=False)

In [155]:
max(leiden_label_df_round_1['leiden_label_full'].values)

60

In [156]:
max(leiden_label_df_round_1['leiden_label_veg'].values)

46

In [157]:
max(leiden_label_df_round_1['leiden_label_sex'].values)

59

In [158]:
module_size_list = []
for l in leiden_label_df_round_1['leiden_label_full'].unique():
    df = leiden_label_df_round_1.loc[leiden_label_df_round_1['leiden_label_full'] == l]
    module_size_list.append(len(df))
    
np.mean(module_size_list)

286.9344262295082

In [159]:
np.median(module_size_list)

277.0

In [160]:
np.max(module_size_list)

808

In [161]:
labeled_annot = leiden_label_df_round_1.merge(complete_annot, on='TTHERM_ID')
labeled_annot.head()

Unnamed: 0,TTHERM_ID,leiden_label_full,leiden_label_veg,leiden_label_sex,seed_ortholog,evalue,score,eggNOG_OGs,max_annot_lvl,COG_category,...,KEGG_Pathway,KEGG_Module,KEGG_Reaction,KEGG_rclass,BRITE,KEGG_TC,CAZy,BiGG_Reaction,PFAMs,TGD2021_description
0,TTHERM_000000045,15,19,29,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,transmembrane protein putative
1,TTHERM_00000010,26,39,21,5911.EAR87737,1.7e-272,898.9,"KOG0575@1|root,KOG0575@2759|Eukaryota",2759|Eukaryota,T,...,"ko04068,ko04110,ko04114,ko04136,ko04138,ko0413...",-,-,-,"ko00000,ko00001,ko01000,ko01001,ko03029,ko0303...",-,-,-,-,protein kinase
2,TTHERM_00000020,26,25,53,5911.EAR87739,1.6e-77,260.4,"KOG0032@1|root,KOG0032@2759|Eukaryota",2759|Eukaryota,T,...,"ko04010,ko04114,ko04136,ko04138,ko04139,ko0414...",-,-,-,"ko00000,ko00001,ko01000,ko01001,ko03019,ko0302...",-,-,-,-,Serine/Threonine kinase domain protein
3,TTHERM_00000030,50,39,6,5888.CAK56476,5.8999999999999995e-52,172.6,"KOG0615@1|root,KOG0615@2759|Eukaryota",2759|Eukaryota,T,...,"ko04140,ko04152,ko04211,ko04626,ko04920,ko0492...",-,-,-,"ko00000,ko00001,ko01000,ko01001",-,-,-,-,Serine/Threonine kinase domain protein
4,TTHERM_00000040,0,8,9,5911.EAR87740,0.0,2139.6,"2CVX5@1|root,2RT23@2759|Eukaryota,3ZDPV@5878|C...",5878|Ciliophora,-,...,-,-,-,-,-,-,-,-,-,B-box zinc finger protein


In [162]:
leiden_label_df_round_1.loc[leiden_label_df_round_1['leiden_label_full']==0]

Unnamed: 0,TTHERM_ID,leiden_label_full,leiden_label_veg,leiden_label_sex
4,TTHERM_00000040,0,8,9
6,TTHERM_000001189,0,0,27
20,TTHERM_000011759,0,33,44
60,TTHERM_000030389,0,27,7
89,TTHERM_00004840,0,0,27
...,...,...,...,...
17381,TTHERM_01443850,0,6,35
17459,TTHERM_01587460,0,12,27
17462,TTHERM_01592580,0,12,7
17464,TTHERM_01592592,0,42,2


In [163]:
leiden_label_df_round_1.loc[leiden_label_df_round_1['TTHERM_ID']=='TTHERM_00532700']

Unnamed: 0,TTHERM_ID,leiden_label_full,leiden_label_veg,leiden_label_sex
10315,TTHERM_00532700,25,41,1


In [164]:
labeled_annot.loc[labeled_annot['leiden_label_full'] == 10][['TTHERM_ID', 'Description', 'TGD2021_description']]

Unnamed: 0,TTHERM_ID,Description,TGD2021_description
12,TTHERM_00001080,Rab subfamily of small GTPases,RAB4A Rab-family small GTPase Rab4B
38,TTHERM_00001440,"Transmembrane protein, putative",transmembrane protein putative
56,TTHERM_000028669,-,hypothetical protein
67,TTHERM_000037389,-,transmembrane protein putative
142,TTHERM_00006390,calcium ion binding,EF hand protein
...,...,...,...
17247,TTHERM_01344710,Protein kinase domain,calmodulin-domain kinase
17252,TTHERM_01345760,Belongs to the small GTPase superfamily. Arf f...,ADP-ribosylation factor(Arf)/Arf-like (ARL) sm...
17326,TTHERM_01398470,"oxidoreductase activity, acting on paired dono...",cytochrome P450 family monooxygenase
17331,TTHERM_01403800,SNAP receptor activity,SNARE domain protein


In [165]:
leiden_label_df_round_1.to_csv('./test_nn3_leiden_label_df_round_1.csv', index=False)

In [166]:
np.sum(np.array(module_size_list) >= 5)

59

In [168]:
leiden_label_df_round_1.loc[leiden_label_df_round_1['TTHERM_ID'] == 'TTHERM_01386050']

Unnamed: 0,TTHERM_ID,leiden_label_full,leiden_label_veg,leiden_label_sex
17314,TTHERM_01386050,47,3,47


In [169]:
# full_pymde_data = full_filtered_norm_df[list(full_filtered_norm_df.columns)[1:]].values

# pymde.seed(42)
# torch.seed(42)

# full_embedding = pymde.preserve_neighbors(
#     full_pymde_data, 
#     verbose=True,
#     n_neighbors=5,
#     constraint=pymde.Standardized()
# ).embed()

# pymde_full_df = pd.DataFrame(np.array(full_embedding), columns=('x', 'y'))
# bokeh.plotting.output_file(filename='./plots/full_norm_pymde_leiden_round_1.2.html', mode='inline')

# p = plot_embedding(full_filtered_norm_df, pymde_full_df, complete_annot, leiden_label_df_round_1, 'full', palette32, n_neighbors=5, radius=0.005)
# bokeh.plotting.save(p)

For ordering the leiden clusters in the heatmap, try computing the cityblock distance between centroids and then ordering by smallest distance, starting with largest cluster.

In [170]:
test_centroid = pd.DataFrame({'a': np.array([1, 2, 3, 4]), 'b': np.array([0, 2, 0, 2])})
test_centroid

Unnamed: 0,a,b
0,1,0
1,2,2
2,3,0
3,4,2


In [171]:
test_centroid.mean(axis=0).values

array([2.5, 1. ])

In [172]:
l = get_all_module_centroids(full_filtered_norm_df, leiden_label_df_round_1, 'leiden', 'full')

Module 0 is always the largest for leiden and WGCNA; need to check about CLR/MCL

In [193]:
cdf = get_module_centroid_df(full_filtered_norm_df, leiden_label_df_round_1, 'leiden', 'full')

In [195]:
arr_df = arrange_modules(full_filtered_norm_df, leiden_label_df_round_1, 'leiden', 'full')
arr_df

17503


Unnamed: 0,TTHERM_ID,leiden_label_full,leiden_label_veg,leiden_label_sex
191,TTHERM_000092809,43,7,26
236,TTHERM_00011350,43,7,26
263,TTHERM_00011710,43,7,26
289,TTHERM_000129409,43,7,42
324,TTHERM_00013410,43,7,26
...,...,...,...,...
17126,TTHERM_01292190,27,32,42
17138,TTHERM_01298450,27,5,8
17386,TTHERM_01444910,27,18,32
17472,TTHERM_01624890,27,16,36


In [196]:
arrange_modules(full_filtered_df, leiden_label_df_round_1, 'leiden', 'full')


17503


Unnamed: 0,TTHERM_ID,leiden_label_full,leiden_label_veg,leiden_label_sex
138,TTHERM_00006340,43,27,47
291,TTHERM_00012970,43,27,26
434,TTHERM_00016330,43,27,8
557,TTHERM_00024170,43,27,4
942,TTHERM_00038850,43,27,17
...,...,...,...,...
16764,TTHERM_01169380,36,8,31
16985,TTHERM_01243520,36,29,21
17097,TTHERM_01284740,36,37,16
17189,TTHERM_01311290,36,33,4


In [197]:
arr_df.loc[arr_df['leiden_label_full'] == 0]

Unnamed: 0,TTHERM_ID,leiden_label_full,leiden_label_veg,leiden_label_sex
28,TTHERM_00001300,0,14,2
131,TTHERM_00006260,0,29,20
285,TTHERM_000128319,0,9,15
320,TTHERM_00013340,0,3,27
708,TTHERM_00030440,0,10,24
...,...,...,...,...
16579,TTHERM_01123990,0,3,20
16787,TTHERM_01179850,0,3,34
17003,TTHERM_01248890,0,4,8
17122,TTHERM_01289160,0,10,34


In [187]:
arr_df.loc[arr_df['TTHERM_ID'] == 'TTHERM_01386050']

Unnamed: 0,TTHERM_ID,leiden_label_full,leiden_label_veg,leiden_label_sex
17314,TTHERM_01386050,28,3,47


In [188]:
leiden_label_df_round_1.loc[leiden_label_df_round_1['TTHERM_ID'] == 'TTHERM_01386050']

Unnamed: 0,TTHERM_ID,leiden_label_full,leiden_label_veg,leiden_label_sex
17314,TTHERM_01386050,47,3,47


In [198]:
arr_df.to_csv('./test_nn3_leiden_label_df_round_1_rearranged.csv')

In [190]:
leiden_label_df_round_1.head()

Unnamed: 0,TTHERM_ID,leiden_label_full,leiden_label_veg,leiden_label_sex
0,TTHERM_000000045,15,19,29
1,TTHERM_00000010,26,39,21
2,TTHERM_00000020,26,25,53
3,TTHERM_00000030,50,39,6
4,TTHERM_00000040,0,8,9


In [192]:
arrange_modules(full_filtered_norm_df, leiden_label_df_round_1, 'leiden', 'full')

17503


Unnamed: 0,TTHERM_ID,leiden_label_full,leiden_label_veg,leiden_label_sex
191,TTHERM_000092809,43,7,26
236,TTHERM_00011350,43,7,26
263,TTHERM_00011710,43,7,26
289,TTHERM_000129409,43,7,42
324,TTHERM_00013410,43,7,26
...,...,...,...,...
17126,TTHERM_01292190,27,32,42
17138,TTHERM_01298450,27,5,8
17386,TTHERM_01444910,27,18,32
17472,TTHERM_01624890,27,16,36


In [46]:
arrange_modules(veg_filtered_df, leiden_label_df_round_1, 'leiden', 'veg')

NameError: name 'veg_filtered_df' is not defined

In [208]:
arrange_modules(sex_filtered_df, leiden_label_df_round_1, 'leiden', 'sex')

17503


Unnamed: 0,TTHERM_ID,leiden_label_full,leiden_label_veg,leiden_label_sex
29,TTHERM_00001310,9,39,55
40,TTHERM_000016167,9,1,55
42,TTHERM_000016298,9,0,55
128,TTHERM_00006230,20,20,55
133,TTHERM_00006290,9,0,55
...,...,...,...,...
17185,TTHERM_01310180,1,14,14
17298,TTHERM_01367710,12,19,14
797,TTHERM_00034960,14,20,43
5857,TTHERM_00289180,14,28,43


In [96]:
def generate_and_save_umap(outfile_name, expression_df, annotation_df, label_df, clust_alg, phase, palette, title, n_neighbors=5, n_components=2, radius=0.02, random_state=42, normalized=True):
    
    data = expression_df[list(expression_df.columns)[1:]].values
    
    umap_mapper = umap.UMAP(random_state=random_state, n_components=n_components, n_neighbors=n_neighbors).fit(data)
    embedding = _get_umap_embedding(umap_mapper)
    
    umap_df = pd.DataFrame(np.array(embedding), columns=('x', 'y'))
    
    # bokeh.plotting.output_file(filename=outfile_name, title=title, mode='inline')
    p = plot_embedding(expression_df, umap_df, annotation_df, label_df, clust_alg, phase, palette, title=title, n_neighbors=n_neighbors, radius=radius, normalized=normalized)
    # bokeh.plotting.save(p)
    print(outfile_name)
    return p

In [48]:
%pdb

Automatic pdb calling has been turned ON


In [97]:
leiden_label_df_round_1

Unnamed: 0,TTHERM_ID,leiden_label_full,leiden_label_veg,leiden_label_sex
0,TTHERM_000000045,2,0,2
1,TTHERM_00000010,7,42,7
2,TTHERM_00000020,7,40,7
3,TTHERM_00000030,7,40,3
4,TTHERM_00000040,14,4,48
...,...,...,...,...
16568,TTHERM_02105572,17,24,17
16569,TTHERM_02272860,17,13,17
16570,TTHERM_02385080,17,24,17
16571,TTHERM_02607240,10,21,8


In [199]:
p = generate_and_save_umap('./plots/interview_dashboard.html', full_filtered_norm_df, complete_annot, leiden_label_df_round_1, 'leiden', 'full', palette64, 'Full normalized expression w/ Leiden clustering (round 1) (nn=3)', radius=0.07, normalized=True)
bokeh.io.show(p)

17503


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hm_df['module'] = hover_data['module'].values


./plots/interview_dashboard.html


In [50]:
generate_and_save_umap('./plots/full_norm_umap_leiden_round_1_nn3.html', full_filtered_norm_df, complete_annot, leiden_label_df_round_1, 'full', palette64, 'Full normalized expression w/ Leiden clustering (round 1) (nn=3)')
generate_and_save_umap('./plots/veg_norm_umap_leiden_round_1_nn3.html', veg_filtered_norm_df, complete_annot, leiden_label_df_round_1, 'veg', palette64, 'Veg normalized expression w/ Leiden clustering (round 1) (nn=3)')
generate_and_save_umap('./plots/sex_norm_umap_leiden_round_1_nn3.html', sex_filtered_norm_df, complete_annot, leiden_label_df_round_1, 'sex', palette64, 'Sex normalized expression w/ Leiden clustering (round 1) (nn=3)')

TypeError: interactive() missing 1 required positional argument: 'expr_df'

In [53]:
wgcna_label_df = pd.read_csv('../WGCNA/wgcna_labels_round_1.csv')
wgcna_label_df.head()

Unnamed: 0,TTHERM_ID,wgcna_label_full,wgcna_label_veg,wgcna_label_sex
0,TTHERM_000000045,16,11,3
1,TTHERM_00000010,5,7,2
2,TTHERM_00000020,0,3,0
3,TTHERM_00000030,0,7,4
4,TTHERM_00000040,0,0,27


In [54]:
print(max(wgcna_label_df['wgcna_label_full']), max(wgcna_label_df['wgcna_label_veg']), max(wgcna_label_df['wgcna_label_sex']))

19 20 37


In [45]:
# wgcna_label_df.to_csv('/Users/eukarya/Downloads/tgn_wgcna_labels_round_1.csv', index=False)

In [65]:
mcl_dict = {'full': {},
            'veg': {},
            'sex': {}}
with open('../CLR_MCL/full_labeled_MCL_nn2_round_1_clusters.txt', 'r') as f:
    clusters = f.readlines()
    full = mcl_dict['full']
    for i, c in enumerate(clusters):
        ttids = c.split()
        for t in ttids:
            full[t] = i
            

with open('../CLR_MCL/veg_labeled_MCL_nn2_round_1_clusters.txt', 'r') as f:
    clusters = f.readlines()
    veg = mcl_dict['veg']
    for i, c in enumerate(clusters):
        ttids = c.split()
        for t in ttids:
            veg[t] = i
            
with open('../CLR_MCL/sex_labeled_MCL_nn2_round_1_clusters.txt', 'r') as f:
    clusters = f.readlines()
    sex = mcl_dict['sex']
    for i, c in enumerate(clusters):
        ttids = c.split()
        for t in ttids:
            sex[t] = i

In [66]:
len(mcl_dict['sex'])

16595

In [67]:
mcl_label_df = pd.DataFrame({'TTHERM_ID': wgcna_label_df['TTHERM_ID'].values})

In [68]:
mcl_label_df['mcl_label_full'] = [mcl_dict['full'][t] for t in mcl_label_df['TTHERM_ID'].values]

mcl_label_df['mcl_label_veg'] = [mcl_dict['veg'][t] for t in mcl_label_df['TTHERM_ID'].values]
mcl_label_df['mcl_label_sex'] = [mcl_dict['sex'][t] for t in mcl_label_df['TTHERM_ID'].values]

In [69]:
mcl_label_df.head()

Unnamed: 0,TTHERM_ID,mcl_label_full,mcl_label_veg,mcl_label_sex
0,TTHERM_000000045,60,1193,69
1,TTHERM_00000010,3217,336,1296
2,TTHERM_00000020,332,10,488
3,TTHERM_00000030,1203,348,2418
4,TTHERM_00000040,520,2579,2782


In [62]:
mcl_label_df.to_csv('../CLR_MCL/mcl_label_df_round_1_I14.csv', index=False)

In [70]:
all_alg_label_df = leiden_label_df_round_1.merge(wgcna_label_df, on='TTHERM_ID')
all_alg_label_df = all_alg_label_df.merge(mcl_label_df, on='TTHERM_ID')

In [71]:
all_alg_label_df.head()

Unnamed: 0,TTHERM_ID,leiden_label_full,leiden_label_veg,leiden_label_sex,wgcna_label_full,wgcna_label_veg,wgcna_label_sex,mcl_label_full,mcl_label_veg,mcl_label_sex
0,TTHERM_000000045,2,0,4,16,11,3,60,1193,69
1,TTHERM_00000010,19,20,7,5,7,2,3217,336,1296
2,TTHERM_00000020,9,38,11,0,3,0,332,10,488
3,TTHERM_00000030,9,20,11,0,7,4,1203,348,2418
4,TTHERM_00000040,15,32,5,0,0,27,520,2579,2782


In [62]:
wgcna_p = generate_and_save_umap('./plots/test_heatmap.html', full_filtered_norm_df, complete_annot, all_alg_label_df, 'wgcna', 'full', palette64, 'Full normalized expression w/ WGCNA clustering (round 1)')

The Copheretic correlation is: 0.7330470980757399
16595


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hm_df['module'] = hover_data['module'].values


./plots/test_heatmap.html


In [63]:
bokeh.io.show(wgcna_p)

In [72]:
mcl_p = generate_and_save_umap('./plots/test_heatmap.html', full_filtered_norm_df, complete_annot, all_alg_label_df, 'mcl', 'full', palette64, 'Full normalized expression w/ CLR/MCL clustering (round 1)')
bokeh.io.show(mcl_p)

The Copheretic correlation is: 0.5513830666275431
16595
Color key has fewer colors than labels. Making all white


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hm_df['module'] = hover_data['module'].values


./plots/test_heatmap.html


In [158]:
# mcl_label_df.to_csv('/Users/eukarya/Downloads/tgn_mcl_label_df_round_1.csv', index=False)

In [175]:
def get_tthermids_in_module(label_df, ttherm_id_query, label_key):
    """
    Function to get the TTHERM_IDs of genes in a specific module (as defined by a gene of interest) from
    a specific analysis (as defined by the applied algorithm and physiological regime).
    
    Parameters:
    -----------
    label_df : Pandas DataFrame
        The df with TTHERM_IDs and corresponding module labels
        
    ttherm_id_query : str (format TTHERM_########)
        The TTHERM_ID (gene ID) of interest
        
    label_key : str
        Defines the algorithm and physiological phases: e.g., "clr_label_full"
        
    Returns:
    --------
    set(ttherm_ids) : set
        The TTHERM_IDs that were identified
    """
    
    module = label_df.loc[label_df['TTHERM_ID'] == ttherm_id_query][label_key].values[0]
    ttherm_ids = label_df.loc[label_df[label_key] == module]['TTHERM_ID'].values
    
    return set(ttherm_ids)

def get_hybrid_clusters(leiden_lab, wgcna_lab, mcl_lab, phases):
    """
    Function to find hybrid (consensus) clusters of genes about which
    UMAP/Leiden, WGCNA, and CLR all agree.
    
    Parameters:
    -----------
    leiden_lab : pandas DataFrame
        dataframe with the UMAP/Leiden labels
    wgcna_lab : pandas DataFrame
        dataframe with the WGCNA labels
    clr_lab : pandas DataFrame
        dataframe with the CLR labels
    phases : str ('full', 'veg', or 'sex')
        the physiological phase(s) of interest
        
    Returns:
    --------
    hybrid_clusters : list of sets of TTHERM_IDs
        the consensus between UMAP/Leiden, WGCNA, and MCL
    """
    
    
    assigned = {t: False for t in leiden_lab['TTHERM_ID'].values}
    hybrid_clusters = []
    
    for t in leiden_lab['TTHERM_ID'].values:
        
        if assigned[t] == True:
            continue
        
        leiden_module = get_tthermids_in_module(leiden_lab, t, f'leiden_label_{phases}')
        wgcna_module = get_tthermids_in_module(wgcna_lab, t, f'wgcna_label_{phases}')
        mcl_module = get_tthermids_in_module(mcl_lab, t, f'mcl_label_{phases}')

        intersection = leiden_module.intersection(wgcna_module).intersection(mcl_module)
        hybrid_clusters.append(intersection)

        for g in intersection:
            if assigned[g] == False:
                assigned[g] = True

            else:
                print(f'{g} already assigned. Something went wrong.')
                break
                
    return hybrid_clusters

def populate_hybrid_label_df(hybrid_clusters, phases, size_threshold):
    """
    Function to build a label dataframe out of the hybrid (consensus) modules.
    
    Parameters:
    -----------
    hybrid_clusters : list of sets of TTHERM_IDs
        as returned by get_hybrid_clusters()
        
    phases : str ('full', 'veg', or 'sex')
        the physiological phase(s) of interest
        
    size_threshold : int
        The minimum consensus set size to include. Setting it to zero
        would be equivalent to taking all of the genes, consensus 
        nonwithstanding. The higher the threshold, the fewer genes
        will be included (i.e., the more strict the consensus). I
        found that 5 is good, especially for doing a second consensus
        iteration.
    """
    
    df = pd.DataFrame({'TTHERM_ID': [],
                       f'hybrid_label_{phases}': []})

    row_num = 0
    for i, c in enumerate(hybrid_clusters):
        if len(c) >= size_threshold:
            for ttid in c:
#                 print(ttid, int(i))
                df.loc[row_num] = [ttid, int(i)]
                row_num += 1
                
    return df

In [176]:
full_hybrid_clusters = get_hybrid_clusters(leiden_label_df_round_1, wgcna_label_df, mcl_label_df, 'full')

veg_hybrid_clusters = get_hybrid_clusters(leiden_label_df_round_1, wgcna_label_df, mcl_label_df, 'veg')
sex_hybrid_clusters = get_hybrid_clusters(leiden_label_df_round_1, wgcna_label_df, mcl_label_df, 'sex')

In [177]:
len(full_hybrid_clusters)

14173

In [178]:
len([c for c in full_hybrid_clusters if len(c) >= 2])

1241

In [179]:
len([c for c in veg_hybrid_clusters if len(c) >= 2])

919

In [180]:
len([c for c in sex_hybrid_clusters if len(c) >= 2])

1091

In [181]:
# hybrid_label_df = pd.DataFrame({'TTHERM_ID': []})
full_hybrid_label_3 = populate_hybrid_label_df(full_hybrid_clusters, 'full', 2)

veg_hybrid_label_3 = populate_hybrid_label_df(veg_hybrid_clusters, 'veg', 2)
sex_hybrid_label_3 = populate_hybrid_label_df(sex_hybrid_clusters, 'sex', 2)

In [182]:
full_consensus_round_1 = full_filtered_norm_df.loc[full_filtered_norm_df['TTHERM_ID'].isin(full_hybrid_label_3['TTHERM_ID'].values)]

veg_consensus_round_1 = veg_filtered_norm_df.loc[veg_filtered_norm_df['TTHERM_ID'].isin(veg_hybrid_label_3['TTHERM_ID'].values)]
sex_consensus_round_1 = sex_filtered_norm_df.loc[sex_filtered_norm_df['TTHERM_ID'].isin(sex_hybrid_label_3['TTHERM_ID'].values)]

In [None]:
full_consensus_round_1.to_csv('../full_expr_consensus_round_1.csv', index=False)
veg_consensus_round_1.to_csv('../veg_expr_consensus_round_1.csv', index=False)
sex_consensus_round_1.to_csv('../sex_expr_consensus_round_1.csv', index=False)

In [183]:
len(full_consensus_round_1)

3663

In [184]:
len(veg_consensus_round_1)

2508

In [185]:
len(sex_consensus_round_1)

2983

In [186]:
len(
    set(full_consensus_round_1['TTHERM_ID'].values).union(
    set(veg_consensus_round_1['TTHERM_ID'].values)).union(
        set(sex_consensus_round_1['TTHERM_ID'].values)
    )
   )

6193

In [None]:
# consensus_round_1_df_full_4 = leiden_x_wgcna_x_mcl_filt(full_filtered_norm_df, full_hybrid_label_4)
# consensus_round_1_df_veg_4 = leiden_x_wgcna_x_mcl_filt(veg_filtered_norm_df, veg_hybrid_label_4)
# consensus_round_1_df_sex_4 = leiden_x_wgcna_x_mcl_filt(sex_filtered_norm_df, sex_hybrid_label_4)


In [None]:
# len(consensus_round_1_df_full_4)

In [192]:
consensus_lldf_full_3, consensus_full_dists = build_leiden_label_df(full_consensus_round_1, 'full', n_neighbors=5)
consensus_lldf_veg_3, consensus_veg_dists = build_leiden_label_df(veg_consensus_round_1, 'veg', n_neighbors=5)
consensus_lldf_sex_3, consensus_sex_dists = build_leiden_label_df(sex_consensus_round_1, 'sex', n_neighbors=5)

In [None]:
consensus_lldf_full_3.to_csv('~/Downloads/tgn_consensus_lldf_full_3.csv', index=False)
consensus_lldf_veg_3.to_csv('~/Downloads/tgn_consensus_lldf_veg_3.csv', index=False)
consensus_lldf_sex_3.to_csv('~/Downloads/tgn_consensus_lldf_sex_3.csv', index=False)

In [193]:
consensus_lldf_full_3.head()

Unnamed: 0,TTHERM_ID,leiden_label_full
0,TTHERM_00000070,12
1,TTHERM_000011208,10
2,TTHERM_00001150,14
3,TTHERM_000011759,17
4,TTHERM_00001230,9


In [194]:
max(consensus_lldf_full_3['leiden_label_full'])

36

In [195]:
max(consensus_lldf_veg_3['leiden_label_veg'])

28

In [196]:
max(consensus_lldf_sex_3['leiden_label_sex'])

33

In [200]:
!pwd

/Users/eukarya/Documents/git/TGNE-2022/TGNE/embedding


In [205]:
p = generate_and_save_umap('./plots/full_norm_umap_leiden_consensus_1_nn5.html', full_consensus_round_1, complete_annot, consensus_lldf_full_3, 'full', palette36, 'Full normalized expression w/ Leiden clustering (consensus 1) (nn=5)')

./plots/full_norm_umap_leiden_consensus_1_nn5.html


In [206]:
bokeh.io.show(p)

In [198]:
generate_and_save_umap('./plots/veg_norm_umap_leiden_consensus_1_nn5.html', veg_consensus_round_1, complete_annot, consensus_lldf_veg_3, 'veg', palette36, 'Veg normalized expression w/ Leiden clustering (consensus 1) (nn=5)')

In [199]:
generate_and_save_umap('./plots/sex_norm_umap_leiden_consensus_1_nn5.html', sex_consensus_round_1, complete_annot, consensus_lldf_sex_3, 'sex', palette36, 'Sex normalized expression w/ Leiden clustering (consensus 1) (nn=5)')

### Round 2 consensus!!

In [None]:
wgcna_label_round_2_df = pd.read_csv('../WGCNA/wgcna_labels_round_2.csv')
wgcna_label_round_2_df.head()

In [None]:
wgcna_label_round_2_df.to_csv('/Users/eukarya/Downloads/tgn_wgcna_labels_round_2.csv', index=False)

In [None]:
wgcna_full_round_2 = wgcna_label_round_2_df[['TTHERM_ID', 'wgcna_label_full']].dropna()
wgcna_veg_round_2 = wgcna_label_round_2_df[['TTHERM_ID', 'wgcna_label_veg']].dropna()
wgcna_sex_round_2 = wgcna_label_round_2_df[['TTHERM_ID', 'wgcna_label_sex']].dropna()
wgcna_sex_round_2

In [None]:
mcl_dict2 = {'full': {},
            'veg': {},
            'sex': {}}
with open('../CLR_MCL/MCL/full_labeled_MCL_round_2_clusters.txt', 'r') as f:
    clusters = f.readlines()
    full = mcl_dict2['full']
    for i, c in enumerate(clusters):
        ttids = c.split()
        for t in ttids:
            full[t] = i
            
with open('../CLR_MCL/MCL/veg_labeled_MCL_round_2_clusters.txt', 'r') as f:
    clusters = f.readlines()
    veg = mcl_dict2['veg']
    for i, c in enumerate(clusters):
        ttids = c.split()
        for t in ttids:
            veg[t] = i
            
with open('../CLR_MCL/MCL/sex_labeled_MCL_round_2_clusters.txt', 'r') as f:
    clusters = f.readlines()
    sex = mcl_dict2['sex']
    for i, c in enumerate(clusters):
        ttids = c.split()
        for t in ttids:
            sex[t] = i

In [None]:
len(mcl_dict2['sex'])

In [None]:
mcl_round_2_ttids = wgcna_label_round_2_df['TTHERM_ID'].values

In [None]:
len(mcl_round_2_ttids)

In [None]:
mcl_label_round_2_df = pd.DataFrame({'TTHERM_ID': list(mcl_round_2_ttids)})

In [None]:
mcl_label_round_2_df['mcl_label_full'] = [mcl_dict2['full'][t] if t in mcl_dict2['full'].keys() else None for t in mcl_label_round_2_df['TTHERM_ID'].values]
mcl_label_round_2_df['mcl_label_veg'] = [mcl_dict2['veg'][t] if t in mcl_dict2['veg'].keys() else None for t in mcl_label_round_2_df['TTHERM_ID'].values]
mcl_label_round_2_df['mcl_label_sex'] = [mcl_dict2['sex'][t] if t in mcl_dict2['sex'].keys() else None for t in mcl_label_round_2_df['TTHERM_ID'].values]

In [None]:
mcl_label_round_2_df.head()

In [None]:
mcl_label_round_2_df.to_csv('/Users/eukarya/Downloads/tgn_mcl_labels_round_2.csv', index=False)

In [None]:
mcl_full_round_2 = mcl_label_round_2_df[['TTHERM_ID', 'mcl_label_full']].dropna()
mcl_veg_round_2 = mcl_label_round_2_df[['TTHERM_ID', 'mcl_label_veg']].dropna()
mcl_sex_round_2 = mcl_label_round_2_df[['TTHERM_ID', 'mcl_label_sex']].dropna()
mcl_sex_round_2

In [None]:
len(consensus_lldf_full_3)

In [None]:
len(wgcna_full_round_2)

In [None]:
len(mcl_full_round_2)

In [None]:
%pdb

In [None]:
full_consensus_2_hybrid_clusters = get_hybrid_clusters(consensus_lldf_full_3, wgcna_full_round_2, mcl_full_round_2, 'full')
veg_consensus_2_hybrid_clusters = get_hybrid_clusters(consensus_lldf_veg_3, wgcna_veg_round_2, mcl_veg_round_2, 'veg')
sex_consensus_2_hybrid_clusters = get_hybrid_clusters(consensus_lldf_sex_3, wgcna_sex_round_2, mcl_sex_round_2, 'sex')

In [None]:
full_consensus_2_hybrid_label_4 = populate_hybrid_label_df(full_consensus_2_hybrid_clusters, 'full', 4)
veg_consensus_2_hybrid_label_4 = populate_hybrid_label_df(veg_consensus_2_hybrid_clusters, 'veg', 4)
sex_consensus_2_hybrid_label_4 = populate_hybrid_label_df(sex_consensus_2_hybrid_clusters, 'sex', 4)

In [None]:
full_consensus_round_2 = full_filtered_norm_df.loc[full_filtered_norm_df['TTHERM_ID'].isin(full_consensus_2_hybrid_label_4['TTHERM_ID'].values)]
veg_consensus_round_2 = veg_filtered_norm_df.loc[veg_filtered_norm_df['TTHERM_ID'].isin(veg_consensus_2_hybrid_label_4['TTHERM_ID'].values)]
sex_consensus_round_2 = sex_filtered_norm_df.loc[sex_filtered_norm_df['TTHERM_ID'].isin(sex_consensus_2_hybrid_label_4['TTHERM_ID'].values)]

In [None]:
consensus_round_2_lldf_full_4 = build_leiden_label_df(full_consensus_round_2, 'full')
consensus_round_2_lldf_veg_4 = build_leiden_label_df(veg_consensus_round_2, 'veg')
consensus_round_2_lldf_sex_4 = build_leiden_label_df(sex_consensus_round_2, 'sex')

In [None]:
len(
    set(consensus_round_2_lldf_sex_4['TTHERM_ID'].values).union(
        set(consensus_round_2_lldf_full_4['TTHERM_ID'].values)
    ).union(
        set(consensus_round_2_lldf_veg_4['TTHERM_ID'].values)
    )       
)

In [None]:
max(consensus_round_2_lldf_full_4['leiden_label_full'])

In [None]:
max(consensus_round_2_lldf_veg_4['leiden_label_veg'])

In [None]:
max(consensus_round_2_lldf_sex_4['leiden_label_sex'])

In [None]:
p = plot_hybrid(full_consensus_round_2, complete_annot, consensus_round_2_lldf_full_4, palette35, radius=0.03, phases='full')
bokeh.io.show(p)

In [None]:
p = plot_hybrid(veg_consensus_round_2, complete_annot, consensus_round_2_lldf_veg_4, palette35, radius=0.03, phases='veg')
bokeh.io.show(p)

In [None]:
p = plot_hybrid(sex_consensus_round_2, complete_annot, consensus_round_2_lldf_sex_4, palette35, radius=0.03, phases='sex')
bokeh.io.show(p)