In [59]:
import pandas as pd
import pickle
import sys

sys.path.append('../../')
from utils import bokeh_ui_utils, microarray_utils, file_utils

In [60]:
full_filtered_df = pd.read_csv('../microarray_probe_alignment_and_filtering/allgood_filt_agg_tidy_2021aligned_qc_rma_expression_full.csv')
full_filtered_df = full_filtered_df.rename(columns={'Unnamed: 0': 'TTHERM_ID'})
full_filtered_norm_df = microarray_utils.normalize_expression_per_gene(full_filtered_df, z=True)

leiden_label_df_round_1 = pd.read_csv('./test_nn3_leiden_label_df_round_1.csv')
complete_annot = pd.read_csv('../eggnog/complete_eggnog_annotation.csv')

In [61]:
full_filtered_df.shape

(19152, 48)

In [62]:
complete_annot.columns

Index(['TTHERM_ID', 'seed_ortholog', 'evalue', 'score', 'eggNOG_OGs',
       'max_annot_lvl', 'COG_category', 'Description', 'Preferred_name', 'GOs',
       'EC', 'KEGG_ko', 'KEGG_Pathway', 'KEGG_Module', 'KEGG_Reaction',
       'KEGG_rclass', 'BRITE', 'KEGG_TC', 'CAZy', 'BiGG_Reaction', 'PFAMs',
       'TGD2021_description'],
      dtype='object')

In [63]:
filtered_annot = complete_annot.loc[complete_annot['TTHERM_ID'].isin(full_filtered_df['TTHERM_ID'].values)]

In [64]:
filtered_annot.loc[
   (filtered_annot['GOs'] != '-')
    | (filtered_annot['EC'] != '-')
    | (filtered_annot['KEGG_ko'] != '-')
    | (filtered_annot['PFAMs'] != '-')
    ].loc[:, ['GOs', 'EC', 'KEGG_ko', 'PFAMs']]

Unnamed: 0,GOs,EC,KEGG_ko,PFAMs
1,-,-,ko:K03036,PCI
2,"GO:0000228,GO:0000723,GO:0000781,GO:0000782,GO...",-,-,"POT1,POT1PC"
4,"GO:0000003,GO:0002064,GO:0002065,GO:0002066,GO...",-,ko:K17973,"DEC-1_C,DEC-1_N,Dec-1"
5,-,-,-,Pkinase
7,-,3.4.25.1,ko:K02737,Proteasome
...,...,...,...,...
15723,-,-,ko:K03456,-
15724,-,-,-,MORN
15725,-,-,-,adh_short
15727,-,3.4.19.12,ko:K11842,UCH


In [65]:
for col in ['GOs', 'EC', 'KEGG_ko', 'PFAMs']:
    print(f'# genes with {col} term(s):', filtered_annot.loc[
        (filtered_annot[col] != '-')
        ].shape[0])

# genes with GOs term(s): 1392
# genes with EC term(s): 2597
# genes with KEGG_ko term(s): 5845
# genes with PFAMs term(s): 8861


## Define palettes for plotting

These palettes are from the R package Polychrome. The first is just palette36 with the first color replaced by white. The second is the alphabet palette with white prepended.

The R code for 64 colors:

library(Polychrome);
seed <- c("#000000", "#ff0000", "#00ff00", "#0000ff");
p64 <- createPalette(64, seed, range=c(40,100));
paste(p64, sep="\n");

Then, replace the first and last with ~white

In [66]:
palette45 = """
#51635F\n#FF1C16\n#16FC26\n#403DFC\n#FE0DCE\n#F9AA00\n#00FFD5\n#22BFFE\n#BB3551\n#E6FE97\n#ECADFF\n#FFBFBD\n#CF00F5\n#0D8B00\n#D7FEFF\n#8D7200\n#F76C00\n#AD3288\n#5C5AB8\n#FC0080\n#B8FF16\n#00AAB4\n#FBE11C\n#9AAAD9\n#8BBB8C\n#934B47\n#6EFE99\n#9C6D91\n#FB9778\n#9D32AF\n#D40087\n#FFDC9D\n#FF8DB6\n#A96AFC\n#FDDDFB\n#168CF7\n#FD6CF9\n#F64553\n#4D6A00\n#FAFEDB\n#A7977D\n#0DFBFF\n#86B80D\n#FD8AE4\n#B7B126
""".split()

palette32 = """
white\n#F91622\n#16FC0D\n#5138FB\n#FD00CF\n#FDD51C\n#16FDD7\n#FC8B8E\n#16BFFF\n#DF9BFD\n#669C2A\n#FEE7C4\n#F31685\n#DF16FD\n#C1F1FE\n#A23D7E\n#D5FD0D\n#8C5A0D\n#FC790D\n#4F5CBC\n#FFCBEF\n#168D72\n#68FA93\n#C4FDC9\n#F7A449\n#16789B\n#AD0DAB\n#C4262E\n#0DF1FF\n#EFF994\n#B6C1FE\n#8F22CD
""".split()

palette35 = """
#585F6A\n#FE1626\n#00FB0D\n#2E40FC\n#FD0DCE\n#FCD200\n#F7868C\n#16FFDC\n#22BEFB\n#D28EF6\n#609000\n#FFE7C9\n#F51683\n#FF730D\n#CAFE16\n#AA3586\n#BEEEFD\n#BD00FA\n#895D22\n#FEC7F0\n#495AA1\n#73F995\n#229270\n#ED963B\n#F6FE97\n#C5FFD0\n#C50DC8\n#6993FF\n#C22A35\n#16ECFC\n#AA707E\n#7A3BCB\n#7C845C\n#358FAA\n#BDBAF6
""".split()

palette38 = """
#636265\n#F60D16\n#00F90D\n#3540FB\n#FD0DD0\n#FDDB0D\n#00FFE2\n#FA8884\n#2ABEFE\n#E5A3FF\n#518F00\n#FEFDD5\n#D51CFF\n#ED007F\n#A33879\n#96731C\n#C8FB16\n#C0ECFE\n#FBC1DA\n#5658BA\n#F96900\n#F69F1C\n#58FA9C\n#008E72\n#BA22B9\n#167D97\n#794D8A\n#CEFE9C\n#BB222E\n#954D45\n#00DCEF\n#FD66B0\n#B2FDD3\n#FDBD9F\n#A9B4F1\n#B371FE\n#849566\n#2A8EFF
""".split()

palette64 = """
white\n#FA002E\n#22FC22\n#221CFA\n#FF3DD6\n#FFDA00\n#00FEFB\n#F48684\n#CEB4FE\n#FFFFE5\n#0D933D\n#CC00F8\n#800D5D\n#F10084\n#22267A\n#0DADFF\n#CBFD71\n#9A761C\n#F96C00\n#6399A6\n#FFBCDA\n#8D0DA3\n#F79F26\n#00FFBF\n#A37CFB\n#F68EEB\n#720D0D\n#F163AA\n#7E926A\n#826386\n#B41C32\n#9BEBCE\n#E2DB83\n#56D4FA\n#E6E2FB\n#925D58\n#F7C3A7\n#62E970\n#220DBD\n#5583BB\n#7EA01C\n#CDFDB6\n#FD00FB\n#B30D97\n#F5FF00\n#DD77FD\n#4282FC\n#BBA6A4\n#0D8068\n#AB5F26\n#F7C26E\n#9EFE00\n#9B2EFD\n#C56887\n#FD3D68\n#ABF2FD\n#835FAC\n#FF16B1\n#325371\n#CA16CA\n#D26322\n#AFCFFE\n#91A1FA\nfloralwhite
""".split()

palette65 = """
white\ngainsboro\n#FA002E\n#22FC22\n#221CFA\n#FF3DD6\n#FFDA00\n#00FEFB\n#F48684\n#CEB4FE\n#FFFFE5\n#0D933D\n#CC00F8\n#800D5D\n#F10084\n#22267A\n#0DADFF\n#CBFD71\n#9A761C\n#F96C00\n#6399A6\n#FFBCDA\n#8D0DA3\n#F79F26\n#00FFBF\n#A37CFB\n#F68EEB\n#720D0D\n#F163AA\n#7E926A\n#826386\n#B41C32\n#9BEBCE\n#E2DB83\n#56D4FA\n#E6E2FB\n#925D58\n#F7C3A7\n#62E970\n#220DBD\n#5583BB\n#7EA01C\n#CDFDB6\n#FD00FB\n#B30D97\n#F5FF00\n#DD77FD\n#4282FC\n#BBA6A4\n#0D8068\n#AB5F26\n#F7C26E\n#9EFE00\n#9B2EFD\n#C56887\n#FD3D68\n#ABF2FD\n#835FAC\n#FF16B1\n#325371\n#CA16CA\n#D26322\n#AFCFFE\n#91A1FA\nfloralwhite
""".split()

In [67]:
palette36 = ["#FFFFFF", 
             "#E4E1E3", 
             "#F6222E", 
             "#FE00FA", 
             "#16FF32", 
             "#3283FE", 
             "#FEAF16", 
             "#B00068", 
             "#1CFFCE",
             "#90AD1C", 
             "#2ED9FF", 
             "#DEA0FD", 
             "#AA0DFE", 
             "#F8A19F", 
             "#325A9B", 
             "#C4451C", 
             "#1C8356", 
             "#85660D",
             "#B10DA1", 
             "#FBE426", 
             "#1CBE4F", 
             "#FA0087", 
             "#FC1CBF", 
             "#F7E1A0", 
             "#C075A6", 
             "#782AB6", 
             "#AAF400",
             "#BDCDFF", 
             "#822E1C", 
             "#B5EFB5", 
             "#7ED7D1", 
             "#1C7F93", 
             "#D85FF7", 
             "#683B79", 
             "#66B0FF", 
             "#3B00FB",
             "magenta"]

change index 5 to #778899. Change index 9 to #2F4F4F. Add #FF7F50. Change index 9 to #FFBCD9. Change index 14 to #DEA5A4

In [68]:
palette27 = ["#FFFFFF", 
             "#AA0DFE", 
             "#3283FE", 
             "#85660D", 
             "#782AB6", 
             "#778899", 
             "#1C8356", 
             "#16FF32", 
             "#F7E1A0", 
#              "#2F4F4F",
             "#FFBCD9", 
             "#C4451C", 
             "#DEA0FD", 
             "#FE00FA", 
#              "#325A9B", 
             "#FEAF16", 
             "#DEA5A4", 
             "#90AD1C", 
             "#F6222E",
             "#1CFFCE", 
             "#2ED9FF", 
             "#B10DA1", 
#              "#C075A6", 
#              "#FC1CBF", 
#              "#B00068", 
             "#FBE426", 
             "#FA0087",
             "#FF7F50"
            ]

In [69]:
with open(('colors_2000_1'), 'rb') as file:
    color_palette_raw = pickle.load(file)

color_palette = palette65

if len(color_palette_raw) >= max(leiden_label_df_round_1['label'].unique()) + 1:
    color_palette = color_palette_raw[:max(leiden_label_df_round_1['label'].unique()) + 1]

In [70]:
embedding_metric='manhattan'
n_neighbors = 3

p = bokeh_ui_utils.generate_and_save_umap(file_utils.generate_uniquely_numbered_export_path('./plots/', 'dashboard', '.html', tags=['microarr', 'umap']), full_filtered_norm_df, complete_annot, leiden_label_df_round_1, 'full', color_palette, 'Full normalized expression w/ Leiden clustering (nn=3)', expr_min=0, expr_max=1, n_neighbors=n_neighbors, embedding_metric=embedding_metric)

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
failed. This is likely due to too small an eigengap. Consider
adding some noise or jitter to your data.

Falling back to random initialisation!
  warn(
failed. This is likely due to too small an eigengap. Consider
adding some noise or jitter to your data.

Falling back to random initialisation!
  warn(
failed. This is likely due to too small an eigengap. Consider
adding some noise or jitter to your data.

Falling back to random initialisation!
  warn(
failed. This is likely due to too small an eigengap. Consider
adding some noise or jitter to your data.

Falling back to random initialisation!
  warn(
failed. This is likely due to too small an eigengap. Consider
adding some noise or jitter to your data.

Falling back to random initialisation!
  warn(


Color key has fewer colors than labels. Making all red


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hm_df['module'] = hover_data['module'].values


./plots/dashboard236_microarr_umap.html


In [71]:
# pca_p = bokeh_ui_utils.generate_and_save_pca(file_utils.generate_uniquely_numbered_export_path('./plots/', 'dashboard', '.html', tags=['pca']), full_filtered_norm_df, complete_annot, leiden_label_df_round_1, 'full', color_palette, 'Full normalized expression w/ Leiden clustering (nn=5)', expr_min=0, expr_max=1)

In [72]:
# tsne_p = bokeh_ui_utils.generate_and_save_tsne(file_utils.generate_uniquely_numbered_export_path('./plots/', 'dashboard', '.html', tags=['tsne']), full_filtered_norm_df, complete_annot, leiden_label_df_round_1, 'full', color_palette, 'Full normalized expression w/ Leiden clustering (nn=5)', expr_min=0, expr_max=1)

In [73]:
# from sklearn.decomposition import PCA

# # Assuming df is your DataFrame with the first column as labels
# # and the rest of the columns as features
# labels = full_filtered_norm_df.iloc[:, 0]
# features = full_filtered_norm_df.iloc[:, 1:]

# # Apply PCA to the standardized features
# pca = PCA()
# pca.fit(features)

# # Get the explained variance ratio
# explained_variance_ratio = pca.explained_variance_ratio_

# # Calculate the cumulative explained variance
# cumulative_variance = np.cumsum(explained_variance_ratio)

# # Plot the scree plot
# plt.figure(figsize=(10, 6))
# plt.plot(range(1, len(explained_variance_ratio) + 1), cumulative_variance, marker='o', linestyle='--')
# plt.title('Scree Plot')
# plt.xlabel('Number of Principal Components')
# plt.ylabel('Cumulative Explained Variance')
# plt.grid(True)
# plt.show()

In [74]:
# num_components = 2
# pca = PCA(n_components=num_components)
# pca.fit(features)

# # Get the explained variance for the specified number of components
# explained_variance = pca.explained_variance_ratio_

# # Sum the explained variance to get the total variance explained
# total_variance_explained = sum(explained_variance)

# # Print the explained variance for each component
# print(f"Explained Variance for {num_components} Components: {explained_variance}")
# print(f"Total Variance Explained: {total_variance_explained}")

In [75]:
# principal_components = pca.fit_transform(features)

In [76]:
# principal_components

In [77]:
# principal_components[:, 0]

In [78]:
# principal_components[:, 1]

In [79]:
# pca_df = pd.DataFrame({
#     'TTHERM_ID': labels,
#     'x': principal_components[:, 0],
#     'y': principal_components[:, 1]
# })
# pca_df

In [80]:
# import umap

# umap_mapper = umap.UMAP(random_state=42, n_components=2, n_neighbors=5).fit(features)
# embedding = bokeh_ui_utils._get_umap_embedding(umap_mapper)

# umap_df = pd.DataFrame(np.array(embedding), columns=('x', 'y'))
# umap_df

In [81]:
# from sklearn.manifold import TSNE

# tsne = TSNE(n_components=2)
# tsne_components = tsne.fit_transform(features)

# tsne_df = pd.DataFrame({
#     'x': tsne_components[:, 0],
#     'y': tsne_components[:, 1]
# })

In [82]:
# tsne_df

((range of x values) * (range of y values)) * (const) = (optimal radius value)

In [83]:
# ((max(umap_df['x'].values) - min(umap_df['x'].values)) * (max(umap_df['y'].values) - min(umap_df['y'].values))) / 0.07

In [84]:
# ((((max(umap_df['x'].values) - min(umap_df['x'].values))**2) * ((max(umap_df['y'].values) - min(umap_df['y'].values))**2))**(0.5)) / 3998.827689034598

((range of x values)^2 * (range of y values)^2)^(0.5) * (const) = (optimal radius value)

In [85]:
# ((((max(umap_df['x'].values) - min(umap_df['x'].values))**2) + ((max(umap_df['y'].values) - min(umap_df['y'].values))**2))**(0.5)) / 0.07

In [86]:
# ((((max(umap_df['x'].values) - min(umap_df['x'].values))**2) + ((max(umap_df['y'].values) - min(umap_df['y'].values))**2))**(0.5)) / 339.30587926495537

In [87]:
# # ((range of x values)^2 * (range of y values)^2)^(0.5) * (const) = (optimal radius value)
# def compute_2d_embedding_point_radius(embedding_df):
#     """Computes a suitable radius value for data points based on a given embedding.

#     Parameters
#     ----------
#     embedding_df : pandas.DataFrame
#         A pandas.DataFrame containing a 2D embedding with the two dimensions having columns labels of 'x' and 'y' respectively.

#     Returns
#     -------
#     float
#         The computed radius value.
#     """
#     return ((((max(embedding_df['x'].values) - min(embedding_df['x'].values))**2) + ((max(embedding_df['y'].values) - min(embedding_df['y'].values))**2))**(0.5)) / 339.30587926495537

# # ((range of x values) * (range of y values)) * (const) = (optimal radius value)
# def compute_2d_embedding_point_radius(embedding_df):
#     return ((max(embedding_df['x'].values) - min(embedding_df['x'].values)) * (max(embedding_df['y'].values) - min(embedding_df['y'].values))) / 3998.827689034598