In [None]:
import numpy as np
import os
import pandas as pd
import sys
from matplotlib.cm import get_cmap
from matplotlib.lines import Line2D
import matplotlib.colors
import matplotlib.pyplot as plt
from matplotlib import colors
from matplotlib import cm
import matplotlib.patches as mpatches
import json

import h5py
import subprocess
import pandas as pd
import matplotlib.pyplot as plt
import csv

bnsi_path = '/scicore/home/nimwegen/degroo0000/Bonsai-data-representation'
sys.path.append(bnsi_path)
from bonsai_scout.bonsai_scout_helpers import Bonvis_figure, Bonvis_settings, Bonvis_metadata, Annotation_info, get_celltype_colors_new

In [None]:
os.getcwd()

In [None]:
#red
color_NK_g1_html = "#FF0000"
color_NK_g1_rgba = (1.0, 0.0, 0.0, 0.5)

#blue
color_NK_g2_html = "#0000FF"
color_NK_g2_rgba = (0.0, 0.0, 1.0, 0.5)

color_other_html = '#808080'
color_other_rgba = (0.5019607843137255, 0.5019607843137255, 0.5019607843137255, 0.5)

In [None]:
NK_color = "#c5b0d5"

In [None]:
colors_rgba = {"NK_group1": color_NK_g1_rgba, "NK_group2": color_NK_g2_rgba, "other":color_other_rgba}

In [None]:
NK_colors = {"NK_group1": "red", "NK_group2": "blue", "other":"grey"}

In [None]:
labels = {"NK_group1": "NK_lymphoid", "NK_group2":"NK_myleoid", "other":"rest" }

## Path to data

In [None]:
path_to_output = "/scicore/home/nimwegen/GROUP/Projects/bonsai_runs/paper_figures_datasets/hao_satija_2021_paper_figure"
celltype_annot_file = os.path.join(path_to_output, "annotation/cell_annot_sub.csv")
sanity_out_path = os.path.join(path_to_output, "Sanity")
bonsai_results_folders = os.path.join(path_to_output, "Bonsai")


In [None]:
celltype_annot = pd.read_csv(celltype_annot_file)
df = pd.concat([celltype_annot[["cellbarcode_full", "rna_annotations", "protein_annotations"]]], axis=1)
df

## Bonsai tree

In [None]:
# %%capture  
print(bonsai_results_folders)
data_path = os.path.join(bonsai_results_folders, 'bonsai_vis_data.hdf')
settings_path = os.path.join(bonsai_results_folders, 'bonsai_vis_settings.json')

print("read in file: {}".format(data_path))
bonvis_metadata = Bonvis_metadata(data_path)
bonvis_settings = Bonvis_settings(load_settings_path=settings_path)
bonvis_data_hdf = h5py.File(data_path, 'r')

# Set darker edges
bonvis_settings.edge_style = {"color": [0.4117647058823529, 0.4117647058823529, 0.4117647058823529, 1.0],
                             'linewidth':0.6}


bonvis_fig = Bonvis_figure(bonvis_data_hdf, 
                           bonvis_metadata, 
                           bonvis_data_path=data_path,
                           bonvis_settings=bonvis_settings)
celltype_info = bonvis_fig.bonvis_settings.celltype_info



bonvis_fig.create_figure(figsize=(6, 6))

# Here, we set the desired celltype-annotation for the dataset
node_style = celltype_info.annot_infos['annot_rna_annotations'].label

# Visualize the tree in the ly_dendrogram layout, with the correct celltype-annotation
geometry = 'flat'
ly_type = "ly_dendrogram_ladderized"

bonvis_fig.update_figure(ly_type=ly_type, 
                         geometry=geometry, 
                         node_style=node_style, 
                         tweak_inds=303, # change root
                         flipped_node_ids=['internal_451'], 
                         new_flip_id=True
                        )


bonsai_fig_darker_lines = bonvis_fig.create_figure(figsize=(6, 6))

## Get NK subgroups

In [None]:
## lymphoid
curr_subset = {}
curr_subset['type'] = "subtree"
curr_subset['info'] = (1009, 1965) 
nk_cell_inds_group1 = bonvis_fig.get_cell_inds_in_subset(curr_subset)[0]
nk_cell_ids_group1 = np.array(bonvis_metadata.cell_ids)[nk_cell_inds_group1]

## myleoid
curr_subset = {}
curr_subset['type'] = "subtree"
curr_subset['info'] = (8110, 8440) 
nk_cell_inds_group2 = bonvis_fig.get_cell_inds_in_subset(curr_subset)[0]
nk_cell_ids_group2 = np.array(bonvis_metadata.cell_ids)[nk_cell_inds_group2]


In [None]:
my_celltype = "NK"
group1 = "NK_group1"
group2 = "NK_group2"
group3 = "other"

df["NK_group"] = [ group3 if ct != my_celltype
               else group1 if cb in nk_cell_ids_group1 
               else group2 if cb in nk_cell_ids_group2 
               else group3 
               for cb, ct in zip(df.cellbarcode_full, df.rna_annotations)]
df

In [None]:
cats = list(df.NK_group.unique())
annot_to_color = {'NK_group1': colors_rgba["NK_group1"], 
                  'NK_group2':colors_rgba["NK_group2"],
                  'other': (0.8274509803921568, 0.8274509803921568, 0.8274509803921568, 0.5)} #lightgray

cbar_info = {'cmap': None, 'vmin': None, 'vmax': None, 'log': None}
label = "NK_group"
info_key = "annot_NK_group"
annot_type = 'cellstates' 
info_object = 'cs_info_dict'

cl_annot = Annotation_info(cats=cats, annot_to_color=annot_to_color, label=label,
                                   cbar_info=cbar_info, annot_type=annot_type,
                                   color_type='categorical', info_object=info_object,
                                   info_key=info_key)

In [None]:
bonvis_metadata.cs_info['cs_info_dict'][info_key] = df['NK_group'].tolist()

bonvis_settings.set_annot(annot_info=cl_annot)
bonvis_settings.cell_to_celltype, _ = bonvis_fig.get_color_info(annot_info=bonvis_fig.bonvis_settings.node_style['annot_info'])


bonvis_settings.celltype_info.annot_infos[info_key] = cl_annot
bonvis_settings.celltype_info.annot_alts.append(info_key)

celltype_info = bonvis_fig.bonvis_settings.celltype_info

## NK tree

In [None]:
myannot = "annot_NK_group"
sub1='NK_group1'
sub2='NK_group2'

sub_idx1 = np.argwhere(np.array(celltype_info.annot_infos[myannot].cats) == sub1)[0][0]
sub_idx2 = np.argwhere(np.array(celltype_info.annot_infos[myannot].cats) == sub2)[0][0]

print(sub_idx1)
print(sub_idx2)

selected_annot1 = celltype_info.annot_infos[myannot].cats[sub_idx1]
selected_annot2 = celltype_info.annot_infos[myannot].cats[sub_idx2]
print(selected_annot1)
print(selected_annot2)

selected_subset1 = {'type': 'annot', 
                       'info': selected_annot1, 
                       'mask_is_on': True}
selected_subset2 = {'type': 'annot', 
                       'info': selected_annot2, 
                       'mask_is_on': True}

selected_subset = [selected_subset1,selected_subset2 ]

In [None]:
# With NK mask
# Here, we set the desired celltype-annotation for the dataset
node_style = celltype_info.annot_infos['annot_NK_group'].label

# # Visualize the tree in the equal-daylight layout, with the correct celltype-annotation
geometry = 'flat'
ly_type = "ly_dendrogram_ladderized"

bonvis_fig.update_figure(ly_type=ly_type, 
                         geometry=geometry, 
                         node_style=node_style, 
                         tweak_inds=303, # change root
                          flipped_node_ids=['internal_451'], new_flip_id=True,
                        )


nk_new_bonsai_fig_darker_lines = bonvis_fig.create_figure(figsize=(6, 6))


### Legends

In [None]:
categories = [
    "B", 
    "CD8 T",
    "CD14+ Mono", 
    "CD16+ Mono", 
    "HSC",
    "DC", 
    "Erythrocytes", 
    "Memory CD4 T", 
    "MK", 
    "NK",
    "Naive CD4 T", 
    "T/Mono doublets", 
    "pDC"
]

cmap = get_cmap("tab20")
colors_ = [cmap(i / 20) for i in range(13)] 

legend_elements = [
    Line2D([0], [0], marker='o', color='w', markerfacecolor=color, markersize=10, label=category)
    for color, category in zip(colors_, categories)
]

fig, ax = plt.subplots(figsize=(6, 3)) 
ax.axis('off')  
legend = ax.legend(handles=legend_elements, loc='center', frameon=False, ncol=1, fontsize=10)

plt.show()

In [None]:
categories = [
    "NK_lymphoid",
    "NK_myleoid",
    "rest"
]


colors_nk_subgroups = [color_NK_g1_html, color_NK_g2_html, color_other_html]  # Use the first 13 colors


legend_elements = [
    Line2D([0], [0], marker='o', color='w', markerfacecolor=color, markersize=10, label=category, alpha=0.8)
    for color, category in zip(colors_nk_subgroups, categories)
]


fig, ax = plt.subplots(figsize=(6, 3))  
ax.axis('off') 
legend = ax.legend(handles=legend_elements, loc='center', frameon=False, ncol=1, fontsize=10)
plt.show()

## Antibody information

In [None]:
all_antibodies_list = json.loads(bonvis_data_hdf["data/antibodies_trnsfm"].attrs['gene_ids'])
print("number of genes: {}".format(len(all_antibodies_list)))

In [None]:
# create new bonvisfig
gene = "CD11c"
print(gene)
bonvis_fig.bonvis_settings.node_style['feature_path'] = "data/antibodies_trnsfm"
geometry = 'flat'
ly_type = "ly_dendrogram_ladderized"

bonvis_fig.update_figure(ly_type=ly_type, 
                         geometry=geometry, 
                         node_style=gene, 
                         tweak_inds=303, 
                         flipped_node_ids=['internal_451'], 
                         new_flip_id=True
                        )

outfig = bonvis_fig.create_figure(figsize=(6, 6))

In [None]:
# create new bonvisfig
gene = "CD3"
print(gene)


bonvis_fig.bonvis_settings.node_style['feature_path'] = "data/antibodies_trnsfm"
geometry = 'flat'
ly_type = "ly_dendrogram_ladderized"

bonvis_fig.update_figure(ly_type=ly_type, 
                         geometry=geometry, 
                         node_style=gene, 
                         tweak_inds=303, 
                         flipped_node_ids=['internal_451'], 
                         new_flip_id=True
                        )

outfig = bonvis_fig.create_figure(figsize=(6, 6))

#### Get colorbar

In [None]:
gene = "CD3"
print(gene)
annot_info = bonvis_settings.node_style['annot_info']
cbar_info = annot_info.cbar_info
cbar_info

In [None]:
def cbar_to_numb(val):
    if cbar_info['log']:
        return np.exp(val * (max_val - min_val) + min_val)
    return val * (max_val - min_val) + min_val

fig_cbar = plt.figure(figsize=(2, 4))
ax_cbar = fig_cbar.add_subplot(111)

norm = colors.Normalize(vmin=0, vmax=1)
mappable = cm.ScalarMappable(norm=norm, cmap=cbar_info['cmap'])
cbar = plt.colorbar(mappable, cax=ax_cbar, orientation='vertical')

print(cbar.ax.get_yticks())
tick_list = [mappable.colorbar.vmin + t * (mappable.colorbar.vmax - mappable.colorbar.vmin) for t in [0,0.25, 0.5,0.75,1]]

min_val = cbar_info['vmin']
max_val = cbar_info['vmax']

tick_labels = ['{:.2e}'.format(cbar_to_numb(tick)) for tick in tick_list]
tick_labels = ['{:.2}'.format(cbar_to_numb(tick)) for tick in tick_list]
cbar.set_ticks(tick_list)
cbar.set_ticklabels(tick_labels)
cbar.set_label('transformed surface protein expression', fontsize=12, labelpad=10)
plt.title(gene)
plt.tight_layout()

In [None]:
gene = "CD11c"
print(gene)
annot_info = bonvis_settings.node_style['annot_info']
cbar_info = annot_info.cbar_info
cbar_info


fig_cbar = plt.figure(figsize=(2, 4))
ax_cbar = fig_cbar.add_subplot(111)

norm = colors.Normalize(vmin=0, vmax=1)
mappable = cm.ScalarMappable(norm=norm, cmap=cbar_info['cmap'])
cbar = plt.colorbar(mappable, cax=ax_cbar, orientation='vertical')


print(cbar.ax.get_yticks())
tick_list = [mappable.colorbar.vmin + t * (mappable.colorbar.vmax - mappable.colorbar.vmin) for t in [0,0.25, 0.5,0.75,1]]

min_val = cbar_info['vmin']
max_val = cbar_info['vmax']

tick_labels = ['{:.2e}'.format(cbar_to_numb(tick)) for tick in tick_list]
tick_labels = ['{:.2}'.format(cbar_to_numb(tick)) for tick in tick_list]
cbar.set_ticks(tick_list)
cbar.set_ticklabels(tick_labels)
cbar.set_label('transformed surface protein expression', fontsize=12, labelpad=10)
plt.title(gene)
plt.tight_layout()

## Antibody expression

In [None]:
antibody_names = json.loads(bonvis_data_hdf["data/antibodies_trnsfm"].attrs['gene_ids'])
cd56_idx = np.where(np.array(antibody_names) == "CD56")[0]
cd16_idx = np.where(np.array(antibody_names) == "CD16")[0]

#### scatter of antibodies with histogram on the side

In [None]:
trnfs_antibodies = bonvis_data_hdf["data/antibodies_trnsfm/means"][:]

s=15

fig = plt.figure(figsize=(8, 8))
grid = fig.add_gridspec(4, 4, hspace=0.1, wspace=0.1) 


# Main scatter plot
main_ax = fig.add_subplot(grid[1:, :-1])

group3 = "other"
main_ax.scatter(x=trnfs_antibodies[cd56_idx, df.NK_group==group3],
            y=trnfs_antibodies[cd16_idx, df.NK_group==group3],
             label=labels[group3], 
             color= NK_colors[group3],
             s=s,
             alpha=0.3)

group1 = "NK_group1"

main_ax.scatter(x=trnfs_antibodies[cd56_idx, df.NK_group==group1],
            y=trnfs_antibodies[cd16_idx, df.NK_group==group1],
             label=labels[group1], 
             color= NK_colors[group1],
             s=s,
             alpha=0.3)

group2 = "NK_group2"
main_ax.scatter(x=trnfs_antibodies[cd56_idx, df.NK_group==group2],
            y=trnfs_antibodies[cd16_idx, df.NK_group==group2],
             label=labels[group2], 
             color= NK_colors[group2],
             s=s,
             alpha=0.4)
main_ax.set_xlabel("CD56")
main_ax.set_ylabel("CD16")

# Top histogram (X-axis)
x_hist = fig.add_subplot(grid[0, :-1], sharex=main_ax)

group1= "NK_group1"
x_hist.hist(trnfs_antibodies[cd56_idx, df.NK_group==group1],
         bins=20, label=labels[group1], 
             color= NK_colors[group1],
             alpha=0.5,
             density=True)
group2= "NK_group2"
x_hist.hist(trnfs_antibodies[cd56_idx, df.NK_group==group2],
         bins=20, label=labels[group2], 
             color= NK_colors[group2],
             alpha=0.5,
             density=True)

group3= "other"
x_hist.hist(trnfs_antibodies[cd56_idx, df.NK_group==group3],
         bins=20, label=labels[group3], 
             color= NK_colors[group3],
             alpha=0.6,
             density=True)

x_hist.axis("off")  


# Side histogram (Y-axis)
y_hist = fig.add_subplot(grid[1:, -1], sharey=main_ax)

group1= "NK_group1"
y_hist.hist(trnfs_antibodies[cd16_idx, df.NK_group==group1],
         bins=20, label=labels[group1], 
             color= NK_colors[group1],
             alpha=0.5,orientation="horizontal",
             density=True)
group2= "NK_group2"
y_hist.hist(trnfs_antibodies[cd16_idx, df.NK_group==group2],
         bins=20, label=labels[group2], 
             color= NK_colors[group2],
             alpha=0.5,orientation="horizontal",
             density=True)

group3= "other"
y_hist.hist(trnfs_antibodies[cd16_idx, df.NK_group==group3],
         bins=20, label=labels[group3], 
             color= NK_colors[group3],
             alpha=0.6,orientation="horizontal",
             density=True)
y_hist.axis("off")  

plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.1)

plt.show()


## Marker score: NK vs rest

In [None]:
from scipy.special import ndtr
def compute_scores_loop(a_mu, a_sig, ncells, indices1, indices2):
    """
    Computes scores for selected slices of motifs and cells, with enhanced optimizations.
    Parameters:
        a_mu: [motifs x cells] matrix for mean values.
        a_sig: [motifs x cells] matrix for standard deviations.
        ncells: [cells] array of cell weights for the full dataset.
        indices1, indices2: Indices corresponding to a1_mu and a2_mu.
    Returns:
        A NumPy array of scores for all motifs in the selected slices.
    """
    # Slice relevant portions of matrices
    a_mu = a_mu.T
    a_sig = a_sig.T
    
    a1_mu = a_mu[indices1, :]
    a2_mu = a_mu[indices2, :]
    
    a1_sig = a_sig[indices1, :]
    a2_sig = a_sig[indices2, :]
    
    ncells1 = ncells[indices1]
    ncells2 = ncells[indices2]
    
    num_cells1, num_motifs = a1_mu.shape
    num_cells2 = a2_mu.shape[0]
    
    # Initialize total weighted sum and total weight
    weighted_sum = np.zeros(num_motifs)
    total_weight = 0.0
    
    # Process in chunks to reduce memory usage
    # for i in tqdm(range(num_cells1)):
    for i in range(num_cells1):
        diff = a1_mu[i, :] - a2_mu  # Compute diff for current row
        sig_sum_sq = a1_sig[i, :]**2 + a2_sig**2  # Compute variance sum for current row
        
        z_matrix = diff / np.sqrt(sig_sum_sq)  # Compute z-scores
        prob_matrix = ndtr(z_matrix)  # Compute probabilities
        
        # Compute weight for current row
        weight_row = ncells1[i] * ncells2
        
        # Accumulate weighted sum and total weight
        weighted_sum += np.sum(prob_matrix * weight_row[:, None], axis=0)
        total_weight += np.sum(weight_row)
    
    # Normalize scores
    scores = weighted_sum / total_weight

    return scores


In [None]:
def make_hist_plot(col_name, group1, group2, group3, data, df_sig, gene, xlabel, title):
    cells_group1 = df[col_name] == group1
    cells_group2 = df[col_name] == group2
    cells_group3 = df[col_name] == group3

    deltas_group1 = data[:, cells_group1]
    deltas_group2 = data[:, cells_group2]
    deltas_group3 = data[:, cells_group3]


    fig = plt.figure(dpi=150, figsize=(6,4))

    group1= "NK_group1"
    plt.hist(deltas_group1[df_sig.loc[gene].idx, :],
             bins=20, label=labels[group1], 
                 color= NK_colors[group1],
                 alpha=0.5,
                 density=True)

    group2= "NK_group2"
    plt.hist(deltas_group2[df_sig.loc[gene].idx, :],
             bins=20, label=labels[group2], 
                 color= NK_colors[group2],
                 alpha=0.5,
                 density=True)

    group3= "other"
    plt.hist(deltas_group3[df_sig.loc[gene].idx, :],
             bins=20, label=labels[group3], 
                 color= NK_colors[group3], 
                 alpha=0.5,
                 density=True)

    plt.title(title)
    plt.ylabel("density")
    plt.xlabel(xlabel)
    plt.savefig("figures/hist_{}.svg".format(title))
    plt.savefig("figures/hist_{}.png".format(title), dpi=300)

In [None]:
group1 = "NK"
group3 = "other"

df["group"] = [group1 if x ==group1 
               else group3 
               for x in df.rna_annotations]
df

In [None]:
# mu_df = pd.read_csv(os.path.join(sanity_out_path, "mu_vmax.txt"), header=None, names=["mu"])
# all_genes_df = pd.read_csv(os.path.join(sanity_out_path, "geneID.txt"), header=None, names=["geneName"])
# all_genes_df["idx_sanity"] = all_genes_df.index
# genes_bonsai_df = pd.DataFrame({"geneName": bonvis_metadata.gene_ids})
# genes_bonsai_df["idx_bonsai"] = genes_bonsai_df.index
# subgenes_df = pd.merge(genes_bonsai_df, all_genes_df)
# subgenes_df = subgenes_df.sort_values("idx_bonsai")
# subgenes_df

In [None]:
# mu_sub = mu_df.loc[subgenes_df.idx_sanity]
# mu_sub

In [None]:
# Note that this was ran before April 2025 and at that time "data/normalized/means" was pointing to the deltas. 
# From there on they point directly to the ltqs, so there is no need anymore to shift by the mu
deltas = bonvis_data_hdf["data/normalized/means"][:] 
deltas = deltas[:, np.array(bonvis_metadata.cell_info["cell_info_dict"]["cell_ind_to_vert_ind"])]

d_deltas = bonvis_data_hdf["data/normalized/vars"][:]
d_deltas = d_deltas[:, np.array(bonvis_metadata.cell_info["cell_info_dict"]["cell_ind_to_vert_ind"])]
deltas.shape, d_deltas.shape

In [None]:
mscores = compute_scores_loop(a_mu=deltas, 
                    a_sig=d_deltas, 
                    ncells=np.ones(deltas.shape[1]), 
                    indices1=list(df.index[df.group == "NK"]), 
                    indices2=list(df.index[df.group == "other"]))

In [None]:
geneNames = json.loads(bonvis_data_hdf["data/normalized/"].attrs["gene_ids"])
mscores_df = pd.DataFrame({"marker_score": mscores, 
                           "geneName": geneNames})
mscores_df["idx"] = mscores_df.index
mscores_df.index = mscores_df.geneName
mscores_df.sort_values("marker_score", ascending=False).head(10)

In [None]:
ltqs =  deltas # + mu_sub.to_numpy()

make_hist_plot(col_name = "NK_group",
group1 = "NK_group1",
group2 = "NK_group2",
group3 = "other",
data = ltqs,
df_sig = mscores_df,
gene = "GNLY_trscrpt1",
title = "GNLY",
xlabel = "log transcription quotients")

make_hist_plot(col_name = "NK_group",
group1 = "NK_group1",
group2 = "NK_group2",
group3 = "other",
data = ltqs,
df_sig = mscores_df,
gene = "NKG7_trscrpt1",
title = "NKG7",
xlabel = "log transcription quotients")

## Marker score: NK subgroups

In [None]:
def make_hist_plot_subgroups(col_name, group1, group2, data,df_sig, gene, xlabel, title):
    cells_group1 = df[col_name] == group1
    cells_group2 = df[col_name] == group2


    deltas_group1 = data[:, cells_group1]
    deltas_group2 = data[:, cells_group2]



    fig = plt.figure(dpi=150)

    group1= "NK_group1"
    plt.hist(deltas_group1[df_sig.loc[gene].idx, :],
             bins=20, label=labels[group1], 
                 color= NK_colors[group1],
                 alpha=0.5,
                 density=True)

    group2= "NK_group2"
    plt.hist(deltas_group2[df_sig.loc[gene].idx, :],
             bins=20, label=labels[group2], 
                 color= NK_colors[group2],
                 alpha=0.5,
                 density=True)

    plt.title(title)
    plt.ylabel("density")
    plt.xlabel(xlabel)
    plt.savefig("figures/hist_subgroup_{}.svg".format(title))
    plt.savefig("figures/hist_subgroup_{}.png".format(title), dpi=300)

In [None]:
mscores_NK_subgroups = compute_scores_loop(a_mu=deltas, 
                    a_sig=d_deltas, 
                    ncells=np.ones(deltas.shape[1]), 
                    indices1=list(df.index[df.NK_group == "NK_group1"]), 
                    indices2=list(df.index[df.NK_group == "NK_group2"]))


mscores_NK_subgroups.shape

In [None]:
mscores_NK_subgroups_df = pd.DataFrame({"marker_score": mscores_NK_subgroups, 
                           "geneName": geneNames})


mscores_NK_subgroups_df["idx"] = mscores_NK_subgroups_df.index
mscores_NK_subgroups_df.index = mscores_NK_subgroups_df.geneName

In [None]:
mscores_NK_subgroups_df.sort_values("marker_score", ascending=False).head(20)


In [None]:
mscores_NK_subgroups_df.sort_values("marker_score", ascending=True).head(20)

In [None]:
ltqs =  deltas # + mu_sub.to_numpy()

make_hist_plot_subgroups(col_name = "NK_group",
group1 = "NK_group1",
group2 = "NK_group2",
data = ltqs,
df_sig = mscores_NK_subgroups_df,
gene = "FTL",
title="FTL",
xlabel = "log transcription quotients")

In [None]:
ltqs =  deltas # + mu_sub.to_numpy()

make_hist_plot_subgroups(col_name = "NK_group",
group1 = "NK_group1",
group2 = "NK_group2",
data = ltqs,
df_sig = mscores_NK_subgroups_df,
gene = "HLA-A",
title="HLA-A",
xlabel = "log transcription quotients")

### Supplementary figures

In [None]:
categories = [
    "B", 
    "CD8 T",
    "CD14+ Mono", 
    "CD16+ Mono", 
    "HSC",
    "DC", 
    "Erythrocytes", 
    "Memory CD4 T", 
    "MK", 
    "NK",
    "Naive CD4 T", 
    "T/Mono doublets", 
    "pDC"
]

cmap = get_cmap("tab20")
colors_ = [cmap(i / 20) for i in range(13)]  # Use the first 13 colors
celltype_col_dict = dict(zip(categories, colors_))
celltype_col_dict

celltype_labels = dict(zip(categories, categories))

In [None]:
def make_plot_for_markers_v2(df_sig, data, col_name , genes , group1 , group2, label1, label2, col1, col2, xlabel="log transcription quotient", alpha=0.5, figname=None):

    cells_group1 = df[col_name] == group1
    cells_group2 = df[col_name] == group2
    
    deltas_group1 = data[:, cells_group1]
    deltas_group2 = data[:, cells_group2]


    print("Number of genes: {}".format(len(genes)))
    if len(genes) >= 37:
        nrows = int(np.round(len(genes)/6))
        fig, axs = plt.subplots(nrows,6, figsize=(20,16), dpi=300)
    elif len(genes) <= 12:
        fig, axs = plt.subplots(3,4, figsize=(10,8), dpi=300)
    elif len(genes) < 16:
        fig, axs = plt.subplots(4,4, figsize=(10,8), dpi=300)
    elif len(genes) < 26:
        fig, axs = plt.subplots(5,5, figsize=(10,8), dpi=300)
    elif len(genes) < 37:
        fig, axs = plt.subplots(6,6, figsize=(10,8), dpi=300)

    axxes = axs.flat
    idx = -1
   
    for gene in genes:
        idx += 1

        axxes[idx].hist(deltas_group1[df_sig.loc[gene].idx, :], 
                 bins=20, label=group1, 
                 color= col1,
                 alpha=alpha,
                 density=True)
        axxes[idx].hist(deltas_group2[df_sig.loc[gene].idx, :], 
                 bins=20, label=group2, 
                 alpha=alpha,
                 color = col2,
                 density=True)

                
        if len(genes) < 16:
            axxes[idx].text(0.5, 1.20, gene, fontsize=14, ha='center', transform=axxes[idx].transAxes)
            axxes[idx].text(0.5, 1.04, "MS= {}".format(np.round(df_sig.loc[gene].marker_score, 5)), fontsize=10, ha='center', transform=axxes[idx].transAxes)
        else:
            axxes[idx].text(0.5, 1.25, gene, fontsize=14, ha='center', transform=axxes[idx].transAxes)
            axxes[idx].text(0.5, 1.04, "MS= {}".format(np.round(df_sig.loc[gene].marker_score, 5)), fontsize=10, ha='center', transform=axxes[idx].transAxes)
       
    # Loop over all axes and remove the empty ones
    for ax in axxes:
        if not ax.has_data():
            fig.delaxes(ax)
    plt.tight_layout()
    
    # make legend
    handles = [mpatches.Patch(color=c, label=l, alpha=a) for c, l, a in zip([col1, col2], [label1, label2], [alpha, alpha])]
    fig.legend(handles=handles,
           loc='center left',
           bbox_to_anchor=(1.01, 0.5), 
           frameon=False)  

    # Set common labels
    fig.text(0.5, 0.0, xlabel, ha='center', va='center')
    fig.text(0, 0.5, 'density', ha='center', va='center', rotation='vertical')
    plt.tight_layout()
    plt.savefig("figures/si_all_markers_{}.svg".format(figname), bbox_inches='tight')
    plt.savefig("figures/si_all_markers_{}.png".format(figname), dpi=300, bbox_inches='tight')

def make_plot_for_markers_v4(df_sig, data, col_name , genes , groups_list, label_dict, col_dict, xlabel="log transcription quotient", alpha=0.5, histtype="stepfilled"):
    cells_groups_list = {}
    data_groups_list = {}
    for group in groups_list:
            if group in ["NK_group1", "NK_group2"]:
                cells_groups_list[group] = df["NK_group"] == group
                data_groups_list[group] = data[:, cells_groups_list[group]]
            else:
                cells_groups_list[group] = df[col_name] == group
                data_groups_list[group] = data[:, cells_groups_list[group]]
                
        
    print("Number of genes: {}".format(len(genes)))
    if len(genes) >= 37:
        nrows = int(np.round(len(genes)/6))
        fig, axs = plt.subplots(nrows,6, figsize=(20,16), dpi=300)
    elif len(genes) <= 12:
        fig, axs = plt.subplots(3,4, figsize=(10,8), dpi=300)
    elif len(genes) < 16:
        fig, axs = plt.subplots(4,4, figsize=(10,8), dpi=300)
    elif len(genes) < 26:
        fig, axs = plt.subplots(5,5, figsize=(10,8), dpi=300)
    elif len(genes) < 37:
        fig, axs = plt.subplots(6,6, figsize=(10,8), dpi=300)

    axxes = axs.flat
    idx = -1
    for gene in genes:
        idx += 1
        
        get_col=True
        my_colors = []
        my_labels = []
        my_alpha=[]
        for group in groups_list:
            if group in ["NK_group1", "NK_group2"]:
                if get_col:
                    my_colors.append(NK_colors[group])
                    my_labels.append(labels[group])
                    my_alpha.append(0.2)
                axxes[idx].hist(data_groups_list[group][df_sig.loc[gene].idx, :], 
                     bins=20, label=labels[group], 
                     color= NK_colors[group],
                     alpha=0.2,
                     density=True,
                     histtype="stepfilled")
            else:
                if get_col:
                    my_colors.append(col_dict[group])
                    my_labels.append(label_dict[group])
                    my_alpha.append(alpha)
                axxes[idx].hist(data_groups_list[group][df_sig.loc[gene].idx, :], 
                         bins=20, label=label_dict[group], 
                         color= col_dict[group],
                         alpha=alpha,
                         density=True,
                         histtype="step",
                         linewidth=1.5)
        get_col=False
        
        if len(genes) < 16:
            axxes[idx].text(0.5, 1.20, gene, fontsize=14, ha='center', transform=axxes[idx].transAxes)
            axxes[idx].text(0.5, 1.04, "MS= {}".format(np.round(df_sig.loc[gene].marker_score, 5)), fontsize=10, ha='center', transform=axxes[idx].transAxes)
        else:
            axxes[idx].text(0.5, 1.25, gene, fontsize=14, ha='center', transform=axxes[idx].transAxes)
            axxes[idx].text(0.5, 1.04, "MS= {}".format(np.round(df_sig.loc[gene].marker_score, 5)), fontsize=10, ha='center', transform=axxes[idx].transAxes)
       
    # Loop over all axes and remove the empty ones
    for ax in axxes:
        if not ax.has_data():
            fig.delaxes(ax)
            
    plt.tight_layout()

    
    # Set common labels
    fig.text(0.5, 0.0, xlabel, ha='center', va='center')
    
    fig.text(0, 0.5, 'density', ha='center', va='center', rotation='vertical')
    
    # Set legend
    handles = [mpatches.Patch(color=c, label=l, alpha=a) for c, l, a in zip(my_colors, my_labels, my_alpha)]
    fig.legend(handles=handles,
           loc='center left',
           bbox_to_anchor=(1.01, 0.5),  
           frameon=False)     
    plt.show()

In [None]:
gene_df = mscores_NK_subgroups_df.sort_values("marker_score", ascending=False)
genes_ly = list(gene_df[mscores_NK_subgroups_df.marker_score > 0.99].geneName)
gene_df = mscores_NK_subgroups_df.sort_values("marker_score", ascending=True)
genes_my = list(gene_df[mscores_NK_subgroups_df.marker_score < 0.01].geneName)

my_genes_list = genes_ly + genes_my
print("n genes: {}".format(len(my_genes_list)))
print(my_genes_list)
print("n genes lymphoid: {}".format(len(genes_ly)))
print(genes_ly)
print("n genes myleoid: {}".format(len(genes_my)))
print(genes_my)

In [None]:
group1 = "NK_group1"
group2 = "NK_group2"

ltqs =  deltas # + mu_sub.to_numpy()

make_plot_for_markers_v2(df_sig=mscores_NK_subgroups_df, data=ltqs, col_name="NK_group",
                      xlabel="log transcription quotient",
                      group1=group1 , 
                      group2=group2, 
                      col1=NK_colors[group1], 
                      col2=NK_colors[group2],
                     label1 = labels[group1],
                     label2 = labels[group2],
                     genes = genes_ly, figname='ly_markers')


In [None]:
group1 = "NK_group1"
group2 = "NK_group2"

ltqs =  deltas # + mu_sub.to_numpy()

make_plot_for_markers_v2(df_sig=mscores_NK_subgroups_df, data=ltqs, col_name="NK_group",
                      xlabel="log transcription quotient",
                      group1=group1 , 
                      group2=group2, 
                      col1=NK_colors[group1], 
                      col2=NK_colors[group2],
                     label1 = labels[group1],
                     label2 = labels[group2],
                     genes = genes_my, figname='my_markers')

In [None]:
group3 = "Naive CD4 T"
group4 = "CD14+ Mono"

group5 = "Memory CD4 T"
group6 = "CD16+ Mono"


group1 = "NK_group1"
group2 = "NK_group2"

groups = [group1, group2, group3, group4]

ltqs =  deltas # + mu_sub.to_numpy()

make_plot_for_markers_v4(df_sig=mscores_NK_subgroups_df, 
                         data=ltqs, 
                         col_name="rna_annotations",
                         xlabel="log transcription quotient",
                         groups_list = groups,
                         col_dict=celltype_col_dict, 
                         label_dict = celltype_labels,
                         genes = genes_ly,
                         alpha=0.9,
                         histtype="step")


In [None]:
groups = [group1, group2, group3, group4]

ltqs =  deltas # + mu_sub.to_numpy()

make_plot_for_markers_v4(df_sig=mscores_NK_subgroups_df, 
                         data=ltqs, 
                         col_name="rna_annotations",
                         xlabel="log transcription quotient",
                         groups_list = groups,
                         col_dict=celltype_col_dict, 
                         label_dict = celltype_labels,
                         genes = genes_my,
                         alpha=0.9,
                         histtype="step")