## Result 7: VSI at the locations of transcripts from markers


In [None]:
from pathlib import Path

import sys
import os
sys.path.append(os.path.abspath("../src"))
import importlib
import plot
importlib.reload(plot)
import utils
importlib.reload(utils)

from utils import order_neuron_clusters, get_cluster_boundaries
from plot import plot_neuron_cluster_heatmap

import numpy as np
import pandas as pd

### data

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

heat_cmap = sns.color_palette("RdYlBu_r", as_cmap=True)
vsi_camp = sns.color_palette("ch:s=.25,rot=-.25", as_cmap=True)

#### Signals in the Tissue Section

In [None]:
MERFISH_data_folder_path = Path("../../data/mouse_hypothalamus/MERFISH/")

In [None]:
columns = [
    "Centroid_X",
    "Centroid_Y",
    "Centroid_Z",
    "Gene_name",
    "Cell_name",
    "Total_brightness",
    "Area",
    "Error_bit",
    "Error_direction",
]

signal_coordinate_df = pd.read_csv(
    MERFISH_data_folder_path / "merfish_barcodes_example.csv", usecols=columns
).rename(
    columns={
        "Centroid_X": "x",
        "Centroid_Y": "y",
        "Centroid_Z": "z",
        "Gene_name": "gene",
    }
)


# remove dummy molecules
signal_coordinate_df = signal_coordinate_df.loc[
    ~signal_coordinate_df["gene"].str.contains("Blank|NegControl"),
]

signal_coordinate_df["gene"] = signal_coordinate_df["gene"].astype("category")

# shift the coordinates to avoid the negative values
coordinate_x_m =  signal_coordinate_df['x'].min()
coordinate_y_m =  signal_coordinate_df['y'].min()
signal_coordinate_df['x'] = signal_coordinate_df['x'] - coordinate_x_m
signal_coordinate_df['y'] = signal_coordinate_df['y'] - coordinate_y_m

# make a copy to avoid SettingWithCopyWarning
signal_coordinate_df = signal_coordinate_df.copy()

#### Results of Ovrlpy

results:  
- signal integrity  
- signal strength  

In [None]:
ovrlpy_result_folder = Path("../../data/results/barcodes_xmpl")
signal_integrity = np.loadtxt(ovrlpy_result_folder/"barcodes_signal_integrity.csv", delimiter=',')
signal_strength = np.loadtxt(ovrlpy_result_folder/"barcodes_signal_strength.csv", delimiter=',')

In [None]:
# exclude MOD marker signals
nonMOD_signal_integrity = np.loadtxt(ovrlpy_result_folder/"nonMOD/barcodes_signal_integrity.csv", delimiter=',')
nonMOD_signal_strength = np.loadtxt(ovrlpy_result_folder/"nonMOD/barcodes_signal_strength.csv", delimiter=',')

In [None]:
# exclude MOD marker signals
nonMOD1_signal_integrity = np.loadtxt(ovrlpy_result_folder/"nonMOD1/barcodes_signal_integrity.csv", delimiter=',')
nonMOD1_signal_strength = np.loadtxt(ovrlpy_result_folder/"nonMOD1/barcodes_signal_strength.csv", delimiter=',')

In [None]:
# exclude MOD marker signals
nonMOD2_signal_integrity = np.loadtxt(ovrlpy_result_folder/"nonMOD2/barcodes_signal_integrity.csv", delimiter=',')
nonMOD2_signal_strength = np.loadtxt(ovrlpy_result_folder/"nonMOD2/barcodes_signal_strength.csv", delimiter=',')

#### Signals in the Tissue Section

In [None]:
MERFISH_data_folder_path = Path("../../data/mouse_hypothalamus/raw/")

In [None]:
columns = [
    "Centroid_X",
    "Centroid_Y",
    "Centroid_Z",
    "Gene_name",
    "Cell_name",
    "Total_brightness",
    "Area",
    "Error_bit",
    "Error_direction",
]

signal_coordinate_df = pd.read_csv(
    MERFISH_data_folder_path / "merfish_barcodes_example.csv", usecols=columns
).rename(
    columns={
        "Centroid_X": "x",
        "Centroid_Y": "y",
        "Centroid_Z": "z",
        "Gene_name": "gene",
    }
)


# remove dummy molecules
signal_coordinate_df = signal_coordinate_df.loc[
    ~signal_coordinate_df["gene"].str.contains("Blank|NegControl"),
]

signal_coordinate_df["gene"] = signal_coordinate_df["gene"].astype("category")

# shift the coordinates to avoid the negative values
coordinate_x_m =  signal_coordinate_df['x'].min()
coordinate_y_m =  signal_coordinate_df['y'].min()
signal_coordinate_df['x'] = signal_coordinate_df['x'] - coordinate_x_m
signal_coordinate_df['y'] = signal_coordinate_df['y'] - coordinate_y_m

# make a copy to avoid SettingWithCopyWarning
signal_coordinate_df = signal_coordinate_df.copy()

#### Results of BANKSY

In [None]:
banksy_folder_path = Path("../../data/banksy_results/")

In [None]:
columns = [
    "Centroid_X",
    "Centroid_Y",
    "Bregma",
    "lam0.2",
]

banksy_result = pd.read_csv(
    banksy_folder_path / 'banksy_cluster.txt', usecols=columns, sep = '\t'
).rename(
    columns={
        "Centroid_X": "x",
        "Centroid_Y": "y",
        "Bregma": "Bregma",
        "lam0.2": "banksy_cluster",
    }
)

banksy_result = banksy_result[banksy_result['Bregma'] == -0.24]

banksy_result['x'] = banksy_result['x'] - coordinate_x_m
banksy_result['y'] = banksy_result['y'] - coordinate_y_m

banksy_result = banksy_result.copy()

#### Segmentation Dataset

In [None]:
seg_data_path = Path("../../data/mouse_hypothalamus/all_cells")

In [None]:
merfish_data = pd.read_csv(
    seg_data_path / "merfish_all_cells.csv"
    ).rename(
    columns={
        "Centroid_X": "x",
        "Centroid_Y": "y"
    }
)

merfish_data = merfish_data.drop(columns=[col for col in merfish_data.columns if col == 'Fos' or col.startswith('Blank_')])
merfish_data = merfish_data[merfish_data["Cell_class"] != "Ambiguous"]
merfish_data = merfish_data[merfish_data['Animal_ID'] == 1]
merfish_data = merfish_data[merfish_data['Bregma'] == -0.24]

merfish_data['x'] = merfish_data['x'] - coordinate_x_m
merfish_data['y'] = merfish_data['y'] - coordinate_y_m

merfish_data['banksy'] = banksy_result['banksy_cluster'].values

merfish_data = merfish_data.copy()

#### Cell boundaries

In [None]:
boundaries_df = pd.read_csv(MERFISH_data_folder_path/'cellboundaries_example_animal.csv')
boundaries_df = boundaries_df.dropna(subset=['boundaryX', 'boundaryY'])

In [None]:
cell_ids = merfish_data['Cell_ID']
boundaries_df = boundaries_df[boundaries_df['feature_uID'].isin(cell_ids)]
boundaries_df = boundaries_df.merge(
    merfish_data[['Cell_ID', 'x', 'y', 'banksy']],
    left_on='feature_uID',
    right_on='Cell_ID',
    how='inner'
)
boundaries_df = boundaries_df.drop(columns=['Cell_ID'])

boundaries_df['boundaryX'] = boundaries_df['boundaryX'].apply(lambda x: [float(i) for i in x.split(';')] if isinstance(x, str) else x)
boundaries_df['boundaryY'] = boundaries_df['boundaryY'].apply(lambda x: [float(i) for i in x.split(';')] if isinstance(x, str) else x)

boundaries_df['boundaryX'] = boundaries_df['boundaryX'].apply(lambda x: [i - coordinate_x_m for i in x] if isinstance(x, list) else x)
boundaries_df['boundaryY'] = boundaries_df['boundaryY'].apply(lambda x: [i - coordinate_y_m for i in x] if isinstance(x, list) else x)

boundaries_df = boundaries_df.copy()

#### Marker Genes

differentially expressed genes identified by BANKSY

In [None]:
# all differentially expressed genes
DE_genes = ['Mlc1', 'Dgkk', 'Cbln2', 'Syt4', 'Gad1', 'Plin3', 'Gnrh1', 'Sln', 'Gjc3', 'Mbp', 'Lpar1', 'Trh', 'Ucn3', 'Cck']
# DE_genes_gm: 7
DE_genes_gm = ['Mlc1', 'Dgkk', 'Cbln2', 'Syt4', 'Gad1', 'Plin3', 'Gnrh1', 'Sln', 'Gjc3']
# DE_genes_wm: 8
DE_genes_wm = ['Mbp', 'Lpar1', 'Trh', 'Ucn3', 'Cck']

In [None]:
MOD_boundaries = boundaries_df[(boundaries_df['banksy'] == 8) | (boundaries_df['banksy'] == 7)]
MOD1_boundaries = boundaries_df[boundaries_df['banksy'] == 8]
MOD2_boundaries = boundaries_df[boundaries_df['banksy'] == 7]

### VSI at marker transcripts

#### function

In [None]:
import numpy as np
import pandas as pd

def marker_transcripts_vsi(signal_df, signal_strength, signal_integrity, gene):
    # Filter to keep only rows for the given gene
    gene_signal = signal_df[signal_df['gene'].isin(gene)].copy()

    # Initialize empty masks with same shape as signal arrays
    mask = np.zeros_like(signal_strength, dtype=bool)

    # Ensure coordinates are within bounds
    valid_coords = (
        (gene_signal['x'] >= 0) & (gene_signal['x'] < signal_strength.shape[1]) &
        (gene_signal['y'] >= 0) & (gene_signal['y'] < signal_strength.shape[0])
    )
    gene_signal = gene_signal[valid_coords]

    # Get x and y coordinates
    xs = gene_signal['x'].astype(int).values
    ys = gene_signal['y'].astype(int).values

    # Mark positions of this gene as True in the mask
    mask[ys, xs] = True

    # Apply the mask to extract the 2D arrays
    gene_strength = np.where(mask, signal_strength, 0)
    gene_integrity = np.where(mask, signal_integrity, 0)

    return gene_strength, gene_integrity


In [None]:
def plot_histogram(ax, cell_integrity, cell_strength, signal_threshold, cmap, label):
    """
    Plot a histogram with color gradients based on a colormap.

    Parameters:
        ax: matplotlib axes object.
        cell_integrity: 1D array of signal integrity values.
        cell_strength: 1D array of signal strength values.
        signal_threshold: Threshold to filter cell_strength.
        cmap: Colormap for gradient coloring.
        label: Label for the x-axis.
    """
    # Calculate histogram values
    vals, bins = np.histogram(
        cell_integrity[cell_strength > signal_threshold],
        bins=50,
        range=(0, 1),
        density=True,
    )
    
    # Plot histogram
    n, bins, patches = ax.hist(
        cell_integrity[cell_strength > signal_threshold],
        bins=50,
        range=(0, 1),
        density=True,
        edgecolor='black',
        alpha=0.8
    )
    
    # Apply colormap
    for i, patch in enumerate(patches):
        patch.set_facecolor(cmap(i / len(patches)))
    
    # Customize appearance
    ax.set_xlim(0, 1)
    ax.set_ylim(0, 7)
    ax.set_ylabel("Density")
    ax.set_xlabel(label)
    ax.spines[["top", "right"]].set_visible(False)
    ax.yaxis.set_tick_params(labelright=False)
    ax.xaxis.set_tick_params(labelsize=8)
    
    return vals, bins

#### MOD all markers

In [None]:
MOD1_strength, MOD1_integrity = marker_transcripts_vsi(signal_coordinate_df, signal_strength, signal_integrity, DE_genes_wm)

with plt.style.context("default"):
    cmap = _BIH_CMAP
    fig, ax = plt.subplots(1, 1, figsize=(10, 5), dpi=600)
    vals1, bins1 = plot_histogram(
            ax, MOD1_integrity, MOD1_strength, 2, cmap, label="MOD1 Signal Integrity"
        )

In [None]:
MOD2_strength, MOD2_integrity = marker_transcripts_vsi(signal_coordinate_df, signal_strength, signal_integrity, DE_genes_gm)

with plt.style.context("default"):
    cmap = _BIH_CMAP
    fig, ax = plt.subplots(1, 1, figsize=(10, 5), dpi=600)
    vals2, bins2 = plot_histogram(
            ax, MOD2_integrity, MOD2_strength, 2, cmap, label="MOD2 Signal Integrity"
        )

In [None]:
epsilon = 1e-10
vals = vals2 / (vals1 + epsilon)
bins = bins1

In [None]:
cmap = _BIH_CMAP
bin_centers = (bins[:-1] + bins[1:]) / 2
fig, ax = plt.subplots(figsize=(6, 6), dpi=600)
# Create the histogram bars
bars = ax.bar(bin_centers, vals, width=np.diff(bins), edgecolor="black", alpha=0.7, linewidth=0.3)
# Apply colormap
for i, bar in enumerate(bars):
    bar.set_facecolor(cmap(i / len(bars)))  # Set color based on the colormap

ax.set_ylabel("VSI Density of MOD2/MOD1 markers")
ax.set_xlabel("Signal Integrity")
ax.spines[["top", "right"]].set_visible(False)
ax.yaxis.set_tick_params(labelright=False)
ax.xaxis.set_tick_params(labelsize=8)

# Set the y-axis scale to log
ax.set_yscale('log')
# Add a horizontal dashed line at y = 1 (10^0)
ax.axhline(y=1, color='black', linestyle='--', linewidth=0.5)
plt.show()


#### Markers vsi

In [None]:
import numpy as np
from matplotlib.ticker import LogLocator

def plot_hist(ax, si, ss, signal_thr, cmap, label, xlim=(0.125, 64), log=False, ylabel=False, xticks=None):
    # Histogram calculation
    vals, bins = np.histogram(
        si[ss > signal_thr],
        bins=50,
        range=(0, 1),
        density=True,
    )

    # Avoid log(0) issues
    # vals += 1e-8

    # Bar colors
    colors = cmap(bins[1:-1])
    bars = ax.barh(bins[1:-1], vals[1:], height=0.01)
    for i, bar in enumerate(bars):
        bar.set_color(colors[i])

    # Axis scaling and limits
    if log:
        ax.set_xscale('log', base=2)
        ax.set_xlim(xlim)
        ax.xaxis.set_major_locator(LogLocator(base=2, subs=[1], numticks=10))
        if xticks:
            ax.set_xticks(xticks)
    else:
        ax.set_xlim(xlim)

    # Y-axis settings
    ax.set_ylim(0, 1)
    if ylabel:
        ax.set_ylabel("Signal Integrity", fontsize=13)
        ax.yaxis.set_label_position("right")

    # Formatting
    ax.invert_xaxis()
    ax.yaxis.tick_right()
    ax.spines[["top", "left"]].set_visible(False)
    ax.set_title(label, fontsize=13)

    

In [None]:
Gad1_strength, Gad1_integrity = marker_transcripts_vsi(signal_coordinate_df, signal_strength, signal_integrity, ['Gad1'])

In [None]:
Plin3_strength, Plin3_integrity = marker_transcripts_vsi(signal_coordinate_df, signal_strength, signal_integrity, ['Plin3'])

In [None]:
Gjc3_strength, Gjc3_integrity = marker_transcripts_vsi(signal_coordinate_df, signal_strength, signal_integrity, ['Gjc3'])

In [None]:
Dgkk_strength, Dgkk_integrity = marker_transcripts_vsi(signal_coordinate_df, signal_strength, signal_integrity, ['Dgkk'])

In [None]:
Cbln2_strength, Cbln2_integrity = marker_transcripts_vsi(signal_coordinate_df, signal_strength, signal_integrity, ['Cbln2'])

In [None]:
Syt4_strength, Syt4_integrity = marker_transcripts_vsi(signal_coordinate_df, signal_strength, signal_integrity, ['Syt4'])

In [None]:
Mlc1_strength, Mlc1_integrity = marker_transcripts_vsi(signal_coordinate_df, signal_strength, signal_integrity, ['Mlc1'])

In [None]:
Lpar1_strength, Lpar1_integrity = marker_transcripts_vsi(signal_coordinate_df, signal_strength, signal_integrity, ['Lpar1'])

#### vsi distribution

In [None]:
fig, ax = plt.subplots(1, 4, figsize=(16, 9), dpi=600)
plot_hist(ax[0], Lpar1_integrity, Lpar1_strength, signal_thr=3, cmap=_BIH_CMAP, label="Lpar1", xlim=(0,16), log=False)
plot_hist(ax[1], Plin3_integrity, Plin3_strength, signal_thr=3, cmap=_BIH_CMAP, label="Plin3", xlim=(0,16), log=False)
plot_hist(ax[2], Gjc3_integrity, Gjc3_strength, signal_thr=3, cmap=_BIH_CMAP, label="Gjc3", xlim=(0,16), log=False)
plot_hist(ax[3], Mlc1_integrity, Mlc1_strength, signal_thr=3, cmap=_BIH_CMAP, label="Mlc1", xlim=(0,16), log=False, ylabel=True)

plt.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(1, 4, figsize=(16, 9), dpi=600)

plot_hist(ax[0], Syt4_integrity, Syt4_strength, signal_thr=3, cmap=_BIH_CMAP, label="Syt4", xlim=(0,16), log=False)
plot_hist(ax[1], Gad1_integrity, Gad1_strength, signal_thr=3, cmap=_BIH_CMAP, label="Gad1", xlim=(0,16), log=False)
plot_hist(ax[2], Cbln2_integrity, Cbln2_strength, signal_thr=3, cmap=_BIH_CMAP, label="Cbln2", xlim=(0,16), log=False)
plot_hist(ax[3], Dgkk_integrity, Dgkk_strength, signal_thr=3, cmap=_BIH_CMAP, label="Dgkk", xlim=(0,16), log=False, ylabel=True)

plt.tight_layout()
plt.show()

### VSI under Exclusive Conditions

In [None]:
# extract_cell_integrity_within_boundary: extracts cell integrity and signal strength based on given boundaries.

from shapely.geometry import Polygon, Point

def extract_cell_integrity_within_boundary(
    boundary_df, 
    sig_integrity,
    signal_strength,
    integrity_size=1800
):

    # Initialize arrays to store cell integrity and strength
    cell_integrity = np.zeros((integrity_size, integrity_size))
    cell_strength = np.zeros((integrity_size, integrity_size))

    # Loop through each row in the boundary DataFrame
    for idx, row in boundary_df.iterrows():
        # Create the polygon from boundary coordinates
        polygon = Polygon(zip(row['boundaryX'], row['boundaryY']))
        
        # Determine the bounding box of the polygon
        x_min, x_max = int(np.floor(polygon.bounds[0])), int(np.ceil(polygon.bounds[2]))
        y_min, y_max = int(np.floor(polygon.bounds[1])), int(np.ceil(polygon.bounds[3]))
        
        # Ensure bounding box coordinates stay within the grid limits
        x_min, x_max = max(0, x_min), min(integrity_size, x_max)
        y_min, y_max = max(0, y_min), min(integrity_size, y_max)

        # Generate grid points within the bounding box
        y_indices, x_indices = np.meshgrid(range(y_min, y_max), range(x_min, x_max), indexing='ij')
        points = np.column_stack([x_indices.ravel(), y_indices.ravel()])
        
        # Identify points within the polygon or on its boundary
        # mask = np.array([polygon.contains(Point(x, y)) or polygon.touches(Point(x, y)) for x, y in points])
        epsilon = 0.7  # Small tolerance to handle floating-point precision issues
        mask = np.array([polygon.contains(Point(x, y)) or polygon.touches(Point(x, y)) or polygon.boundary.distance(Point(x, y)) < epsilon for x, y in points])
        mask = mask.reshape(y_indices.shape)

        # Extract the subgrid for integrity and strength within the bounding box
        subgrid_int = sig_integrity[y_min:y_max, x_min:x_max]
        subgrid_str = signal_strength[y_min:y_max, x_min:x_max]

        # Assign values to the cell arrays where the mask is True
        cell_integrity[y_min:y_max, x_min:x_max][mask] = subgrid_int[mask]
        cell_strength[y_min:y_max, x_min:x_max][mask] = subgrid_str[mask]

    # Return the computed cell integrity and strength grids
    return cell_integrity, cell_strength


In [None]:
def plot_hist(ax, si, ss, signal_thr, cmap, label, xlim, log, ylabel=False):
    vals, bins = np.histogram(
        si[ss > signal_thr],
        bins=50,
        range=(0, 1),
        density=True,
    )
    colors = cmap(bins[1:-1])
    bars = ax.barh(bins[1:-1], vals[1:], height=0.01)
    for i, bar in enumerate(bars):
        bar.set_color(colors[i])

    if log:
        ax.set_xscale('log', base=2)
    else:
        ax.set_xlim(xlim)

    if ylabel:
        ax.set_ylabel("Signal Integrity", fontsize=13)
        ax.yaxis.set_label_position("right")

    ax.set_ylim(0, 1)
    ax.invert_xaxis()
    ax.yaxis.tick_right()
    ax.spines[["top", "left"]].set_visible(False)
    ax.set_title(label, fontsize=13)
    

def histogram_comparison(si1, ss1, si2, ss2, si3, ss3, si4, ss4, signal_threshold, cmap, xlim,log):
    fig, ax = plt.subplots(1, 4, figsize=(16, 9), dpi=600)  # Create a row of 4 subplots
    plot_hist(ax[0], si1, ss1, signal_threshold, cmap, label="All Signals", xlim=xlim, log=log)
    plot_hist(ax[1], si2, ss2, signal_threshold, cmap, label="Excluding MOD1 Marker", xlim=xlim, log=log)
    plot_hist(ax[2], si3, ss3, signal_threshold, cmap, label="Excluding MOD2 Marker", xlim=xlim, log=log)
    plot_hist(ax[3], si4, ss4, signal_threshold, cmap, label="Excluding MOD Marker", xlim=xlim, log=log, ylabel=True)
    plt.tight_layout()  # Adjust spacing to prevent overlap
    plt.show()

In [None]:
histogram_comparison(signal_integrity, signal_strength, 
                     nonMOD1_signal_integrity, nonMOD1_signal_strength, 
                     nonMOD2_signal_integrity, nonMOD2_signal_strength, 
                     nonMOD_signal_integrity, nonMOD_signal_strength, 
                     signal_threshold=3, cmap=_BIH_CMAP, xlim=(0,8), log=False)

In [None]:
cell_si, cell_ss = extract_cell_integrity_within_boundary(boundary_df=boundaries_df, sig_integrity=signal_integrity, signal_strength=signal_strength, integrity_size=1800)
cell_si_nonMOD1, cell_ss_nonMOD1 = extract_cell_integrity_within_boundary(boundary_df=boundaries_df, sig_integrity=nonMOD1_signal_integrity, signal_strength=nonMOD1_signal_strength, integrity_size=1800)
cell_si_nonMOD2, cell_ss_nonMOD2 = extract_cell_integrity_within_boundary(boundary_df=boundaries_df, sig_integrity=nonMOD2_signal_integrity, signal_strength=nonMOD2_signal_strength, integrity_size=1800)
cell_si_nonMOD, cell_ss_nonMOD = extract_cell_integrity_within_boundary(boundary_df=boundaries_df, sig_integrity=nonMOD_signal_integrity, signal_strength=nonMOD_signal_strength, integrity_size=1800)

In [None]:
histogram_comparison(cell_si, cell_ss, 
                     cell_si_nonMOD1, cell_ss_nonMOD1, 
                     cell_si_nonMOD2, cell_ss_nonMOD2,
                     cell_si_nonMOD, cell_ss_nonMOD, 
                     signal_threshold=3, cmap=_BIH_CMAP, xlim=(0,8), log=False)

### normalized hist: MOD2/MOD1

#### functions

In [None]:
# plot_integrity_comparison:Compare histograms and cumulative densities of two integrity.

import matplotlib.ticker as ticker
from matplotlib import scale

def plot_histogram_comp(ax, cell_integrity, cell_strength, signal_threshold, cmap, label, ylim=(1e-1,64)):
    """
    Plot a histogram with color gradients based on a colormap.

    Parameters:
        ax: matplotlib axes object.
        cell_integrity: 1D array of signal integrity values.
        cell_strength: 1D array of signal strength values.
        signal_threshold: Threshold to filter cell_strength.
        cmap: Colormap for gradient coloring.
        label: Label for the x-axis.
    """
    # Calculate histogram values
    vals, bins = np.histogram(
        cell_integrity[cell_strength > signal_threshold],
        bins=50,
        range=(0, 1),
        density=True,
    )
    
    # Plot histogram
    n, bins, patches = ax.hist(
        cell_integrity[cell_strength > signal_threshold],
        bins=50,
        range=(0, 1),
        density=True,
        edgecolor='black',
        alpha=0.8
    )
    
    # Apply colormap
    for i, patch in enumerate(patches):
        patch.set_facecolor(cmap(i / len(patches)))
    
    # Customize appearance
    ax.set_xlim(0, 1)
    ax.set_ylabel("Density")
    ax.set_yscale('log', base=2)
    ax.set_ylim(ylim)
    ax.set_xlabel(label)
    ax.spines[["top", "right"]].set_visible(False)
    ax.yaxis.set_tick_params(labelright=False)
    ax.xaxis.set_tick_params(labelsize=8)
    
    return vals, bins

def plot_integrity_comparison(
    cell_integrity_1,
    cell_strength_1,
    cell_integrity_2,
    cell_strength_2,
    title,
    signal_threshold=2.0,
    figure_height=8,
    cmap="BIH",
    ylim=None
):
    """
    Compare histograms and cumulative densities of two datasets.

    Parameters:
        cell_integrity_1, cell_strength_1: Data for dataset 1.
        cell_integrity_2, cell_strength_2: Data for dataset 2.
        signal_threshold: Threshold for filtering data.
        figure_height: Height of the figure.
        cmap: Colormap for histogram gradients.
    """
    # Validate inputs
    for data, name in [
        (cell_integrity_1, "cell_integrity_1"),
        (cell_strength_1, "cell_strength_1"),
        (cell_integrity_2, "cell_integrity_2"),
        (cell_strength_2, "cell_strength_2"),
    ]:
        if not (isinstance(data, np.ndarray) and data.ndim == 2):
            raise ValueError(f"{name} must be a 2D numpy array.")

    with plt.style.context("default"):
        # Define colormap
        if cmap == "BIH":
            try:
                cmap = _BIH_CMAP
            except NameError:
                raise ValueError("BIH colormap is not defined.")
        
        # Create figure and subplots
        fig, ax = plt.subplots(2, 1, figsize=(figure_height, figure_height), dpi=600)
            
        # Plot histograms
        vals1, bins1 = plot_histogram_comp(
            ax[0], cell_integrity_1, cell_strength_1, signal_threshold, cmap, label="MOD1 Signal Integrity", ylim=ylim
        )
        vals2, bins2 = plot_histogram_comp(
            ax[1], cell_integrity_2, cell_strength_2, signal_threshold, cmap, label="MOD2 Signal Integrity", ylim=ylim
        )

    plt.tight_layout()  # Adjust spacing to prevent overlap
    ax[0].set_title(title)
    plt.show()

    return vals1, bins1, vals2, bins2

In [None]:
def normalized_hist(vals1, bins1, vals2, bins2, title):
    epsilon = 1e-10
    vals = vals2 / (vals1 + epsilon)
    bins = bins1

    cmap = _BIH_CMAP

    bin_centers = (bins[:-1] + bins[1:]) / 2

    fig, ax = plt.subplots(figsize=(8, 8), dpi=600)

    # Create the histogram bars
    bars = ax.bar(bin_centers, vals, width=np.diff(bins), edgecolor="black", alpha=0.7, linewidth=0.3)

    # Apply colormap
    for i, bar in enumerate(bars):
        bar.set_facecolor(cmap(i / len(bars)))  # Set color based on the colormap

    ax.set_ylabel("VSI Density of MOD2/MOD1")
    ax.set_xlabel("Signal Integrity")
    ax.set_title(title)
    ax.spines[["top", "right"]].set_visible(False)
    ax.yaxis.set_tick_params(labelright=False)
    ax.xaxis.set_tick_params(labelsize=8)

    # Set the y-axis scale to log
    ax.set_yscale('log')

    # Add a horizontal dashed line at y = 1 (10^0)
    ax.axhline(y=1, color='black', linestyle='--', linewidth=0.5)

    plt.show()


In [None]:
MOD_si, MOD_ss = extract_cell_integrity_within_boundary(boundary_df=MOD_boundaries, sig_integrity=signal_integrity, signal_strength=signal_strength, integrity_size=1800)
MOD_si_nonMOD1, MOD_ss_nonMOD1 = extract_cell_integrity_within_boundary(boundary_df=MOD_boundaries, sig_integrity=nonMOD1_signal_integrity, signal_strength=nonMOD1_signal_strength, integrity_size=1800)
MOD_si_nonMOD2, MOD_ss_nonMOD2 = extract_cell_integrity_within_boundary(boundary_df=MOD_boundaries, sig_integrity=nonMOD2_signal_integrity, signal_strength=nonMOD2_signal_strength, integrity_size=1800)
MOD_si_nonMOD, MOD_ss_nonMOD = extract_cell_integrity_within_boundary(boundary_df=MOD_boundaries, sig_integrity=nonMOD_signal_integrity, signal_strength=nonMOD_signal_strength, integrity_size=1800)

In [None]:
MOD1_si, MOD1_ss = extract_cell_integrity_within_boundary(boundary_df=MOD1_boundaries, sig_integrity=signal_integrity, signal_strength=signal_strength, integrity_size=1800)
MOD1_si_nonMOD1, MOD1_ss_nonMOD1 = extract_cell_integrity_within_boundary(boundary_df=MOD1_boundaries, sig_integrity=nonMOD1_signal_integrity, signal_strength=nonMOD1_signal_strength, integrity_size=1800)
MOD1_si_nonMOD2, MOD1_ss_nonMOD2 = extract_cell_integrity_within_boundary(boundary_df=MOD1_boundaries, sig_integrity=nonMOD2_signal_integrity, signal_strength=nonMOD2_signal_strength, integrity_size=1800)
MOD1_si_nonMOD, MOD1_ss_nonMOD = extract_cell_integrity_within_boundary(boundary_df=MOD1_boundaries, sig_integrity=nonMOD_signal_integrity, signal_strength=nonMOD_signal_strength, integrity_size=1800)

In [None]:
MOD2_si, MOD2_ss = extract_cell_integrity_within_boundary(boundary_df=MOD2_boundaries, sig_integrity=signal_integrity, signal_strength=signal_strength, integrity_size=1800)
MOD2_si_nonMOD1, MOD2_ss_nonMOD1 = extract_cell_integrity_within_boundary(boundary_df=MOD2_boundaries, sig_integrity=nonMOD1_signal_integrity, signal_strength=nonMOD1_signal_strength, integrity_size=1800)
MOD2_si_nonMOD2, MOD2_ss_nonMOD2 = extract_cell_integrity_within_boundary(boundary_df=MOD2_boundaries, sig_integrity=nonMOD2_signal_integrity, signal_strength=nonMOD2_signal_strength, integrity_size=1800)
MOD2_si_nonMOD, MOD2_ss_nonMOD = extract_cell_integrity_within_boundary(boundary_df=MOD2_boundaries, sig_integrity=nonMOD_signal_integrity, signal_strength=nonMOD_signal_strength, integrity_size=1800)

#### MOD1 and MOD2

In [None]:
histogram_comparison(MOD1_si, MOD1_ss, 
                     MOD1_si_nonMOD1, MOD1_ss_nonMOD1, 
                     MOD1_si_nonMOD2, MOD1_ss_nonMOD2,
                     MOD1_si_nonMOD, MOD1_ss_nonMOD, 
                     signal_threshold=3, cmap=_BIH_CMAP, xlim=(1e-1,64), log=True)

In [None]:
histogram_comparison(MOD2_si, MOD2_ss, 
                     MOD2_si_nonMOD1, MOD2_ss_nonMOD1, 
                     MOD2_si_nonMOD2, MOD2_ss_nonMOD2,
                     MOD2_si_nonMOD, MOD2_ss_nonMOD, 
                     signal_threshold=3, cmap=_BIH_CMAP, xlim=(1e-1,64), log=True)

#### exclusive distribution: MOD1 and MOD2

In [None]:
vals1, bins1, vals2, bins2 = plot_integrity_comparison(
    cell_integrity_1=MOD1_si,
    cell_strength_1=MOD1_ss,
    cell_integrity_2=MOD2_si,
    cell_strength_2=MOD2_ss,
    title="All Signals",
    signal_threshold=3.0,
    ylim=(1e-1,64)
)

In [None]:
normalized_hist(vals1, bins1, vals2, bins2, title = "All Signals")

In [None]:
vals1, bins1, vals2, bins2 = plot_integrity_comparison(
    cell_integrity_1=MOD1_si_nonMOD1,
    cell_strength_1=MOD1_ss_nonMOD1,
    cell_integrity_2=MOD2_si_nonMOD1,
    cell_strength_2=MOD2_ss_nonMOD1,
    title="Excluding MOD1 Markers",
    signal_threshold=3.0,
    ylim=(1e-1,64)
)

In [None]:
normalized_hist(vals1, bins1, vals2, bins2, title = "Excluding MOD1 Markers")

In [None]:
vals1, bins1, vals2, bins2 = plot_integrity_comparison(
    cell_integrity_1=MOD1_si_nonMOD2,
    cell_strength_1=MOD1_ss_nonMOD2,
    cell_integrity_2=MOD2_si_nonMOD2,
    cell_strength_2=MOD2_ss_nonMOD2,
    title="Excluding MOD2 Markers",
    signal_threshold=3.0,
    ylim=(1e-1,64)
)

In [None]:
normalized_hist(vals1, bins1, vals2, bins2, title = "Excluding MOD2 Markers")

In [None]:
vals1, bins1, vals2, bins2 = plot_integrity_comparison(
    cell_integrity_1=MOD1_si_nonMOD,
    cell_strength_1=MOD1_ss_nonMOD,
    cell_integrity_2=MOD2_si_nonMOD,
    cell_strength_2=MOD2_ss_nonMOD,
    title="Excluding MOD Markers",
    signal_threshold=3.0,
    ylim=(1e-1,64)
)

In [None]:
normalized_hist(vals1, bins1, vals2, bins2, title = "Excluding MOD Markers")

### Marker VSI in cell

#### extract vsi

In [None]:
Gad1_cell_ss, Gad1_cell_si = marker_transcripts_vsi(signal_coordinate_df, cell_ss, cell_si, ['Gad1'])
Plin3_cell_ss, Plin3_cell_si = marker_transcripts_vsi(signal_coordinate_df, cell_ss, cell_si, ['Plin3'])
Gjc3_cell_ss, Gjc3_cell_si = marker_transcripts_vsi(signal_coordinate_df, cell_ss, cell_si, ['Gjc3'])
Dgkk_cell_ss, Dgkk_cell_si = marker_transcripts_vsi(signal_coordinate_df, cell_ss, cell_si, ['Dgkk'])
Cbln2_cell_ss, Cbln2_cell_si = marker_transcripts_vsi(signal_coordinate_df, cell_ss, cell_si, ['Cbln2'])
Syt4_cell_ss, Syt4_cell_si = marker_transcripts_vsi(signal_coordinate_df, cell_ss, cell_si, ['Syt4'])
Mlc1_cell_ss, Mlc1_cell_si = marker_transcripts_vsi(signal_coordinate_df, cell_ss, cell_si, ['Mlc1'])
Lpar1_cell_ss, Lpar1_cell_si = marker_transcripts_vsi(signal_coordinate_df, cell_ss, cell_si, ['Lpar1'])

In [None]:
Gad1_MOD_ss, Gad1_MOD_si = marker_transcripts_vsi(signal_coordinate_df, MOD_ss, MOD_si, ['Gad1'])
Plin3_MOD_ss, Plin3_MOD_si = marker_transcripts_vsi(signal_coordinate_df, MOD_ss, MOD_si, ['Plin3'])
Gjc3_MOD_ss, Gjc3_MOD_si = marker_transcripts_vsi(signal_coordinate_df, MOD_ss, MOD_si, ['Gjc3'])
Dgkk_MOD_ss, Dgkk_MOD_si = marker_transcripts_vsi(signal_coordinate_df, MOD_ss, MOD_si, ['Dgkk'])
Cbln2_MOD_ss, Cbln2_MOD_si = marker_transcripts_vsi(signal_coordinate_df, MOD_ss, MOD_si, ['Cbln2'])
Syt4_MOD_ss, Syt4_MOD_si = marker_transcripts_vsi(signal_coordinate_df, MOD_ss, MOD_si, ['Syt4'])
Mlc1_MOD_ss, Mlc1_MOD_si = marker_transcripts_vsi(signal_coordinate_df, MOD_ss, MOD_si, ['Mlc1'])
Lpar1_MOD_ss, Lpar1_MOD_si = marker_transcripts_vsi(signal_coordinate_df, MOD_ss, MOD_si, ['Lpar1'])

In [None]:
Gad1_MOD1_ss, Gad1_MOD1_si = marker_transcripts_vsi(signal_coordinate_df, MOD1_ss, MOD1_si, ['Gad1'])
Plin3_MOD1_ss, Plin3_MOD1_si = marker_transcripts_vsi(signal_coordinate_df, MOD1_ss, MOD1_si, ['Plin3'])
Gjc3_MOD1_ss, Gjc3_MOD1_si = marker_transcripts_vsi(signal_coordinate_df, MOD1_ss, MOD1_si, ['Gjc3'])
Dgkk_MOD1_ss, Dgkk_MOD1_si = marker_transcripts_vsi(signal_coordinate_df, MOD1_ss, MOD1_si, ['Dgkk'])
Cbln2_MOD1_ss, Cbln2_MOD1_si = marker_transcripts_vsi(signal_coordinate_df, MOD1_ss, MOD1_si, ['Cbln2'])
Syt4_MOD1_ss, Syt4_MOD1_si = marker_transcripts_vsi(signal_coordinate_df, MOD1_ss, MOD1_si, ['Syt4'])
Mlc1_MOD1_ss, Mlc1_MOD1_si = marker_transcripts_vsi(signal_coordinate_df, MOD1_ss, MOD1_si, ['Mlc1'])
Lpar1_MOD1_ss, Lpar1_MOD1_si = marker_transcripts_vsi(signal_coordinate_df, MOD1_ss, MOD1_si, ['Lpar1'])

In [None]:
Gad1_MOD2_ss, Gad1_MOD2_si = marker_transcripts_vsi(signal_coordinate_df, MOD2_ss, MOD2_si, ['Gad1'])
Plin3_MOD2_ss, Plin3_MOD2_si = marker_transcripts_vsi(signal_coordinate_df, MOD2_ss, MOD2_si, ['Plin3'])
Gjc3_MOD2_ss, Gjc3_MOD2_si = marker_transcripts_vsi(signal_coordinate_df, MOD2_ss, MOD2_si, ['Gjc3'])
Dgkk_MOD2_ss, Dgkk_MOD2_si = marker_transcripts_vsi(signal_coordinate_df, MOD2_ss, MOD2_si, ['Dgkk'])
Cbln2_MOD2_ss, Cbln2_MOD2_si = marker_transcripts_vsi(signal_coordinate_df, MOD2_ss, MOD2_si, ['Cbln2'])
Syt4_MOD2_ss, Syt4_MOD2_si = marker_transcripts_vsi(signal_coordinate_df, MOD2_ss, MOD2_si, ['Syt4'])
Mlc1_MOD2_ss, Mlc1_MOD2_si = marker_transcripts_vsi(signal_coordinate_df, MOD2_ss, MOD2_si, ['Mlc1'])
Lpar1_MOD2_ss, Lpar1_MOD2_si = marker_transcripts_vsi(signal_coordinate_df, MOD2_ss, MOD2_si, ['Lpar1'])

#### Cells distribution

In [None]:
fig, ax = plt.subplots(1, 4, figsize=(16, 9), dpi=600)
plot_hist(ax[0], Lpar1_cell_si, Lpar1_cell_ss, signal_thr=3, cmap=_BIH_CMAP, label="Lpar1", xlim=(0,19), log=False)
plot_hist(ax[1], Plin3_cell_si, Plin3_cell_ss, signal_thr=3, cmap=_BIH_CMAP, label="Plin3", xlim=(0,19), log=False)
plot_hist(ax[2], Gjc3_cell_si, Gjc3_cell_ss, signal_thr=3, cmap=_BIH_CMAP, label="Gjc3", xlim=(0,19), log=False)
plot_hist(ax[3], Mlc1_cell_si, Mlc1_cell_ss, signal_thr=3, cmap=_BIH_CMAP, label="Mlc1", xlim=(0,19), log=False, ylabel=True)

plt.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(1, 4, figsize=(16, 9), dpi=600)

plot_hist(ax[0], Syt4_cell_si, Syt4_cell_ss, signal_thr=3, cmap=_BIH_CMAP, label="Syt4", xlim=(0,19), log=False)
plot_hist(ax[1], Gad1_cell_si, Gad1_cell_ss, signal_thr=3, cmap=_BIH_CMAP, label="Gad1", xlim=(0,19), log=False)
plot_hist(ax[2], Cbln2_cell_si, Cbln2_cell_ss, signal_thr=3, cmap=_BIH_CMAP, label="Cbln2", xlim=(0,19), log=False)
plot_hist(ax[3], Dgkk_cell_si, Dgkk_cell_ss, signal_thr=3, cmap=_BIH_CMAP, label="Dgkk", xlim=(0,19), log=False, ylabel=True)

plt.tight_layout()
plt.show()

#### MOD distribution

In [None]:
fig, ax = plt.subplots(1, 4, figsize=(16, 9), dpi=600)
plot_hist(ax[0], Lpar1_MOD_si, Lpar1_MOD_ss, signal_thr=3, cmap=_BIH_CMAP, label="Lpar1", xlim=(1e-1,64), log=True)
plot_hist(ax[1], Plin3_MOD_si, Plin3_MOD_ss, signal_thr=3, cmap=_BIH_CMAP, label="Plin3", xlim=(1e-1,64), log=True)
plot_hist(ax[2], Gjc3_MOD_si, Gjc3_MOD_ss, signal_thr=3, cmap=_BIH_CMAP, label="Gjc3", xlim=(1e-1,64), log=True)
plot_hist(ax[3], Mlc1_MOD_si, Mlc1_MOD_ss, signal_thr=3, cmap=_BIH_CMAP, label="Mlc1", xlim=(1e-1,64), log=True, ylabel=True)

plt.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(1, 4, figsize=(16, 9), dpi=600)

plot_hist(ax[0], Syt4_MOD_si, Syt4_MOD_ss, signal_thr=3, cmap=_BIH_CMAP, label="Syt4", xlim=(1e-1,64), log=True)
plot_hist(ax[1], Gad1_MOD_si, Gad1_MOD_ss, signal_thr=3, cmap=_BIH_CMAP, label="Gad1", xlim=(1e-1,64), log=True)
plot_hist(ax[2], Cbln2_MOD_si, Cbln2_MOD_ss, signal_thr=3, cmap=_BIH_CMAP, label="Cbln2", xlim=(1e-1,64), log=True)
plot_hist(ax[3], Dgkk_MOD_si, Dgkk_MOD_ss, signal_thr=3, cmap=_BIH_CMAP, label="Dgkk", xlim=(1e-1,64), log=True, ylabel=True)

plt.tight_layout()
plt.show()

#### MOD1 distribution

In [None]:
fig, ax = plt.subplots(1, 4, figsize=(16, 9), dpi=600)
plot_hist(ax[0], Lpar1_MOD1_si, Lpar1_MOD1_ss, signal_thr=3, cmap=_BIH_CMAP, label="Lpar1", xlim=(1e-1,64), log=True)
plot_hist(ax[1], Plin3_MOD1_si, Plin3_MOD1_ss, signal_thr=3, cmap=_BIH_CMAP, label="Plin3", xlim=(1e-1,64), log=True)
plot_hist(ax[2], Gjc3_MOD1_si, Gjc3_MOD1_ss, signal_thr=3, cmap=_BIH_CMAP, label="Gjc3", xlim=(1e-1,64), log=True)
plot_hist(ax[3], Mlc1_MOD1_si, Mlc1_MOD1_ss, signal_thr=3, cmap=_BIH_CMAP, label="Mlc1", xlim=(1e-1,64), log=True, ylabel=True)

plt.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(1, 4, figsize=(16, 9), dpi=600)

plot_hist(ax[0], Syt4_MOD1_si, Syt4_MOD1_ss, signal_thr=3, cmap=_BIH_CMAP, label="Syt4", xlim=(1e-1,64), log=False)
plot_hist(ax[1], Gad1_MOD1_si, Gad1_MOD1_ss, signal_thr=3, cmap=_BIH_CMAP, label="Gad1", xlim=(1e-1,64), log=False)
plot_hist(ax[2], Cbln2_MOD1_si, Cbln2_MOD1_ss, signal_thr=3, cmap=_BIH_CMAP, label="Cbln2", xlim=(1e-1,64), log=False)
plot_hist(ax[3], Dgkk_MOD1_si, Dgkk_MOD1_ss, signal_thr=3, cmap=_BIH_CMAP, label="Dgkk", xlim=(1e-1,64), log=False, ylabel=True)

plt.tight_layout()
plt.show()

#### MOD2 distribution

In [None]:
fig, ax = plt.subplots(1, 4, figsize=(16, 9), dpi=600)
plot_hist(ax[0], Lpar1_MOD2_si, Lpar1_MOD2_ss, signal_thr=3, cmap=_BIH_CMAP, label="Lpar1", xlim=(1e-1,32), log=True)
plot_hist(ax[1], Plin3_MOD2_si, Plin3_MOD2_ss, signal_thr=3, cmap=_BIH_CMAP, label="Plin3", xlim=(1e-1,32), log=True)
plot_hist(ax[2], Gjc3_MOD2_si, Gjc3_MOD2_ss, signal_thr=3, cmap=_BIH_CMAP, label="Gjc3", xlim=(1e-1,32), log=True)
plot_hist(ax[3], Mlc1_MOD2_si, Mlc1_MOD2_ss, signal_thr=3, cmap=_BIH_CMAP, label="Mlc1", xlim=(1e-1,32), log=True, ylabel=True)

plt.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(1, 4, figsize=(16, 9), dpi=600)

plot_hist(ax[0], Syt4_MOD2_si, Syt4_MOD2_ss, signal_thr=3, cmap=_BIH_CMAP, label="Syt4", xlim=(1e-1,32), log=True)
plot_hist(ax[1], Gad1_MOD2_si, Gad1_MOD2_ss, signal_thr=3, cmap=_BIH_CMAP, label="Gad1", xlim=(1e-1,32), log=True)
plot_hist(ax[2], Cbln2_MOD2_si, Cbln2_MOD2_ss, signal_thr=3, cmap=_BIH_CMAP, label="Cbln2", xlim=(1e-1,32), log=True)
plot_hist(ax[3], Dgkk_MOD2_si, Dgkk_MOD2_ss, signal_thr=3, cmap=_BIH_CMAP, label="Dgkk", xlim=(1e-1,32), log=True, ylabel=True)

plt.tight_layout()
plt.show()