In [None]:
import anndata as ad
import matplotlib.pyplot as plt
import mudata as md
import muon
import scanpy as sc
import scvi
import seaborn as sns
import torch
import pandas as pd
import numpy as np
import json
from matplotlib.colors import to_hex

In [None]:
scvi.settings.seed = 0
print("Last run with scvi-tools version:", scvi.__version__)

In [None]:
sc.set_figure_params(figsize=(6, 6), frameon=False)
sns.set_theme()
torch.set_float32_matmul_precision("high")
save_dir = './'

%config InlineBackend.print_figure_kwargs={"facecolor": "w"}
%config InlineBackend.figure_format="retina"

## Read raw adata

In [None]:
adata = ad.read_h5ad('/media/Lynn/data/Integrated_data/adata/run2_3_codex_raw_xenium_norm100_log_scale_16_umap_leiden_0_7.h5ad')

In [None]:
adata

## Remove trash cells from the 1st run (clusters 18 and 22)

In [None]:
input_path = '/media/Lynn/data/totalVI/1st_run/cells_in_clusters_18_22.txt'

with open(input_path, 'r') as f:
    cells_in_clusters = [line.strip() for line in f]

print(f"Loaded {len(cells_in_clusters)} cell IDs")

In [None]:
adata = adata[~adata.obs_names.isin(cells_in_clusters)].copy()

In [None]:
adata.n_obs

## Normalize Xenium counts

In [None]:
adata.layers['counts']= adata.layers['xenium_counts'].copy() #raw xenium data

In [None]:
sc.pp.normalize_total(adata, target_sum=100)
sc.pp.log1p(adata)

## Background-correct CD4 intensities

In [None]:
# Extract raw CD4 intensities from .obs
cd4_raw = adata.obs["CD4"].values

# Compute background as 10th percentile
bg = np.percentile(cd4_raw, 10)

# Apply background subtraction
cd4_corrected = np.clip(cd4_raw - bg, 0, None)

# Store corrected values in a new column
adata.obs["CD4_corrected"] = cd4_corrected

## Remove CODEX markers that didn't work / that aren't present in both runs. 

In [None]:
codex_channels_working = ['aSMA', 'CD8', 'CD31', 
                 'CD11c', 'IFNG', 'Pan-Cytokeratin', 'CD68', 'CD20', 
                 'CD66b', 'CD45RO', 'CD11b', 'Vimentin', 
                 'CD45', 'CCR7', 'CD38', 'CD4_corrected',
                 'Podoplanin', 'ECP', 'MPO', 'MIP-3', 'CD16'] # 21 markers

codex_channels_not_working = ['DAPI', 'FoxP3', 'CD4', 'TNFa', 'CD163', 'PDGFRA', 'PNAd', 'IL10', 'CXCL13', 'CD14', 'CD69'] # 11 markers

codex_channels_working_both_runs = ['aSMA', 'CD8', 'CD31', 
                 'CD11c', 'IFNG', 'Pan-Cytokeratin', 'CD68', 'CD20', 
                 'CD66b', 'CD45RO', 'CD11b', 'Vimentin', 'CD4_corrected',
                 'CD45', 'CCR7', 'CD38', 'Podoplanin'] # 17 markers

In [None]:
# Extract protein columns from obs as a DataFrame
protein_df = adata.obs[codex_channels_working_both_runs]

# Store in obsm as a numpy array
adata.obsm['protein_expression'] = protein_df

In [None]:
adata.obsm['protein_expression'] 

In [None]:
# Combine the two lists of columns to remove
cols_to_remove = codex_channels_working + codex_channels_not_working

# Drop from obs
adata.obs.drop(columns=cols_to_remove, inplace=True)

In [None]:
adata

In [None]:
adata.write('/media/Lynn/data/totalVI/4th_run_with_cd4/filtered_starting_adata_xenium_norm100_log1p_codex_raw.h5ad')

## Get starting mdata

In [None]:
X_codex= adata.obsm["protein_expression"].values
X_codex_rounded = np.round(X_codex).astype(int)

# Create new AnnData with the same obs
protein_adata = ad.AnnData(X=X_codex_rounded, obs=adata.obsm["protein_expression"].copy(), var=pd.DataFrame(index=codex_channels_working_both_runs))

protein_adata.obs_names = adata.obs_names

del adata.obsm["protein_expression"]

mdata = md.MuData({"rna": adata, "protein": protein_adata})
mdata

In [None]:
# Add "_CDX" to all protein var_names
mdata.mod['protein'].var_names = [
    f"{name}_CDX" for name in mdata.mod['protein'].var_names
]

In [None]:
mdata.write('/media/Lynn/data/totalVI/4th_run_with_cd4/final_filtered_starting_mdata_rna_norm100_log1p_protein_raw.h5mu')

## EULER

### Prepare and run the model

In [None]:
scvi.model.TOTALVI.setup_mudata(
    mdata,
    rna_layer="counts",
    protein_layer=None,
    batch_key="slide_str",
    modalities={
        "rna_layer": "rna",
        "protein_layer": "protein",
        "batch_key": "rna",
    },
)

In [None]:
model = scvi.model.TOTALVI(mdata)

In [None]:
model.train(
    train_size=0.9,               # 10% validation set
    max_epochs=400,               # big number; will stop earlier automatically
    accelerator="gpu",            # use Apple GPU
    devices=1,
    early_stopping=True,          # enable early stopping
    early_stopping_patience=20,   # stop after 20 epochs with no improvement
    early_stopping_monitor="elbo_validation",  # monitor validation ELBO
)
model.save("/cluster/scratch/lyarab/model/3rd_run_model_400_epochs", overwrite=True)

### Get latent representation

In [None]:
rna = mdata.mod['rna']
protein = mdata.mod['protein']

# arbitrarily store latent in rna modality
TOTALVI_LATENT_KEY = "X_totalVI"
rna.obsm[TOTALVI_LATENT_KEY] = model.get_latent_representation()

### Get denoised rna and protein, and protein foreground probabilies

In [None]:
rna_denoised, protein_denoised = model.get_normalized_expression(
    n_samples=25, return_mean=True
)
rna.layers["denoised_rna"] = rna_denoised
protein.layers["denoised_protein"] = protein_denoised

protein.layers["protein_foreground_prob"] = 100 * model.get_protein_foreground_probability(
    n_samples=25, return_mean=True
)
parsed_protein_names = [p.split("_")[0] for p in protein.var_names]
protein.var["clean_names"] = parsed_protein_names

mdata.update()
mdata.write("/cluster/scratch/lyarab/data/3rd_run/mdata_denoised.h5mu")

### Cluster

In [None]:
TOTALVI_CLUSTERS_KEY = "leiden_totalVI"

sc.pp.neighbors(rna, use_rep=TOTALVI_LATENT_KEY)
sc.tl.umap(rna)
sc.tl.leiden(rna, key_added=TOTALVI_CLUSTERS_KEY)

mdata.update()
mdata.write("/cluster/scratch/lyarab/data/3rd_run/mdata_leiden.h5mu")

### Dendogram

In [None]:
sc.tl.dendrogram(rna, groupby=TOTALVI_CLUSTERS_KEY, use_rep=TOTALVI_LATENT_KEY)
# This is a bit of a hack to be able to use scanpy dendrogram with the protein data
protein.obs[TOTALVI_CLUSTERS_KEY] = rna.obs[TOTALVI_CLUSTERS_KEY]
protein.obsm[TOTALVI_LATENT_KEY] = rna.obsm[TOTALVI_LATENT_KEY]
sc.tl.dendrogram(protein, groupby=TOTALVI_CLUSTERS_KEY, use_rep=TOTALVI_LATENT_KEY)

## Read updated mdata

In [None]:
mdata = muon.read('/media/Lynn/data/totalVI/1st_run/mdata/mdata_leiden_dendogram.h5mu')

In [None]:
rna = mdata.mod['rna']
protein = mdata.mod['protein']

TOTALVI_LATENT_KEY = "X_totalVI"
TOTALVI_CLUSTERS_KEY = "leiden_totalVI"

In [None]:
rna.write('/media/Lynn/data/totalVI/1st_run/totalVI_rna_adata.h5ad')
protein.write('/media/Lynn/data/totalVI/1st_run/totalVI_protein_adata.h5ad')

In [None]:
rna.obsm['X_totalVI'][1]

In [None]:
rna.X[1]

In [None]:
np.save("/media/Lynn/data/totalVI/3rd_run/X_totalVI.npy", rna.obsm["X_totalVI"])

In [None]:
np.save("/media/Lynn/data/totalVI/1st_run/X_rna_totalVI.npy", rna.X)
np.save("/media/Lynn/data/totalVI/1st_run/X_protein_totalVI.npy", protein.X)

In [None]:
rna = rna[~rna.obs_names.isin(cells_in_clusters)].copy()

In [None]:
np.save("/media/Lynn/data/totalVI/1st_run/X_totalVI_all_markers.npy", rna.obsm["X_totalVI"])

## Read in model

In [None]:
model = scvi.model.TOTALVI.load("/media/Lynn/data/totalVI/3rd_run/3rd_run_model_400_epochs", mdata)

In [None]:
last_val_valid = np.array(model.history["elbo_validation"])[-1]
last_val_train = np.array(model.history["elbo_train"])[-1]
global_min_loss = min(
    np.min(model.history["elbo_train"]), np.min(model.history["elbo_validation"])
)
last_max_loss = max(last_val_train, last_val_valid)[0]
global_max_loss = max(
    np.max(model.history["elbo_train"]), np.max(model.history["elbo_validation"])
)

In [None]:
# Compute the min and max of both train and validation losses
min_loss = min(min(last_val_train, last_val_valid), global_min_loss)
max_loss = max(max(last_val_train, last_val_valid), global_max_loss)
ylim_min = 0.995 * min_loss  # 0.5% below the minimum
ylim_max = min(
    global_max_loss, ylim_min + (last_max_loss - ylim_min) * 4
)  # keep it under the 25% part of figure

In [None]:
fig, ax = plt.subplots(1, 1)
model.history["elbo_train"].plot(ax=ax, label="train")
model.history["elbo_validation"].plot(ax=ax, label="validation")
if isinstance(ylim_min, (int | float)) and isinstance(ylim_max, (int | float)):
    ax.set(title="Negative ELBO over training epochs", ylim=(ylim_min, ylim_max))
else:
    ax.set(title="Negative ELBO over training epochs")
ax.legend()
fig.savefig("/media/Lynn/notebooks/figures/elbo_training_plot.png", dpi=300, bbox_inches="tight")

## Visualize probability of foreground

In [None]:
muon.pl.embedding(
    mdata,
    basis="rna:X_umap",
    layer="protein_foreground_prob",
    color=protein.var_names,
    frameon=False,
    ncols=3,
    vmax="p99",
    wspace=0.1,
    color_map="cividis",
    save = '_protein_foreground_prob.png'
)

## Visualize top DE genes by cluster

In [None]:
de_df = model.differential_expression(
    groupby="rna:leiden_totalVI", delta=0.5, batch_correction=True
)
de_df.head(5)

In [None]:
de_df.to_csv("/media/Lynn/data/totalVI/4th_run_with_cd4/differential_expression_results.csv")

#### We filter the results such that we retain features above a certain Bayes factor (which here is on the natural log scale) and transcripts with greater than 10% non-zero entries in the cluster of interest.

In [None]:
filtered_pro = {}
filtered_rna = {}
cats = rna.obs[TOTALVI_CLUSTERS_KEY].cat.categories
for c in cats:
    cid = f"{c} vs Rest"
    cell_type_df = de_df.loc[de_df.comparison == cid]
    cell_type_df = cell_type_df.sort_values("lfc_median", ascending=False)

    cell_type_df = cell_type_df[cell_type_df.lfc_median > 0]

    pro_rows = cell_type_df.index.str.contains("protein")
    data_pro = cell_type_df.iloc[pro_rows]
    data_pro = data_pro[data_pro["bayes_factor"] > 0.7]

    data_rna = cell_type_df.iloc[~pro_rows]
    data_rna = data_rna[data_rna["bayes_factor"] > 3]
    data_rna = data_rna[data_rna["non_zeros_proportion1"] > 0.1]

    filtered_pro[c] = data_pro.index.tolist()[:3]
    filtered_rna[c] = data_rna.index.tolist()[:2]

In [None]:
with open("/media/Lynn/data/totalVI/4th_run_with_cd4/filtered_pro.json", "w") as f:
    json.dump(filtered_pro, f)
with open("/media/Lynn/data/totalVI/4th_run_with_cd4/filtered_rna.json", "w") as f:
    json.dump(filtered_rna, f)

In [None]:
sc.pl.dotplot(
    rna,
    filtered_rna,
    groupby=TOTALVI_CLUSTERS_KEY,
    dendrogram=True,
    standard_scale="var",
    swap_axes=True,
    save = '_dendogram_filtered_rna.png'
)

In [None]:
sc.pl.umap(
    rna,
    color=[
        TOTALVI_CLUSTERS_KEY,
        "SH2D6",
        "TRPM5",
        "CHGA",
        "CHGB",
        "RETNLB",
        "FABP2",
        "MUC12",
        "TMIGD1",
        "HCAR3",
        "RRM2",
        "IL1B"
    ],
    legend_loc="on data",
    frameon=False,
    ncols=3,
    layer="denoised_rna",
    wspace=0.2,
    save = '_some_filtered_rna.png'
)

## Visualize mean protein expression by cluster

In [None]:
protein.obs.rename(
    columns={name: f"{name}_CDX_protein" for name in codex_channels_working_both_runs if name in protein.obs.columns},
    inplace=True)

In [None]:
sc.pl.matrixplot(
    protein,
    protein.var["clean_names"],
    groupby=TOTALVI_CLUSTERS_KEY,
    gene_symbols="clean_names",
    dendrogram=True,
    swap_axes=True,
    layer="denoised_protein",
    cmap="Greens",
    standard_scale="var",
    save = '_dendogram_filtered_proteins.png'
)

In [None]:
muon.pl.embedding(
    mdata,
    basis="rna:X_umap",
    color=protein.var_names,
    frameon=False,
    ncols=3,
    vmax="p99",
    wspace=0.1,
    layer="denoised_protein",
    save = '_codex_markers.png'
)

## Visualize top DE genes and proteins by cluster

In [None]:
import pandas as pd
from itertools import chain

# Flatten feature dicts
rna_features = list(chain.from_iterable(filtered_rna.values()))
protein_features = list(chain.from_iterable(filtered_pro.values()))

# Remove duplicates while preserving order
rna_features = list(dict.fromkeys(rna_features))
protein_features = list(dict.fromkeys(protein_features))

# Remove "_CDX_protein" suffix
protein_features_clean = [f.replace('_protein', '') for f in protein_features]

# Extract protein values as a DataFrame (cells x features)
protein_df = pd.DataFrame(
    mdata['protein'][:, protein_features_clean].X,
    index=mdata['protein'].obs_names,
    columns=protein_features_clean
)

# Make sure the index matches RNA obs
protein_df = protein_df.loc[mdata['rna'].obs_names]

# Add protein columns to RNA obs
for col in protein_df.columns:
    mdata['rna'].obs[col] = protein_df[col]

# Combine features for dotplot
combined_features = rna_features + protein_features_clean

# Dotplot
import scanpy as sc
sc.pl.dotplot(
    mdata['rna'],
    var_names=combined_features,
    groupby=TOTALVI_CLUSTERS_KEY,
    dendrogram=True,
    standard_scale='var',
    swap_axes=True,
    save='_dendogram_filtered_rna_protein_combined.png'
)


## Clustering by metadata

In [None]:
sc.pl.umap(
    rna,
    color= 'response_group',
    frameon=False,
    wspace=0.2,
    save = '_by_response_group.png'
)

In [None]:
sc.pl.umap(
    rna,
    color= 'time_point',
    frameon=False,
    wspace=0.2,
    save = '_by_timepoint.png'
)

In [None]:
rna.obs[["year", "patient_ID"]] = rna.obs[["year", "patient_ID"]].astype(str)

In [None]:
sc.pl.umap(
    rna,
    color= 'year',
    frameon=False,
    wspace=0.2,
    save = '_by_year.png'
)

In [None]:
sc.pl.umap(
    rna,
    color= 'patient_ID',
    frameon=False,
    wspace=0.2,
    save = '_by_patient.png'
)

In [None]:
sc.pl.umap(
    rna,
    color= 'tissue',
    frameon=False,
    wspace=0.2,
    save = '_by_tissue.png'
)

In [None]:
# Calculate the number of cells per cluster
cluster_counts = rna.obs[TOTALVI_CLUSTERS_KEY].value_counts()

# Create a new column to store the count of cells for each cluster
rna.obs['cluster_cell_count'] = rna.obs[TOTALVI_CLUSTERS_KEY].map(cluster_counts)

# Plot UMAP with a gradient based on the number of cells per cluster
sc.pl.umap(
    rna,
    color='cluster_cell_count',  # Use the newly created column to represent the number of cells
    wspace=0.4,
    save = '_cell_counts.png'
)

In [None]:
sc.pl.umap(
    rna,
    color=TOTALVI_CLUSTERS_KEY,  # Use the newly created column to represent the number of cells
    legend_loc="on data",
    wspace=0.4,
    save = '_overlayed_cluster_numbers.png'
)

In [None]:
sc.pl.umap(
    rna,
    color=TOTALVI_CLUSTERS_KEY,  # Use the newly created column to represent the number of cells
    wspace=0.4,
    save = '.png'
)

In [None]:
sc.pl.umap(
    rna,
    color='xenium_annotation', 
    wspace=0.4,
    save = '_with_xenium_annotation.png'
)

In [None]:
# Make sure both columns exist in .obs
if "xenium_annotation" in rna.obs.columns and "xenium_leiden_0.7" in rna.obs.columns:
    rna.obs["xenium_annotation_with_cluster_number"] = (
        rna.obs["xenium_leiden_0.7"].astype(str) + ": " + rna.obs["xenium_annotation"].astype(str)
    )
    
sc.pl.umap(
    rna,
    color='xenium_annotation_with_cluster_number', 
    wspace=0.4,
    save = '_with_xenium_annotation_with_numbers.png'
)

In [None]:
sc.pl.umap(
    rna,
    color='xenium_leiden_0.7', 
    wspace=0.4,
    legend_loc = 'on data',
    save = '_with_overlayed_xenium_only_clustering.png'
)

In [None]:
sc.pl.umap(
    rna,
    color='slide_str', 
    wspace=0.4,
    save = '_by_batch.png'
)

In [None]:
# Define the mapping from leiden_0.7 clusters to broader cell subsets
cluster_to_subset_mapping = {
    '0': 'T cells',
    '1': 'Stroma',
    '2': 'B/ Plasma cells',
    '3': 'Epithelium',
    '4': 'Myeloid cells',
    '5': 'Myeloid cells',
    '6': 'Epithelium',
    '7': 'B/ Plasma cells',
    '8': 'Stroma',
    '9': 'T cells',
    '10': 'Epithelium',
    '11': 'Epithelium',
    '12': 'Stroma',
    '13': 'Epithelium',
    '14': 'Myeloid cells',
    '15': 'Epithelium',
    '16': 'Epithelium', 
    '17': 'Unassigned', 
    '18': 'Stroma',
    '19': 'Stroma',
    '20': 'Epithelium',
    '21': 'T cells',
    '22': 'Myeloid cells',
    '23': 'Myeloid cells',
    '24': 'Epithelium',
    '25': 'Epithelium',
    '26': 'Epithelium',
    '27': 'Epithelium',
    '28': 'Myeloid cells',
    '29': 'Myeloid cells',
    '30': 'Myeloid cells',
    '31': 'Unassigned', # Based on '??'
    '32': 'Unassigned', # Based on '??'
    '33': 'Unassigned', # Based on '??'
    '34': 'Unassigned', # Based on '??'
    '35': 'Stroma',
    '36': 'Stroma'
}

In [None]:
# Create the new 'cell_subsets' column by mapping 'leiden_0.7'
rna.obs['xenium_cell_subset'] = rna.obs['xenium_leiden_0.7'].map(cluster_to_subset_mapping)

In [None]:
sc.pl.umap(
    rna,
    color='xenium_cell_subset', 
    wspace=0.4,
    save = '_with_xenium_cell_subsets.png'
)

In [None]:
def plot_umap_with_cluster_labels(adata, color_by='xenium_cell_subset', label_by=TOTALVI_CLUSTERS_KEY, save_name=None):
    """
    Plots UMAP colored by `color_by` and overlays cluster labels from `label_by`.
    
    Parameters:
        adata: AnnData object
        color_by: column in adata.obs to color points
        label_by: column in adata.obs to label clusters
        save_name: filename to save the figure (optional)
    """
    # Create figure
    fig, ax = plt.subplots(figsize=(6,6))
    
    # Plot points colored by cell subset
    sc.pl.umap(
        adata,
        color=color_by,
        ax=ax,
        show=False,
        size=5,
        legend_loc='right margin'
    )
    
    # Get cluster centers for labels
    cluster_means = adata.obsm['X_umap'].copy()
    obs_df = adata.obs[[label_by]].copy()
    obs_df['UMAP1'] = cluster_means[:,0]
    obs_df['UMAP2'] = cluster_means[:,1]
    
    # Compute mean position per cluster
    centers = obs_df.groupby(label_by)[['UMAP1','UMAP2']].mean()
    
    # Overlay cluster labels
    for cluster, row in centers.iterrows():
        ax.text(row['UMAP1'], row['UMAP2'], str(cluster),
                color='black', fontsize=12, fontweight='bold',
                ha='center', va='center')
    
    if save_name:
        plt.savefig(save_name, bbox_inches='tight', dpi=150)
    plt.show()


# Example usage:
plot_umap_with_cluster_labels(
    rna,
    color_by='xenium_cell_subset',
    label_by=TOTALVI_CLUSTERS_KEY,
    save_name='umap_cell_subset_with_clusters.png'
)


## QC

In [None]:
import os 
# Optional: set output directory
output_dir = "figures/totalVI_Run2+3/3rd_run/codex_intensities_raw_per_slide"
os.makedirs(output_dir, exist_ok=True)

# 1. Create DataFrame from raw adata.X
raw_df = pd.DataFrame(
    protein.X, 
    columns=protein.var_names, 
    index=protein.obs_names
)

# Add slide info
raw_df['slide_ID'] = rna.obs['slide_str']

# 2. Compute intensity statistics per slide
intensity_sum_raw = raw_df.groupby('slide_ID')[protein.var_names].sum()
intensity_mean_raw = raw_df.groupby('slide_ID')[protein.var_names].mean()

# 3. Plot and save

# Plot 1: Total Raw Intensity per slide
plt.figure(figsize=(16, 6))
sns.heatmap(intensity_sum_raw.T, cmap='viridis')
plt.title("Total Raw CODEX Channel Intensity per Slide")
plt.xlabel("Slide ID")
plt.ylabel("Marker")
plt.tight_layout()
plt.savefig(os.path.join(output_dir, "total_raw_intensity_per_slide.png"), dpi=300)
plt.show()

# Plot 2: Mean Raw Intensity per slide
plt.figure(figsize=(16, 6))
sns.heatmap(intensity_mean_raw.T, cmap='plasma')
plt.title("Mean Raw CODEX Channel Intensity per Cell per Slide")
plt.xlabel("Slide ID")
plt.ylabel("Marker")
plt.tight_layout()
plt.savefig(os.path.join(output_dir, "mean_raw_intensity_per_slide.png"), dpi=300)
plt.show()

In [None]:
# Optional: set output directory
output_dir = "figures/totalVI_Run2+3/3rd_run/X_totalVI_per_slide"
os.makedirs(output_dir, exist_ok=True)

# 1. Create DataFrame from X_totalVI
totalvi_df = pd.DataFrame(
    rna.obsm["X_totalVI"], 
    index=rna.obs_names,
    columns=[f"totalVI_{i}" for i in range(rna.obsm["X_totalVI"].shape[1])]
)

# Add slide info
totalvi_df['slide_ID'] = rna.obs['slide_ID']

# 2. Compute statistics per slide
intensity_sum_totalvi = totalvi_df.groupby('slide_ID').sum()
intensity_mean_totalvi = totalvi_df.groupby('slide_ID').mean()

# 3. Plot and save

# Plot 1: Total latent intensity per slide
plt.figure(figsize=(16, 6))
sns.heatmap(intensity_sum_totalvi.T, cmap='viridis')
plt.title("Total X_totalVI Latent Dimension Sum per Slide")
plt.xlabel("Slide ID")
plt.ylabel("Latent Dimension")
plt.tight_layout()
plt.savefig(os.path.join(output_dir, "total_X_totalVI_per_slide.png"), dpi=300)
plt.show()

# Plot 2: Mean latent intensity per slide
plt.figure(figsize=(16, 6))
sns.heatmap(intensity_mean_totalvi.T, cmap='plasma')
plt.title("Mean X_totalVI Latent Dimension per Cell per Slide")
plt.xlabel("Slide ID")
plt.ylabel("Latent Dimension")
plt.tight_layout()
plt.savefig(os.path.join(output_dir, "mean_X_totalVI_per_slide.png"), dpi=300)
plt.show()


In [None]:
import pandas as pd
import numpy as np

# Latent representation
latent = pd.DataFrame(
    rna.obsm["X_totalVI"], 
    index=rna.obs_names,
    columns=[f"totalVI_{i}" for i in range(rna.obsm["X_totalVI"].shape[1])]
)

rna_matrix = pd.DataFrame(
    rna.X.toarray() if hasattr(rna.X, "toarray") else rna.X,
    index=rna.obs_names,
    columns=rna.var_names
)

prot_matrix = pd.DataFrame(
    protein.X.toarray() if hasattr(protein.X, "toarray") else protein.X,
    index=protein.obs_names,
    columns=protein.var_names
)

# Compute correlations for RNA
corrs_rna = {
    dim: rna_matrix.corrwith(latent[dim]) for dim in latent.columns
}

# Optionally compute correlations for proteins
if prot_matrix is not None:
    corrs_prot = {
        dim: prot_matrix.corrwith(latent[dim]) for dim in latent.columns
    }


In [None]:
# Save correlations
# Convert the dict to DataFrame first
corrs_rna_df = pd.DataFrame(corrs_rna)
corrs_rna_df.to_csv("corrs_rna.csv")

if prot_matrix is not None:
    corrs_prot_df = pd.DataFrame(corrs_prot)
    corrs_prot_df.to_csv("corrs_prot.csv")

In [None]:
# Example: top 10 RNA features correlated with totalVI_0
corrs_rna["totalVI_0"].abs().sort_values(ascending=False).head(10)

In [None]:
# Example: top 10 RNA features correlated with totalVI_0
corrs_prot["totalVI_0"].abs().sort_values(ascending=False).head(10)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Combine correlations into a single DataFrame
corr_df_list = []

for dim in latent.columns:
    # RNA
    df_rna = pd.DataFrame({
        "feature": rna_matrix.columns,
        "corr": corrs_rna[dim],
        "modality": "RNA",
        "latent_dim": dim
    })
    
    # Protein
    if prot_matrix is not None:
        df_prot = pd.DataFrame({
            "feature": prot_matrix.columns,
            "corr": corrs_prot[dim],
            "modality": "Protein",
            "latent_dim": dim
        })
        df_combined = pd.concat([df_rna, df_prot], axis=0)
    else:
        df_combined = df_rna
    
    corr_df_list.append(df_combined)

corr_df = pd.concat(corr_df_list, axis=0)

# Optionally, you can only take the top N features per latent dimension
top_n = 50
top_features_df = (
    corr_df
    .groupby("latent_dim", group_keys=False)
    .apply(lambda x: x.nlargest(top_n, "corr"))
)

# Count the proportion of RNA vs Protein per latent dimension
prop_df = (
    top_features_df
    .groupby(["latent_dim", "modality"])
    .size()
    .reset_index(name="count")
)

# Compute proportion
prop_df["proportion"] = prop_df.groupby("latent_dim")["count"].transform(lambda x: x / x.sum())

# Pivot to have modalities as columns for stacked bar plot
stacked_df = prop_df.pivot(index="latent_dim", columns="modality", values="proportion").fillna(0)

# Plot
stacked_df.plot(
    kind="bar",
    stacked=True,
    figsize=(12, 6),
    color=["skyblue", "salmon"]  # Optional: colors for RNA and Protein
)

plt.ylabel("Proportion of top features")
plt.xlabel("totalVI latent dimensions")
plt.title(f"Proportion of RNA vs Protein features in top {top_n} correlations per totalVI dimension")
plt.xticks(rotation=45)
plt.legend(title="Modality")
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

top_n = 10  # Number of top features per latent dimension

# Select top N features per latent dimension
top_features_df = (
    corr_df
    .groupby("latent_dim", group_keys=False)
    .apply(lambda x: x.nlargest(top_n, "corr"))
)

# Add combined label for feature + modality
top_features_df['feature_label'] = top_features_df['feature']

latent_dims = top_features_df['latent_dim'].unique()
n_cols = 3
n_rows = (len(latent_dims) + n_cols - 1) // n_cols

fig, axes = plt.subplots(n_rows, n_cols, figsize=(6*n_cols, 4*n_rows))
axes = axes.flatten()

for i, dim in enumerate(latent_dims):
    df = top_features_df[top_features_df['latent_dim'] == dim]
    sns.barplot(
        data=df,
        x='corr',
        y='feature_label',
        hue='modality',
        dodge=False,
        palette={"RNA": "skyblue", "Protein": "salmon"},
        ax=axes[i]
    )
    axes[i].set_title(dim)
    axes[i].set_xlabel("Correlation")
    axes[i].set_ylabel("")
    axes[i].legend().set_title("Modality")

# Hide any unused subplots
for j in range(i+1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()


In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc  # if not already imported

# Create output directory for plots if it doesn't exist
output_dir = "/media/Lynn/notebooks/figures/totalVI_Run2+3/3rd_run/protein_intensity_per_cell_type"
os.makedirs(output_dir, exist_ok=True)


# Convert protein.X to a DataFrame
df_protein = pd.DataFrame(
    protein.X.toarray() if hasattr(protein.X, "toarray") else protein.X,
    columns=protein.var_names,
    index=protein.obs_names
)

# Add xenium_annotation from RNA obs
df_protein['xenium_annotation'] = rna.obs.loc[df_protein.index, 'xenium_annotation']

# Plot each protein
for protein_name in protein.var_names:
    plt.figure(figsize=(12, 6))
    sns.boxplot(data=df_protein, x='xenium_annotation', y=protein_name)
    plt.xticks(rotation=90)
    plt.title(f"Expression of {protein_name} per Cell Type")
    plt.ylabel("Expression")
    plt.xlabel("Cell Type")
    plt.yscale('log')  # optional: log scale
    plt.tight_layout()
    
    # Save the plot
    filename = f"{protein_name}_expression_boxplot.png"
    filepath = os.path.join(output_dir, filename)
    plt.savefig(filepath, dpi=300, bbox_inches='tight')
    
    plt.show()
    print(f"Saved plot for {protein_name} to: {filepath}")

print(f"\nAll plots saved to directory: {output_dir}")


## Compare all markers vs filtered markers clusterings

In [None]:
mdata_all_markers = muon.read("/media/Lynn/data/totalVI/1st_run/mdata/mdata_leiden_dendogram.h5mu")

In [None]:
rna_all_markers = mdata_all_markers.mod['rna']
protein_all_markers = mdata_all_markers.mod['protein']

In [None]:
rna_all_markers.obs['leiden_only_working_markers']=rna.obs[TOTALVI_CLUSTERS_KEY]

In [None]:
sc.pl.umap(
    rna_all_markers,
    color= 'leiden_only_working_markers',
    frameon=False,
    wspace=0.2,
    save = '_1st_run_with_overlayed_3rd_run_colors.png'
)

In [None]:
sc.pl.umap(
    rna_all_markers,
    color= 'leiden_only_working_markers',
    frameon=False,
    wspace=0.2,
    legend_loc='on data',
    save = '_1st_run_with_overlayed_3rd_run_colors_and_numbers.png'
)

In [None]:
rna.obs['leiden_all_markers']=rna_all_markers.obs[TOTALVI_CLUSTERS_KEY]

In [None]:
sc.pl.umap(
    rna,
    color= 'leiden_all_markers',
    frameon=False,
    wspace=0.2,
    save = '_3rd_run_with_overlayed_1st_run_colors.png'
)

In [None]:
sc.pl.umap(
    rna,
    color= 'leiden_all_markers',
    frameon=False,
    wspace=0.2,
    legend_loc = 'on data',
    save = '_3rd_run_with_overlayed_1st_run_colors_and_numbers.png'
)