In [None]:
import pandas as pd
import anndata as ad
import numpy as np
import squidpy as sq
import scanpy as sc
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import sparse
from anndata import AnnData
import os

In [None]:
full_adata = ad.read_h5ad("/media/Lynn/data/Integrated_data/adata/run2_3_codex_raw_xenium_norm100_log_scale_16_umap_leiden_0_7.h5ad")

In [None]:
full_adata

## Z-norm CODEX intensities

In [None]:
codex_channels = ['DAPI', 'FoxP3', 'aSMA', 'CD4', 'CD8', 'CD31', 
                 'CD11c', 'IFNG', 'Pan-Cytokeratin', 'CD68', 'CD20', 
                 'CD66b', 'TNFa', 'CD45RO', 'CD14', 'CD11b', 'Vimentin', 
                 'CD163', 'PDGFRA', 'CD45', 'CCR7', 'IL10', 'CD38', 'CD69', 
                 'Podoplanin', 'PNAd', 'ECP', 'MPO', 'MIP-3', 'CD16', 'CXCL13'
]

In [None]:
# Store original values with new column names before normalization
print("Storing original CODEX marker values...")
for marker in codex_channels:
    full_adata.obs[f'{marker}_original'] = full_adata.obs[marker].copy()

In [None]:
# Standard Z-score normalization 
print("\nApplying Z-score normalization...")
for marker in codex_channels:
    marker_values = full_adata.obs[marker]
    
    # Calculate mean and standard deviation
    mean_val = marker_values.mean()
    std_val = marker_values.std()
    
    # Apply z-score normalization (overwrite original columns)
    full_adata.obs[marker] = (marker_values - mean_val) / std_val

## Check that we have the correct cell_id - CODEX intensities combination

In [None]:
adata_56777_core_1 = full_adata[(full_adata.obs['slide_str']=='ID_0056777') & (full_adata.obs['core_ID']=='X1Y1')].copy()

In [None]:
sq.pl.spatial_scatter(
    adata_56777_core_1,
    color="Pan-Cytokeratin",
    shape=None,
    size=2,
    save = 'Pan-CK_spatial_codex_56777_core_1'
)

In [None]:
adata_56764_X1_Y6 = full_adata[(full_adata.obs['slide_str']=='ID_0056764') & (full_adata.obs['core_ID']=='X1Y6')].copy()

In [None]:
import numpy as np

coords = adata_56764_X1_Y6.obsm['spatial']

# Get min and max for x and y
x_min, y_min = np.min(coords, axis=0)
x_max, y_max = np.max(coords, axis=0)

print(f"x_min: {x_min}, x_max: {x_max}")
print(f"y_min: {y_min}, y_max: {y_max}")


In [None]:
size = 1000       # side length of the square
x_offset = 200    # move right from the left edge
y_offset = 200    # move down from the top edge

# Compute crop coordinates
x_min_crop = 618.95 + x_offset
x_max_crop = x_min_crop + size

y_max_crop = 14003.92 - y_offset
y_min_crop = y_max_crop - size

# Create mask
coords = adata_56764_X1_Y6.obsm['spatial']
mask = (
    (coords[:, 0] >= x_min_crop) & (coords[:, 0] <= x_max_crop) &
    (coords[:, 1] >= y_min_crop) & (coords[:, 1] <= y_max_crop)
)

# Crop AnnData
adata_cropped = adata_56764_X1_Y6[mask].copy()


In [None]:
sq.pl.spatial_scatter(
    adata_cropped,
    color="Pan-Cytokeratin",
    shape=None,
    size=4,
    vmin=0,
    vmax=2.5,
    save = 'Pan-CK_spatial_codex_56764_X1_Y6_highres'
)

In [None]:
sq.pl.spatial_scatter(
    adata_56764_X1_Y6,
    color="CD45",
    shape=None,
    size=2,
    vmax = 0.3,
    save = 'CD45_codex_56764_X1_Y6_highres'
)
sq.pl.spatial_scatter(
    adata_56764_X1_Y6,
    color="PTPRC",
    shape=None,
    size=2,
    vmax = 3,
    save = 'PTPRC_spatial_codex_56764_X1_Y6_highres'
)

In [None]:
sq.pl.spatial_scatter(
    adata_cropped,
    color="CD45",
    shape=None,
    size=4,
    vmax = 0.3,
    save = 'CD45_codex_56764_X1_Y6_highres'
)
sq.pl.spatial_scatter(
    adata_cropped,
    color="PTPRC",
    shape=None,
    size=4,
    vmax = 3,
    save = 'PTPRC_spatial_codex_56764_X1_Y6_highres'
)

In [None]:
# Extract expression values from .obs or .X depending on where markers are stored
ptprc = full_adata[:, "PTPRC"].X.toarray().ravel()
cd45  = full_adata.obs["CD45_original"]  

# Define thresholds (adjust as needed!)
ptprc_thresh = -0.2
cd45_thresh  = 0

# Boolean masks
ptprc_positive = ptprc > ptprc_thresh
cd45_positive  = cd45 > cd45_thresh

# Count cells that are PTPRC+ and CD45-
only_ptprc = np.sum(ptprc_positive & ~cd45_positive)

print(f"Number of cells PTPRC+ only: {only_ptprc}")

## Find transcripts with low signal that can be compensated with their protein markers, which shows codex utility

In [None]:
# Use your RNA AnnData
adata = full_adata.copy()

# compute per-gene detection
X = adata.X
if hasattr(X, "toarray"):  # sparse case
    X = X.toarray()

detected = (X > 0)
detect_rate = detected.mean(axis=0)
mean_expr_pos = np.array([
    X[:, i][detected[:, i]].mean() if detected[:, i].sum() > 0 else np.nan
    for i in range(X.shape[1])
])

summary_rna = pd.DataFrame({
    "gene": adata.var_names,
    "detect_rate": detect_rate,
    "mean_expr_pos": mean_expr_pos
}).sort_values("detect_rate")

# show lowest-detected genes
print(summary_rna.head(20))


In [None]:
genes_of_interest = [
    'ACTA2','CD8A','CD8B','PECAM1','ITGAX','IFNG',
    'KRT8','KRT18','KRT19','KRT7','KRT14','KRT5',
    'CD68','MS4A1','CEACAM8','PTPRC','ITGAM',
    'VIM','CCR7','CD38','PDPN', 'CD4'
]

summary_interest = summary_rna[summary_rna["gene"].isin(genes_of_interest)]
print(summary_interest.sort_values("detect_rate"))

## Save z-norm adata

In [None]:
output_path = "/media/Lynn/data/Integrated_data/adata/run2_3_codex_znorm_xenium_norm100_log_scale_16_umap_leiden_0_7.h5ad"
full_adata.write(output_path)

## Plot each CODEX channel intensity per cell type

### Z-normalized

In [None]:
# Create output directory for plots if it doesn't exist
output_dir = "marker_intensity_plots_run2+3"
os.makedirs(output_dir, exist_ok=True)

# Extract obs data with annotations and CODEX intensities
df_plot = full_adata.obs[codex_channels + ['xenium_annotation']].copy()

for channel in codex_channels:
    plt.figure(figsize=(12, 6))
    sns.boxplot(data=df_plot, x='xenium_annotation', y=channel)
    plt.xticks(rotation=90)
    plt.title(f"Intensity of {channel} per Cell Type")
    plt.ylabel("Intensity")
    plt.xlabel("Cell Type")
    plt.yscale('log') # to change the scale bar
    plt.tight_layout()
    
    # Save the plot
    filename = f"{channel}_intensity_boxplot.png"
    filepath = os.path.join(output_dir, filename)
    plt.savefig(filepath, dpi=300, bbox_inches='tight')
    
    # Show the plot
    plt.show()
    
    print(f"Saved plot for {channel} to: {filepath}")

print(f"\nAll plots saved to directory: {output_dir}")

### log1p-normalized

In [None]:
# Create output directory for plots if it doesn't exist
output_dir = "figures/QC_Run2+3/marker_intensity_plots_run2+3/log1p"
os.makedirs(output_dir, exist_ok=True)

# Create list of raw channel names
raw_channels = [f"{ch}_original" for ch in codex_channels]

# Extract obs data with annotations and RAW intensities
df_plot = full_adata.obs[raw_channels + ['xenium_annotation']].copy()

# Apply log1p transformation to raw intensities
df_plot[raw_channels] = np.log1p(df_plot[raw_channels])

# Loop through each raw marker channel and plot
for raw_channel in raw_channels:
    channel_name = raw_channel.replace("_original", "")  # Strip suffix for cleaner title
    
    plt.figure(figsize=(12, 6))
    sns.boxplot(data=df_plot, x='xenium_annotation', y=raw_channel)
    plt.xticks(rotation=90)
    plt.title(f"{channel_name}")
    plt.ylabel("log1p(Intensity)")
    plt.xlabel("Cell Type")
    plt.tight_layout()
    
    # Save the plot
    filename = f"{channel_name}_log1p_raw_intensity_boxplot.png"
    filepath = os.path.join(output_dir, filename)
    plt.savefig(filepath, dpi=300, bbox_inches='tight')
    
    # Show the plot
    plt.show()
    
    print(f"Saved plot for {channel_name} to: {filepath}")

print(f"\nAll plots saved to directory: {output_dir}")

In [None]:
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import math

# Create output directory
output_dir = "../pictures/thesis_pictures/marker_intensity_plots" 
os.makedirs(output_dir, exist_ok=True)

# Replace '??' with 'Other' in xenium_annotation
full_adata.obs["xenium_annotation"] = full_adata.obs["xenium_annotation"].replace("??", "Other")
full_adata.obs["xenium_annotation"] = full_adata.obs["xenium_annotation"].replace("Eos? Enteroendocrine?", "Eos / Enteroendocrine Cells")
full_adata.obs["xenium_annotation"] = full_adata.obs["xenium_annotation"].replace("Neutrophils (M2?)", "Neutrophils / Macrophages (M2)")
full_adata.obs["xenium_annotation"] = full_adata.obs["xenium_annotation"].replace("Marophages", "Macrophages")

# Create list of raw channel names
raw_channels = [f"{ch}_original" for ch in codex_channels]

# Extract data with Xenium annotations and raw intensities
df_plot = full_adata.obs[raw_channels + ['xenium_annotation']].copy()

# Apply log1p transformation
df_plot[raw_channels] = np.log1p(df_plot[raw_channels])

# Number of markers per figure
markers_per_figure = 6

# Total number of figures needed
n_figures = math.ceil(len(raw_channels) / markers_per_figure)

# Loop over figure batches
for fig_idx in range(n_figures):
    # Define marker subset for this figure
    start = fig_idx * markers_per_figure
    end = start + markers_per_figure
    subset_channels = raw_channels[start:end]
    
    # Create figure with stacked subplots
    fig, axes = plt.subplots(
        nrows=len(subset_channels),
        ncols=1,
        figsize=(12, 3 * len(subset_channels)),  # Adjust height dynamically
        sharex=True
    )
    
    # If only one marker left, axes may not be iterable
    if len(subset_channels) == 1:
        axes = [axes]
    
    # Plot each marker on its subplot
    for i, raw_channel in enumerate(subset_channels):
        channel_name = raw_channel.replace("_original", "")
        sns.boxplot(data=df_plot, x='xenium_annotation', y=raw_channel, ax=axes[i])
        axes[i].set_title(f"{channel_name}", fontsize=11)
        axes[i].set_ylabel("log1p(Intensity)")
    
    # Format x-axis on bottom subplot
    axes[-1].set_xlabel("Cell Type")
    axes[-1].tick_params(axis='x', rotation=90)
    
    # Remove redundant x-labels for upper plots
    for ax in axes[:-1]:
        ax.set_xlabel("")
    
    plt.tight_layout()
    
    # Save figure
    filename = f"marker_intensity_boxplots_group_{fig_idx+1}.png"
    filepath = os.path.join(output_dir, filename)
    plt.savefig(filepath, dpi=300, bbox_inches='tight')
    plt.close(fig)
    
    print(f"Saved: {filepath}")

print(f"\nAll grouped marker intensity plots saved to: {output_dir}")


In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Output directory
output_dir = "figures/QC_Run2+3/marker_intensity_plots_run2+3/log1p/cd4_by_response_group/"
os.makedirs(output_dir, exist_ok=True)

# Marker of interest
marker = "CD4"

# Create dataframe with only the needed columns
df_plot = full_adata.obs[[marker, 'xenium_annotation', 'response_group']].copy()

# Log1p transform
df_plot[marker] = np.log1p(df_plot[marker])

# Plot separately for each response group
for group in df_plot['response_group'].unique():
    subset = df_plot[df_plot['response_group'] == group]

    plt.figure(figsize=(12, 6))
    sns.boxplot(data=subset, x='xenium_annotation', y=marker)
    plt.xticks(rotation=90)
    plt.title(f"log1p(Raw Intensity) of CD4 â€” Response Group: {group}")
    plt.ylabel("log1p(Intensity)")
    plt.xlabel("Cell Type")
    plt.tight_layout()

    # Save the plot
    filename = f"CD4_log1p_raw_intensity_boxplot_response_{group}.png"
    filepath = os.path.join(output_dir, filename)
    plt.savefig(filepath, dpi=300, bbox_inches='tight')
    plt.close()

    print(f"Saved plot for response group {group} to: {filepath}")

print(f"\nAll plots saved to directory: {output_dir}")


### Check top markers per cell type

In [None]:
# Remove DAPI from the list of channels
codex_channels_no_dapi = [ch for ch in codex_channels if ch != 'DAPI']

# Group by cell type and compute mean intensity of each marker
mean_per_annotation = full_adata.obs.groupby('xenium_annotation')[codex_channels_no_dapi].mean()

# For each cell type, get the top 5 markers sorted by normalized mean intensity
top5_markers_per_type = mean_per_annotation.apply(
    lambda row: row.sort_values(ascending=False).head(5).index.tolist(), axis=1
)
top5_normalized_values_per_type = mean_per_annotation.apply(
    lambda row: row.sort_values(ascending=False).head(5).values.tolist(), axis=1
)


# Combine into a summary DataFrame
top5_summary = pd.DataFrame({
    'Top 5 Markers': top5_markers_per_type,
    'Top 5 Normalized Values': top5_normalized_values_per_type
})

# Optional: sort alphabetically by cell type
top5_summary = top5_summary.sort_index()

# Display
print("Top 5 markers per cell type in order (1->5, based on z-normalized intensities):")
print(top5_summary)

## Plot each CODEX channel intensity per slide

In [None]:
# Optional: set output directory
output_dir = "./figures/QC_Run2+3/codex_intensities_per_slide"
os.makedirs(output_dir, exist_ok=True)

# 1. Create DataFrame with both raw and z-normalized channels
raw_channels = [f"{ch}_original" for ch in codex_channels]
df = full_adata.obs[raw_channels + codex_channels + ['slide_ID']].copy()

# 2. Compute intensity statistics per slide
intensity_sum_raw = df.groupby('slide_ID')[raw_channels].sum()
intensity_mean_raw = df.groupby('slide_ID')[raw_channels].mean()
intensity_mean_z = df.groupby('slide_ID')[codex_channels].mean()

# 3. Plot and save

# Plot 1: Total Raw Intensity
plt.figure(figsize=(16, 6))
sns.heatmap(intensity_sum_raw.T, cmap='viridis')
plt.title("Total Raw CODEX Channel Intensity per Slide")
plt.xlabel("Slide ID")
plt.ylabel("Marker")
plt.tight_layout()
plt.savefig(os.path.join(output_dir, "total_raw_intensity_per_slide.png"), dpi=300)
plt.show()

# Plot 2: Mean Raw Intensity
plt.figure(figsize=(16, 6))
sns.heatmap(intensity_mean_raw.T, cmap='plasma')
plt.title("Mean Raw CODEX Channel Intensity per Cell per Slide")
plt.xlabel("Slide ID")
plt.ylabel("Marker")
plt.tight_layout()
plt.savefig(os.path.join(output_dir, "mean_raw_intensity_per_slide.png"), dpi=300)
plt.show()

# Plot 3: Mean Z-normalized Intensity
plt.figure(figsize=(16, 6))
sns.heatmap(intensity_mean_z.T, cmap='magma')
plt.title("Mean Z-norm CODEX Channel Intensity per Cell per Slide")
plt.xlabel("Slide ID")
plt.ylabel("Marker")
plt.tight_layout()
plt.savefig(os.path.join(output_dir, "mean_znorm_intensity_per_slide.png"), dpi=300)
plt.show()

In [None]:
dapi_col = 'DAPI_original'

# Subset DataFrame for DAPI and slide_ID
dapi_df = full_adata.obs[[dapi_col, 'slide_ID']].copy()

# 1. Compute summary statistics per slide
dapi_stats = dapi_df.groupby('slide_ID')[dapi_col].agg(['min', 'max', 'mean', 'median', 'quantile'])
# Calculate Q1 and Q3 separately
dapi_stats['Q1'] = dapi_df.groupby('slide_ID')[dapi_col].quantile(0.25)
dapi_stats['Q3'] = dapi_df.groupby('slide_ID')[dapi_col].quantile(0.75)

# Optional: reorder columns
dapi_stats = dapi_stats[['min', 'Q1', 'median', 'mean', 'Q3', 'max']]

# Display stats table if needed
print(dapi_stats)

# 2. Plotting with seaborn boxplot per slide (shows min, Q1, median, Q3, max)
plt.figure(figsize=(12, 6))
sns.boxplot(data=dapi_df, x='slide_ID', y=dapi_col)
plt.title("Distribution of Raw DAPI Intensities per Slide")
plt.xlabel("Slide ID")
plt.ylabel("DAPI Raw Intensity")
plt.tight_layout()
plt.savefig("./figures/QC_Run2+3/dapi_boxplot_per_slide.png", dpi=300)
plt.show()

## Xenium Clustering

In [None]:
adata = full_adata.copy()
adata.obsm['X_umap'] = full_adata.obsm['xenium_X_umap']
adata.obs["xenium_annotation"] = adata.obs["xenium_annotation"].replace("??", "Other")

In [None]:
adata.obsm['X_umap'] = full_adata.obsm['xenium_X_umap']
adata.obs["xenium_annotation"] = adata.obs["xenium_annotation"].replace("??", "Other")
sc.pl.umap(adata, color='xenium_annotation', save='_xenium_annotated_highres.png')

In [None]:
adata.obsm['X_umap'] = full_adata.obsm['xenium_X_umap']
sc.pl.umap(adata, color='pat')

In [None]:
import squidpy as sq
import scanpy as sc

# 1. Replace '??' with 'Other' in xenium_annotation
adata_56764_X1_Y6.obs["xenium_annotation"] = adata_56764_X1_Y6.obs["xenium_annotation"].replace("??", "Other")

# 2. Set high-resolution figure saving (600 dpi)
sc.settings.set_figure_params(dpi=100, dpi_save=500, facecolor='white')

# 3. Plot and save high-resolution spatial scatter
sq.pl.spatial_scatter(
    adata_56764_X1_Y6,
    library_id="spatial",
    shape=None,
    color=["xenium_annotation"],
    title="Ileum",
    wspace=0.4,
    save="run2+3_spatial_xenium_annotation_on_ileum_highres.png"
)


In [None]:
adata_colon = adata[adata.obs['tissue'] == 'colon']

# View the subset
adata_colon.obs['core_ID']

In [None]:
adata_56764_X3Y1 = full_adata[(full_adata.obs['slide_str']=='ID_0056764') & (full_adata.obs['core_ID']=='X3Y1')].copy()

In [None]:
import squidpy as sq
import scanpy as sc

# 1. Replace '??' with 'Other' in xenium_annotation
adata_56764_X3Y1.obs["xenium_annotation"] = adata_56764_X3Y1.obs["xenium_annotation"].replace("??", "Other")

# 2. Set high-resolution figure saving (600 dpi)
sc.settings.set_figure_params(dpi=100, dpi_save=500, facecolor='white')

# 3. Plot and save high-resolution spatial scatter
sq.pl.spatial_scatter(
    adata_56764_X3Y1,
    library_id="spatial",
    shape=None,
    color=["xenium_annotation"],
    title="Colon",
    wspace=0.4,
    save="run2+3_spatial_xenium_annotation_on_colon_highres.png"
)

In [None]:
adata_s1_c1 = full_adata[(full_adata.obs['core_ID']=='X1Y1') & (full_adata.obs['slide_str']=='ID_0022110') ].copy()

sq.pl.spatial_scatter(
    adata_s1_c1,
    library_id="spatial",
    shape=None,
    color=[
        "xenium_annotation",
    ],
    wspace=0.4,
    save = 'run2+3_spatial__xenium_22110_X1Y1.png'
)

In [None]:
adata_s2_c4 = full_adata[(full_adata.obs['core_ID']=='X4Y2')& (full_adata.obs['slide_str']=='ID_0022111')].copy()

sq.pl.spatial_scatter(
    adata_s2_c4,
    library_id="spatial",
    shape=None,
    color=[
        "xenium_annotation",
    ],
    wspace=0.4,
    save = 'run2+3_spatial__xenium_22111_X4Y2.png'
)

### Per Tissue, Time point, Patient, Year, Core

In [None]:
sc.pl.umap(
    adata,
    color=[
        "tissue",
    ],
    wspace=0.4,
    save = '_run2+3_by_tissue_xenium.png'
)

In [None]:
sc.pl.umap(
    adata,
    color=[
        "time_point",
    ],
    wspace=0.4,
    save = '_run_2+3_by_timepoint_xenium.png'
)

In [None]:
sc.pl.umap(
    adata,
    color=["slide_str"],
    wspace=0.4,
    title="slide_ID",   # Set custom title
    save='_run_2+3_by_slide_xenium.png'
)

In [None]:
adata.obsm['X_umap'] = full_adata.obsm['xenium_X_umap']
# Ensure patient IDs are strings
adata.obs["patient_ID"] = adata.obs["patient_ID"].astype(str)

# Standardize patient IDs: make '3_' and '03_' consistent
def standardize_patient_ids(pid):
    parts = pid.split('_', 1)
    if parts[0].isdigit():
        parts[0] = f"{int(parts[0]):02d}"  # e.g., '3' -> '03'
    return "_".join(parts)

adata.obs["patient_ID"] = adata.obs["patient_ID"].map(standardize_patient_ids)

# Now plot UMAP
import scanpy as sc

sc.pl.umap(
    adata,
    color=["patient_ID"],
    wspace=0.4,
    save='_run_2+3_by_patient_xenium.png'
)


In [None]:
full_adata

In [None]:
adata.obs['response_group'] = np.where(
    adata.obs['patient_ID'].str.endswith('_NR'), 'Non-Responder',
    np.where(
        adata.obs['patient_ID'].str.endswith('_R'), 'Responder', 'Unknown'
    )
)

In [None]:
sc.pl.umap(
    adata,
    color=[
        "response_group",
    ],
    wspace=0.4,
    save = '_run2+3_by_response_group_xenium.png'
)

In [None]:
adata.obs["year"] = adata.obs["year"].astype(str)  

sc.pl.umap(
    adata,
    color=["year"],
    wspace=0.4,
    save='_run_2+3_by_year_xenium.png'
)

In [None]:
adata_ileum = adata[adata.obs['tissue']=='ileum'].copy()
adata_colon = adata[adata.obs['tissue']=='colon'].copy()

In [None]:
sc.pl.umap(
    adata_ileum,
    color=[
        "core_ID",
    ],
    wspace=0.4,
    save = '_run_2+3_ileum_by_core_xenium.png'
)

In [None]:
sc.pl.umap(
    adata_colon,
    color=[
        "core_ID",
    ],
    wspace=0.4,
    save = '_run_2+3_colon_by_core_xenium.png'
)

### LND cells 

In [None]:
adata.obsm['X_umap'] = full_adata.obsm['xenium_X_umap']
adata_LND = adata[adata.obs['xenium_annotation']=='LND cells'].copy()

In [None]:
adata_LND.obs["patient_ID"] = adata_LND.obs["patient_ID"].astype(str) 

In [None]:
sc.pl.umap(
    adata_LND,
    color=[
        "patient_ID",
    ],
    wspace=0.4,
    save = '_run_2+3_LND_by_patient.png'
)

In [None]:
sc.pp.pca(adata_LND)
sc.pp.neighbors(adata_LND)
sc.tl.umap(adata_LND)
sc.tl.leiden(adata_LND) 

In [None]:
sc.pl.umap(
    adata_LND[adata_LND.obs['time_point']=='during_treatment'],
    color=[
        "patient_ID",
    ],
    wspace=0.2,
    save = '_reclustered_during_treatment_LND_by_patient.png'
)

In [None]:
sc.pl.umap(
    adata_LND[adata_LND.obs['response_group']=='Non-Responder'],
    color=[
        "patient_ID",
    ],
    wspace=0.2,
    save = '_reclustered_non_responders_LND_by_patient.png'
)

In [None]:
sc.pl.umap(
    adata_LND[(adata_LND.obs['response_group']=='Non-Responder') & (adata_LND.obs['time_point']=='before_treatment')],
    color=[
        "patient_ID",
    ],
    wspace=0.2,
    save = '_reclustered_non_responders_before_treatment_LND_by_patient.png'
)

### CODEX markers on top

In [None]:
codex_channels_updated = ['DAPI', 'FoxP3', 'aSMA', 'CD4', 'CD8', 'CD31', 
                 'CD11c', 'IFNG', 'Pan-Cytokeratin', 'CD68', 'CD20', 
                 'CD66b', 'TNFa', 'CD45RO', 'CD14', 'CD11b', 'Vimentin', 
                 'CD163', 'CD45', 'CCR7', 'IL10', 'CD38', 'CD69', 
                 'Podoplanin', 'PNAd'] # 25 markers

In [None]:
adata.obs.rename(
    columns={name: f"{name}_CDX" for name in codex_channels_updated if name in adata.obs.columns},
    inplace=True)

In [None]:
codex_channels_updated = [ch + "_CDX" for ch in codex_channels_updated]

In [None]:
sc.pl.embedding(
    adata,
    basis="xenium_X_umap",
    color=codex_channels_updated,
    frameon=False,
    ncols=3,
    vmax="p99",
    wspace=0.1,
    save = '_xenium_with_codex_markers.png'
)

### Try to fix high bg markers

In [None]:
sc.settings.set_figure_params(dpi=100, dpi_save=500, facecolor='white')

In [None]:
adata.obs.rename(columns={'CD4': 'CD4_CDX'}, inplace=True)

In [None]:
# Extract raw CD4 intensities from .obs
cd4_raw = adata.obs["CD4_CDX"].values

# Compute background as 10th percentile
bg = np.percentile(cd4_raw, 10)

# Apply background subtraction
cd4_corrected = np.clip(cd4_raw - bg, 0, None)

# Store corrected values in a new column
adata.obs["CD4_CDX_corrected"] = cd4_corrected

# Plot before/after
plt.figure(figsize=(10,5))

plt.subplot(1,2,1)
plt.hist(cd4_raw, bins=100, color="steelblue", alpha=0.7)
plt.axvline(bg, color="red", linestyle="--", label=f"bg = {bg:.2f}")
plt.xlabel("CD4 Intensity"); plt.ylabel("Cell count")
plt.legend()
plt.xlim(-2, 20) 

plt.subplot(1,2,2)
plt.hist(cd4_corrected, bins=100, color="seagreen", alpha=0.7)
plt.xlabel("CD4 Corrected intensity"); plt.ylabel("Cell count")
plt.xlim(-2, 20) 

plt.tight_layout()
plt.savefig('figures/barplot_cd4_bg_correction_10_percent.png', dpi=400)
plt.show()


In [None]:
sc.pl.embedding(
    adata,
    basis="xenium_X_umap",
    color=['CD4_CDX', 'CD4_CDX_corrected'],
    frameon=False,
    vmax="p99",
    wspace=0.1,
    ncols = 2,
    title = ['CD4', 'CD4 Corrected'],
    save = '_CD4_before_after_correction_10_percent.png'
)

In [None]:
# Plot before/after
plt.figure(figsize=(10,5))

plt.subplot(1,2,1)
plt.hist(cd4_raw, bins=100, color="steelblue", alpha=0.7)
plt.axvline(bg, color="red", linestyle="--", label=f"bg = {bg:.2f}")
plt.title("CD14 raw intensity")
plt.xlabel("Intensity"); plt.ylabel("Cell count")
plt.legend()

plt.subplot(1,2,2)
plt.hist(cd4_corrected, bins=100, color="seagreen", alpha=0.7)
plt.title("CD14 after background subtraction")
plt.xlabel("Corrected intensity"); plt.ylabel("Cell count")

plt.tight_layout()
plt.show()

sc.pl.embedding(
    adata,
    basis="xenium_X_umap",
    color= ['CD14_CDX_corrected', 'CD14_CDX'],
    frameon=False,
    vmax="p99",
    ncols = 2,
    title = [' ', ' '],
    wspace=0.1,
)


In [None]:
# Extract raw CD4 intensities from .obs
cd4_raw = adata.obs["PNAd_CDX"].values

# Compute background as 10th percentile
bg = np.percentile(cd4_raw, 10)

# Apply background subtraction
cd4_corrected = np.clip(cd4_raw - bg, 0, None)

# Store corrected values in a new column
adata.obs["PNAd_CDX_corrected"] = cd4_corrected

# Plot before/after
plt.figure(figsize=(10,5))

plt.subplot(1,2,1)
plt.hist(cd4_raw, bins=100, color="steelblue", alpha=0.7)
plt.axvline(bg, color="red", linestyle="--", label=f"bg = {bg:.2f}")
plt.title("PNAd raw intensity")
plt.xlabel("Intensity"); plt.ylabel("Cell count")
plt.legend()

plt.subplot(1,2,2)
plt.hist(cd4_corrected, bins=100, color="seagreen", alpha=0.7)
plt.title("PNAd after background subtraction")
plt.xlabel("Corrected intensity"); plt.ylabel("Cell count")

plt.tight_layout()
plt.show()

sc.pl.embedding(
    adata,
    basis="xenium_X_umap",
    color= ['PNAd_CDX_corrected', 'PNAd_CDX'],
    frameon=False,
    vmax="p99",
    ncols = 2,
    wspace=0.1,
)

In [None]:
# Extract raw CD4 intensities from .obs
cd4_raw = adata.obs["TNFa_CDX"].values

# Compute background as 10th percentile
bg = np.percentile(cd4_raw, 10)

# Apply background subtraction
cd4_corrected = np.clip(cd4_raw - bg, 0, None)

# Store corrected values in a new column
adata.obs["TNFa_CDX_corrected"] = cd4_corrected

# Plot before/after
plt.figure(figsize=(10,5))

plt.subplot(1,2,1)
plt.hist(cd4_raw, bins=100, color="steelblue", alpha=0.7)
plt.axvline(bg, color="red", linestyle="--", label=f"bg = {bg:.2f}")
plt.title("TNFa raw intensity")
plt.xlabel("Intensity"); plt.ylabel("Cell count")
plt.legend()

plt.subplot(1,2,2)
plt.hist(cd4_corrected, bins=100, color="seagreen", alpha=0.7)
plt.title("TNFa after background subtraction")
plt.xlabel("Corrected intensity"); plt.ylabel("Cell count")

plt.tight_layout()
plt.show()

sc.pl.embedding(
    adata,
    basis="xenium_X_umap",
    color= ['TNFa_CDX_corrected', 'TNFa_CDX'],
    frameon=False,
    vmax="p99",
    ncols = 2,
    wspace=0.1,
)


In [None]:
def background_correct_cdx(adata, percentile=5):
    """
    Background-correct all CODEX markers stored in adata.obs
    (columns ending with '_CDX') by subtracting a percentile value.
    
    Parameters
    ----------
    adata : AnnData
        AnnData object with marker intensities in .obs
    percentile : int, optional (default=5)
        Percentile to use for background estimation
    """
    # find all CDX markers
    cdx_markers = [col for col in adata.obs.columns if col.endswith("_CDX")]
    
    for marker in cdx_markers:
        # compute background
        bg = np.percentile(adata.obs[marker], percentile)
        # subtract background and clip
        corrected = np.clip(adata.obs[marker] - bg, 0, None)
        # save in new column
        adata.obs[f"{marker}_corrected"] = corrected
    
    print(f"Background corrected {len(cdx_markers)} markers with {percentile}th percentile subtraction.")
    return adata

adata = background_correct_cdx(adata, percentile=10)

In [None]:
# Get all raw CODEX markers
cdx_markers = [col for col in adata.obs.columns if col.endswith("_CDX")]

# Get all corrected markers
cdx_corrected = [f"{m}_corrected" for m in cdx_markers]

sc.pl.embedding(
    adata,
    basis="xenium_X_umap",
    color=cdx_corrected,
    frameon=False,
    ncols=3,
    vmax="p99",
    wspace=0.1,
    save = '_xenium_with_codex_markers_bg_corrected_10_percent.png'
)

## Xenium QC table (for each metadata feature)

In [None]:
# Use raw counts layer if needed
if 'xenium_counts' in adata.layers:
    counts = adata.layers['xenium_counts']
else:
    counts = adata.X

# Calculate total transcripts and genes per cell
adata.obs['n_transcripts'] = counts.sum(axis=1).A1 if hasattr(counts, 'A1') else counts.sum(axis=1)
adata.obs['n_genes'] = (counts > 0).sum(axis=1).A1 if hasattr(counts, 'A1') else (counts > 0).sum(axis=1)

# Grouping variables
grouping_vars = ['slide_str', 'core_ID', 'patient_ID', 'time_point', 'response_group', 'year', 'tissue']

summary_stats = {}

for var in grouping_vars:
    grouped = adata.obs.groupby(var)
    stats = grouped.agg(
        total_cells=('n_transcripts', 'count'),
        total_transcripts=('n_transcripts', 'sum'),
        avg_transcripts_per_cell=('n_transcripts', 'mean'),
        avg_genes_per_cell=('n_genes', 'mean'),
    )
    
    df = grouped.agg(
        total_cells=('total_counts', 'count'),
        total_transcripts=('total_counts', 'sum'),
        control_probe_counts=('control_probe_counts', 'sum'),
        control_codeword_counts=('control_codeword_counts', 'sum')
    )

    df['negative_dna_pct'] = 100 * df['control_probe_counts'] / df['total_transcripts']
    df['negative_decoding_pct'] = 100 * df['control_codeword_counts'] / df['total_transcripts']
    
    summary_stats[var] = stats
    
    summary_stats[var] = summary_stats[var].join(df[['negative_dna_pct', 'negative_decoding_pct']])

In [None]:
summary_stats['response_group'].head()

In [None]:
summary_stats['time_point'].head()

## CODEX clustering - all markers present in both runs

In [None]:
# Exclude markers that are not shared between runs
codex_channels_updated = ['DAPI', 'FoxP3', 'aSMA', 'CD4', 'CD8', 'CD31', 
                 'CD11c', 'IFNG', 'Pan-Cytokeratin', 'CD68', 'CD20', 
                 'CD66b', 'TNFa', 'CD45RO', 'CD14', 'CD11b', 'Vimentin', 
                 'CD163', 'CD45', 'CCR7', 'IL10', 'CD38', 'CD69', 
                 'Podoplanin', 'PNAd'] # 25 markers

### Before z-norm

In [None]:
old_adata = ad.read_h5ad("/media/Lynn/data/Integrated_data/adata/run2_3_codex_raw_xenium_norm100_log_scale_16_umap_leiden_0_7.h5ad")

In [None]:
# Extract only CODEX intensity data
old_X_codex = old_adata.obs[codex_channels_updated].values

# Create new AnnData with the same obs
old_adata_codex = ad.AnnData(X=old_X_codex, obs=old_adata.obs.copy(), var=pd.DataFrame(index=codex_channels_updated))

In [None]:
sc.pp.pca(old_adata_codex)
sc.pp.neighbors(old_adata_codex)
sc.tl.umap(old_adata_codex)
sc.tl.leiden(old_adata_codex) 

In [None]:
sc.pl.umap(old_adata_codex, color="leiden", wspace=0.4, save = '_run2+3_codex_raw.png')

In [None]:
sc.pl.umap(old_adata_codex, color="xenium_annotation", wspace=0.4, save = '_run2+3_codex_raw_with_xenium_annotation.png')

### After z-norm

In [None]:
full_adata = ad.read_h5ad("/scratch/lyarab/adata/run2_3_codex_znorm_xenium_norm100_log_scale_16_umap_leiden_0_7.h5ad")

In [None]:
# Extract only CODEX intensity data
X_codex = full_adata.obs[codex_channels_updated].values

# Create new AnnData with the same obs
adata_codex = ad.AnnData(X=X_codex, obs=full_adata.obs.copy(), var=pd.DataFrame(index=codex_channels_updated))

In [None]:
sc.pp.pca(adata_codex)
sc.pp.neighbors(adata_codex)
sc.tl.umap(adata_codex)  

In [None]:
sc.tl.leiden(adata_codex)

In [None]:
sc.pl.umap(adata_codex, color="leiden", wspace=0.4, save = 'run2+3_codex_znorm.png')

In [None]:
sc.pl.umap(adata_codex, color="xenium_annotation", wspace=0.4, save = '_run2+3_codex_znorm_with_xenium_annotation.png')

In [None]:
adata_codex.write("/media/Lynn/data/Integrated_data/adata/run2_3_codex_znorm_leiden.h5ad")

### With sc.pp.scale

In [None]:
old_adata = ad.read_h5ad("/scratch/lyarab/adata/run2_3_codex_raw_xenium_norm100_log_scale_16_umap_leiden_0_7.h5ad")

In [None]:
# Extract only CODEX intensity data
old_X_codex = old_adata.obs[codex_channels_updated].values

# Create new AnnData with the same obs
old_adata_codex = ad.AnnData(X=old_X_codex, obs=old_adata.obs.copy(), var=pd.DataFrame(index=codex_channels_updated))

In [None]:
old_adata_codex_scaled = old_adata_codex.copy()

In [None]:
sc.pp.scale(old_adata_codex_scaled)
sc.pp.pca(old_adata_codex_scaled)
sc.pp.neighbors(old_adata_codex_scaled)
sc.tl.umap(old_adata_codex_scaled)
sc.tl.leiden(old_adata_codex_scaled) 

In [None]:
old_adata_codex_scaled.write("/media/Lynn/data/Integrated_data/adata/run2_3_codex_scaled_leiden.h5ad")

In [None]:
sc.pl.umap(old_adata_codex_scaled, color="leiden", wspace=0.4, save = 'run2+3_codex_scaled.png')

In [None]:
sc.pl.umap(old_adata_codex_scaled, color="xenium_annotation", wspace=0.4, save = '_run2+3_codex_scaled_with_xenium_annotation.png')

## CODEX clustering - only working markers

In [None]:
# Exclude markers that didn't work
codex_channels_updated_working = ['DAPI', 'FoxP3', 'aSMA', 'CD4', 'CD8', 'CD31', 
                 'CD11c', 'IFNG', 'Pan-Cytokeratin', 'CD68', 'CD20', 
                 'CD66b', 'TNFa', 'CD45RO', 'CD14', 'CD11b', 'Vimentin', 
                 'CD45', 'CCR7', 'IL10', 'CD38', 'Podoplanin'] # 22 markers

In [None]:
old_adata = ad.read_h5ad("/media/Lynn/data/Integrated_data/adata/run2_3_codex_raw_xenium_norm100_log_scale_16_umap_leiden_0_7.h5ad")

In [None]:
# Extract only CODEX intensity data
old_X_codex = old_adata.obs[codex_channels_updated_working].values

# Create new AnnData with the same obs
old_adata_codex = ad.AnnData(X=old_X_codex, obs=old_adata.obs.copy(), var=pd.DataFrame(index=codex_channels_updated_working))

In [None]:
sc.pp.scale(old_adata_codex)
sc.pp.pca(old_adata_codex)
sc.pp.neighbors(old_adata_codex)
sc.tl.umap(old_adata_codex)
sc.tl.leiden(old_adata_codex) 

In [None]:
old_adata_codex.write("/media/Lynn/data/Integrated_data/adata/run2_3_codex_scaled_leiden_only_working_markers.h5ad")

In [None]:
sc.pl.umap(old_adata_codex, color="leiden", wspace=0.4, legend_loc="on data", save = 'run2+3_codex_scaled(only_working_markers)_overlayed_cluster_numbers.png')

In [None]:
sc.pl.umap(old_adata_codex, color="xenium_annotation", wspace=0.4, save = '_run2+3_codex_scaled_with_xenium_annotation(only_working_markers).png')

### Check marker genes and proteins in each cluster

In [None]:
sc.tl.rank_genes_groups(old_adata_codex, groupby='leiden', method='wilcoxon', key_added='rank_genes_groups')

In [None]:
# Plot top genes/proteins across clusters
sc.pl.rank_genes_groups(old_adata_codex, n_genes=10, key='rank_genes_groups', sharey=False)

In [None]:
import scanpy as sc
import matplotlib.pyplot as plt
import math

# Get all cluster names
clusters = old_adata_codex.obs['leiden'].cat.categories.tolist()

# Define number of clusters per figure
batch_size = 8
n_batches = math.ceil(len(clusters) / batch_size)

# Loop over batches and plot
for i in range(n_batches):
    batch_clusters = clusters[i * batch_size:(i + 1) * batch_size]
    
    sc.pl.rank_genes_groups(
        old_adata_codex,
        groups=batch_clusters,
        n_genes=10,
        key='rank_genes_groups',
        sharey=False,
        show=False  # Prevents it from displaying immediately
    )
    
    plt.savefig(f"figures/QC_Run2+3/marker_intensity_plots_run2+3/only_codex(scaled_and_only_working_markers/ranked_genes_clusters_with_cdx_suffix_{i*batch_size}_{(i+1)*batch_size - 1}.png", dpi=300, bbox_inches='tight')
    plt.close()

## Concatenated Xenium + CODEX clustering

In [None]:
adata = ad.read_h5ad("/media/Lynn/data/Integrated_data/adata/run2_3_codex_znorm_xenium_norm100_log_scale_16_umap_leiden_0_7.h5ad")

### Remove trash cells from the 1st totalVI run

In [None]:
input_path = '/media/Lynn/data/totalVI/1st_run/cells_in_clusters_18_22.txt'

with open(input_path, 'r') as f:
    cells_in_clusters = [line.strip() for line in f]

print(f"Loaded {len(cells_in_clusters)} cell IDs")

In [None]:
adata = adata[~adata.obs_names.isin(cells_in_clusters)].copy()

### Fix CD4 signal

In [None]:
# Extract raw CD4 intensities from .obs
cd4_raw = adata.obs["CD4"].values

# Compute background as 10th percentile
bg = np.percentile(cd4_raw, 10)

# Apply background subtraction
cd4_corrected = np.clip(cd4_raw - bg, 0, None)

# Store corrected values in a new column
adata.obs["CD4_corrected"] = cd4_corrected

In [None]:
adata

In [None]:
sc.pl.embedding(
    adata,
    basis="xenium_X_umap",
    color='CD4_corrected',
    frameon=False,
    vmax=3,
    wspace=0.1,
    ncols = 2,
)

In [None]:
subset = adata[adata.obs['slide_str'].isin(['ID_0056764', 'ID_0056777'])]

sc.pl.embedding(
    subset,
    basis="xenium_X_umap",
    color='ECP',
    frameon=False,
    vmax=1,
    wspace=0.1,
    ncols=2,
    save='ECP_on_xenium_clustering.png'
)

In [None]:
# Make sure both columns exist in .obs
if "xenium_annotation" in adata.obs.columns and "xenium_leiden_0.7" in adata.obs.columns:
    adata.obs["xenium_annotation_with_cluster_number"] = (
        adata.obs["xenium_leiden_0.7"].astype(str) + ": " + adata.obs["xenium_annotation"].astype(str)
    )

### Cluster using only working markers

In [None]:
codex_channels = ['aSMA', 'CD8', 'CD31', 
                 'CD11c', 'IFNG', 'Pan-Cytokeratin', 'CD68', 'CD20', 
                 'CD66b', 'CD45RO', 'CD11b', 'Vimentin', 'CD4_corrected',
                 'CD45', 'CCR7', 'CD38', 'Podoplanin'] # 17 markers

In [None]:
adata_znorm = adata.copy()

In [None]:
# Step 1: Rename CODEX channel names in obs to have "_CDX" suffix
codex_renamed = [f"{ch}_CDX" for ch in codex_channels]
adata_znorm.obs.rename(columns=dict(zip(codex_channels, codex_renamed)), inplace=True)

# Step 2: Extract normalized gene expression (X) from .X
X_expr = adata_znorm.X.toarray() if sparse.issparse(adata_znorm.X) else adata_znorm.X

# Step 3: Extract renamed CODEX intensity matrix from .obs
X_codex = adata_znorm.obs[codex_renamed].to_numpy()

# Step 4: Concatenate along feature axis (columns)
X_combined = np.concatenate([X_expr, X_codex], axis=1)

# Step 5: Combine var names
combined_var_names = list(adata_znorm.var_names) + codex_renamed

# Step 6: Create new AnnData object
adata_joint = AnnData(X=X_combined, obs=adata_znorm.obs.copy())
adata_joint.var_names = combined_var_names

In [None]:
sc.pp.pca(adata_joint)
sc.pp.neighbors(adata_joint)
sc.tl.leiden(adata_joint)
sc.tl.umap(adata_joint)
adata_joint.write("/media/Lynn/data/Integrated_data/adata/concatenated_clustering_run2_3_codex_znorm_xenium_norm100_log_scale_16_umap_leiden_0_7.h5ad")

In [None]:
sc.pl.umap(adata_joint, color="leiden", wspace=0.4, save = '_run2+3_concatenated.png')

In [None]:
adata_joint.obs["xenium_annotation"] = adata_joint.obs["xenium_annotation"].replace("??", "Other")
sc.pl.umap(adata_joint, color="xenium_annotation", title = 'Xenium-only Annotation', wspace=0.4, save = '_run2+3_concatenated_overlayed_xenium_annotation.png')

In [None]:
adata_joint = ad.read_h5ad("/media/Lynn/data/Integrated_data/adata/concatenated_clustering_run2_3_codex_znorm_xenium_norm100_log_scale_16_umap_leiden_0_7.h5ad")

In [None]:
# Make sure both columns exist in .obs
if "xenium_annotation" in adata_joint.obs.columns and "xenium_leiden_0.7" in adata_joint.obs.columns:
    adata_joint.obs["xenium_annotation_with_cluster_number"] = (
        adata_joint.obs["xenium_leiden_0.7"].astype(str) + ": " + adata_joint.obs["xenium_annotation"].astype(str)
    )

In [None]:
sc.pl.umap(adata_joint, color="xenium_annotation_with_cluster_number", wspace=0.4, save = '_run2+3_concatenated_overlayed_xenium_annotation_with_numbers.png')

In [None]:
adata_joint.write("/media/Lynn/data/Integrated_data/adata/concatenated_clustering_run2_3_codex_znorm_xenium_norm100_log_scale_16_umap_leiden_0_7.h5ad")

In [None]:
sc.pl.umap(adata_joint, color="xenium_leiden_0.7", legend_loc= 'on data', wspace=0.4, save = '_run2+3_concatenated_overlayed_xenium_only_cluster_numbers.png')

In [None]:
sc.tl.leiden(adata_joint, resolution=0.7, key_added='leiden_0.7')
sc.tl.umap(adata_joint, key_added='umap_0.7')
adata_joint.write("/media/Lynn/data/Integrated_data/adata/concatenated_clustering_run2_3_codex_znorm_leiden_0_7_xenium_norm100_log_scale_16_umap_leiden_0_7.h5ad")

In [None]:
sc.set_figure_params(figsize=(6, 6), frameon=False, dpi_save=600)
sns.set_theme()

%config InlineBackend.print_figure_kwargs={"facecolor": "w"}
%config InlineBackend.figure_format="retina"

In [None]:
sc.pl.umap(adata_joint, color="leiden_0.7", title = ' ', wspace=0.4, save = '_run2+3_concatenated_res_07_highres.png')

In [None]:
sc.pl.umap(adata_joint, color="leiden_0.7", title = ' ', legend_loc = 'on data', wspace=0.4, save = '_run2+3_concatenated_res_07_legend_on_data_highres.png')

In [None]:
sc.pl.umap(adata_joint, color="tissue", title = ' ', wspace=0.4, save = '_run2+3_concatenated_by_tissue_highres.png')

In [None]:
sc.tl.rank_genes_groups(adata_joint, groupby='leiden_0.7', method='wilcoxon', key_added='rank_genes_groups')

In [None]:
# extract the results
result = adata_joint.uns['rank_genes_groups']

# turn into a pandas dataframe
groups = result['names'].dtype.names

de_table = pd.concat(
    [
        pd.DataFrame({
            'cluster': group,
            'names': result['names'][group],
            'scores': result['scores'][group],
            'logfoldchanges': result['logfoldchanges'][group],
            'pvals': result['pvals'][group],
            'pvals_adj': result['pvals_adj'][group],
        })
        for group in groups
    ]
)

de_table.reset_index(drop=True, inplace=True)

de_table.head()

In [None]:
de_table.to_csv("/media/Lynn/for_Lisa/concatenated_adata_ranked_gene_groups/rank_genes_groups.csv", index=False)

In [None]:
adata_joint.write("/media/Lynn/data/Integrated_data/adata/concatenated_clustering_run2_3_codex_znorm_leiden_0_7_xenium_norm100_log_scale_16_umap_leiden_0_7.h5ad")

In [None]:
adata_joint = ad.read_h5ad("/media/Lynn/data/Integrated_data/adata/concatenated_clustering_run2_3_codex_znorm_leiden_0_7_xenium_norm100_log_scale_16_umap_leiden_0_7.h5ad")

In [None]:
# Extract ranked genes info
rank_genes = adata_joint.uns['rank_genes_groups']
groups = rank_genes['names'].dtype.names  # cluster names

# Build a table of top 5 genes per cluster
top_n = 5
data = {
    group: rank_genes['names'][group][:top_n].tolist()
    for group in groups
}

# Convert to a DataFrame with cluster and top genes
df_top = pd.DataFrame({
    "Cluster": data.keys(),
    "Top 5 Genes/Proteins": [
        ", ".join(genes) for genes in data.values()
    ]
})

# Convert to LaTeX table
latex_table = df_top.to_latex(
    index=False,
    caption="Top 5 marker genes or proteins per cluster identified by rank_gene_groups().",
    label="tab:top5_markers",
    escape=False
)

print(latex_table)


In [None]:
import pandas as pd 
import re

df = pd.read_excel('/media/Lynn/data/metadata/run2_3_metadata.xlsx')

def normalize_patient_id(pid):
    match = re.match(r'(\d+)_([A-Z]+)', str(pid))
    if match:
        num, group = match.groups()
        return f"{int(num):02d}_{group}"  # zero-pad to 2 digits
    else:
        return pid  # if it doesn't match pattern, leave unchanged

df['patient_ID'] = df['patient_ID'].apply(normalize_patient_id)

# Create a unique sample ID combining slide_ID and core_ID
df['sample_ID'] = df['slide_ID'].astype(str) + "_" + df['core_ID'].astype(str)

# Group by tissue type and time point
summary = (
    df.groupby(['tissue', 'time_point'])
    .agg(
        n_samples=('sample_ID', 'nunique'),   # unique samples per tissue/time point
        n_patients=('patient_ID', 'nunique')  # unique patients
    )
    .reset_index()
)

print(summary)

In [None]:
np.save("/media/Lynn/data/Integrated_data/X_pca_concatenated.npy", adata_joint.obsm["X_pca"])