In [None]:
# Import libraries
import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from scipy.sparse import issparse
from datetime import datetime
from tqdm import tqdm
from sklearn import metrics
from scipy import stats
import warnings
import sys


# Set plotting settings
sc.settings.verbosity = 3
sc.settings.set_figure_params(dpi=100, frameon=False)

BASE_DIR = "/beegfs/scratch/ric.broccoli/kubacki.michal/SRF_Linda_RNA/"

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
samples = {
    "Emx1_Ctrl": "cellranger_counts_R26_Emx1_Ctrl_adult_0",
    "Emx1_Mut": "cellranger_counts_R26_Emx1_Mut_adult_1",
    "Nestin_Ctrl": "cellranger_counts_R26_Nestin_Ctrl_adult_2",
    "Nestin_Mut": "cellranger_counts_R26_Nestin_Mut_adult_3"
    }

In [None]:
# This cell will be parameterized by the script
SAMPLE_NAME = "SAMPLE_PLACEHOLDER"  # This will be replaced with the actual sample name
# SAMPLE_NAME = "Emx1_Ctrl"
print(f"Processing sample: {SAMPLE_NAME}")

# %% [markdown]
# # 1. Setup and Data Loading

In [None]:
SAMPLE = samples[SAMPLE_NAME]

WORKING_DIR = os.path.join(BASE_DIR, "post_analysis", "individual_data_analysis_opt_clusters", SAMPLE)
os.makedirs(WORKING_DIR, exist_ok=True)

CELL_DATA_DIR = "cellranger_final_count_data"
matrix_dir = os.path.join(BASE_DIR, CELL_DATA_DIR, SAMPLE, "outs", "filtered_feature_bc_matrix")

os.chdir(WORKING_DIR)
OUTPUT_DIR=WORKING_DIR

sys.path.append(os.path.join(BASE_DIR, "post_analysis", "individual_data_analysis_opt_clusters"))
from functions import *

# Load the data from the filtered matrix
try:
    adata = sc.read_10x_mtx(
        matrix_dir,
        var_names='gene_symbols',
        cache=True
    )
    print(f"Shape of loaded data: {adata.shape}")  # cells × genes
except ValueError as e:
    print(f"Error loading data: {e}")
    # Try loading with different parameters to handle the mismatch
    adata = sc.read_10x_mtx(
        matrix_dir,
        var_names='gene_symbols',
        cache=False
    )
    print(f"Shape of loaded data after retry: {adata.shape}")  # cells × genes

# %% [markdown]
# # 2. Basic Pre-processing

In [None]:
# Make a copy of the raw counts
adata.raw = adata.copy()

# Basic filtering
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)

# Calculate QC metrics
adata.var['mt'] = adata.var_names.str.startswith('mt-')  # identify mitochondrial genes
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)

# Plot QC metrics
fig, axs = plt.subplots(1, 3, figsize=(15, 4))
sns.histplot(adata.obs['n_genes_by_counts'], kde=False, ax=axs[0])
axs[0].set_title('Genes per cell')
sns.histplot(adata.obs['total_counts'], kde=False, ax=axs[1])
axs[1].set_title('UMI counts per cell')
sns.histplot(adata.obs['pct_counts_mt'], kde=False, ax=axs[2])
axs[2].set_title('Percent mitochondrial')
plt.tight_layout()

# Save the plot to the output directory
plt.savefig(os.path.join(OUTPUT_DIR, 'qc_metrics.png'))
plt.show()

# %% [markdown]
# # 3. Filtering Based on QC Metrics

In [None]:
max_genes = 15000 
min_genes = 500  
max_mt_pct = 20  

adata = adata[adata.obs['n_genes_by_counts'] < max_genes, :]
adata = adata[adata.obs['n_genes_by_counts'] > min_genes, :]
adata = adata[adata.obs['pct_counts_mt'] < max_mt_pct, :]

print(f"Number of cells after filtering: {adata.n_obs}")
print(f"Number of genes after filtering: {adata.n_vars}")

# %% [markdown]
# # 4. Normalization and Log Transformation

In [None]:
# Normalize to 10,000 reads per cell
sc.pp.normalize_total(adata, target_sum=1e4)

# Log transform
sc.pp.log1p(adata)

# Identify highly variable genes
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
print(f"Number of highly variable genes: {sum(adata.var.highly_variable)}")

# Plot highly variable genes
plt.figure(figsize=(10, 8))
sc.pl.highly_variable_genes(adata, show=False)
plt.tight_layout()
plt.show()

In [None]:
# Save the current normalized and log-transformed data to a new layer BEFORE scaling
adata.layers['for_cell_typist'] = adata.X.copy()

In [None]:
# Quick check that the data in the layer is correctly normalized
# Reverse log1p transformation
if issparse(adata.layers['for_cell_typist']):
    counts_in_layer = adata.layers['for_cell_typist'].copy()
    counts_in_layer.data = np.expm1(counts_in_layer.data)
else:
    counts_in_layer = np.expm1(adata.layers['for_cell_typist'])

# Sum counts per cell
total_counts_layer = np.asarray(counts_in_layer.sum(axis=1)).flatten()

print("\nVerifying normalization in 'for_cell_typist' layer:")
print(f"  Mean total counts (reversed log1p): {total_counts_layer.mean():.2f}")
print(f"  Median total counts (reversed log1p): {np.median(total_counts_layer):.2f}")

# Basic QC check for the layer
if np.mean(total_counts_layer) < 9900 or np.mean(total_counts_layer) > 10100:
    warnings.warn(f"Normalization in 'for_cell_typist' layer may not be exactly 10k (Mean: {total_counts_layer.mean():.2f}). Check normalization step.")
else:
    print("  Normalization in 'for_cell_typist' layer appears correct (around 10k).")

# %% [markdown]
# # 5. Dimensionality Reduction

In [None]:
# Scale adata.X to unit variance and zero mean AFTER saving the normalized layer
# This step modifies adata.X but leaves adata.layers['for_cell_typist'] untouched
sc.pp.scale(adata, max_value=10)

# Run PCA
sc.tl.pca(adata, svd_solver='arpack')

# Determine number of significant PCs
sc.pl.pca_variance_ratio(adata, n_pcs=50, log=True)
plt.show()

In [None]:
# Choose number of PCs for downstream analyses
n_pcs = 30  # Adjust based on the variance ratio plot

# Compute neighborhood graph
sc.pp.neighbors(adata, n_neighbors=15, n_pcs=n_pcs)

# Run UMAP
sc.tl.umap(adata)

# Plot UMAP
plt.figure(figsize=(10, 8))
sc.pl.umap(adata, color=['total_counts', 'n_genes_by_counts', 'pct_counts_mt'], 
        use_raw=False, color_map='viridis', show=False)
plt.tight_layout()
plt.show()

# %% [markdown]
# # 6. Marker Gene Identification

In [None]:
# Try different resolutions to find optimal number of clusters
resolution_range=[0.05, 0.8]
n_resolutions=10
resolutions = np.linspace(resolution_range[0], resolution_range[1], n_resolutions)
resolutions = [round(r, 2) for r in resolutions]

In [None]:
# Check first 5 values from first cell
if issparse(adata.X):
    print("X matrix values (first cell):", adata.X[0, :5].toarray().flatten())
else:
    print("X matrix values (first cell):", adata.X[0, :5])
print("Should be log1p transformed values (~0-5 range)")

# Check raw values if raw exists
if adata.raw:
    if issparse(adata.raw.X):
        print("Raw values:", adata.raw.X[0, :5].toarray().flatten())
    else:
        print("Raw values:", adata.raw.X[0, :5])
    print("Should be original counts (integers)")


In [None]:
# With custom parameters
optimal_resolution = analyze_and_select_best_clustering(
    adata,
    resolutions=resolutions,
    run_marker_analysis=True,       # Run marker gene analysis
    leiden_key='leiden',            # Base name for cluster labels
    output_dir="my_cluster_analysis"  # Output directory
)

# Annotate adata with optimal clustering (if not already present)
best_clustering = f"leiden_{optimal_resolution}"
if best_clustering not in adata.obs:
    sc.tl.leiden(adata, resolution=optimal_resolution, key_added=best_clustering)

In [None]:
# Load the CSV file
df = pd.read_csv(os.path.join(OUTPUT_DIR, 'my_cluster_analysis', 'evaluation', 'clustering_quality_metrics.csv'))

# Sort the dataframe by overall_score in descending order
sorted_df = df.sort_values(by='overall_score', ascending=False)

# Create an ordered list of resolutions
ordered_resolutions = sorted_df['resolution'].tolist()
scores = []
print("Resolutions ordered by overall_score (highest to lowest):")
for i, res in enumerate(ordered_resolutions, 1):
    score = sorted_df.loc[sorted_df['resolution'] == res, 'overall_score'].values[0]
    scores.append(score)
    print(f"{i}. Resolution: {res}, Overall Score: {score}")

In [None]:
# Try different resolutions to find optimal number of clusters
best_resolutions = ordered_resolutions[:3]
print(best_resolutions)
# Plot clusters at different resolutions with improved layout
fig, axes = plt.subplots(1, len(best_resolutions), figsize=(20, 5))
for i, res in enumerate(best_resolutions):
    sc.pl.umap(adata, color=f'leiden_{res}', title=f'Resolution {res}, score {scores[i]}', 
               frameon=True, legend_loc='on data', legend_fontsize=10, ax=axes[i], show=False)

# Ensure proper spacing between subplots
plt.tight_layout()
plt.show()

# %% [markdown]
# # 7. Save Processed Data

In [None]:
# Define the output file path
output_adata_file = os.path.join(OUTPUT_DIR, f"{SAMPLE_NAME}_processed.h5ad")

# List all clustering assignments stored in the adata object
print("Clustering assignments stored in the AnnData object:")
leiden_columns = [col for col in adata.obs.columns if col.startswith('leiden_')]
for col in leiden_columns:
    n_clusters = len(adata.obs[col].unique())
    print(f"  - {col}: {n_clusters} clusters")

# Save the AnnData object with all clustering results
print(f"\nSaving processed AnnData object to: {output_adata_file}")
try:
    adata.write(output_adata_file)
    print("Successfully saved AnnData object with all clustering assignments.")
except Exception as e:
    print(f"Error saving AnnData object: {e}")

# %% [markdown]
# # 8. Visualize Clustering Results and Quality Metrics

In [None]:
# Display the optimal clustering on UMAP
plt.figure(figsize=(12, 10))
sc.pl.umap(adata, color=f'leiden_{optimal_resolution}', 
           title=f'Optimal Clustering (Resolution={optimal_resolution})', 
           legend_loc='on data', frameon=True, show=False)
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, 'optimal_clustering_umap.png'), dpi=150)
plt.show()

# %% [markdown]
# ## 8.1 Clustering Quality Metrics Analysis

In [None]:
# Load the clustering quality metrics
metrics_df = pd.read_csv(os.path.join(OUTPUT_DIR, 'my_cluster_analysis', 'evaluation', 'clustering_quality_metrics.csv'))
print("Clustering quality metrics summary:")
display(metrics_df[['resolution', 'n_clusters', 'silhouette_score', 'davies_bouldin_score', 'marker_gene_score', 'overall_score']])

In [None]:
# Display the main clustering quality metrics visualization
from IPython.display import Image, display

print("Clustering quality metrics across resolutions:")
metrics_img = Image(os.path.join(OUTPUT_DIR, 'my_cluster_analysis', 'evaluation', 'clustering_quality_metrics.png'))
display(metrics_img)

In [None]:
# Display metric contributions visualization if available
metric_details_path = os.path.join(OUTPUT_DIR, 'my_cluster_analysis', 'evaluation', 'metric_details')
if os.path.exists(metric_details_path):
    contributions_img = Image(os.path.join(metric_details_path, 'metric_contributions.png'))
    print("Contribution of each metric to the overall score:")
    display(contributions_img)
    
    individual_metrics_img = Image(os.path.join(metric_details_path, 'individual_metrics.png'))
    print("Individual metrics across resolutions:")
    display(individual_metrics_img)

In [None]:
# Load and display the metric contribution summary
contribution_summary_path = os.path.join(OUTPUT_DIR, 'my_cluster_analysis', 'evaluation', 'metric_details', 'metric_contribution_summary.csv')
if os.path.exists(contribution_summary_path):
    contribution_df = pd.read_csv(contribution_summary_path)
    print("Metric contribution summary:")
    display(contribution_df)

# %% [markdown]
# ## 8.2 Marker Genes for Optimal Clustering

In [None]:
# Display marker heatmap for optimal clustering
optimal_heatmap_path = os.path.join(OUTPUT_DIR, 'my_cluster_analysis', 'evaluation', 'optimal_clustering_heatmap.png')
if os.path.exists(optimal_heatmap_path):
    # Instead of just displaying the image, let's create an improved version
    # First, get the marker genes and expression data
    leiden_key = f'leiden_{optimal_resolution}'
    
    # Check if we have marker genes information
    if f"rank_genes_{optimal_resolution}" in adata.uns:
        # Get top markers for each cluster (adjust n_genes as needed)
        n_top_genes = 20
        sc.tl.dendrogram(adata, groupby=leiden_key)
        
        # Create an improved heatmap with better formatting
        plt.figure(figsize=(14, 10))
        sc.pl.heatmap(adata, var_names=adata.uns[f'rank_genes_{optimal_resolution}']['names'][:n_top_genes], 
                      groupby=leiden_key, 
                      swap_axes=True,              # Put genes on y-axis for better labels
                      show_gene_labels=True,       # Show gene names
                      dendrogram=True,             # Show the dendrogram
                      cmap='viridis',              # Use a perceptually uniform colormap
                      vmin=0, vmax=None,           # Set minimum value to 0
                      standard_scale='var',        # Scale expression by gene
                      use_raw=True,                # Use raw counts for better contrast
                      show=False)
        
        plt.title(f"Top Markers for Optimal Clustering (Resolution={optimal_resolution})", fontsize=16)
        plt.tight_layout()
        
        # Save the improved heatmap
        improved_heatmap_path = os.path.join(OUTPUT_DIR, 'my_cluster_analysis', 'evaluation', 'improved_clustering_heatmap.png')
        plt.savefig(improved_heatmap_path, dpi=150, bbox_inches='tight')
        plt.show()
        
        print(f"Improved marker gene heatmap saved to: {improved_heatmap_path}")
    else:
        # If we don't have marker genes, display the original heatmap
        optimal_heatmap_img = Image(optimal_heatmap_path)
        print(f"Marker gene heatmap for optimal clustering (resolution={optimal_resolution}):")
        display(optimal_heatmap_img)

In [None]:
# Load and display top markers for each cluster in the optimal clustering
markers_file = os.path.join(OUTPUT_DIR, 'my_cluster_analysis', 'marker_analysis', f'cluster_markers_res{optimal_resolution}.csv')
if os.path.exists(markers_file):
    markers_df = pd.read_csv(markers_file)
    
    # Create a more readable format for marker genes by cluster
    top_markers_by_cluster = {}
    for cluster in sorted(markers_df['cluster'].unique()):
        cluster_markers = markers_df[markers_df['cluster'] == cluster].sort_values('pvals_adj').head(10)
        top_markers_by_cluster[cluster] = list(zip(
            cluster_markers['names'], 
            cluster_markers['logfoldchanges'].round(2),
            cluster_markers['pvals_adj'].apply(lambda x: f"{x:.2e}")
        ))
    
    # Display top markers for each cluster
    print(f"Top marker genes for each cluster at resolution {optimal_resolution}:")
    for cluster, markers in top_markers_by_cluster.items():
        print(f"\nCluster {cluster}:")
        for i, (gene, lfc, pval) in enumerate(markers, 1):
            print(f"  {i}. {gene} (log2FC: {lfc}, adj.p-val: {pval})")

# %% [markdown]
# ## 8.3 Resolution Comparison Summary

In [None]:
# Load the resolution comparison summary
summary_file = os.path.join(OUTPUT_DIR, 'my_cluster_analysis', 'marker_analysis', 'resolution_comparison_summary.csv')
if os.path.exists(summary_file):
    summary_df = pd.read_csv(summary_file)
    print("Resolution comparison summary:")
    display(summary_df)
    
    # Plot resolution comparison metrics
    plt.figure(figsize=(12, 6))
    
    plt.subplot(1, 2, 1)
    plt.bar(summary_df['Resolution'].astype(str), summary_df['Clusters'])
    plt.xlabel('Resolution')
    plt.ylabel('Number of clusters')
    plt.title('Clusters by resolution')
    
    plt.subplot(1, 2, 2)
    plt.bar(summary_df['Resolution'].astype(str), summary_df['Avg_markers_per_cluster'])
    plt.xlabel('Resolution')
    plt.ylabel('Avg. significant markers per cluster')
    plt.title('Marker genes by resolution')
    
    plt.tight_layout()
    plt.show()

# %% [markdown]
# ## 8.4 Interactive Visualization of Optimal Clustering

In [None]:
# Create an interactive visualization of the optimal clustering with marker genes
leiden_key = f'leiden_{optimal_resolution}'

# Plot UMAP with interactive cluster selection
sc.pl.umap(adata, color=leiden_key, legend_loc='on data', 
           title=f'Interactive UMAP (resolution={optimal_resolution})', 
           frameon=True, return_fig=True)

# Create an interactive panel to explore top markers for each cluster
try:
    # Only import panel libraries if needed to avoid dependencies
    import panel as pn
    pn.extension()
    
    # Get unique clusters
    clusters = sorted(adata.obs[leiden_key].unique().astype(str).tolist())
    
    # Create widgets
    cluster_selector = pn.widgets.Select(name='Select Cluster', options=clusters)
    n_genes_slider = pn.widgets.IntSlider(name='Number of Genes', start=5, end=20, value=10)
    
    @pn.depends(cluster_selector.param.value, n_genes_slider.param.value)
    def get_marker_table(cluster, n_genes):
        # Get markers for selected cluster
        if f"rank_genes_{optimal_resolution}" in adata.uns:
            markers_df = sc.get.rank_genes_groups_df(adata, group=cluster, 
                                                    key=f"rank_genes_{optimal_resolution}")
            markers_df = markers_df.head(n_genes)
            return pn.pane.DataFrame(markers_df[['names', 'logfoldchanges', 'pvals_adj']], 
                                     width=600, height=400)
        else:
            return pn.pane.Markdown("Marker gene information not available")
    
    # Create layout
    marker_panel = pn.Column(
        pn.pane.Markdown(f"# Marker Genes Explorer for Resolution {optimal_resolution}"),
        pn.Row(cluster_selector, n_genes_slider),
        get_marker_table
    )
    
    # Display panel
    display(marker_panel)
except ImportError:
    print("Panel library not available. Install with 'pip install panel' for interactive visualizations.")
except Exception as e:
    print(f"Error creating interactive visualization: {e}")

# %% [markdown]
# # 9. Summary and Conclusion

In [None]:
# Load and display the analysis summary
summary_path = os.path.join(OUTPUT_DIR, 'my_cluster_analysis', 'analysis_summary.txt')
if os.path.exists(summary_path):
    with open(summary_path, 'r') as f:
        summary_text = f.read()
    
    from IPython.display import Markdown
    display(Markdown(f"```\n{summary_text}\n```"))

In [None]:
# Print final summary
print(f"\n{'='*50}")
print(f"CLUSTERING ANALYSIS COMPLETED")
print(f"{'='*50}")
print(f"Sample: {SAMPLE_NAME}")
print(f"Optimal resolution: {optimal_resolution}")
print(f"Number of clusters: {len(adata.obs[f'leiden_{optimal_resolution}'].unique())}")
print(f"Total cells analyzed: {adata.n_obs}")
print(f"Results saved to: {os.path.abspath(OUTPUT_DIR)}")
print(f"{'='*50}")


