In [None]:
from sctoolbox.utils.jupyter import bgcolor, _compare_version

# change the background of input cells
bgcolor("PowderBlue", select=[2, 4])

nb_name = "proportion_analysis.ipynb"

_compare_version(nb_name)

# Proportion Analysis
<hr style="border:2px solid black"> </hr>

## 1 - Description

**Requires a clustered or otherwise categorized anndata object. A clustering can be generated with a clustering notebook (e.g. `rna_analysis/notebooks/04_clustering.ipynb`).**

**Move this notebook into the notebook folder (e.g. `rna_analysis/notebooks/`) of the respective analysis before using it!**

Differential proportion analysis aims to identify clusters showing differential composition between different biological conditions. Scanpro offers a linear regression framework and empirical bayes moderated statistical tests, taking sample-to-sample variation into account. Scanpro also generate pseudo-replicates automatically for unreplicated data.

In this notebook we will use [Scanpro](https://github.com/loosolab/scanpro). For more information, check the [documentation](https://scanpro.readthedocs.io/en/latest/)

<img src="https://raw.githubusercontent.com/loosolab/scanpro/main/docs/source/figures/scanpro_workflow.png" alt="image" width="65%" height="auto">


___

## 2 - Loading packages

In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)  # no limit to the number of columns shown
from sctoolbox import settings
import sctoolbox.utils as utils
import sctoolbox.utils.decorator as deco

# We will use Scanpro for proportion analysis
from scanpro import scanpro

___

## 3 - Input/output settings

<h1><center>⬐ Fill in input data here ⬎</center></h1>

In [None]:
# In/output paths
settings.adata_input_dir = "../adatas/"
settings.adata_output_dir = "../adatas/"
settings.figure_dir = "../figures/proportion_analysis/"
settings.log_file: "../logs/scanpro_analysis_log.txt"

# Input/Output
last_notebook_adata = "anndata_4_2D.h5ad"
output = "anndata_scanpro.h5ad"
plot_suffix = "scanpro"

___

## 4 - Load anndata

In [None]:
adata = utils.adata.load_h5ad(last_notebook_adata)

adata.obs.rename(columns={'meta age':'meta_age', 'meta age_dec':'meta_age_dec', 'meta sex':'meta_sex', 'meta c19_severity':'meta_c19_severity', 'meta cohort':'meta_cohort', 'atac plate':'atac_plate'}, inplace=True)

with pd.option_context("display.max.rows", 5, "display.max.columns", None):
    display(adata)
    display(adata.obs)

___

## 5 - General Input

<h1><center>⬐ Fill in input data here ⬎</center></h1>

In [None]:
# Clustering or celltype annotation column
clustering_col = "clustering"

# Set to None if not available
sample_col = None

# Conditions to compare
condition_col = "meta_c19_severity"
specific_conds = None #["mild","severe"] #,"moderate"
# Transformation method
trans = 'arcsin'  # can be "logit" or "arcsin".

# Covariates to include in analysis
covariates = None

### For unreplicated data ###
# If sample_col=None, data is assumed unreplicated.
# Parameters for the bootstrapping if data is unreplicated
n_sims = 100  # number of bootstrapping simulations
n_reps = 8  # number of pseudo-replicates to generate for each condition

# P-value Threshold to determine significane
significance_threshold = 0.12

### Plots ###
# Clusters to plot
specific_clusters = None  # specify clusters you want to plot: ["c1", "c2",...], None to plot all
# Number of plots per row
n_cols = 4

--------------

## 6 - Proportion analysis with Scanpro

In [None]:
# add decorator to scanpro
scanpro = deco.log_anndata(scanpro)

In [None]:
out = scanpro(adata,
              clusters_col=clustering_col,
              samples_col=sample_col,
              conds_col=condition_col,
              conditions=specific_conds,
              covariates=covariates,
              transform=trans,
              n_sims=n_sims,
              n_reps=n_reps)

out.results

In [None]:
# adjusted p-value is only in combination with sample information present
sig_col = "adjusted_p_values" if sample_col else "p_values"

significant_change = (out.results[sig_col] < significance_threshold).to_dict()
significant_change

In [None]:
out.plot(kind='stripplot',
         clusters=specific_clusters,
         n_columns=n_cols,
         save=f'{settings.figure_dir}{plot_suffix}_stripplot.pdf')

In [None]:
out.plot(kind='boxplot',
         clusters=specific_clusters,
         n_columns=n_cols,
         save=f'{settings.figure_dir}{plot_suffix}_boxplot.pdf')

The above plots show the proportion (amount) of cells of each group (e.g. cell type) allocated to each of the conditions. The p-value on top of each plot describes whether there is a significant change in proportion between any of the conditions. In case there are no replicates Scanpro will create simulated replicates (similar to random subsamples) to improve statistical robustness. The replicates are either shown as separate entities (upper plot) or as a box-distribution (lower plot). E.g. for a dataset where `clustering_col = "celltype"` and `condition_col = "injury"` a plot with low p-value can be interpreted as "Cell Type X shows a high change in the number of cells between injured and healthy" and a high p-value can be interpreted as "Cell Type Y shows a low change in the number of cells between injured and healthy".

In [None]:
# Save results to uns dictionary
scanpro_uns_dict = {"scanpro": {"results": out.results,
                                "significance": significant_change,
                                "proportions": out.props,
                                "counts": out.counts,
                                "transformation": trans,
                                "conditions": out.conditions}}
# Add to adata
adata.uns.update(scanpro_uns_dict)

___

## 7 - Saving adata

In [None]:
utils.adata.save_h5ad(adata, output)

In [None]:
settings.close_logfile()