In [None]:
from sctoolbox.utils.jupyter import bgcolor, _compare_version

# change the background of input cells
bgcolor("PowderBlue", select=[2, 4])

nb_name = "proportion_analysis.ipynb"

_compare_version(nb_name)

# Proportion Analysis
<hr style="border:2px solid black"> </hr>

## 1 - Description

**Requires a clustered or otherwise categorized anndata object. A clustering can be generated with a clustering notebook (e.g. `rna_analysis/notebooks/04_clustering.ipynb`).**

**Move this notebook into the notebook folder (e.g. `rna_analysis/notebooks/`) of the respective analysis before using it!**

Differential proportion analysis aims to identify clusters showing differential composition between different biological conditions. Scanpro offers a linear regression framework and empirical bayes moderated statistical tests, taking sample-to-sample variation into account. Scanpro also generate pseudo-replicates automatically for unreplicated data.

In this notebook we will use [Scanpro](https://github.com/loosolab/scanpro). For more information, check the [documentation](https://scanpro.readthedocs.io/en/latest/)

<img src="https://raw.githubusercontent.com/loosolab/scanpro/main/docs/source/figures/scanpro_workflow.png" alt="image" width="65%" height="auto">


___

## 2 - Setup

In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)  # no limit to the number of columns shown
from sctoolbox import settings
import sctoolbox.utils as utils
import sctoolbox.utils.decorator as deco
from sctoolbox.plotting.general import plot_table
import matplotlib.pyplot as plt
from pathlib import Path

# We will use Scanpro for proportion analysis
from scanpro import scanpro

settings.settings_from_config("config.yaml", key="proportion_analysis")

with pd.option_context("display.max.rows", None, "display.max_colwidth", None):
    display(utils.general.get_version_report(report="versions.yml"))

___

## 3 - Input/output settings

<h1><center>⬐ Fill in input data here ⬎</center></h1>

In [None]:
# Input/Output
last_notebook_adata = "anndata_4.h5ad"
output = "anndata_scanpro.h5ad"
plot_suffix = "scanpro"

___

## 4 - Load anndata

In [None]:
adata = utils.adata.load_h5ad(last_notebook_adata)

with pd.option_context("display.max.rows", 5, "display.max.columns", None):
    display(adata)
    display(adata.obs)

---

## 5 - General Input
<hr style="border:2px solid black"> </hr>

### 4.1 - Parameter Overview

| Parameter | Description | Options |
|-----------|-------------|---------|
| clustering_col | The name of a column in `adata.obs` with clustering or celltype annotation | Column should be categorical
| sample_col | The name of a column in `adata.obs` with sample information | Set to None if not available |
| condition_col | The name of a column in `adata.obs` with conditions | Column should be categorical |
| specific_conds | Specify conditions to compare | e.g. `["cond1", "cond2",...]` If None, all conditions are compared |
| pairwise | Set to True to do pairwise comparisons between all conditions. Pairwise comparisons give more insight into changes between each pair of conditions, which can be hard to distinguish with normal comparison | True or False
| trans | Transformation method to normalize proportions | `'logit'` or `'arcsin'` |
| covariates | The name of a column in `adata.obs` with covariates to include in analysis | 
| n_sims | Number of bootstrapping simulations for unreplicated data |
| n_reps | Number of pseudo-replicates to generate for each condition | `int` or `'auto'` to specify automatically |
| significance_threshold | P-value Threshold to determine significane |
| specific_clusters | specify clusters you want to plot | e.g. `["c1", "c2",...]`, set to None to plot all |
| n_cols | Number of plots per row |

<h1><center>⬐ Fill in input data here ⬎</center></h1>

In [None]:
clustering_col = "celltype"

sample_col = "sample"

condition_col = "chamber"
specific_conds = None
pairwise = False

trans = 'logit'

covariates = None

### For unreplicated data ###
# If sample_col=None, data is assumed unreplicated.
# Parameters for the bootstrapping if data is unreplicated
n_sims = 100
n_reps = 8

significance_threshold = 0.05

### Plots ###
# Clusters to plot
specific_clusters = None
n_cols = 4

--------------

## 6 - Proportion analysis with Scanpro

In [None]:
# add decorator to scanpro
scanpro = deco.log_anndata(scanpro)

In [None]:
out = scanpro(adata,
              clusters_col=clustering_col,
              samples_col=sample_col,
              conds_col=condition_col,
              conditions=specific_conds,
              covariates=covariates,
              transform=trans,
              n_sims=n_sims,
              n_reps=n_reps,
              pairwise=pairwise)

results = out.sim_results if hasattr(out, 'sim_results') else out.results

# write results table to automated report section
_ = plot_table(out.results, report=f'01_{plot_suffix}_results.png', crop=None, round=6)

results

In [None]:
# adjusted p-value is only in combination with sample information present
sig_col = "adjusted_p_values"

if not out.pairwise:
    significant_change = (out.results[sig_col] < significance_threshold).to_dict()
else:
    significant_change = {ct: {f"{pair[0]} vs {pair[1]}": out.results.loc[ct, f'{sig_col}_{pair[0]}_{pair[1]}'] < significance_threshold for pair in out.condition_pairs} for ct in out.results.index}
significant_change

In [None]:
# Plot scanpro results to specific destination folder for automated report generation 
out.plot(kind='stripplot',
         clusters=specific_clusters,
         n_columns=n_cols,
         show=False,
         verbosity=0,
         save=Path(settings.report_dir) / f'02_{plot_suffix}_stripplot.png')
plt.close()

out.plot(kind='stripplot',
         clusters=specific_clusters,
         n_columns=n_cols,
         save=Path(settings.figure_dir) / f'{plot_suffix}_stripplot.pdf')

In [None]:
# Plot scanpro results to specific destination folder for automated report generation 
out.plot(kind='boxplot',
         clusters=specific_clusters,
         n_columns=n_cols,
         show=False,
         verbosity=0,
         save=Path(settings.report_dir) / f'03_{plot_suffix}_boxplot.png')
plt.close()

out.plot(kind='boxplot',
         clusters=specific_clusters,
         n_columns=n_cols,
         save=Path(settings.figure_dir) / f'{plot_suffix}_boxplot.pdf')

The above plots show the proportion (amount) of cells of each group (e.g. cell type) allocated to each of the conditions. The p-value on top of each plot describes whether there is a significant change in proportion between any of the conditions. In case there are no replicates Scanpro will create simulated replicates (similar to random subsamples) to improve statistical robustness. The replicates are either shown as separate entities (upper plot) or as a box-distribution (lower plot). E.g. for a dataset where `clustering_col = "celltype"` and `condition_col = "injury"` a plot with low p-value can be interpreted as "Cell Type X shows a high change in the number of cells between injured and healthy" and a high p-value can be interpreted as "Cell Type Y shows a low change in the number of cells between injured and healthy".

In [None]:
# Save results to uns dictionary
scanpro_uns_dict = {"scanpro": {"results": out.results,
                                "significance": significant_change,
                                "proportions": out.props,
                                "counts": out.counts,
                                "transformation": trans,
                                "conditions": out.conditions}}
# Add to adata
adata.uns.update(scanpro_uns_dict)

___

## 7 - Saving adata

In [None]:
utils.io.update_yaml(d={"cluster": clustering_col, "condition": condition_col}, yml="method.yml", path_prefix="report")
utils.adata.save_h5ad(adata, output)

In [None]:
settings.close_logfile()