# Summary Statistics Plots

In [5]:
import os
homepath = os.path.join(os.path.expanduser("~"), 'dev/data-portal-summary-stats')
os.chdir(homepath)
import scanpy as sc
import numpy as np

## Create a `ScanPy` data object

In [6]:
sc.settings.set_figure_params(dpi=200)  # figure resolution

In [7]:
#mtxfile = 'notebook/mtxfile_processed.mtx/'
mtxfile = 'notebook/e7d811e2-832a-4452-85a5-989e2f8267bf.mtx'
adata = sc.read_10x_mtx(mtxfile, # the directory with the `.mtx` file
        var_names='gene_symbols',    # use gene symbols for variable names (variables-axis index)
        cache=True)
adata.var_names_make_unique()

FileNotFoundError: [Errno 2] No such file or directory: 'notebook/e7d811e2-832a-4452-85a5-989e2f8267bf.mtx/features.tsv.gz'

## Highest-variable genes

In [None]:
sc.pl.highest_expr_genes(adata, n_top=10)

In [None]:
sc.pp.filter_cells(adata, min_genes=1200)
sc.pp.filter_genes(adata, min_cells=10)

In [None]:
mito_genes = adata.var_names.str.startswith('MT-')
# for each cell compute fraction of counts in mito genes vs. all genes
# the `.A1` is only necessary as X is sparse (to transform to a dense array after summing)
adata.obs['percent_mito'] = np.sum(
    adata[:, mito_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1
# add the total counts per cell as observations-annotation to adata
adata.obs['n_counts'] = adata.X.sum(axis=1).A1

In [None]:
sc.pl.violin(adata, ['n_genes', 'n_counts', 'percent_mito'],
             jitter=0.4, multi_panel=True)

Violin plots are similar to box plots, but in addition show the probability density, smoothed by a kernel density estimator.
* `n_genes` represents number of genes for each cell. That is, not every gene is expressed in each cell and some cells express more genes than others.
* `n_counts` represents the total number of counts for each cell.
* `percent_mito` represents the fraction of mitochondrial genes of the total number of genes, for each cell.

In [None]:
sc.pl.scatter(adata, x='n_counts', y='percent_mito')
sc.pl.scatter(adata, x='n_counts', y='n_genes')

## Identify highly variable genes.

In [None]:
sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)
sc.pp.log1p(adata)  # logarithmize
adata.raw = adata   # save raw data for later use

In [None]:
sc.pp.highly_variable_genes(adata, min_mean=0.05, max_mean=30, min_disp=1.9)
sc.pl.highly_variable_genes(adata)

## Principal Component Analysis

In [None]:
sc.tl.pca(adata, svd_solver='arpack')
sc.pl.pca(adata, color='CST3')

## tSNE

In [None]:
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)
sc.tl.umap(adata)
sc.tl.louvain(adata)
sc.pl.umap(adata, color=['louvain', 'CST3'])

In [None]:
sc.tl.rank_genes_groups(adata, 'louvain', method='logreg')
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)