# Generating summary statistics for each group of cells and gene

This notebook demonstrates the use of memento for generating statistics for each group of cells x genes.

In [6]:
%load_ext autoreload

In [7]:
%autoreload 2

In [21]:
import scanpy as sc
import scipy.stats as stats
import scipy.sparse as sparse
import pandas as pd
import numpy as np
import string
import random
import estimators as memento

### Generate dataset

In [92]:
mu = 1
sigma = 5
num_cells = 1000
num_genes = 100

In [93]:
X = sparse.csr_matrix(stats.nbinom.rvs(n=mu**2/(sigma**2-mu), p = mu/sigma**2, size=(num_cells, num_genes)))
obs = pd.DataFrame(
    data= zip(
        np.random.choice(['cell_A', 'cell_B'], size=num_cells),
        np.random.choice(['study_A', 'study_B'], size=num_cells)),
    index=[''.join(random.choices('AGCT', k=15)) for i in range(num_cells)],
    columns= ['celltype', 'study'])
var = pd.DataFrame(index=[''.join(random.choices(string.ascii_uppercase, k=5)) for i in range(num_genes)])

adata = sc.AnnData(X=X, obs=obs, var=var, dtype=X.dtype)

### Some preprocessing 

How would this work in the cellxgene context?

By default, memento uses the least variable genes as size factors but it's flexible to different normalizations.
Here we just use total count normalization.

In [94]:
adata.obs['size_factor'] = adata.X.sum(axis=1).A1
adata.obs['approx_size_factor'] = memento.bin_size_factor(adata.obs['size_factor'].values)

In [95]:
adata.obs.head(10)

Unnamed: 0,celltype,study,size_factor,approx_size_factor
ATTGGGTAGGATTGT,cell_A,study_B,180,182.24
AATAAGCTTATCGCG,cell_A,study_B,176,170.571429
GACCATGGACGTCCC,cell_A,study_A,60,60.008929
CTAAACAGTACGCTG,cell_A,study_A,38,34.87234
CCACATGTACACCTT,cell_B,study_B,85,84.083333
TTACTTAACGTTTCA,cell_B,study_B,92,96.293103
GCTGGCAATCAGATT,cell_B,study_B,110,108.329412
ATCTCCCTGAGTCCT,cell_B,study_B,71,71.65625
TCTTTCTCCATGTCC,cell_A,study_A,100,96.293103
AACAGAGGAGTGAGG,cell_B,study_B,147,145.840909


### Generate summary statistics

In [96]:
studies = ['study_A', 'study_B']
celltypes = ['cell_A', 'cell_B']
genes = adata.var.index.tolist()
q = 0.1 # RNA capture efficiency depending on technology

In [97]:
# This entire thing can be parallelized to however degree
summary_list = []
for ct in celltypes:
    
    for study in studies:
        
        subset = adata[(adata.obs['celltype'] == ct) & (adata.obs['study'] == study)]
        subset.X = subset.X.tocsc() # makes column indexing a bit more efficient
        size_factor = subset.obs['approx_size_factor'].values
        
        for idx, gene in enumerate(genes):
            
            X = subset.X[:, idx]
            
            mean = memento.compute_mean(X, q, size_factor)
            sem = memento.compute_sem(X, q, size_factor)
            _, variance = memento.compute_variance(X, q, size_factor)
            _, _, sev, selv = memento.compute_sev(X, q, size_factor, num_boot=10000)
            
            summary_list.append(
                (
                    ct,
                    study,
                    gene,
                    mean,
                    sem, 
                    variance,
                    sev, 
                    selv,
                )
            )
summary = pd.DataFrame(
    data=summary_list,
    columns=['celltype', 'study', 'gene', 'mean', 'SEM', 'variance', 'SEV', 'SELV'])

In [98]:
summary

Unnamed: 0,celltype,study,gene,mean,SEM,variance,SEV,SELV
0,cell_A,study_A,DPROO,0.019060,0.005018,0.006319,0.002423,0.430679
1,cell_A,study_A,RBWHQ,0.005173,0.001173,0.000345,0.000183,0.574223
2,cell_A,study_A,KLJVY,0.005586,0.001163,0.000340,0.000142,0.478284
3,cell_A,study_A,EFDAQ,0.015482,0.003179,0.002536,0.000972,0.409071
4,cell_A,study_A,ITOEA,0.008931,0.002161,0.001172,0.000462,0.432632
...,...,...,...,...,...,...,...,...
395,cell_B,study_B,DZWUF,0.010766,0.002682,0.002029,0.001135,0.599446
396,cell_B,study_B,RFIGW,0.009962,0.002397,0.001620,0.000598,0.414875
397,cell_B,study_B,LECNU,0.015523,0.004119,0.004785,0.002149,0.499948
398,cell_B,study_B,ZFLOA,0.009193,0.001866,0.000982,0.000392,0.419434
