# Generating summary statistics for each group of cells and gene

This notebook demonstrates the use of memento for generating statistics for each group of cells x genes.

In [29]:
%load_ext autoreload

In [30]:
%autoreload 2

In [31]:
import scanpy as sc
import scipy.stats as stats
import scipy.sparse as sparse
import pandas as pd
import numpy as np
import string
import random
import estimators as memento

### Generate dataset

In [38]:
mu = 1
sigma = 5
num_cells = 1000
num_genes = 100

In [39]:
X_single_gene = sparse.csr_matrix(stats.nbinom.rvs(n=mu ** 2 / (sigma ** 2 - mu), p =mu / sigma ** 2, size=(num_cells, num_genes)))
obs = pd.DataFrame(
    data= zip(
        np.random.choice(['cell_A', 'cell_B'], size=num_cells),
        np.random.choice(['study_A', 'study_B'], size=num_cells)),
    index=[''.join(random.choices('AGCT', k=15)) for i in range(num_cells)],
    columns= ['celltype', 'study'])
var = pd.DataFrame(index=[''.join(random.choices(string.ascii_uppercase, k=5)) for i in range(num_genes)])

adata = sc.AnnData(X=X_single_gene, obs=obs, var=var, dtype=X_single_gene.dtype)

### Some preprocessing 

How would this work in the cellxgene context?

By default, memento uses the least variable genes as size factors but it's flexible to different normalizations.
Here we just use total count normalization.

In [40]:
# Sum expression levels for each cell, then bin all sums to have fewer unique values
adata.obs['size_factor'] = adata.X.sum(axis=1).A1
adata.obs['approx_size_factor'] = memento.bin_size_factor(adata.obs['size_factor'].values)

In [41]:
(len(np.unique(adata.obs['size_factor'].values)), adata.obs['approx_size_factor'].unique().size)

(203, 26)

In [42]:
adata.obs.head(10)

Unnamed: 0,celltype,study,size_factor,approx_size_factor
GGTACGTGGTCGAGA,cell_A,study_A,140,138.705882
CAGTGCGGCCGAAGG,cell_B,study_B,154,150.555556
CAAGATTCTCCGTTT,cell_B,study_B,136,138.705882
ATTCAAATCCTTTCG,cell_A,study_A,183,188.04
TCAACGTACAGATGT,cell_B,study_B,46,40.876712
ATGGTCCATTTTTCA,cell_B,study_A,143,138.705882
TAGTTCGCGTAGGGG,cell_B,study_A,127,127.409091
GGATTCTGTCTCCTT,cell_B,study_A,98,101.44898
TGATGGTCACTGCAG,cell_B,study_A,228,223.461538
CTGTGGGTGGAAGCG,cell_B,study_A,166,164.085714


### Generate summary statistics

In [43]:
studies = ['study_A', 'study_B']
celltypes = ['cell_A', 'cell_B']
genes = adata.var.index.tolist()
q = 0.1 # RNA capture efficiency depending on technology

In [45]:
# This entire thing can be parallelized to however degree
summary_list = []
for ct in celltypes:
    
    for study in studies:
        
        adata_subset = adata[(adata.obs['celltype'] == ct) & (adata.obs['study'] == study)]
        adata_subset.X = adata_subset.X.tocsc() # makes column indexing a bit more efficient
        approx_size_factor = adata_subset.obs['approx_size_factor'].values
        
        for idx, gene in enumerate(genes):
            
            X_single_gene = adata_subset.X[:, idx]
            
            mean = memento.compute_mean(X_single_gene, q, approx_size_factor)
            sem = memento.compute_sem(X_single_gene, q, approx_size_factor)
            _, variance = memento.compute_variance(X_single_gene, q, approx_size_factor)
            _, _, sev, selv = memento.compute_sev(X_single_gene, q, approx_size_factor, num_boot=10000)
            
            summary_list.append(
                (
                    ct,
                    study,
                    gene,
                    mean,
                    sem, 
                    variance,
                    sev, 
                    selv,
                )
            )
summary = pd.DataFrame(
    data=summary_list,
    columns=['celltype', 'study', 'gene', 'mean', 'SEM', 'variance', 'SEV', 'SELV'])

  (2, 0)	6
  (9, 0)	4
  (12, 0)	22
  (28, 0)	1
  (39, 0)	24
  (68, 0)	1
  (78, 0)	1
  (98, 0)	1
  (99, 0)	3
  (103, 0)	28
  (104, 0)	6
  (125, 0)	7
  (128, 0)	5
  (139, 0)	2
  (142, 0)	4
  (144, 0)	1
  (147, 0)	1
  (149, 0)	11
  (170, 0)	16
  (175, 0)	1
  (177, 0)	1
  (189, 0)	1
  (205, 0)	2
  (220, 0)	14
  (223, 0)	1
  (238, 0)	1
  (240, 0)	1
  (1, 0)	15
  (3, 0)	2
  (4, 0)	31
  (16, 0)	9
  (27, 0)	8
  (35, 0)	21
  (62, 0)	13
  (68, 0)	1
  (84, 0)	18
  (90, 0)	1
  (103, 0)	6
  (104, 0)	2
  (107, 0)	3
  (110, 0)	21
  (111, 0)	1
  (112, 0)	2
  (119, 0)	1
  (136, 0)	24
  (144, 0)	32
  (153, 0)	1
  (157, 0)	9
  (159, 0)	1
  (170, 0)	12
  (175, 0)	2
  (182, 0)	7
  (206, 0)	8
  (212, 0)	8
  (235, 0)	2
  (236, 0)	8
  (237, 0)	1
  (242, 0)	5
  (243, 0)	12
  (3, 0)	30
  (5, 0)	1
  (10, 0)	3
  (28, 0)	9
  (34, 0)	19
  (36, 0)	18
  (66, 0)	7
  (81, 0)	1
  (91, 0)	2
  (99, 0)	9
  (104, 0)	4
  (125, 0)	1
  (127, 0)	1
  (130, 0)	2
  (135, 0)	5
  (153, 0)	53
  (157, 0)	2
  (173, 0)	43
  (196, 0)	18


KeyboardInterrupt: 

In [98]:
summary

Unnamed: 0,celltype,study,gene,mean,SEM,variance,SEV,SELV
0,cell_A,study_A,DPROO,0.019060,0.005018,0.006319,0.002423,0.430679
1,cell_A,study_A,RBWHQ,0.005173,0.001173,0.000345,0.000183,0.574223
2,cell_A,study_A,KLJVY,0.005586,0.001163,0.000340,0.000142,0.478284
3,cell_A,study_A,EFDAQ,0.015482,0.003179,0.002536,0.000972,0.409071
4,cell_A,study_A,ITOEA,0.008931,0.002161,0.001172,0.000462,0.432632
...,...,...,...,...,...,...,...,...
395,cell_B,study_B,DZWUF,0.010766,0.002682,0.002029,0.001135,0.599446
396,cell_B,study_B,RFIGW,0.009962,0.002397,0.001620,0.000598,0.414875
397,cell_B,study_B,LECNU,0.015523,0.004119,0.004785,0.002149,0.499948
398,cell_B,study_B,ZFLOA,0.009193,0.001866,0.000982,0.000392,0.419434
