# Generating summary statistics for each group of cells and gene

This notebook demonstrates the use of memento for generating statistics for each group of cells x genes.

In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
import scanpy as sc
import scipy.stats as stats
import scipy.sparse as sparse
import pandas as pd
import numpy as np
import string
import random
import estimators as memento

### Generate dataset

In [4]:
mu = 1
sigma = 5
num_cells = 1000
num_genes = 100

In [5]:
X = sparse.csr_matrix(stats.nbinom.rvs(n=mu**2/(sigma**2-mu), p = mu/sigma**2, size=(num_cells, num_genes)))
obs = pd.DataFrame(
    data= zip(
        np.random.choice(['cell_A', 'cell_B'], size=num_cells),
        np.random.choice(['study_A', 'study_B'], size=num_cells)),
    index=[''.join(random.choices('AGCT', k=15)) for i in range(num_cells)],
    columns= ['celltype', 'study'])
var = pd.DataFrame(index=[''.join(random.choices(string.ascii_uppercase, k=5)) for i in range(num_genes)])

adata = sc.AnnData(X=X, obs=obs, var=var, dtype=X.dtype)

### Some preprocessing 

How would this work in the cellxgene context?

By default, memento uses the least variable genes as size factors but it's flexible to different normalizations.
Here we just use total count normalization.

In [6]:
adata.obs['size_factor'] = adata.X.sum(axis=1).A1
adata.obs['approx_size_factor'] = memento.bin_size_factor(adata.obs['size_factor'].values)

In [7]:
adata.obs.head(10)

Unnamed: 0,celltype,study,size_factor,approx_size_factor
CGATGGATCTCTCAT,cell_A,study_B,185,189.0
ACTGAGCGGTAACAG,cell_B,study_A,106,106.684932
CGCCAGTGCTAACAT,cell_A,study_A,33,34.421053
GTTTAATATAGTTAA,cell_B,study_A,160,158.962963
CACCTTTGGGCATTA,cell_B,study_A,157,158.962963
CGGGCAGCAATTTTG,cell_A,study_B,29,25.6
TCGTGCCTATTATTT,cell_B,study_A,58,55.518987
GGAGGGACAGGAACC,cell_B,study_B,50,45.283784
GTAGGAATTCGATTA,cell_A,study_A,16,15.1
AGACTAGGCAACGAT,cell_B,study_B,162,158.962963


### Generate summary statistics

In [8]:
studies = ['study_A', 'study_B']
celltypes = ['cell_A', 'cell_B']
genes = adata.var.index.tolist()
q = 0.1 # RNA capture efficiency depending on technology

In [9]:
# This entire thing can be parallelized to however degree
summary_list = []
for ct in celltypes:
    
    for study in studies:
        
        subset = adata[(adata.obs['celltype'] == ct) & (adata.obs['study'] == study)]
        subset.X = subset.X.tocsc() # makes column indexing a bit more efficient
        size_factor = subset.obs['approx_size_factor'].values
        
        for idx, gene in enumerate(genes):
            
            X = subset.X[:, idx]
            
            mean = memento.compute_mean(X, q, size_factor)
            sem = memento.compute_sem(X, q, size_factor)
            _, variance = memento.compute_variance(X, q, size_factor)
            _, _, sev, selv = memento.compute_sev(X, q, size_factor, num_boot=10000)
            
            summary_list.append(
                (
                    ct,
                    study,
                    gene,
                    mean,
                    sem, 
                    variance,
                    sev, 
                    selv,
                )
            )
summary = pd.DataFrame(
    data=summary_list,
    columns=['celltype', 'study', 'gene', 'mean', 'SEM', 'variance', 'SEV', 'SELV'])

In [11]:
summary

Unnamed: 0,celltype,study,gene,mean,SEM,variance,SEV,SELV
0,cell_A,study_A,AAWNM,0.011078,0.002662,0.001680,0.000662,0.453293
1,cell_A,study_A,PNFBR,0.007578,0.001581,0.000593,0.000215,0.410049
2,cell_A,study_A,HBQPB,0.009016,0.001985,0.000934,0.000475,0.557987
3,cell_A,study_A,KELFA,0.004870,0.001663,0.000655,0.000335,0.679576
4,cell_A,study_A,ZFPJZ,0.010668,0.002645,0.001658,0.000642,0.432235
...,...,...,...,...,...,...,...,...
395,cell_B,study_B,MLIAS,0.007314,0.001842,0.000872,0.000325,0.411151
396,cell_B,study_B,XGWUD,0.002556,0.000765,0.000150,0.000066,0.510557
397,cell_B,study_B,DYJPY,0.004594,0.001476,0.000560,0.000300,0.688119
398,cell_B,study_B,RRAYX,0.007144,0.001963,0.000991,0.000447,0.525260


In [21]:
import json

adata.write_h5ad('sample_input.h5ad')

with open('sample_result.json', 'w') as f:
    f.write(summary.to_json())