# Linear Covariance Testing

Use linear mixed effect models to test the effect of the guide on the covariance of IRF4 and BATF and target genes.

### Import

In [6]:
from IPython.core.display import display, HTML
import warnings
warnings.filterwarnings('ignore')
display(HTML("<style>.container { width:100% !important; }</style>"))

In [7]:
repo_path = '/Users/mincheolkim/Github/'
data_path = '/Users/mincheolkim/Documents/'

In [8]:
import sys
sys.path.append(repo_path + 'scVI')
sys.path.append(repo_path + 'scVI-extensions')

In [14]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [10]:
import scvi_extensions.dataset.supervised_data_loader as sdl
import scvi_extensions.dataset.cropseq as cs
import scvi_extensions.inference.supervised_variational_inference as svi
import scvi_extensions.hypothesis_testing.mean as mn
import scvi_extensions.hypothesis_testing.variance as vr
import scvi_extensions.dataset.label_data_loader as ldl

### Load a dataset

In [11]:
h5_filename = data_path + 'raw_gene_bc_matrices_h5.h5'
metadata_filename = data_path + 'nsnp20.raw.sng.km_vb1_default.norm.meta.txt'

In [13]:
imp.reload(cs)
# Load the dataset
gene_dataset = cs.CropseqDataset(
    filename=h5_filename,
    metadata_filename=metadata_filename,
    batch='wells',
    use_labels='gene',
    save_path='')

Preprocessing CROP-seq dataset
Number of cells kept after filtering with metadata: 283634
Number of cells kept after removing all zero cells: 283634
Finished preprocessing CROP-seq dataset


### Create a dataframe with relevant cells

In [143]:
ko_genes = ['NO_GUIDE', 'BATF', 'IRF4', 'JUNB']
genes_of_interest = ['BATF', 'IRF4', 'JUNB', 'RORC', 'BCL6', 'MAF', 'IL10']
goi_indices = [np.where(gene_dataset.gene_names == gene)[0][0] for gene in genes_of_interest]

In [230]:
dfs=[]
for ko_gene in ko_genes:
    indices = (gene_dataset.labels == np.where(gene_dataset.ko_gene_lookup == ko_gene)[0][0]).reshape(-1)
    print(ko_gene, indices.sum())
    expr = gene_dataset.X[indices, :].astype(float)
    umi_counts = expr.sum(axis=1)
    df = pd.DataFrame(expr[:, goi_indices].todense(), columns=genes_of_interest)
    df['ko_gene'] = ko_gene
    df['donor'] = pd.Series(gene_dataset.donor_batches[indices].reshape(-1)).astype(int)
    df['louvain'] = gene_dataset.louvain[indices].reshape(-1)
    dfs.append(df)
df = pd.get_dummies(pd.concat(dfs), columns=['ko_gene'])

NO_GUIDE 87336
BATF 915
IRF4 961
JUNB 856


### Compute covariances of interest and decide the group

In [231]:
def compute_point_cov(s1, s2):
    return (df[s1]-df[s1].mean())*(df[s2]-df[s2].mean())

In [233]:
df['cov_BATF_RORC'] = compute_point_cov('BATF', 'RORC')
df['cov_IRF4_RORC'] = compute_point_cov('IRF4', 'RORC')

In [290]:
groups = 'donor'

### Get some general sense of how sparse these genes are

IRF4 isnt detected for most cells, so I'm going to test BATF using IRF4 KO cells.

In [291]:
df.query('BATF > 0').shape

(14205, 17)

In [292]:
df.query('IRF4 > 0').shape

(1147, 17)

### Test differential BATF/RORC covariance between IRF4 KO cells and NO_GUIDE cells

p_value 0.006, shows some significance

In [298]:
print(smf.mixedlm(
    'cov_BATF_RORC ~ ko_gene_IRF4',
    df.query('ko_gene_IRF4 > 0 | ko_gene_NO_GUIDE > 0'),
    groups=df.query('ko_gene_IRF4 > 0 | ko_gene_NO_GUIDE > 0')[groups]).fit(maxiter=1000, method='nm').summary())

           Mixed Linear Model Regression Results
Model:            MixedLM Dependent Variable: cov_BATF_RORC
No. Observations: 88297   Method:             REML         
No. Groups:       9       Scale:              0.0123       
Min. group size:  7565    Likelihood:         69046.8944   
Max. group size:  11814   Converged:          Yes          
Mean group size:  9810.8                                   
------------------------------------------------------------
               Coef.  Std.Err.    z    P>|z|  [0.025  0.975]
------------------------------------------------------------
Intercept      0.002     0.000  4.702  0.000   0.001   0.003
ko_gene_IRF4   0.010     0.004  2.758  0.006   0.003   0.017
Group Var      0.000     0.000                              





### Test differential BATF/RORC covariance between BATF KO cells and IRF4 KO cells

p_value 0.165, significance goes away.

In [299]:
print(smf.mixedlm(
    'cov_BATF_RORC ~ ko_gene_IRF4',
    df.query('ko_gene_BATF > 0 | ko_gene_IRF4 > 0'),
    groups=df.query('ko_gene_BATF > 0 | ko_gene_IRF4 > 0')[groups]).fit(maxiter=1000, method='nm').summary())

           Mixed Linear Model Regression Results
Model:            MixedLM Dependent Variable: cov_BATF_RORC
No. Observations: 1876    Method:             REML         
No. Groups:       9       Scale:              0.0293       
Min. group size:  149     Likelihood:         641.3140     
Max. group size:  266     Converged:          Yes          
Mean group size:  208.4                                    
------------------------------------------------------------
               Coef.  Std.Err.    z    P>|z|  [0.025  0.975]
------------------------------------------------------------
Intercept      0.001     0.006  0.194  0.846  -0.011   0.013
ko_gene_IRF4   0.011     0.008  1.388  0.165  -0.005   0.026
Group Var      0.000     0.001                              





### Boxplot