# 10X PBMC datasets

## Imports

In [1]:
import glob
import pandas as pd
import numpy as np
import os
from IPython.display import HTML
import scipy
import torch
import pickle
import re
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score
import matplotlib.pyplot as plt
from inspect import signature

In [2]:
from scvi.models import AutoZIVAE
from scvi.inference import UnsupervisedTrainer
from datasets.dataset10Xcelltype import Dataset10XCellTypes
import torch
import pickle
import argparse
import re
import numpy as np
import time
from scvi.models.log_likelihood import compute_marginal_log_likelihood_scvi, compute_marginal_log_likelihood_autozi
import os
from autozi_simulate_tools import retrieve_rates_dropouts_per_cell_type, restrict_to_common_nonzero_genes_cell_types
from classification_metrics import *
from functools import partial

[2019-10-11 17:54:35,830] INFO - scvi._settings | Added StreamHandler with custom formatter to 'scvi' logger.
  from numpy.core.umath_tests import inner1d


In [3]:
plt.switch_backend("TkAgg")
%matplotlib inline

## Preprocessing datasets

Before running AutoZI, we preprocess the datasets (pbmc3k, pbmc8k, pbmc10k), notably restricting them to the set of genes expressed in all cell types and datasets and saving, for each cell type, the set of genes with sufficient expression in each dataset.

In [4]:
dataset_name_to_metadata_file_name = {
    'pbmc3k': 'pbmc3k_metadata.pickle',
    'pbmc8k': 'pbmc8k_metadata.pickle',
    'pbmc_10k_protein_v3': 'pbmc10k_metadata.pickle',
}

tenxv1_id = 'pbmc3k'
tenxv2_id = 'pbmc8k'
tenxv3_id = 'pbmc_10k_protein_v3'

tenxv1 = Dataset10XCellTypes(tenxv1_id, dataset_name_to_metadata_file_name[tenxv1_id])
tenxv2 = Dataset10XCellTypes(tenxv2_id, dataset_name_to_metadata_file_name[tenxv2_id])
tenxv3 = Dataset10XCellTypes(tenxv3_id, dataset_name_to_metadata_file_name[tenxv3_id])

cell_types_of_interest = np.array(['B cells', 'CD14+ Monocytes', 'CD4 T cells', 'CD8 T cells'])
for tenx in [tenxv1, tenxv2, tenxv3]:
    tenx.filter_cell_types(cell_types_of_interest)
    tenx.reorder_cell_types(cell_types_of_interest)

for tenx in [tenxv1, tenxv2, tenxv3]:
    tenx.subsample_genes(new_n_genes=1000)

genes_to_study = restrict_to_common_nonzero_genes_cell_types(tenxv1, tenxv2, tenxv3)

datasets = {
    '10x-v1-' + tenxv1_id: tenxv1,
    '10x-v2-' + tenxv2_id: tenxv2,
    '10x-v3-' + tenxv3_id: tenxv3,
}

[2019-10-11 17:54:36,419] INFO - scvi.dataset.dataset | File /media/storage/Documents/2. Professionnel/UC Berkeley Internship 2019/scVI-C/autozi_reproducibility/data/10X/pbmc3k/filtered_gene_bc_matrices.tar.gz already downloaded
[2019-10-11 17:54:39,319] INFO - scvi.dataset.dataset | Remapping batch_indices to [0,N]
[2019-10-11 17:54:39,320] INFO - scvi.dataset.dataset | Remapping labels to [0,N]
[2019-10-11 17:54:39,329] INFO - scvi.dataset.dataset | Computing the library size for the new data
[2019-10-11 17:54:39,341] INFO - scvi.dataset.dataset | Downsampled from 2700 to 2700 cells
[2019-10-11 17:54:39,345] INFO - scvi.dataset.dataset | File /media/storage/Documents/2. Professionnel/UC Berkeley Internship 2019/scVI-C/autozi_reproducibility/data/10X/pbmc8k/filtered_gene_bc_matrices.tar.gz already downloaded
[2019-10-11 17:54:52,037] INFO - scvi.dataset.dataset | Remapping batch_indices to [0,N]
[2019-10-11 17:54:52,038] INFO - scvi.dataset.dataset | Remapping labels to [0,N]
[2019-10

## Train AutoZI on datasets

For each dataset under scrutiny, we retrieve the posterior parameters $\alpha^g, \beta^g$ of $q(\delta_g)$ for each gene $g$.

In [10]:
results_autozi_outputs = []

for ind_dataset,dataset_name in enumerate(datasets):
    data = datasets[dataset_name]

    np.random.seed(int(time.time()))
    torch.manual_seed(int(time.time()))
    model = AutoZIVAE(n_input=data.nb_genes, alpha_prior=0.5, beta_prior=0.5,minimal_dropout=0.01,\
                      dispersion='gene-label', zero_inflation='gene-label', n_labels=data.n_labels)
    
    trainer = UnsupervisedTrainer(model, data)
    trainer.train(n_epochs=600, lr=1e-2)
    outputs = trainer.model.get_alphas_betas()

    outputs['ind_dataset'] = ind_dataset
    outputs['cell_types'] = data.cell_types
    outputs['labels'] = data.labels
    outputs['genes_to_study'] = genes_to_study

    outputs['means_emp'] = {}
    for label in np.unique(data.labels):
        outputs['means_emp'][label] = np.array(data.X[data.labels.reshape(-1) == label, :].mean(axis=0) \
                                               .reshape(-1))

    outputs['dataset_name'] = dataset_name
    
    results_autozi_outputs.append(outputs)

training: 100%|██████████| 600/600 [02:40<00:00,  3.05it/s]
training: 100%|██████████| 600/600 [07:28<00:00,  1.45it/s]
training: 100%|██████████| 600/600 [06:24<00:00,  1.55it/s]


## Compute metrics from AutoZI's outputs

In [11]:
# Create a mask for a flattened gene-label entry matrix for a given label and, optionally, another label
def make_label_mask(shape, label, mask_other=None):
    mask = np.zeros(shape).astype(bool)
    mask[:,label] = True
    if mask_other is not None:
        mask = np.logical_and(mask, mask_other)
    return mask.reshape(-1)


For each dataset under scrutiny, from these $\alpha^g, \beta^g$, we estimate the ZI probabilities $q(\delta_g < 0.5)$ and classification metrics on the default decision rule $q(\delta_g < 0.5) > 0.5$ using tools from `classification_metrics.py`.

In [17]:
results_autozi_data_list = []

for outputs in results_autozi_outputs:

    labels = outputs['labels']
    cell_types = outputs['cell_types']
    genes_to_study = outputs['genes_to_study']
    ind_dataset = outputs['ind_dataset']

    # Create a mask such that each column corresponds to a cell type and each entry of this column corresponds
    # to the indices of genes of interest, determined above
    means_emp = outputs["means_emp"]
    mask_means_emp = np.zeros((means_emp[0].size,len(means_emp))).astype(bool)
    for label in means_emp:
        mask_means_emp[genes_to_study[cell_types[label]][ind_dataset],label] = True

    # The ground-truth is set to ZI for all genes for computational purposes but we do NOT assume
    # all genes are ZI
    is_zinb_gt = np.ones(outputs['alpha_posterior'].shape).astype(bool)
    # Dictionary associating each cell type toand a flattened gene-label mask with the adequate gene indices
    # for the cell type
    masks = {cell_types[label].replace(' ',''): make_label_mask(is_zinb_gt.shape, label, mask_means_emp) \
                 for label in np.unique(labels)}

    is_zinb_gt = is_zinb_gt.reshape(-1)

    metric_list_bernoulli = [
        ConfusionMatrixMetric(is_zinb_gt, masks=masks),
    ]

    model_score_evals = [
        AutoZIBernoulliThresholdEval('bernoullithreshold50', outputs, metric_list_bernoulli, threshold=0.50),
    ]

    results_autozi_data = {}
    for model_score_eval in model_score_evals:
        # Compute metrics for AutoZI on the dataset (here confusion matrix metrics)
        # Positives are ZI genes
        results_autozi_data.update(model_score_eval.compute_all_metrics())
        # Also directly add the scores used to compute the metrics
        results_autozi_data[model_score_eval.name] = model_score_eval.scores

    for key in ['means_emp', 'dataset_name']:
        results_autozi_data[key] = outputs.get(key, None)
        
    results_autozi_data_list.append(results_autozi_data)

results_autozi = pd.DataFrame(results_autozi_data_list)

In [18]:
results_autozi = results_autozi.sort_values(by=['dataset_name']).set_index(['dataset_name'])

## Percentages of ZINB genes

In [19]:
dict_metrics_to_celltypes = {
        'bernoullithreshold50_confusionmatrix_Bcells': 'B cells',
        'bernoullithreshold50_confusionmatrix_CD14+Monocytes': 'CD14+ monocytes',
        'bernoullithreshold50_confusionmatrix_CD4Tcells': 'CD4 T cells',
        'bernoullithreshold50_confusionmatrix_CD8Tcells': 'CD8 T cells',
        'bernoullithreshold50_confusionmatrix_NKcells': 'NK cells',
    }

In [20]:
cols = []
for cell_type in ['Bcells','CD14+Monocytes','CD4Tcells','CD8Tcells']:
    cols.append('bernoullithreshold50_confusionmatrix_'+cell_type+'_tp')
results_autozi_restricted = results_autozi[cols].rename(columns=lambda col: col.replace('_tp','')).T.rename(index=dict_metrics_to_celltypes)

print('Numbers of ZINB genes per cell type and dataset')
display(results_autozi_restricted)

cols = []
for cell_type in ['Bcells','CD14+Monocytes','CD4Tcells','CD8Tcells']:
    cols.append('bernoullithreshold50_confusionmatrix_'+cell_type+'_total')
results_autozi_restricted_denominator = results_autozi[cols].rename(columns=lambda col: col.replace('_total','')).T.rename(index=dict_metrics_to_celltypes)

print('Numbers of total genes under study per cell type and dataset')
display(results_autozi_restricted_denominator)

results_autozi_restricted_percentages = (results_autozi_restricted\
                                         / results_autozi_restricted_denominator).apply(lambda s: round(s*100,2))


print('Percentages of ZINB genes per cell type and dataset')
display(results_autozi_restricted_percentages)

Numbers of ZINB genes per cell type and dataset


dataset_name,10x-v1-pbmc3k,10x-v2-pbmc8k,10x-v3-pbmc_10k_protein_v3
B cells,79,62,25
CD14+ monocytes,77,57,31
CD4 T cells,84,77,31
CD8 T cells,78,58,41


Numbers of total genes under study per cell type and dataset


dataset_name,10x-v1-pbmc3k,10x-v2-pbmc8k,10x-v3-pbmc_10k_protein_v3
B cells,203,203,203
CD14+ monocytes,228,228,228
CD4 T cells,217,217,217
CD8 T cells,211,211,211


Percentages of ZINB genes per cell type and dataset


dataset_name,10x-v1-pbmc3k,10x-v2-pbmc8k,10x-v3-pbmc_10k_protein_v3
B cells,38.92,30.54,12.32
CD14+ monocytes,33.77,25.0,13.6
CD4 T cells,38.71,35.48,14.29
CD8 T cells,36.97,27.49,19.43
