In [1]:
from scvi.dataset import EbiData, MouseAtlas, UnionDataset, AnnDatasetFromAnnData
from Eval_basis import *
import scanpy as sc
import pandas as pd
import scipy.sparse as sparse
from tqdm import tqdm_notebook as tqdm

[2019-09-04 10:52:34,768] INFO - scvi._settings | Added StreamHandler with custom formatter to 'scvi' logger.


In [2]:
mouse_union = UnionDataset("./data", 
                              gene_map_load_filename="gene_maps/ensembl_mouse_genes-proteincoding", 
                              low_memory=False)

In [3]:
fpaths_and_fnames = {'data': "./data/mouse_atlas/cleaned_data_sparse.npz",
                     'cell': "./data/mouse_atlas/cell_annotation.csv",
                     'gene': "./data/mouse_atlas/gene_annotation.csv",
                     'pheno': "./data/mouse_atlas/phenotype_data.csv"}

mouse_union.join_datasets(data_source="memory", 
                          data_target="memory",
                          gene_datasets=[MouseAtlas("./data/mouse_atlas",
                                                    fpaths_and_fnames,
                                                    True,
                                                    False)])

Concatenating datasets: 100%|██████████| 1/1 [13:31<00:00, 811.43s/it]
[2019-09-04 11:07:16,829] INFO - scvi.dataset.dataset | Union dataset is set to ignore batch annotation.
[2019-09-04 11:07:26,619] INFO - scvi.dataset.dataset | Remapping labels to [0,N]
[2019-09-04 11:07:26,752] INFO - scvi.dataset.dataset | Remapping batch_indices to [0,N]
[2019-09-04 11:07:26,782] INFO - scvi.dataset.dataset | Union dataset is set to ignore batch annotation.
[2019-09-04 11:07:26,799] INFO - scvi.dataset.dataset | Joined 1 datasets to one of shape 1331984 x 22250.


GeneExpressionDataset object with n_cells x nb_genes = 1331984 x 22250
    gene_attribute_names: 'gene_names'
    cell_attribute_names: 'local_vars', 'local_means', 'labels', 'batch_indices'
    cell_categorical_attribute_names: 'labels', 'batch_indices'

In [4]:
n_epochs = 100
colors=None

print("Training VAE")

trainer = train_vae(mouse_union, "./data", f"../trained_models/mouse_atlas", n_epochs=n_epochs)
# trainer_small = train_vae(data_small, "./data", f"small_{tissue}_data_portion", n_epochs=n_epochs)



Training VAE
Initializing training.
training: 100%|██████████| 100/100 [3:49:32<00:00, 157.67s/it] 


In [5]:
mouse_ebi_celltype_data = UnionDataset("./data", 
                              gene_map_load_filename="gene_maps/ensembl_mouse_genes-proteincoding", 
                              low_memory=False)
mouse_ebi_celltype_data.join_datasets(data_source="memory", 
                             data_target="memory", 
                             gene_datasets=[EbiData("./data")])

[2019-09-04 15:00:25,276] INFO - scvi.dataset.dataset | Remapping labels to [0,N]
INFO:scvi.dataset.dataset:Remapping labels to [0,N]
[2019-09-04 15:00:25,282] INFO - scvi.dataset.dataset | Remapping batch_indices to [0,N]
INFO:scvi.dataset.dataset:Remapping batch_indices to [0,N]
[2019-09-04 15:00:26,367] INFO - scvi.dataset.dataset | Computing the library size for the new data
INFO:scvi.dataset.dataset:Computing the library size for the new data
[2019-09-04 15:00:27,462] INFO - scvi.dataset.dataset | Downsampled from 50896 to 35577 cells
INFO:scvi.dataset.dataset:Downsampled from 50896 to 35577 cells
Concatenating datasets: 100%|██████████| 1/1 [00:34<00:00, 34.92s/it]
[2019-09-04 15:01:02,886] INFO - scvi.dataset.dataset | Union dataset is set to ignore batch annotation.
INFO:scvi.dataset.dataset:Union dataset is set to ignore batch annotation.
[2019-09-04 15:01:04,208] INFO - scvi.dataset.dataset | Remapping labels to [0,N]
INFO:scvi.dataset.dataset:Remapping labels to [0,N]
[2019-

GeneExpressionDataset object with n_cells x nb_genes = 35577 x 22250
    gene_attribute_names: 'gene_names'
    cell_attribute_names: 'local_vars', 'local_means', 'labels', 'batch_indices'
    cell_categorical_attribute_names: 'labels', 'batch_indices'

In [10]:
dot_size = (mpl.rcParams['lines.markersize'] ** 2.0)

posterior_big = plot_tsne(trainer, trainer.model, mouse_union, f"./plots/mouse_atlas_tsne", image_datatype="pdf",
                          colors=colors, s=dot_size, edgecolors='black')

# posterior_ebi_annotated = trainer.create_posterior(model, ebi_with_celltypes, indices=np.arange(len(dataset)))
posterior_ebi_annotated = plot_tsne(trainer, trainer.model, mouse_ebi_celltype_data, f"./plots/ebi_annotated_in_mouse_atlas",
                                    image_datatype="pdf",
                                    colors=colors, s=dot_size, edgecolors='black')