In [1]:
from scvi.dataset import EbiData, MouseAtlas, UnionDataset, AnnDatasetFromAnnData
from Eval_basis import *
import scanpy as sc
import pandas as pd
import scipy.sparse as sparse

[2019-08-22 17:15:08,109] INFO - scvi._settings | Added StreamHandler with custom formatter to 'scvi' logger.


In [None]:
ebi_1 = EbiData("./data", 'E-MTAB-6946', result_file='raw')

In [None]:
ebi_2 = EbiData("./data", 'E-MTAB-7320', result_file='raw')

In [None]:
fpaths_and_fnames = {'data': "./data/mouse_atlas/cleaned_data_sparse.npz",
                     'cell': "./data/mouse_atlas/cell_annotation.csv",
                     'gene': "./data/mouse_atlas/gene_annotation.csv",
                     'pheno': "./data/mouse_atlas/phenotype_data.csv"}
mouse_atlas = MouseAtlas("./data/mouse_atlas",
                   fpaths_and_fnames,
                   True,
                   False)

In [None]:
conv = pd.read_csv("./data/gene_maps/hugo_mouse_genes-proteincoding.csv", header=0, index_col=0)
conv.index = conv.index.str.lower()

data_path = os.path.join(("./data"))
mouse_data_path = os.path.join(data_path, "mouse_data")
dsets = [] 
for file in os.listdir(f"{data_path}/mouse_data"):
#     if "droplet" in file:
    dset = sc.read_h5ad(os.path.join(mouse_data_path, file))
    dset.obs.rename(columns={"cell_ontology_class": "cell_types"}, inplace=True)
    
    dset = AnnDatasetFromAnnData(dset)
    
    gns_conved = conv.reindex(np.char.lower(dset.gene_names))["ensembl"]
    if not isinstance(dset.X, np.ndarray):
        X = dset.X.toarray()
    else:
        X = dset.X
    mask = ~gns_conved.isnull()
    
    dset.gene_names = gns_conved[mask].values.astype(str)
    dset.X = X[:, mask]
    dset.cell_types = np.array([ct.replace("ï", "i") for ct in dset.cell_types])
    
    dsets.append(dset)

In [None]:
mouse_muris_senis = UnionDataset("./data", 
                                 gene_map_load_filename="gene_maps/ensembl_mouse_genes-proteincoding", 
                                 low_memory=False)

In [None]:
mouse_muris_senis.join_datasets(data_source="memory", 
                                data_target="memory",
                                gene_datasets=dsets)
mouse_muris_senis.name = "Tabula Muris Senis"

In [None]:
dsets = None
mouse_muris_senis.cell_types = np.array([ct.replace("ï", "i") for ct in mouse_muris_senis.cell_types])

In [None]:
complete_mouse = UnionDataset("./data", 
                              gene_map_load_filename="gene_maps/ensembl_mouse_genes-proteincoding", 
                              low_memory=False)

In [None]:
complete_mouse.join_datasets(data_source="memory", 
                             data_target="hdf5", 
                             out_filename="mouse_data_all", 
                             gene_datasets=[mouse_muris_senis, ebi_1, ebi_2, mouse_atlas])

In [None]:
mouse_muris_senis.cell_types

In [None]:
complete_mouse = UnionDataset("./data", 
                              gene_map_load_filename="gene_maps/ensembl_mouse_genes-proteincoding", 
                              data_load_filename="mouse_data_all",
                              low_memory=True)

In [None]:
complete_mouse.gene_names

In [None]:
n_epochs = 100
colors=None

print("Training VAE")

trainer = train_vae(complete_mouse, "./data", f"max_data_model", n_epochs=n_epochs)
# trainer_small = train_vae(data_small, "./data", f"small_{tissue}_data_portion", n_epochs=n_epochs)



In [None]:
ebi_with_celltypes = EbiData("./data")

In [None]:
dot_size = (mpl.rcParams['lines.markersize'] ** 2.0)

posterior_big = plot_tsne(trainer, trainer.model, complete_mouse, f"./max_data_model",
                          colors=colors, s=dot_size, edgecolors='black')

# posterior_ebi_annotated = trainer.create_posterior(model, ebi_with_celltypes, indices=np.arange(len(dataset)))
posterior_ebi_annotated = plot_tsne(trainer_big, trainer_big.model, ebi_with_celltypes, f"./plots/small_{tissue}_data_portion_in_big",
#                                    colors=colors, s=dot_size, edgecolors='black')
