In [None]:
from scvi.dataset import EbiData, MouseAtlas, UnionDataset, AnnDatasetFromAnnData
from Eval_basis import *
import scanpy as sc
import pandas as pd
import scipy.sparse as sparse
from tqdm import tqdm_notebook as tqdm

In [None]:
conv = pd.read_csv("./data/gene_maps/hugo_mouse_genes-proteincoding.csv", header=0, index_col=0)
conv.index = conv.index.str.lower()

data_path = os.path.join(("./data"))
mouse_data_path = os.path.join(data_path, "mouse_data")
dsets = [] 
for file in os.listdir(f"{data_path}/mouse_data"):
#     if "droplet" in file:
    dset = sc.read_h5ad(os.path.join(mouse_data_path, file))
    dset.obs.rename(columns={"cell_ontology_class": "cell_types"}, inplace=True)
    
    dset = AnnDatasetFromAnnData(dset)
    
    gns_conved = conv.reindex(np.char.upper(dset.gene_names))["ensembl"]
    if not isinstance(dset.X, np.ndarray):
        X = dset.X.toarray()
    else:
        X = dset.X
    mask = ~gns_conved.isnull()
    
    dset.gene_names = gns_conved[mask].values.astype(str)
    dset.X = X[:, mask]
    dset.cell_types = np.array([ct.replace("ï", "i") for ct in dset.cell_types])
    
    dsets.append(dset)

In [None]:
mouse_muris_senis = UnionDataset("./data", 
                                 gene_map_load_filename="gene_maps/ensembl_mouse_genes-proteincoding", 
                                 low_memory=False)

In [None]:
mouse_muris_senis.join_datasets(data_source="memory", 
                                data_target="memory",
                                gene_datasets=dsets)
mouse_muris_senis.name = "Tabula Muris Senis"

In [None]:
n_epochs = 100
colors=None

print("Training VAE")

trainer = train_vae(complete_mouse, "./data", f"../trained_models/tabula_muris_data", n_epochs=n_epochs)
# trainer_small = train_vae(data_small, "./data", f"small_{tissue}_data_portion", n_epochs=n_epochs)



In [None]:
mouse_ebi_celltype_data = UnionDataset("./data", 
                              gene_map_load_filename="gene_maps/ensembl_mouse_genes-proteincoding", 
                              low_memory=False)
mouse_ebi_celltype_data.join_datasets(data_source="memory", 
                             data_target="memory", 
                             gene_datasets=[EbiData("./data")])

In [None]:
dot_size = (mpl.rcParams['lines.markersize'] ** 2.0)

posterior_big = plot_tsne(trainer, trainer.model, complete_mouse, f"./tabula_muris_tsne", image_datatype="pdf",
                          colors=colors, s=dot_size, edgecolors='black')

# posterior_ebi_annotated = trainer.create_posterior(model, ebi_with_celltypes, indices=np.arange(len(dataset)))
posterior_ebi_annotated = plot_tsne(trainer_big, trainer_big.model, ebi_with_celltypes, f"./plots/ebi_annotated_in_tabula-muris",
                                    image_datatype="pdf", colors=colors, s=dot_size, edgecolors='black')