In [1]:
from Eval_basis import *
import pandas as pd

[2019-08-13 12:08:34,318] INFO - scvi._settings | Added StreamHandler with custom formatter to 'scvi' logger.


In [2]:
np.random.seed(1)
%matplotlib inline

In [3]:
data_full = EbiData("./data", experiment="E-ENAD-15")

[2019-08-13 12:09:37,720] INFO - scvi.dataset.dataset | Remapping batch_indices to [0,N]
[2019-08-13 12:09:37,722] INFO - scvi.dataset.dataset | Remapping labels to [0,N]
[2019-08-13 12:09:38,782] INFO - scvi.dataset.dataset | Computing the library size for the new data
[2019-08-13 12:09:39,822] INFO - scvi.dataset.dataset | Downsampled from 50896 to 35577 cells


In [4]:
data_big_mapped = UnionDataset("./data", map_fname="ensembl_mouse_genes-proteincoding", low_memory=False)
data_big_mapped.union_from_memory([data_full])
data_big_mapped.filter_cell_types(np.array([ct for ct in data_big_mapped.cell_types if ct != "not available"]))

[2019-08-13 12:10:42,485] INFO - scvi.dataset.dataset | Remapping batch_indices to [0,N]
[2019-08-13 12:10:42,487] INFO - scvi.dataset.dataset | Remapping labels to [0,N]
[2019-08-13 12:10:42,491] INFO - scvi.dataset.dataset | Joined 1 datasets to one of shape 35577 x 22250.
[2019-08-13 12:10:43,476] INFO - scvi.dataset.dataset | Downsampled from 35577 to 31394 cells


In [5]:
agg = data_full.obs.groupby(["Sample Characteristic[organism part]", "cell_types"]).size()
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(agg.sort_index())

Sample Characteristic[organism part]      cell_types                                        
aorta                                     endothelial cell                                       188
                                          erythrocyte                                             91
                                          fibroblast                                              70
                                          not available                                          694
                                          professional antigen presenting cell                    59
ascending colon                           Brush cell of epithelium proper of large intestine      53
                                          enterocyte of epithelium of large intestine            761
                                          enteroendocrine cell                                    29
                                          epithelial cell of large intestine                    141

In [None]:
n_epochs = 100
colors=None
for tissue in np.unique(agg.index.get_level_values(0)): 
    print("Training VAE for tissue ", tissue)
    cutout_cts = agg[tissue].index.values
    if "not available" in cutout_cts:
        cutout_cts = np.delete(cutout_cts, np.where(cutout_cts == "not available")[0])

    rem_ct = data_big_mapped.cell_types[~np.isin(data_big_mapped.cell_types, cutout_cts)]
    data_small = copy.deepcopy(data_big_mapped)
    data_big = copy.deepcopy(data_big_mapped)

    data_small.filter_cell_types(cutout_cts)
    data_big.filter_cell_types(rem_ct)
    
    trainer_big = train_vae(data_big, "./data", f"big_{tissue}_data_portion", n_epochs=n_epochs)
    trainer_small = train_vae(data_small, "./data", f"small_{tissue}_data_portion", n_epochs=n_epochs)
    dot_size = (mpl.rcParams['lines.markersize'] ** 2.0)

    posterior_big = plot_tsne(trainer_big, trainer_big.model, data_big, f"./plots/big_{tissue}_data_portion",
                              colors=colors, s=dot_size, edgecolors='black')
    posterior_small = plot_tsne(trainer_small, trainer_small.model, data_small, f"./plots/small_{tissue}_data_portion",
                                colors=colors, s=dot_size, edgecolors='black')
    posterior_small_in_big = plot_tsne(trainer_big, trainer_big.model, data_small, f"./plots/small_{tissue}_data_portion_in_big",
                                       colors=colors, s=dot_size, edgecolors='black')

Training VAE for tissue  aorta


[2019-08-13 12:10:43,861] INFO - scvi.dataset.dataset | Downsampled from 31394 to 3985 cells
[2019-08-13 12:10:44,706] INFO - scvi.dataset.dataset | Downsampled from 31394 to 27409 cells


Training VAE for tissue  ascending colon


[2019-08-13 12:11:56,022] INFO - scvi.dataset.dataset | Downsampled from 31394 to 2725 cells
[2019-08-13 12:11:56,850] INFO - scvi.dataset.dataset | Downsampled from 31394 to 28669 cells


Training VAE for tissue  back skin


[2019-08-13 12:12:53,470] INFO - scvi.dataset.dataset | Downsampled from 31394 to 2818 cells
[2019-08-13 12:12:54,352] INFO - scvi.dataset.dataset | Downsampled from 31394 to 28576 cells


Training VAE for tissue  bone marrow


[2019-08-13 12:13:51,595] INFO - scvi.dataset.dataset | Downsampled from 31394 to 6015 cells
[2019-08-13 12:13:52,409] INFO - scvi.dataset.dataset | Downsampled from 31394 to 25379 cells


Training VAE for tissue  cerebellum


[2019-08-13 12:15:10,912] INFO - scvi.dataset.dataset | Downsampled from 31394 to 7502 cells
[2019-08-13 12:15:11,680] INFO - scvi.dataset.dataset | Downsampled from 31394 to 23892 cells


Training VAE for tissue  cerebral cortex


[2019-08-13 12:16:31,442] INFO - scvi.dataset.dataset | Downsampled from 31394 to 7482 cells
[2019-08-13 12:16:32,197] INFO - scvi.dataset.dataset | Downsampled from 31394 to 23912 cells


training: 100%|██████████| 100/100 [03:59<00:00,  2.76s/it]
training: 100%|██████████| 100/100 [01:09<00:00,  1.24it/s]
Training VAE for tissue  descending colon


[2019-08-13 12:23:03,685] INFO - scvi.dataset.dataset | Downsampled from 31394 to 2725 cells
INFO:scvi.dataset.dataset:Downsampled from 31394 to 2725 cells
[2019-08-13 12:23:04,525] INFO - scvi.dataset.dataset | Downsampled from 31394 to 28669 cells
INFO:scvi.dataset.dataset:Downsampled from 31394 to 28669 cells


training: 100%|██████████| 100/100 [04:48<00:00,  3.37s/it]
training: 100%|██████████| 100/100 [00:30<00:00,  2.83it/s]


  plt.figure(figsize=(16, 16))


Training VAE for tissue  diaphragm


[2019-08-13 12:29:21,361] INFO - scvi.dataset.dataset | Downsampled from 31394 to 3310 cells
INFO:scvi.dataset.dataset:Downsampled from 31394 to 3310 cells
[2019-08-13 12:29:22,222] INFO - scvi.dataset.dataset | Downsampled from 31394 to 28084 cells
INFO:scvi.dataset.dataset:Downsampled from 31394 to 28084 cells


training: 100%|██████████| 100/100 [04:46<00:00,  3.31s/it]
training: 100%|██████████| 100/100 [00:30<00:00,  2.73it/s]


  plt.figure(figsize=(16, 16))
  plt.figure(figsize=(16, 16))
  plt.figure(figsize=(16, 16))


Training VAE for tissue  endocrine pancreas


[2019-08-13 12:35:44,333] INFO - scvi.dataset.dataset | Downsampled from 31394 to 3746 cells
INFO:scvi.dataset.dataset:Downsampled from 31394 to 3746 cells
[2019-08-13 12:35:45,190] INFO - scvi.dataset.dataset | Downsampled from 31394 to 27648 cells
INFO:scvi.dataset.dataset:Downsampled from 31394 to 27648 cells


training: 100%|██████████| 100/100 [04:25<00:00,  3.08s/it]
training: 100%|██████████| 100/100 [00:35<00:00,  2.43it/s]


  plt.figure(figsize=(16, 16))
  plt.figure(figsize=(16, 16))
  plt.figure(figsize=(16, 16))


Training VAE for tissue  exocrine pancreas


[2019-08-13 12:41:55,498] INFO - scvi.dataset.dataset | Downsampled from 31394 to 3746 cells
INFO:scvi.dataset.dataset:Downsampled from 31394 to 3746 cells
[2019-08-13 12:41:56,341] INFO - scvi.dataset.dataset | Downsampled from 31394 to 27648 cells
INFO:scvi.dataset.dataset:Downsampled from 31394 to 27648 cells


training: 100%|██████████| 100/100 [04:33<00:00,  3.18s/it]
training: 100%|██████████| 100/100 [00:35<00:00,  2.43it/s]


  plt.figure(figsize=(16, 16))
  plt.figure(figsize=(16, 16))
  plt.figure(figsize=(16, 16))


Training VAE for tissue  gonadal fat pad


[2019-08-13 12:48:13,083] INFO - scvi.dataset.dataset | Downsampled from 31394 to 6770 cells
INFO:scvi.dataset.dataset:Downsampled from 31394 to 6770 cells
[2019-08-13 12:48:13,836] INFO - scvi.dataset.dataset | Downsampled from 31394 to 24624 cells
INFO:scvi.dataset.dataset:Downsampled from 31394 to 24624 cells


training: 100%|██████████| 100/100 [04:06<00:00,  2.82s/it]
training: 100%|██████████| 100/100 [01:02<00:00,  1.38it/s]


  plt.figure(figsize=(16, 16))
  plt.figure(figsize=(16, 16))
  plt.figure(figsize=(16, 16))


Training VAE for tissue  heart


[2019-08-13 12:54:47,515] INFO - scvi.dataset.dataset | Downsampled from 31394 to 4092 cells
INFO:scvi.dataset.dataset:Downsampled from 31394 to 4092 cells
[2019-08-13 12:54:48,347] INFO - scvi.dataset.dataset | Downsampled from 31394 to 27302 cells
INFO:scvi.dataset.dataset:Downsampled from 31394 to 27302 cells


training: 100%|██████████| 100/100 [04:28<00:00,  3.11s/it]
training: 100%|██████████| 100/100 [00:37<00:00,  2.27it/s]


  plt.figure(figsize=(16, 16))
  plt.figure(figsize=(16, 16))
  plt.figure(figsize=(16, 16))


Training VAE for tissue  heart left atrium


[2019-08-13 13:01:07,317] INFO - scvi.dataset.dataset | Downsampled from 31394 to 4684 cells
INFO:scvi.dataset.dataset:Downsampled from 31394 to 4684 cells
[2019-08-13 13:01:08,185] INFO - scvi.dataset.dataset | Downsampled from 31394 to 26710 cells
INFO:scvi.dataset.dataset:Downsampled from 31394 to 26710 cells


training: 100%|██████████| 100/100 [04:28<00:00,  3.12s/it]
training: 100%|██████████| 100/100 [00:45<00:00,  1.97it/s]


  plt.figure(figsize=(16, 16))
  plt.figure(figsize=(16, 16))
  plt.figure(figsize=(16, 16))


Training VAE for tissue  heart left atrium and heart right atrium


[2019-08-13 13:07:44,598] INFO - scvi.dataset.dataset | Downsampled from 31394 to 1643 cells
INFO:scvi.dataset.dataset:Downsampled from 31394 to 1643 cells
[2019-08-13 13:07:45,560] INFO - scvi.dataset.dataset | Downsampled from 31394 to 29751 cells
INFO:scvi.dataset.dataset:Downsampled from 31394 to 29751 cells


training: 100%|██████████| 100/100 [05:02<00:00,  3.41s/it]
training: 100%|██████████| 100/100 [00:17<00:00,  5.39it/s]


  plt.figure(figsize=(16, 16))
  plt.figure(figsize=(16, 16))
  plt.figure(figsize=(16, 16))


Training VAE for tissue  heart left ventricle


[2019-08-13 13:13:53,891] INFO - scvi.dataset.dataset | Downsampled from 31394 to 4684 cells
INFO:scvi.dataset.dataset:Downsampled from 31394 to 4684 cells
[2019-08-13 13:13:54,783] INFO - scvi.dataset.dataset | Downsampled from 31394 to 26710 cells
INFO:scvi.dataset.dataset:Downsampled from 31394 to 26710 cells


training: 100%|██████████| 100/100 [04:38<00:00,  3.27s/it]
training: 100%|██████████| 100/100 [00:45<00:00,  2.01it/s]


  plt.figure(figsize=(16, 16))
  plt.figure(figsize=(16, 16))
  plt.figure(figsize=(16, 16))


Training VAE for tissue  heart right atrium


[2019-08-13 13:20:39,615] INFO - scvi.dataset.dataset | Downsampled from 31394 to 4684 cells
INFO:scvi.dataset.dataset:Downsampled from 31394 to 4684 cells
[2019-08-13 13:20:40,483] INFO - scvi.dataset.dataset | Downsampled from 31394 to 26710 cells
INFO:scvi.dataset.dataset:Downsampled from 31394 to 26710 cells


training: 100%|██████████| 100/100 [04:35<00:00,  3.31s/it]
training: 100%|██████████| 100/100 [00:45<00:00,  1.93it/s]


  plt.figure(figsize=(16, 16))
  plt.figure(figsize=(16, 16))
  plt.figure(figsize=(16, 16))


Training VAE for tissue  heart right ventricle


[2019-08-13 13:27:25,251] INFO - scvi.dataset.dataset | Downsampled from 31394 to 4684 cells
INFO:scvi.dataset.dataset:Downsampled from 31394 to 4684 cells
[2019-08-13 13:27:26,183] INFO - scvi.dataset.dataset | Downsampled from 31394 to 26710 cells
INFO:scvi.dataset.dataset:Downsampled from 31394 to 26710 cells


training: 100%|██████████| 100/100 [04:29<00:00,  3.10s/it]
training: 100%|██████████| 100/100 [00:47<00:00,  1.91it/s]


  plt.figure(figsize=(16, 16))
  plt.figure(figsize=(16, 16))
  plt.figure(figsize=(16, 16))


Training VAE for tissue  hippocampus


[2019-08-13 13:34:05,327] INFO - scvi.dataset.dataset | Downsampled from 31394 to 7482 cells
INFO:scvi.dataset.dataset:Downsampled from 31394 to 7482 cells
[2019-08-13 13:34:06,068] INFO - scvi.dataset.dataset | Downsampled from 31394 to 23912 cells
INFO:scvi.dataset.dataset:Downsampled from 31394 to 23912 cells


training: 100%|██████████| 100/100 [03:57<00:00,  2.75s/it]
training: 100%|██████████| 100/100 [01:11<00:00,  1.26it/s]


  plt.figure(figsize=(16, 16))
  plt.figure(figsize=(16, 16))
  plt.figure(figsize=(16, 16))


Training VAE for tissue  interscapular brown adipose tissue


[2019-08-13 13:40:40,137] INFO - scvi.dataset.dataset | Downsampled from 31394 to 6770 cells
INFO:scvi.dataset.dataset:Downsampled from 31394 to 6770 cells
[2019-08-13 13:40:40,940] INFO - scvi.dataset.dataset | Downsampled from 31394 to 24624 cells
INFO:scvi.dataset.dataset:Downsampled from 31394 to 24624 cells


training: 100%|██████████| 100/100 [04:10<00:00,  2.84s/it]
training: 100%|██████████| 100/100 [01:03<00:00,  1.37it/s]


  plt.figure(figsize=(16, 16))
  plt.figure(figsize=(16, 16))
  plt.figure(figsize=(16, 16))


Training VAE for tissue  kidney


[2019-08-13 13:47:17,537] INFO - scvi.dataset.dataset | Downsampled from 31394 to 2936 cells
INFO:scvi.dataset.dataset:Downsampled from 31394 to 2936 cells
[2019-08-13 13:47:18,410] INFO - scvi.dataset.dataset | Downsampled from 31394 to 28458 cells
INFO:scvi.dataset.dataset:Downsampled from 31394 to 28458 cells


training: 100%|██████████| 100/100 [04:43<00:00,  3.29s/it]
training: 100%|██████████| 100/100 [00:26<00:00,  3.26it/s]


  plt.figure(figsize=(16, 16))
  plt.figure(figsize=(16, 16))
  plt.figure(figsize=(16, 16))


Training VAE for tissue  limb muscle


[2019-08-13 13:53:28,152] INFO - scvi.dataset.dataset | Downsampled from 31394 to 5312 cells
INFO:scvi.dataset.dataset:Downsampled from 31394 to 5312 cells
[2019-08-13 13:53:28,970] INFO - scvi.dataset.dataset | Downsampled from 31394 to 26082 cells
INFO:scvi.dataset.dataset:Downsampled from 31394 to 26082 cells


training: 100%|██████████| 100/100 [04:24<00:00,  3.06s/it]
training: 100%|██████████| 100/100 [00:47<00:00,  1.80it/s]


  plt.figure(figsize=(16, 16))
  plt.figure(figsize=(16, 16))
  plt.figure(figsize=(16, 16))


Training VAE for tissue  liver


[2019-08-13 14:00:05,375] INFO - scvi.dataset.dataset | Downsampled from 31394 to 1984 cells
INFO:scvi.dataset.dataset:Downsampled from 31394 to 1984 cells
[2019-08-13 14:00:06,286] INFO - scvi.dataset.dataset | Downsampled from 31394 to 29410 cells
INFO:scvi.dataset.dataset:Downsampled from 31394 to 29410 cells


training: 100%|██████████| 100/100 [04:49<00:00,  3.33s/it]
training: 100%|██████████| 100/100 [00:18<00:00,  4.75it/s]


  plt.figure(figsize=(16, 16))
  plt.figure(figsize=(16, 16))
  plt.figure(figsize=(16, 16))


Training VAE for tissue  lung


[2019-08-13 14:06:03,169] INFO - scvi.dataset.dataset | Downsampled from 31394 to 4930 cells
INFO:scvi.dataset.dataset:Downsampled from 31394 to 4930 cells
[2019-08-13 14:06:04,008] INFO - scvi.dataset.dataset | Downsampled from 31394 to 26464 cells
INFO:scvi.dataset.dataset:Downsampled from 31394 to 26464 cells


training: 100%|██████████| 100/100 [04:21<00:00,  3.02s/it]
training: 100%|██████████| 100/100 [00:45<00:00,  1.89it/s]


  plt.figure(figsize=(16, 16))
  plt.figure(figsize=(16, 16))
  plt.figure(figsize=(16, 16))


Training VAE for tissue  mammary gland


[2019-08-13 14:12:33,206] INFO - scvi.dataset.dataset | Downsampled from 31394 to 4262 cells
INFO:scvi.dataset.dataset:Downsampled from 31394 to 4262 cells
[2019-08-13 14:12:34,056] INFO - scvi.dataset.dataset | Downsampled from 31394 to 27132 cells
INFO:scvi.dataset.dataset:Downsampled from 31394 to 27132 cells


training: 100%|██████████| 100/100 [04:27<00:00,  3.11s/it]
training: 100%|██████████| 100/100 [00:40<00:00,  2.14it/s]


  plt.figure(figsize=(16, 16))
  plt.figure(figsize=(16, 16))
  plt.figure(figsize=(16, 16))


Training VAE for tissue  mesenteric adipose tissue


[2019-08-13 14:18:57,429] INFO - scvi.dataset.dataset | Downsampled from 31394 to 6770 cells
INFO:scvi.dataset.dataset:Downsampled from 31394 to 6770 cells
[2019-08-13 14:18:58,214] INFO - scvi.dataset.dataset | Downsampled from 31394 to 24624 cells
INFO:scvi.dataset.dataset:Downsampled from 31394 to 24624 cells


training: 100%|██████████| 100/100 [04:04<00:00,  2.82s/it]
training: 100%|██████████| 100/100 [01:02<00:00,  1.38it/s]


  plt.figure(figsize=(16, 16))
  plt.figure(figsize=(16, 16))
  plt.figure(figsize=(16, 16))


Training VAE for tissue  spleen


[2019-08-13 14:25:28,795] INFO - scvi.dataset.dataset | Downsampled from 31394 to 2257 cells
INFO:scvi.dataset.dataset:Downsampled from 31394 to 2257 cells
[2019-08-13 14:25:29,698] INFO - scvi.dataset.dataset | Downsampled from 31394 to 29137 cells
INFO:scvi.dataset.dataset:Downsampled from 31394 to 29137 cells


training: 100%|██████████| 100/100 [04:46<00:00,  3.31s/it]
training: 100%|██████████| 100/100 [00:20<00:00,  4.27it/s]


  plt.figure(figsize=(16, 16))
  plt.figure(figsize=(16, 16))
  plt.figure(figsize=(16, 16))


Training VAE for tissue  striatum


[2019-08-13 14:31:30,335] INFO - scvi.dataset.dataset | Downsampled from 31394 to 7502 cells
INFO:scvi.dataset.dataset:Downsampled from 31394 to 7502 cells
[2019-08-13 14:31:31,105] INFO - scvi.dataset.dataset | Downsampled from 31394 to 23892 cells
INFO:scvi.dataset.dataset:Downsampled from 31394 to 23892 cells


training: 100%|██████████| 100/100 [03:56<00:00,  2.74s/it]
training: 100%|██████████| 100/100 [01:09<00:00,  1.25it/s]


  plt.figure(figsize=(16, 16))
  plt.figure(figsize=(16, 16))
  plt.figure(figsize=(16, 16))


Training VAE for tissue  subcutaneous adipose tissue


[2019-08-13 14:37:59,898] INFO - scvi.dataset.dataset | Downsampled from 31394 to 6770 cells
INFO:scvi.dataset.dataset:Downsampled from 31394 to 6770 cells
[2019-08-13 14:38:00,647] INFO - scvi.dataset.dataset | Downsampled from 31394 to 24624 cells
INFO:scvi.dataset.dataset:Downsampled from 31394 to 24624 cells


training: 100%|██████████| 100/100 [04:03<00:00,  2.80s/it]
training: 100%|██████████| 100/100 [01:02<00:00,  1.36it/s]


  plt.figure(figsize=(16, 16))
  plt.figure(figsize=(16, 16))
  plt.figure(figsize=(16, 16))


Training VAE for tissue  thymus


[2019-08-13 14:44:31,176] INFO - scvi.dataset.dataset | Downsampled from 31394 to 1014 cells
INFO:scvi.dataset.dataset:Downsampled from 31394 to 1014 cells
[2019-08-13 14:44:32,129] INFO - scvi.dataset.dataset | Downsampled from 31394 to 30380 cells
INFO:scvi.dataset.dataset:Downsampled from 31394 to 30380 cells


training: 100%|██████████| 100/100 [05:06<00:00,  3.54s/it]
training: 100%|██████████| 100/100 [00:08<00:00, 11.14it/s]


  plt.figure(figsize=(16, 16))
  plt.figure(figsize=(16, 16))
  plt.figure(figsize=(16, 16))


Training VAE for tissue  tongue


[2019-08-13 14:50:27,543] INFO - scvi.dataset.dataset | Downsampled from 31394 to 1385 cells
INFO:scvi.dataset.dataset:Downsampled from 31394 to 1385 cells
[2019-08-13 14:50:28,420] INFO - scvi.dataset.dataset | Downsampled from 31394 to 30009 cells
INFO:scvi.dataset.dataset:Downsampled from 31394 to 30009 cells


training: 100%|██████████| 100/100 [04:51<00:00,  3.37s/it]
training: 100%|██████████| 100/100 [00:28<00:00,  2.97it/s]


  plt.figure(figsize=(16, 16))
  plt.figure(figsize=(16, 16))
  plt.figure(figsize=(16, 16))


Training VAE for tissue  trachea


[2019-08-13 14:56:31,704] INFO - scvi.dataset.dataset | Downsampled from 31394 to 3195 cells
INFO:scvi.dataset.dataset:Downsampled from 31394 to 3195 cells
[2019-08-13 14:56:32,565] INFO - scvi.dataset.dataset | Downsampled from 31394 to 28199 cells
INFO:scvi.dataset.dataset:Downsampled from 31394 to 28199 cells


training: 100%|██████████| 100/100 [04:37<00:00,  3.20s/it]
training: 100%|██████████| 100/100 [00:30<00:00,  2.81it/s]


  plt.figure(figsize=(16, 16))
  plt.figure(figsize=(16, 16))
  plt.figure(figsize=(16, 16))


Training VAE for tissue  urinary bladder


[2019-08-13 15:02:45,664] INFO - scvi.dataset.dataset | Downsampled from 31394 to 982 cells
INFO:scvi.dataset.dataset:Downsampled from 31394 to 982 cells
[2019-08-13 15:02:46,576] INFO - scvi.dataset.dataset | Downsampled from 31394 to 30412 cells
INFO:scvi.dataset.dataset:Downsampled from 31394 to 30412 cells


training:  25%|██▌       | 25/100 [01:14<04:16,  3.42s/it]

In [None]:
data_big_mapped.X

In [None]:
data_big.gene_names

In [None]:
np.isin(data_full.gene_names, data_big_mapped.gene_names).sum() / len(data_full.gene_names)