In [16]:
import os
import sys

import numpy as np
import pandas as pd


In [17]:
import git
path_repo_root = git.Repo('.', search_parent_directories=True).working_tree_dir

In [18]:
path_lib = os.path.join(path_repo_root, "src/lib")
sys.path.insert(1, path_lib)
from es_precalculation import *

# Constants

In [20]:
### Constants
input_dir = os.path.join(path_repo_root, "tmp-data/expression")
output_dir = os.path.join(path_repo_root, "tmp-data/expression-precalc")

# Functions

In [28]:
def utils_celldata_reader(input_dir, prefix):
    """ Helper function to read .metadata.csv and .umi.csv.gz files in stored input_dir (tmp-data/expression)"""

    ### Metadata
    file_metadata = os.path.join(input_dir, "{}.metadata.csv".format(prefix))
    df_metadata = pd.read_csv(file_metadata, index_col=False)
    ### Data
    print("Reading UMI data...")
    file_data = os.path.join(input_dir, "{}.umi.csv.gz".format(prefix))
    df_data = pd.read_csv(file_data, index_col=False) # this takes 12-14 min for tabula_muris! (Pandas is slow!)
    print("Done reading UMI data")
    df_data.set_index("gene", inplace=True) # set index
    ### [*IMPORTANT*] Check that all metadata cell_ids are identical to data columnnames. 
    ### We need to ensure this before we can use the metadata cell-types as annotations.
    assert(np.all(df_data.columns.values == df_metadata["cell_id"].values)) # ---> True
    return [df_metadata, df_data]

# Mousebrain

In [None]:
import loompy # version 2

In [42]:
### Variables
out_prefix = "mousebrain"
annotation_column = "ClusterName"

In [39]:
file_loom = os.path.join(input_dir, "mousebrain-l5_all.loom")
# file_loom = "/scratch/data-for_fast_access/pub-others/zeisel-biorxiv-2018/l5_all2.loom"
with loompy.connect(file_loom) as ds:
    ### Make data frame. Use Ensembl GeneIDs
    # ds.ra.Gene (Gene names) contains 66 duplicate gene names
    df_data = pd.DataFrame(ds[:, :].astype(int), index=ds.ra.Accession, columns=ds.ca.CellID)
    df_data.columns = pd.MultiIndex.from_arrays([ds.ca.CellID, ds.ca.ClusterName], names=["CellID", "ClusterName"])

In [40]:
df_ctc_log = ctc_log_normalize(df_data)
del df_data

Performning common transcript count (ctc) normalization and log-transformation on input data


In [43]:
annotations = df_ctc_log.columns.get_level_values(level=annotation_column) # 1 x NCell

In [None]:
df_anova = calculate_anova_sporadically_expressed_genes(df_ctc_log, annotations, out_prefix)

Splitting data frame into annotation groups
Splitting annotation #1/#265 into group
Splitting annotation #2/#265 into group
Splitting annotation #3/#265 into group
Splitting annotation #4/#265 into group
Splitting annotation #5/#265 into group
Splitting annotation #6/#265 into group
Splitting annotation #7/#265 into group
Splitting annotation #8/#265 into group
Splitting annotation #9/#265 into group
Splitting annotation #10/#265 into group
Splitting annotation #11/#265 into group
Splitting annotation #12/#265 into group
Splitting annotation #13/#265 into group
Splitting annotation #14/#265 into group
Splitting annotation #15/#265 into group
Splitting annotation #16/#265 into group
Splitting annotation #17/#265 into group
Splitting annotation #18/#265 into group
Splitting annotation #19/#265 into group
Splitting annotation #20/#265 into group
Splitting annotation #21/#265 into group
Splitting annotation #22/#265 into group
Splitting annotation #23/#265 into group
Splitting annotation #

Splitting annotation #200/#265 into group
Splitting annotation #201/#265 into group
Splitting annotation #202/#265 into group
Splitting annotation #203/#265 into group
Splitting annotation #204/#265 into group
Splitting annotation #205/#265 into group
Splitting annotation #206/#265 into group
Splitting annotation #207/#265 into group
Splitting annotation #208/#265 into group
Splitting annotation #209/#265 into group
Splitting annotation #210/#265 into group
Splitting annotation #211/#265 into group
Splitting annotation #212/#265 into group
Splitting annotation #213/#265 into group
Splitting annotation #214/#265 into group
Splitting annotation #215/#265 into group
Splitting annotation #216/#265 into group
Splitting annotation #217/#265 into group
Splitting annotation #218/#265 into group
Splitting annotation #219/#265 into group
Splitting annotation #220/#265 into group
Splitting annotation #221/#265 into group
Splitting annotation #222/#265 into group
Splitting annotation #223/#265 int

  f = msb / msw


gene 2700 out of 27998
gene 2800 out of 27998
gene 2900 out of 27998
gene 3000 out of 27998
gene 3100 out of 27998
gene 3200 out of 27998
gene 3300 out of 27998
gene 3400 out of 27998
gene 3500 out of 27998
gene 3600 out of 27998
gene 3700 out of 27998
gene 3800 out of 27998
gene 3900 out of 27998
gene 4000 out of 27998
gene 4100 out of 27998
gene 4200 out of 27998
gene 4300 out of 27998
gene 4400 out of 27998
gene 4500 out of 27998
gene 4600 out of 27998
gene 4700 out of 27998
gene 4800 out of 27998
gene 4900 out of 27998
gene 5000 out of 27998
gene 5100 out of 27998
gene 5200 out of 27998
gene 5300 out of 27998
gene 5400 out of 27998
gene 5500 out of 27998
gene 5600 out of 27998
gene 5700 out of 27998
gene 5800 out of 27998
gene 5900 out of 27998
gene 6000 out of 27998
gene 6100 out of 27998
gene 6200 out of 27998
gene 6300 out of 27998
gene 6400 out of 27998
gene 6500 out of 27998
gene 6600 out of 27998
gene 6700 out of 27998
gene 6800 out of 27998
gene 6900 out of 27998
gene 7000 o

In [28]:
(df_frac, df_mu, df_var, df_n) = calculate_per_anno_summary_stats(df_ctc_log, annotations, out_prefix, permute_annotations=False)

Running: #1/#115 | Bladder.bladder cell
Running: #2/#115 | Bladder.bladder urothelial cell
Running: #3/#115 | Brain_Myeloid.macrophage
Running: #4/#115 | Brain_Myeloid.microglial cell
Running: #5/#115 | Brain_Non-Myeloid.Bergmann glial cell
Running: #6/#115 | Brain_Non-Myeloid.astrocyte
Running: #7/#115 | Brain_Non-Myeloid.brain pericyte
Running: #8/#115 | Brain_Non-Myeloid.endothelial cell
Running: #9/#115 | Brain_Non-Myeloid.neuron
Running: #10/#115 | Brain_Non-Myeloid.oligodendrocyte
Running: #11/#115 | Brain_Non-Myeloid.oligodendrocyte precursor cell
Running: #12/#115 | Fat.B cell
Running: #13/#115 | Fat.T cell
Running: #14/#115 | Fat.endothelial cell
Running: #15/#115 | Fat.mesenchymal stem cell of adipose
Running: #16/#115 | Fat.myeloid cell
Running: #17/#115 | Fat.natural killer cell
Running: #18/#115 | Fat.unknown cell type
Running: #19/#115 | Heart.cardiac muscle cell
Running: #20/#115 | Heart.endocardial cell
Running: #21/#115 | Heart.endothelial cell
Running: #22/#115 | Hear

In [29]:
(df_frac_null, df_mu_null, df_var_null, df_n_null) = calculate_per_anno_summary_stats(df_ctc_log, annotations, out_prefix, permute_annotations=True)

Doing null computation. Permuting labels with seed(1).
Running: #1/#115 | Bladder.bladder cell
Running: #2/#115 | Bladder.bladder urothelial cell
Running: #3/#115 | Brain_Myeloid.macrophage
Running: #4/#115 | Brain_Myeloid.microglial cell
Running: #5/#115 | Brain_Non-Myeloid.Bergmann glial cell
Running: #6/#115 | Brain_Non-Myeloid.astrocyte
Running: #7/#115 | Brain_Non-Myeloid.brain pericyte
Running: #8/#115 | Brain_Non-Myeloid.endothelial cell
Running: #9/#115 | Brain_Non-Myeloid.neuron
Running: #10/#115 | Brain_Non-Myeloid.oligodendrocyte
Running: #11/#115 | Brain_Non-Myeloid.oligodendrocyte precursor cell
Running: #12/#115 | Fat.B cell
Running: #13/#115 | Fat.T cell
Running: #14/#115 | Fat.endothelial cell
Running: #15/#115 | Fat.mesenchymal stem cell of adipose
Running: #16/#115 | Fat.myeloid cell
Running: #17/#115 | Fat.natural killer cell
Running: #18/#115 | Fat.unknown cell type
Running: #19/#115 | Heart.cardiac muscle cell
Running: #20/#115 | Heart.endocardial cell
Running: #21

# Tabula Muris

In [None]:
### Variables
out_prefix = os.join(output_dir, "tabula_muris")
annotation_column = "tissue_celltype"

In [11]:
[df_metadata, df_data] = utils_celldata_reader(input_dir, prefix="tabula_muris")

True

In [12]:
df_ctc_log = ctc_log_normalize(df_data)
del df_data

Performning common transcript count (ctc) normalization and log-transformation on input data


In [17]:
### Set annotations
annotations = df_metadata[annotation_column].values

array(['Skin.epidermal cell', 'Skin.epidermal cell',
       'Skin.basal cell of epidermis', 'Skin.epidermal cell',
       'Skin.basal cell of epidermis'], dtype=object)

In [24]:
df_anova = calculate_anova_sporadically_expressed_genes(df_ctc_log, annotations, out_prefix)

NameError: name 'df_ctc_log' is not defined

In [28]:
(df_frac, df_mu, df_var, df_n) = calculate_per_anno_summary_stats(df_ctc_log, annotations, out_prefix, permute_annotations=False)

Running: #1/#115 | Bladder.bladder cell
Running: #2/#115 | Bladder.bladder urothelial cell
Running: #3/#115 | Brain_Myeloid.macrophage
Running: #4/#115 | Brain_Myeloid.microglial cell
Running: #5/#115 | Brain_Non-Myeloid.Bergmann glial cell
Running: #6/#115 | Brain_Non-Myeloid.astrocyte
Running: #7/#115 | Brain_Non-Myeloid.brain pericyte
Running: #8/#115 | Brain_Non-Myeloid.endothelial cell
Running: #9/#115 | Brain_Non-Myeloid.neuron
Running: #10/#115 | Brain_Non-Myeloid.oligodendrocyte
Running: #11/#115 | Brain_Non-Myeloid.oligodendrocyte precursor cell
Running: #12/#115 | Fat.B cell
Running: #13/#115 | Fat.T cell
Running: #14/#115 | Fat.endothelial cell
Running: #15/#115 | Fat.mesenchymal stem cell of adipose
Running: #16/#115 | Fat.myeloid cell
Running: #17/#115 | Fat.natural killer cell
Running: #18/#115 | Fat.unknown cell type
Running: #19/#115 | Heart.cardiac muscle cell
Running: #20/#115 | Heart.endocardial cell
Running: #21/#115 | Heart.endothelial cell
Running: #22/#115 | Hear

In [29]:
(df_frac_null, df_mu_null, df_var_null, df_n_null) = calculate_per_anno_summary_stats(df_ctc_log, annotations, out_prefix, permute_annotations=True)

Doing null computation. Permuting labels with seed(1).
Running: #1/#115 | Bladder.bladder cell
Running: #2/#115 | Bladder.bladder urothelial cell
Running: #3/#115 | Brain_Myeloid.macrophage
Running: #4/#115 | Brain_Myeloid.microglial cell
Running: #5/#115 | Brain_Non-Myeloid.Bergmann glial cell
Running: #6/#115 | Brain_Non-Myeloid.astrocyte
Running: #7/#115 | Brain_Non-Myeloid.brain pericyte
Running: #8/#115 | Brain_Non-Myeloid.endothelial cell
Running: #9/#115 | Brain_Non-Myeloid.neuron
Running: #10/#115 | Brain_Non-Myeloid.oligodendrocyte
Running: #11/#115 | Brain_Non-Myeloid.oligodendrocyte precursor cell
Running: #12/#115 | Fat.B cell
Running: #13/#115 | Fat.T cell
Running: #14/#115 | Fat.endothelial cell
Running: #15/#115 | Fat.mesenchymal stem cell of adipose
Running: #16/#115 | Fat.myeloid cell
Running: #17/#115 | Fat.natural killer cell
Running: #18/#115 | Fat.unknown cell type
Running: #19/#115 | Heart.cardiac muscle cell
Running: #20/#115 | Heart.endocardial cell
Running: #21

# Campbell2017

In [30]:
### Variables
dict_run = {os.join(output_dir, "campbell2017_lvl1"):"cell_type_all_lvl1",
            os.join(output_dir, "campbell2017_lvl2"):"cell_type_all_lvl2"} # {out_prefix:annotation_column}

In [31]:
[df_metadata, df_data] = utils_celldata_reader(input_dir, prefix="campbell2017")

In [None]:
df_ctc_log = ctc_log_normalize(df_data)
del df_data

In [29]:
for out_prefix in dict_run:
    print(out_prefix)
    annotations = df_metadata[dict_run[out_prefix]].values # get annotations
    df_anova = calculate_anova_sporadically_expressed_genes(df_ctc_log, annotations, out_prefix)
    (df_frac, df_mu, df_var, df_n) = calculate_per_anno_summary_stats(df_ctc_log, annotations, out_prefix, permute_annotations=False)
    (df_frac_null, df_mu_null, df_var_null, df_n_null) = calculate_per_anno_summary_stats(df_ctc_log, annotations, out_prefix, permute_annotations=True)

campbell2017_lvl1
['a18.Neurons6' 'a19.ParsTuber1' 'a13.Neurons1' 'a18.Neurons6'
 'a18.Neurons6']


NameError: name 'df_ctc_log' is not defined

# ChenXXX

# RomanovXXX

# Moffit