In [2]:
import os
import sys

import numpy as np
import pandas as pd


In [3]:
import git
path_repo_root = git.Repo('.', search_parent_directories=True).working_tree_dir

In [4]:
path_lib = os.path.join(path_repo_root, "src/lib")
sys.path.insert(1, path_lib)
from es_precalculation import *

# Constants

In [5]:
### Constants
input_dir = os.path.join(path_repo_root, "tmp-data/expression")
output_dir = os.path.join(path_repo_root, "tmp-data/expression-precalc")
assert(os.path.exists(output_dir)) # output_dir must exists

# Functions

In [6]:
def utils_celldata_reader(input_dir, prefix):
    """ Helper function to read .metadata.csv and .umi.csv.gz files in stored input_dir (tmp-data/expression)"""

    ### Metadata
    file_metadata = os.path.join(input_dir, "{}.metadata.csv".format(prefix))
    df_metadata = pd.read_csv(file_metadata, index_col=False)
    ### Data
    print("Reading UMI data...")
    file_data = os.path.join(input_dir, "{}.umi.csv.gz".format(prefix))
    df_data = pd.read_csv(file_data, index_col=False) # this takes 12-14 min for tabula_muris! (Pandas is slow!)
    print("Done reading UMI data")
    df_data.set_index("gene", inplace=True) # set index
    ### [*IMPORTANT*] Check that all metadata cell_ids are identical to data columnnames. 
    ### We need to ensure this before we can use the metadata cell-types as annotations.
    assert(np.all(df_data.columns.values == df_metadata["cell_id"].values)) # ---> True
    return [df_metadata, df_data]

# Mousebrain

In [6]:
import loompy # version 2

In [7]:
### Variables
out_prefix = os.path.join(output_dir, "mousebrain")
annotation_column = "ClusterName"

In [8]:
file_loom = os.path.join(input_dir, "mousebrain-l5_all.loom")
# file_loom = "/scratch/data-for_fast_access/pub-others/zeisel-biorxiv-2018/l5_all2.loom"
with loompy.connect(file_loom) as ds:
    ### Make data frame. Use Ensembl GeneIDs
    # ds.ra.Gene (Gene names) contains 66 duplicate gene names
    df_data = pd.DataFrame(ds[:, :].astype(int), index=ds.ra.Accession, columns=ds.ca.CellID)
    df_data.columns = pd.MultiIndex.from_arrays([ds.ca.CellID, ds.ca.ClusterName], names=["CellID", "ClusterName"])

In [None]:
df_ctc_log = ctc_log_normalize(df_data)
del df_data

Performning common transcript count (ctc) normalization and log-transformation on input data


In [None]:
annotations = df_ctc_log.columns.get_level_values(level=annotation_column) # 1 x NCell

In [None]:
df_anova = calculate_anova_sporadically_expressed_genes(df_ctc_log, annotations, out_prefix)

In [None]:
(df_frac, df_mu, df_var, df_n) = calculate_per_anno_summary_stats(df_ctc_log, annotations, out_prefix, permute_annotations=False)

In [None]:
(df_frac_null, df_mu_null, df_var_null, df_n_null) = calculate_per_anno_summary_stats(df_ctc_log, annotations, out_prefix, permute_annotations=True)

# Tabula Muris

In [19]:
### Variables
out_prefix = os.path.join(output_dir, "tabula_muris")
annotation_column = "tissue_celltype"

In [None]:
[df_metadata, df_data] = utils_celldata_reader(input_dir, prefix="tabula_muris")

Reading UMI data...


In [None]:
df_ctc_log = ctc_log_normalize(df_data)
del df_data

In [None]:
### Set annotations
annotations = df_metadata[annotation_column].values

In [None]:
df_anova = calculate_anova_sporadically_expressed_genes(df_ctc_log, annotations, out_prefix)

In [None]:
(df_frac, df_mu, df_var, df_n) = calculate_per_anno_summary_stats(df_ctc_log, annotations, out_prefix, permute_annotations=False)

In [None]:
(df_frac_null, df_mu_null, df_var_null, df_n_null) = calculate_per_anno_summary_stats(df_ctc_log, annotations, out_prefix, permute_annotations=True)

# Campbell2017

In [None]:
### Variables
dict_run = {os.path.join(output_dir, "campbell2017_lvl1"):"cell_type_all_lvl1",
            os.path.join(output_dir, "campbell2017_lvl2"):"cell_type_all_lvl2"} # {out_prefix:annotation_column}

In [None]:
[df_metadata, df_data] = utils_celldata_reader(input_dir, prefix="campbell2017")

In [None]:
df_ctc_log = ctc_log_normalize(df_data)
del df_data

In [None]:
for out_prefix in dict_run:
    print(out_prefix)
    annotations = df_metadata[dict_run[out_prefix]].values # get annotations
    df_anova = calculate_anova_sporadically_expressed_genes(df_ctc_log, annotations, out_prefix)
    (df_frac, df_mu, df_var, df_n) = calculate_per_anno_summary_stats(df_ctc_log, annotations, out_prefix, permute_annotations=False)
    (df_frac_null, df_mu_null, df_var_null, df_n_null) = calculate_per_anno_summary_stats(df_ctc_log, annotations, out_prefix, permute_annotations=True)

# Chen2017

In [12]:
### Variables
dict_run = {os.path.join(output_dir, "chen2017"):"SVM_clusterID"} # {out_prefix:annotation_column}

In [13]:
[df_metadata, df_data] = utils_celldata_reader(input_dir, prefix="chen2017")

Reading UMI data...
Done reading UMI data


In [14]:
df_ctc_log = ctc_log_normalize(df_data)
del df_data

Performning common transcript count (ctc) normalization and log-transformation on input data


In [15]:
for out_prefix in dict_run:
    print(out_prefix)
    annotations = df_metadata[dict_run[out_prefix]].values # get annotations
    df_anova = calculate_anova_sporadically_expressed_genes(df_ctc_log, annotations, out_prefix)
    (df_frac, df_mu, df_var, df_n) = calculate_per_anno_summary_stats(df_ctc_log, annotations, out_prefix, permute_annotations=False)
    (df_frac_null, df_mu_null, df_var_null, df_n_null) = calculate_per_anno_summary_stats(df_ctc_log, annotations, out_prefix, permute_annotations=True)

/nfsdata/projects/jonatan/pub-perslab/timshel-bmicelltypes2019/tmp-data/expression-precalc/chen2017
Splitting data frame into annotation groups
Splitting annotation #1/#45 into group
Splitting annotation #2/#45 into group
Splitting annotation #3/#45 into group
Splitting annotation #4/#45 into group
Splitting annotation #5/#45 into group
Splitting annotation #6/#45 into group
Splitting annotation #7/#45 into group
Splitting annotation #8/#45 into group
Splitting annotation #9/#45 into group
Splitting annotation #10/#45 into group
Splitting annotation #11/#45 into group
Splitting annotation #12/#45 into group
Splitting annotation #13/#45 into group
Splitting annotation #14/#45 into group
Splitting annotation #15/#45 into group
Splitting annotation #16/#45 into group
Splitting annotation #17/#45 into group
Splitting annotation #18/#45 into group
Splitting annotation #19/#45 into group
Splitting annotation #20/#45 into group
Splitting annotation #21/#45 into group
Splitting annotation #22/

  f = msb / msw


gene 100 out of 23284
gene 200 out of 23284
gene 300 out of 23284
gene 400 out of 23284
gene 500 out of 23284
gene 600 out of 23284
gene 700 out of 23284
gene 800 out of 23284
gene 900 out of 23284
gene 1000 out of 23284
gene 1100 out of 23284
gene 1200 out of 23284
gene 1300 out of 23284
gene 1400 out of 23284
gene 1500 out of 23284
gene 1600 out of 23284
gene 1700 out of 23284
gene 1800 out of 23284
gene 1900 out of 23284
gene 2000 out of 23284
gene 2100 out of 23284
gene 2200 out of 23284
gene 2300 out of 23284
gene 2400 out of 23284
gene 2500 out of 23284
gene 2600 out of 23284
gene 2700 out of 23284
gene 2800 out of 23284
gene 2900 out of 23284
gene 3000 out of 23284
gene 3100 out of 23284
gene 3200 out of 23284
gene 3300 out of 23284
gene 3400 out of 23284
gene 3500 out of 23284
gene 3600 out of 23284
gene 3700 out of 23284
gene 3800 out of 23284
gene 3900 out of 23284
gene 4000 out of 23284
gene 4100 out of 23284
gene 4200 out of 23284
gene 4300 out of 23284
gene 4400 out of 232

# Romanov2017

In [7]:
### Variables
dict_run = {os.path.join(output_dir, "romanov2017"):"cell_type"} # {out_prefix:annotation_column}

In [8]:
[df_metadata, df_data] = utils_celldata_reader(input_dir, prefix="romanov2017")

Reading UMI data...
Done reading UMI data


In [9]:
df_ctc_log = ctc_log_normalize(df_data)
del df_data

Performning common transcript count (ctc) normalization and log-transformation on input data


In [10]:
for out_prefix in dict_run:
    print(out_prefix)
    annotations = df_metadata[dict_run[out_prefix]].values # get annotations
    df_anova = calculate_anova_sporadically_expressed_genes(df_ctc_log, annotations, out_prefix)
    (df_frac, df_mu, df_var, df_n) = calculate_per_anno_summary_stats(df_ctc_log, annotations, out_prefix, permute_annotations=False)
    (df_frac_null, df_mu_null, df_var_null, df_n_null) = calculate_per_anno_summary_stats(df_ctc_log, annotations, out_prefix, permute_annotations=True)

/nfsdata/projects/jonatan/pub-perslab/timshel-bmicelltypes2019/tmp-data/expression-precalc/romanov2017
Splitting data frame into annotation groups
Splitting annotation #1/#60 into group
Splitting annotation #2/#60 into group
Splitting annotation #3/#60 into group
Splitting annotation #4/#60 into group
Splitting annotation #5/#60 into group
Splitting annotation #6/#60 into group
Splitting annotation #7/#60 into group
Splitting annotation #8/#60 into group
Splitting annotation #9/#60 into group
Splitting annotation #10/#60 into group
Splitting annotation #11/#60 into group
Splitting annotation #12/#60 into group
Splitting annotation #13/#60 into group
Splitting annotation #14/#60 into group
Splitting annotation #15/#60 into group
Splitting annotation #16/#60 into group
Splitting annotation #17/#60 into group
Splitting annotation #18/#60 into group
Splitting annotation #19/#60 into group
Splitting annotation #20/#60 into group
Splitting annotation #21/#60 into group
Splitting annotation #

# Moffit2018

In [16]:
### Variables
dict_run = {os.path.join(output_dir, "moffitt2018"):"cell_type"} # {out_prefix:annotation_column}

In [17]:
[df_metadata, df_data] = utils_celldata_reader(input_dir, prefix="moffitt2018")

Reading UMI data...
Done reading UMI data


In [18]:
df_ctc_log = ctc_log_normalize(df_data)
del df_data

Performning common transcript count (ctc) normalization and log-transformation on input data


In [19]:
for out_prefix in dict_run:
    print(out_prefix)
    annotations = df_metadata[dict_run[out_prefix]].values # get annotations
    df_anova = calculate_anova_sporadically_expressed_genes(df_ctc_log, annotations, out_prefix)
    (df_frac, df_mu, df_var, df_n) = calculate_per_anno_summary_stats(df_ctc_log, annotations, out_prefix, permute_annotations=False)
    (df_frac_null, df_mu_null, df_var_null, df_n_null) = calculate_per_anno_summary_stats(df_ctc_log, annotations, out_prefix, permute_annotations=True)

/nfsdata/projects/jonatan/pub-perslab/timshel-bmicelltypes2019/tmp-data/expression-precalc/moffitt2018
Splitting data frame into annotation groups
Splitting annotation #1/#87 into group
Splitting annotation #2/#87 into group
Splitting annotation #3/#87 into group
Splitting annotation #4/#87 into group
Splitting annotation #5/#87 into group
Splitting annotation #6/#87 into group
Splitting annotation #7/#87 into group
Splitting annotation #8/#87 into group
Splitting annotation #9/#87 into group
Splitting annotation #10/#87 into group
Splitting annotation #11/#87 into group
Splitting annotation #12/#87 into group
Splitting annotation #13/#87 into group
Splitting annotation #14/#87 into group
Splitting annotation #15/#87 into group
Splitting annotation #16/#87 into group
Splitting annotation #17/#87 into group
Splitting annotation #18/#87 into group
Splitting annotation #19/#87 into group
Splitting annotation #20/#87 into group
Splitting annotation #21/#87 into group
Splitting annotation #

  f = msb / msw


gene 100 out of 27998
gene 200 out of 27998
gene 300 out of 27998
gene 400 out of 27998
gene 500 out of 27998
gene 600 out of 27998
gene 700 out of 27998
gene 800 out of 27998
gene 900 out of 27998
gene 1000 out of 27998
gene 1100 out of 27998
gene 1200 out of 27998
gene 1300 out of 27998
gene 1400 out of 27998
gene 1500 out of 27998
gene 1600 out of 27998
gene 1700 out of 27998
gene 1800 out of 27998
gene 1900 out of 27998
gene 2000 out of 27998
gene 2100 out of 27998
gene 2200 out of 27998
gene 2300 out of 27998
gene 2400 out of 27998
gene 2500 out of 27998
gene 2600 out of 27998
gene 2700 out of 27998
gene 2800 out of 27998
gene 2900 out of 27998
gene 3000 out of 27998
gene 3100 out of 27998
gene 3200 out of 27998
gene 3300 out of 27998
gene 3400 out of 27998
gene 3500 out of 27998
gene 3600 out of 27998
gene 3700 out of 27998
gene 3800 out of 27998
gene 3900 out of 27998
gene 4000 out of 27998
gene 4100 out of 27998
gene 4200 out of 27998
gene 4300 out of 27998
gene 4400 out of 279