In [1]:
import os
import sys

import numpy as np
import pandas as pd


In [2]:
import git
path_repo_root = git.Repo('.', search_parent_directories=True).working_tree_dir

In [3]:
path_lib = os.path.join(path_repo_root, "src/lib")
sys.path.insert(1, path_lib)
from es_precalculation import *

# Constants

In [4]:
### Constants
input_dir = os.path.join(path_repo_root, "tmp-data/expression")
output_dir = os.path.join(path_repo_root, "tmp-data/expression-precalc")
assert(os.path.exists(output_dir)) # output_dir must exists

# Functions

In [5]:
def utils_celldata_reader(input_dir, prefix):
    """ Helper function to read .metadata.csv and .umi.csv.gz files in stored input_dir (tmp-data/expression)"""

    ### Metadata
    file_metadata = os.path.join(input_dir, "{}.metadata.csv".format(prefix))
    df_metadata = pd.read_csv(file_metadata, index_col=False)
    ### Data
    print("Reading UMI data...")
    file_data = os.path.join(input_dir, "{}.umi.csv.gz".format(prefix))
    df_data = pd.read_csv(file_data, index_col=False) # this takes 12-14 min for tabula_muris! (Pandas is slow!)
    print("Done reading UMI data")
    df_data.set_index("gene", inplace=True) # set index
    ### [*IMPORTANT*] Check that all metadata cell_ids are identical to data columnnames. 
    ### We need to ensure this before we can use the metadata cell-types as annotations.
    assert(np.all(df_data.columns.values == df_metadata["cell_id"].values)) # ---> True
    return [df_metadata, df_data]

# Mousebrain

In [6]:
import loompy # version 2

In [7]:
### Variables
out_prefix = os.path.join(output_dir, "mousebrain")
annotation_column = "ClusterName"

In [8]:
file_loom = os.path.join(input_dir, "mousebrain-l5_all.loom")
# file_loom = "/scratch/data-for_fast_access/pub-others/zeisel-biorxiv-2018/l5_all2.loom"
with loompy.connect(file_loom) as ds:
    ### Make data frame. Use Ensembl GeneIDs
    # ds.ra.Gene (Gene names) contains 66 duplicate gene names
    df_data = pd.DataFrame(ds[:, :].astype(int), index=ds.ra.Accession, columns=ds.ca.CellID)
    df_data.columns = pd.MultiIndex.from_arrays([ds.ca.CellID, ds.ca.ClusterName], names=["CellID", "ClusterName"])

In [None]:
df_ctc_log = ctc_log_normalize(df_data)
del df_data

Performning common transcript count (ctc) normalization and log-transformation on input data


In [None]:
annotations = df_ctc_log.columns.get_level_values(level=annotation_column) # 1 x NCell

In [None]:
df_anova = calculate_anova_sporadically_expressed_genes(df_ctc_log, annotations, out_prefix)

In [None]:
(df_frac, df_mu, df_var, df_n) = calculate_per_anno_summary_stats(df_ctc_log, annotations, out_prefix, permute_annotations=False)

In [None]:
(df_frac_null, df_mu_null, df_var_null, df_n_null) = calculate_per_anno_summary_stats(df_ctc_log, annotations, out_prefix, permute_annotations=True)

# Tabula Muris

In [19]:
### Variables
out_prefix = os.path.join(output_dir, "tabula_muris")
annotation_column = "tissue_celltype"

In [None]:
[df_metadata, df_data] = utils_celldata_reader(input_dir, prefix="tabula_muris")

Reading UMI data...


In [None]:
df_ctc_log = ctc_log_normalize(df_data)
del df_data

In [None]:
### Set annotations
annotations = df_metadata[annotation_column].values

In [None]:
df_anova = calculate_anova_sporadically_expressed_genes(df_ctc_log, annotations, out_prefix)

In [None]:
(df_frac, df_mu, df_var, df_n) = calculate_per_anno_summary_stats(df_ctc_log, annotations, out_prefix, permute_annotations=False)

In [None]:
(df_frac_null, df_mu_null, df_var_null, df_n_null) = calculate_per_anno_summary_stats(df_ctc_log, annotations, out_prefix, permute_annotations=True)

# Campbell2017

In [None]:
### Variables
dict_run = {os.path.join(output_dir, "campbell2017_lvl1"):"cell_type_all_lvl1",
            os.path.join(output_dir, "campbell2017_lvl2"):"cell_type_all_lvl2"} # {out_prefix:annotation_column}

In [None]:
[df_metadata, df_data] = utils_celldata_reader(input_dir, prefix="campbell2017")

In [None]:
df_ctc_log = ctc_log_normalize(df_data)
del df_data

In [None]:
for out_prefix in dict_run:
    print(out_prefix)
    annotations = df_metadata[dict_run[out_prefix]].values # get annotations
    df_anova = calculate_anova_sporadically_expressed_genes(df_ctc_log, annotations, out_prefix)
    (df_frac, df_mu, df_var, df_n) = calculate_per_anno_summary_stats(df_ctc_log, annotations, out_prefix, permute_annotations=False)
    (df_frac_null, df_mu_null, df_var_null, df_n_null) = calculate_per_anno_summary_stats(df_ctc_log, annotations, out_prefix, permute_annotations=True)

# ChenXXX

# RomanovXXX

# Moffit