In [1]:
import loompy
import numpy as np
import pandas as pd

from dfply import *

### from ggplot import *
# from plotnine import *

### Make sure you use the Python 3 (py3_anaconda3_PT180510) environment
# import sys
# print(sys.executable) # /tools/anaconda/3-4.4.0/envs/py3_anaconda3_PT180510/bin/python
# print(sys.version) # 3.6.4 |Anaconda, Inc.| (default, Jan 16 2018, 18:10:19) [GCC 7.2.0]

In [2]:
!ls

explore_and_export_mousebrain_loom_agg.ipynb
explore_mousebrain.R
export_mousebrain_ctc_log_avg_data.ipynb
export_mousebrain_loom.ipynb
import_mousebrain_to_seurat.R
mousebrain_anova.ipynb
mousebrain_beta.ipynb
mousebrain_ges.ipynb
mousebrain_pre_calculation.ipynb
mousebrain-SEM_models-old_workflow.R
mousebrain-SEM_models.R
mousebrain-SEM_models-script.R
mousebrain-SEM_models-skene_quick_and_dirty.R
mousebrain.sem_obj.hier.Class.RData
mousebrain.sem_obj.hier.TaxonomyRank2.RData
mousebrain.sem_obj_mouse.RData
mousebrain.sem_obj.RData
nb-sem_heatmaps.nb.html
nb-sem_heatmaps.Rmd
nb-sem_heatmaps-script_complexheatmap_per_anno.R
nb-sem_heatmaps-script_complexheatmap.R
nb-sem_heatmaps-script.R
out.cell_prioritization.mousebrain.all_annotations.sem_meta_median.csv
out.cell_prioritization.mousebrain.Astrocytes.sem_meta_median.csv
out.cell_prioritization.mousebrain.Ependymal.sem_meta_median.csv
out.cell_prioritization.mousebrain.Immune.sem_meta_median.csv
out.cell_prio

### Coonect to loompy and make data frame

In [3]:
file_loom = "/data/pub-others/zeisel-biorxiv-2018/data/l5_all.loom"
with loompy.connect(file_loom) as ds:
    ### Make data frame
    # ds.ra.Gene (Gene names) contains 66 duplicate gene names
    df = pd.DataFrame(ds[:, :].astype(int), index=ds.ra.Accession, columns=ds.ca.CellID)
    df.columns = pd.MultiIndex.from_arrays([ds.ca.CellID, ds.ca.ClusterName], names=["CellID", "ClusterName"])

### common transcript count (ctc) normalization and log-transformation

In [None]:
### OBS INEFFICIENT: this creates a COPY of the data frame.
df_ctc_log = np.log(1+df/df.sum(axis=0)*1e4) # column sum. Seurat default scale.factor is '10000'
del df # clean up

### Gene enrichment score: pre-calculation of f and mu

In [None]:
# OBS: here we calculate Gene enrichment scores on log CPM data

In [None]:
def precalculate(df, annotations):
    """
    Pre-calculation of mean, variance and fraction expressed per annotation.
    
    Args:
        df:           DataFrame. genes x cells. Must have row index with gene IDs and column index. Any column indexes in the data frame will not be used (annotations argument defines the grouping of the data frame), so the DataFrame can have any kind of column index.
        annotations   1d array-like. Defines the annotations in df. Must have same length as number of columns in df.
    Returns:
        DataFrame of mean, variance and fraction expressed. Each DataFrame has the same index (genes) and columns (unique annotations).
        DataFrame of number of cells per annotation. Single column ("n") and index is unique annotations. 
        Columns are alpha-numeric SORTED by their annotation name.
    Notes:
        Pandas works fine with non-unique column index. See some of the behavior here: https://github.com/pandas-dev/pandas/pull/3683
    """
    ### RUNTIME 'smart way' vectorized: <2 min
    ### RUNTIME 'smart way' non-vectorized: 3h 8m 0s (~0.7 min per cell-type)
    ### RUNTIME {self + others} 'dum way': ~5-7 min per cell-type (if doing mean and frac) --> ESTIMATED TOTAL RUNTIME = ~20 hours

    
    ### Reorder data frame by sorted annotations. This should give a faster pandas data retrieval
    ### Option 1: will duplicate object in memory, so slow and inefficient
    # df.columns = annotations # set column index. Duplicates are allowed.
    # df.sort_index(axis=1, inplace=True) # sort column index inplace
    ### Option 2: alo slow and inefficien
    # np.argsort(): returns the indices that would sort an array.
    # df = df.iloc[:, np.argsort(annotations)] # COPIES the the data frame. Inefficient
    # REF: https://stackoverflow.com/a/39237712/6639640
    
    annotations_unique = np.unique(annotations) # returns SORTED unique elements of an array. We sort to make sure the output column is always in the same order
    
    df_frac = pd.DataFrame(index=df.index)
    df_mu = pd.DataFrame(index=df.index)
    df_var = pd.DataFrame(index=df.index)
    df_n = pd.DataFrame(index=annotations_unique) # obs different index

    df_frac.index.name="gene"
    df_mu.index.name="gene"
    df_var.index.name="gene"
    df_n.index.name="annotation"

    for counter, annotation in enumerate(annotations_unique, start=1):
        print("Running: #{}/#{} | {}".format(counter, len(annotations_unique), annotation))
        df_tmp_cells_in_annotation = df.iloc[:, np.isin(annotations, [annotation])] # boolean indexing. extract data once, then do computations. Data extraction is the slowest part of this function
        n_cells_in_annotation = df_tmp_cells_in_annotation.shape[1]
        df_n.loc[annotation, "n"] = n_cells_in_annotation
        df_frac.loc[:, annotation] = np.count_nonzero(df_tmp_cells_in_annotation, axis=1)/float(n_cells_in_annotation) # axis=1 : count non-zeros *along* columns | returns number of non-zeroes for each row.
        df_mu.loc[:, annotation] = df_tmp_cells_in_annotation.mean(axis='columns') # axis='columns': apply function to each row.
        df_var.loc[:, annotation] = df_tmp_cells_in_annotation.var(axis='columns') # Normalized by N-1 by default

    return(df_frac, df_mu, df_var, df_n)

In [None]:
np.random.seed(1)
annotations = df_ctc_log.columns.get_level_values(level="ClusterName") # 1 x NCell
annotations_null = np.random.permutation(annotations) # permute labels
print(type(annotations))
print(type(annotations_null))
print(annotations_null[:6])

In [None]:
(df_frac, df_mu, df_var, df_n) = precalculate(df_ctc_log, annotations)

In [None]:
(df_frac_null, df_mu_null, df_var_null, df_n_null) = precalculate(df_ctc_log, annotations_null)

In [None]:
# print(df_frac_self.head())
# print(df_mu_self.head())
# print(df_var_self.head())

### Write 'pre-calculation' files

In [None]:
# df_frac.to_csv("mousebrain.pre_calc.frac_expr.csv.gz", compression="gzip")
# df_mu.to_csv("mousebrain.pre_calc.mean.csv.gz", compression="gzip")
# df_var.to_csv("mousebrain.pre_calc.var.csv.gz", compression="gzip")
# df_n.to_csv("mousebrain.pre_calc.ncells.csv.gz", compression="gzip")

In [None]:
# df_frac_null.to_csv("mousebrain.pre_calc.frac_expr.null.csv.gz", compression="gzip")
# df_mu_null.to_csv("mousebrain.pre_calc.mean.null.csv.gz", compression="gzip")
# df_var_null.to_csv("mousebrain.pre_calc.var.null.csv.gz", compression="gzip")
# df_n_null.to_csv("mousebrain.pre_calc.ncells.null.csv.gz", compression="gzip")

### Loop

In [None]:
np.random.seed(1)
for i, dummy in enumerate(range(100), start=1):
    annotations_null = np.random.permutation(annotations) # permute labels
    (df_frac_null, df_mu_null, df_var_null, df_n_null) = precalculate(df_ctc_log, annotations_null)
    df_frac_null.to_csv("mousebrain.pre_calc.frac_expr.null_{}.csv.gz".format(i), compression="gzip")
    df_mu_null.to_csv("mousebrain.pre_calc.mean.null_{}.csv.gz".format(i), compression="gzip")
    df_var_null.to_csv("mousebrain.pre_calc.var.null_{}.csv.gz".format(i), compression="gzip")
    df_n_null.to_csv("mousebrain.pre_calc.ncells.null_{}.csv.gz".format(i), compression="gzip")


Running: #1/#265 | ABC
Running: #2/#265 | ACBG
Running: #3/#265 | ACMB
Running: #4/#265 | ACNT1
Running: #5/#265 | ACNT2
Running: #6/#265 | ACOB
Running: #7/#265 | ACTE1
Running: #8/#265 | ACTE2
Running: #9/#265 | CBGRC
Running: #10/#265 | CBINH1
Running: #11/#265 | CBINH2
Running: #12/#265 | CBNBL1
Running: #13/#265 | CBNBL2
Running: #14/#265 | CBPC
Running: #15/#265 | CHOR
Running: #16/#265 | COP1
Running: #17/#265 | COP2
Running: #18/#265 | CR
Running: #19/#265 | DECHO1
Running: #20/#265 | DECHO2
Running: #21/#265 | DEGLU1
Running: #22/#265 | DEGLU2
Running: #23/#265 | DEGLU3
Running: #24/#265 | DEGLU4
Running: #25/#265 | DEGLU5
Running: #26/#265 | DEINH1
Running: #27/#265 | DEINH2
Running: #28/#265 | DEINH3
Running: #29/#265 | DEINH4
Running: #30/#265 | DEINH5
Running: #31/#265 | DEINH6
Running: #32/#265 | DEINH7
Running: #33/#265 | DEINH8
Running: #34/#265 | DETPH
Running: #35/#265 | DGGRC1
Running: #36/#265 | DGGRC2
Running: #37/#265 | DGNBL1
Running: #38/#265 | DGNBL2
Running: #

Running: #39/#265 | ENMFB
Running: #40/#265 | ENT1
Running: #41/#265 | ENT2
Running: #42/#265 | ENT3
Running: #43/#265 | ENT4
Running: #44/#265 | ENT5
Running: #45/#265 | ENT6
Running: #46/#265 | ENT7
Running: #47/#265 | ENT8
Running: #48/#265 | ENT9
Running: #49/#265 | ENTG1
Running: #50/#265 | ENTG2
Running: #51/#265 | ENTG3
Running: #52/#265 | ENTG4
Running: #53/#265 | ENTG5
Running: #54/#265 | ENTG6
Running: #55/#265 | ENTG7
Running: #56/#265 | EPEN
Running: #57/#265 | EPMB
Running: #58/#265 | EPSC
Running: #59/#265 | HBADR
Running: #60/#265 | HBCHO1
Running: #61/#265 | HBCHO2
Running: #62/#265 | HBCHO3
Running: #63/#265 | HBCHO4
Running: #64/#265 | HBGLU1
Running: #65/#265 | HBGLU10
Running: #66/#265 | HBGLU2
Running: #67/#265 | HBGLU3
Running: #68/#265 | HBGLU4
Running: #69/#265 | HBGLU5
Running: #70/#265 | HBGLU6
Running: #71/#265 | HBGLU7
Running: #72/#265 | HBGLU8
Running: #73/#265 | HBGLU9
Running: #74/#265 | HBINH1
Running: #75/#265 | HBINH2
Running: #76/#265 | HBINH3
Runnin

Running: #77/#265 | HBINH4
Running: #78/#265 | HBINH5
Running: #79/#265 | HBINH6
Running: #80/#265 | HBINH7
Running: #81/#265 | HBINH8
Running: #82/#265 | HBINH9
Running: #83/#265 | HBNOR
Running: #84/#265 | HBSER1
Running: #85/#265 | HBSER2
Running: #86/#265 | HBSER3
Running: #87/#265 | HBSER4
Running: #88/#265 | HBSER5
Running: #89/#265 | HYPEN
Running: #90/#265 | HYPEP1
Running: #91/#265 | HYPEP2
Running: #92/#265 | HYPEP3
Running: #93/#265 | HYPEP4
Running: #94/#265 | HYPEP5
Running: #95/#265 | HYPEP6
Running: #96/#265 | HYPEP7
Running: #97/#265 | HYPEP8
Running: #98/#265 | MBCHO1
Running: #99/#265 | MBDOP1
Running: #100/#265 | MBDOP2
Running: #101/#265 | MEGLU1
Running: #102/#265 | MEGLU10
Running: #103/#265 | MEGLU11
Running: #104/#265 | MEGLU14
Running: #105/#265 | MEGLU2
Running: #106/#265 | MEGLU3
Running: #107/#265 | MEGLU4
Running: #108/#265 | MEGLU5
Running: #109/#265 | MEGLU6
Running: #110/#265 | MEGLU7
Running: #111/#265 | MEGLU8
Running: #112/#265 | MEGLU9
Running: #113/

Running: #113/#265 | MEINH1
Running: #114/#265 | MEINH10
Running: #115/#265 | MEINH11
Running: #116/#265 | MEINH12
Running: #117/#265 | MEINH13
Running: #118/#265 | MEINH14
Running: #119/#265 | MEINH2
Running: #120/#265 | MEINH3
Running: #121/#265 | MEINH4
Running: #122/#265 | MEINH5
Running: #123/#265 | MEINH6
Running: #124/#265 | MEINH7
Running: #125/#265 | MEINH8
Running: #126/#265 | MEINH9
Running: #127/#265 | MFOL1
Running: #128/#265 | MFOL2
Running: #129/#265 | MGL1
Running: #130/#265 | MGL2
Running: #131/#265 | MGL3
Running: #132/#265 | MOL1
Running: #133/#265 | MOL2
Running: #134/#265 | MOL3
Running: #135/#265 | MSN1
Running: #136/#265 | MSN2
Running: #137/#265 | MSN3
Running: #138/#265 | MSN4
Running: #139/#265 | MSN5
Running: #140/#265 | MSN6
Running: #141/#265 | NFOL1
Running: #142/#265 | NFOL2
Running: #143/#265 | OBDOP1
Running: #144/#265 | OBDOP2
Running: #145/#265 | OBINH1
Running: #146/#265 | OBINH2
Running: #147/#265 | OBINH3
Running: #148/#265 | OBINH4
Running: #149/#

Running: #149/#265 | OBINH5
Running: #150/#265 | OBNBL1
Running: #151/#265 | OBNBL2
Running: #152/#265 | OBNBL3
Running: #153/#265 | OBNBL4
Running: #154/#265 | OBNBL5
Running: #155/#265 | OEC
Running: #156/#265 | OPC
Running: #157/#265 | PER1
Running: #158/#265 | PER2
Running: #159/#265 | PER3
Running: #160/#265 | PSNF1
Running: #161/#265 | PSNF2
Running: #162/#265 | PSNF3
Running: #163/#265 | PSNP1
Running: #164/#265 | PSNP2
Running: #165/#265 | PSNP3
Running: #166/#265 | PSNP4
Running: #167/#265 | PSNP5
Running: #168/#265 | PSNP6
Running: #169/#265 | PSPEP1
Running: #170/#265 | PSPEP2
Running: #171/#265 | PSPEP3
Running: #172/#265 | PSPEP4
Running: #173/#265 | PSPEP5
Running: #174/#265 | PSPEP6
Running: #175/#265 | PSPEP7
Running: #176/#265 | PSPEP8
Running: #177/#265 | PVM1
Running: #178/#265 | PVM2
Running: #179/#265 | RGDG
Running: #180/#265 | RGSZ
Running: #181/#265 | SATG1
Running: #182/#265 | SATG2
Running: #183/#265 | SCGLU1
Running: #184/#265 | SCGLU10
Running: #185/#265 | S

Running: #185/#265 | SCGLU2
Running: #186/#265 | SCGLU3
Running: #187/#265 | SCGLU4
Running: #188/#265 | SCGLU5
Running: #189/#265 | SCGLU6
Running: #190/#265 | SCGLU7
Running: #191/#265 | SCGLU8
Running: #192/#265 | SCGLU9
Running: #193/#265 | SCHW
Running: #194/#265 | SCINH1
Running: #195/#265 | SCINH10
Running: #196/#265 | SCINH11
Running: #197/#265 | SCINH2
Running: #198/#265 | SCINH3
Running: #199/#265 | SCINH4
Running: #200/#265 | SCINH5
Running: #201/#265 | SCINH6
Running: #202/#265 | SCINH7
Running: #203/#265 | SCINH8
Running: #204/#265 | SCINH9
Running: #205/#265 | SEPNBL
Running: #206/#265 | SYCHO1
Running: #207/#265 | SYCHO2
Running: #208/#265 | SYNOR1
Running: #209/#265 | SYNOR2
Running: #210/#265 | SYNOR3
Running: #211/#265 | SYNOR4
Running: #212/#265 | SYNOR5
Running: #213/#265 | SZNBL
Running: #214/#265 | TECHO
Running: #215/#265 | TEGLU1
Running: #216/#265 | TEGLU10
Running: #217/#265 | TEGLU11
Running: #218/#265 | TEGLU12
Running: #219/#265 | TEGLU13
Running: #220/#265

Running: #220/#265 | TEGLU14
Running: #221/#265 | TEGLU15
Running: #222/#265 | TEGLU16
Running: #223/#265 | TEGLU17
Running: #224/#265 | TEGLU18
Running: #225/#265 | TEGLU19
Running: #226/#265 | TEGLU2
Running: #227/#265 | TEGLU20
Running: #228/#265 | TEGLU21
Running: #229/#265 | TEGLU22
Running: #230/#265 | TEGLU23
Running: #231/#265 | TEGLU24
Running: #232/#265 | TEGLU3
Running: #233/#265 | TEGLU4
Running: #234/#265 | TEGLU5
Running: #235/#265 | TEGLU6
Running: #236/#265 | TEGLU7
Running: #237/#265 | TEGLU8
Running: #238/#265 | TEGLU9
Running: #239/#265 | TEINH1
Running: #240/#265 | TEINH10
Running: #241/#265 | TEINH11
Running: #242/#265 | TEINH12
Running: #243/#265 | TEINH13
Running: #244/#265 | TEINH14
Running: #245/#265 | TEINH15
Running: #246/#265 | TEINH16
Running: #247/#265 | TEINH17
Running: #248/#265 | TEINH18
Running: #249/#265 | TEINH19
Running: #250/#265 | TEINH2
