In [3]:
import loompy
import numpy as np
import pandas as pd

from dfply import *

### from ggplot import *
# from plotnine import *

### Make sure you use the Python 3 (py3_anaconda3_PT180510) environment
# import sys
# print(sys.executable) # /tools/anaconda/3-4.4.0/envs/py3_anaconda3_PT180510/bin/python
# print(sys.version) # 3.6.4 |Anaconda, Inc.| (default, Jan 16 2018, 18:10:19) [GCC 7.2.0]

In [4]:
!ls

anova_results.mousebrain-BEFORE_CTC_LOG.csv
explore_and_export_mousebrain_loom_agg.ipynb
explore_mousebrain.R
export_mousebrain_ctc_log_avg_data.ipynb
export_mousebrain_loom.ipynb
import_mousebrain_to_seurat.R
mousebrain_anova.ipynb
mousebrain_beta.ipynb
mousebrain.celltype_expr.frac_expr_bgthres.csv.gz
mousebrain.celltype_expr.frac_expr_nonzero.csv.gz
mousebrain_ges.ipynb
mousebrain-L5.average.ensembl_musculus.csv
mousebrain-L5.ctc_log_average.ensembl_musculus.csv.gz
mousebrain-L5.log_average.ensembl_musculus.csv
mousebrain-SEM_models.R
mousebrain-SEM_models-skene_quick_and_dirty.R


### Coonect to loompy and make data frame

In [5]:
file_loom = "/data/pub-others/zeisel-biorxiv-2018/data/l5_all.loom"
with loompy.connect(file_loom) as ds:
    ### Make data frame
    # ds.ra.Gene (Gene names) contains 66 duplicate gene names
    df = pd.DataFrame(ds[:, :].astype(int), index=ds.ra.Accession, columns=ds.ca.CellID)
    df.columns = pd.MultiIndex.from_arrays([ds.ca.CellID, ds.ca.ClusterName], names=["CellID", "ClusterName"])

### common transcript count (ctc) normalization and log-transformation

In [6]:
### OBS INEFFICIENT: this creates a COPY of the data frame.
df_ctc_log = np.log(1+df/df.sum(axis=0)*1e4) # column sum. Seurat default scale.factor is '10000'
del df # clean up

### Gene enrichment score: pre-calculation of f and mu

In [7]:
# OBS: here we calculate Gene enrichment scores on log CPM data

In [9]:
### RUNTIME smart way: 3h 8m 0s (~0.7 min per cell-type)
### RUNTIME {self + others} 'dum way': ~5-7 min per cell-type (if doing mean and frac) --> ESTIMATED TOTAL RUNTIME = ~20 hours

cell_types = df_ctc_log.columns.get_level_values(level="ClusterName") # 1 x NCell
df_frac_self = pd.DataFrame(index=df_ctc_log.index)
df_mu_self = pd.DataFrame(index=df_ctc_log.index)
df_var_self = pd.DataFrame(index=df_ctc_log.index)
df_n_self = pd.DataFrame(index=cell_types.unique()) # obs different index

df_frac_self.index.name="gene"
df_mu_self.index.name="gene"
df_var_self.index.name="gene"
df_n_self.index.name="annotation"

# df_frac_others = pd.DataFrame(index=df_ctc_log.index)
# df_mu_others = pd.DataFrame(index=df_ctc_log.index)
for lvl in cell_types.unique():
    print(lvl)
    df_tmp_cells_in_cluster = df_ctc_log.loc[:, cell_types.isin([lvl])]
    n_cells_in_cluster = df_tmp_cells_in_cluster.shape[1]
    # df_tmp_cells_not_in_cluster = df_ctc_log.loc[:, ~cell_types.isin([lvl])]
    # n_cells_not_in_cluster = df_tmp_cells_not_in_cluster.shape[1]
    # print(n_cells)
    df_n_self.loc[lvl, "n"] = n_cells_in_cluster
    for gene_id in df_ctc_log.index:
        df_frac_self.loc[gene_id, lvl] = np.sum(df_tmp_cells_in_cluster.loc[gene_id] > 0)/float(n_cells_in_cluster)
        df_mu_self.loc[gene_id, lvl] = df_tmp_cells_in_cluster.loc[gene_id].mean()
        df_var_self.loc[gene_id, lvl] = df_tmp_cells_in_cluster.loc[gene_id].var() # Normalized by N-1 by default
        # df_frac_others.loc[gene_id, lvl] = np.sum(df_tmp_cells_not_in_cluster.loc[gene_id] > 0)/float(n_cells_not_in_cluster)
        # df_mu_others.loc[gene_id, lvl] = df_tmp_cells_not_in_cluster.loc[gene_id].mean()

ENT9
ENT8
ENT6
ENT5
ENT4
ENT7
ENT3
ENT2
ENT1
ENTG1
ENTG4
ENTG2
ENTG3
ENTG6
ENTG5
ENTG7
ENMFB
MOL3
MOL2
MFOL1
MOL1
MFOL2
NFOL1
NFOL2
COP2
COP1
HBSER2
HBSER1
HBSER3
HBSER4
HBSER5
TEGLU15
TEGLU16
TEGLU5
TEGLU4
TEGLU22
TEGLU23
TEGLU24
TEGLU17
TEGLU18
TEGLU19
TEGLU2
TEGLU3
TEGLU14
TEGLU13
TEGLU21
TEGLU10
TEGLU11
TEGLU20
TEGLU12
TEGLU9
TEGLU8
TEGLU7
TEINH10
TEINH11
TEINH12
TEINH9
OBINH5
TEINH5
TEINH6
TEINH7
TEINH8
TEINH4
TEINH20
TEINH13
TEINH14
TEINH16
TEINH15
TEINH19
TEINH21
HYPEP7
HYPEP6
DEINH8
MBDOP1
MBDOP2
MEGLU14
MEGLU11
MEGLU10
MEGLU9
MEGLU8
MEGLU7
MEGLU6
DEGLU5
MEGLU2
MEGLU3
MEGLU1
MEGLU5
MEGLU4
DEGLU4
DEGLU3
OBNBL1
MBCHO1
HBGLU9
SCGLU1
HBGLU10
SCGLU10
HBGLU2
HBGLU3
HBNOR
HBGLU1
DEGLU2
DEGLU1
MSN1
MSN2
MSN3
MSN5
MSN4
MSN6
OEC
DETPH
RGDG
RGSZ
OPC
SZNBL
EPMB
HYPEN
OBDOP2
OBINH2
OBINH3
OBINH1
OBINH4
OBNBL2
TEGLU6
TEGLU1
DGGRC2
DGGRC1
DGNBL1
DGNBL2
OBNBL3
HYPEP3
HYPEP2
HYPEP1
CBNBL2
CBGRC
CR
CBNBL1
SEPNBL
OBNBL5
OBNBL4
DECHO2
HYPEP5
DEINH7
HYPEP4
HBGLU8
HBGLU6
HBGLU7
HBGLU5
HBGLU4
HBCHO2


In [10]:
print(df_frac_self.head())
print(df_mu_self.head())
print(df_var_self.head())

                        ENT9      ENT8      ENT6      ENT5      ENT4  \
gene                                                                   
ENSMUSG00000024647  0.917160  0.216495  0.118644  0.057915  0.032258   
ENSMUSG00000041544  0.313609  0.000000  0.000000  0.000000  0.000000   
ENSMUSG00000029503  0.976331  0.628866  0.118644  0.397683  0.309677   
ENSMUSG00000039942  0.455621  0.082474  0.177966  0.108108  0.116129   
ENSMUSG00000059187  0.988166  0.969072  0.915254  0.729730  0.606452   

                        ENT7      ENT3      ENT2      ENT1     ENTG1  \
gene                                                                   
ENSMUSG00000024647  0.000000  0.006452  0.018692  0.071429  0.012270   
ENSMUSG00000041544  0.032258  0.000000  0.009346  0.000000  0.000000   
ENSMUSG00000029503  0.612903  0.625806  0.626168  0.714286  0.006135   
ENSMUSG00000039942  0.129032  0.064516  0.130841  0.142857  0.000000   
ENSMUSG00000059187  0.709677  0.070968  0.056075  0.857143  0.0

### Write 'pre-calculation' files

In [11]:
# df_frac_self.to_csv("mousebrain.pre_calc.frac_expr.csv.gz", compression="gzip")
# df_mu_self.to_csv("mousebrain.pre_calc.mean.csv.gz", compression="gzip")
# df_var_self.to_csv("mousebrain.pre_calc.var.csv.gz", compression="gzip")
# df_n_self.to_csv("mousebrain.pre_calc.ncells.csv.gz", compression="gzip")

### Calculate GES

### Write GES file

In [None]:
# df_ges.to_csv("mousebrain.celltype_expr.beta_bgthres.csv.gz", compression="gzip")
# df_beta_nonzero.to_csv("mousebrain.celltype_expr.beta_nonzero.csv.gz", compression="gzip")