In [1]:
import loompy
import numpy as np
import pandas as pd

from dfply import *

### from ggplot import *
# from plotnine import *

### Make sure you use the Python 3 (py3_anaconda3_PT180510) environment
# import sys
# print(sys.executable) # /tools/anaconda/3-4.4.0/envs/py3_anaconda3_PT180510/bin/python
# print(sys.version) # 3.6.4 |Anaconda, Inc.| (default, Jan 16 2018, 18:10:19) [GCC 7.2.0]

In [2]:
!ls

anova_results.mousebrain-BEFORE_CTC_LOG.csv
explore_and_export_mousebrain_loom_agg.ipynb
explore_mousebrain.R
export_mousebrain_ctc_log_avg_data.ipynb
export_mousebrain_loom.ipynb
import_mousebrain_to_seurat.R
mousebrain_anova.ipynb
mousebrain_beta.ipynb
mousebrain.celltype_expr.frac_expr_bgthres.csv.gz
mousebrain.celltype_expr.frac_expr_nonzero.csv.gz
mousebrain_ges.ipynb
mousebrain-L5.average.ensembl_musculus.csv
mousebrain-L5.ctc_log_average.ensembl_musculus.csv.gz
mousebrain-L5.log_average.ensembl_musculus.csv
mousebrain-SEM_models.R
mousebrain-SEM_models-skene_quick_and_dirty.R


### Coonect to loompy and make data frame

In [None]:
file_loom = "/data/pub-others/zeisel-biorxiv-2018/data/l5_all.loom"
with loompy.connect(file_loom) as ds:
    ### Make data frame
    # ds.ra.Gene (Gene names) contains 66 duplicate gene names
    df = pd.DataFrame(ds[:, :].astype(int), index=ds.ra.Accession, columns=ds.ca.CellID)
    df.columns = pd.MultiIndex.from_arrays([ds.ca.CellID, ds.ca.ClusterName], names=["CellID", "ClusterName"])

### common transcript count (ctc) normalization and log-transformation

In [13]:
### OBS INEFFICIENT: this creates a COPY of the data frame.
df_ctc_log = np.log(1+df/df.sum(axis=0)*1e4) # column sum. Seurat default scale.factor is '10000'

### Average ctc-log data

In [14]:
### Average for each cell-type
df_ctc_log_avg = df_ctc_log.mean(axis=1, level=1) # 'axis=1'--> take mean ALONG the columns. It only make sense because we use level=1, so it results in a 'group' mean.
df_ctc_log_avg.head()
# del df_ctc_log # clean up

ClusterName,ENT9,ENT8,ENT6,ENT5,ENT4,ENT7,ENT3,ENT2,ENT1,ENTG1,...,PSPEP8,PSNF2,PSNF3,PSNF1,PSNP1,PSNP3,PSNP2,PSNP5,PSNP4,PSNP6
ENSMUSG00000024647,2.029741,0.176376,0.108033,0.052293,0.028326,0.0,0.006695,0.01106,0.051132,0.016737,...,0.0,0.872146,0.135545,1.283194,0.172524,0.0,0.0,0.008103,0.0,0.00181
ENSMUSG00000041544,0.238551,0.0,0.0,0.0,0.0,0.03435,0.0,0.006238,0.0,0.0,...,0.075393,0.06291,0.039704,0.059869,0.0,0.001054,0.045985,0.007529,0.0,0.006804
ENSMUSG00000029503,1.855591,0.531197,0.08796,0.417713,0.315584,0.671047,0.724751,0.594759,0.842018,0.006533,...,0.017316,0.0,0.0,0.0,0.006656,0.001902,0.0,0.130491,0.013577,0.574446
ENSMUSG00000039942,0.381635,0.047328,0.14716,0.118842,0.114504,0.128225,0.055323,0.106273,0.133145,0.0,...,0.0,0.077727,0.116941,0.046735,0.027128,0.018362,0.031399,0.04562,0.070678,0.071835
ENSMUSG00000059187,2.000372,1.971189,1.505637,1.075453,0.745603,0.932725,0.052921,0.047126,1.021252,0.0,...,0.0,0.145884,0.304648,0.044459,0.00511,0.00487,0.0,0.005308,0.0,0.003099


### Write file

In [15]:
### Write df_cpm_avg_log
df_ctc_log_avg.index.name = "gene"
df_ctc_log_avg.to_csv("mousebrain-L5.ctc_log_average.ensembl_musculus.csv.gz", compression="gzip")

### Alternative: average data

In [16]:
### calculate non-cpm average from loompy
df_avg = df.mean(axis=1, level=1)
df_avg.index.name = "gene"
df_avg.to_csv("mousebrain-L5.average.ensembl_musculus.csv", compression="gzip")

### Alternative: log-transform, then average data (that is, not ctc normalization)

In [17]:
# first log-transform, then average
df_avg_log = np.log(1+df).mean(axis=1, level=1)
df_avg_log.index.name = "gene"
df_avg_log.to_csv("mousebrain-L5.log_average.ensembl_musculus.csv", compression="gzip")

### Alternative: average, then log transform - BUT WE DON'T CARE ABOUT THIS!

### Compare to Linnarson .agg file

In [56]:
# CONCLUSION: df_avg and loom.agg data is EXACTLY the same

In [53]:
### load agg data
file_loom_agg = "/data/pub-others/zeisel-biorxiv-2018/data/L5_All.agg.loom"
ds_agg = loompy.connect(file_loom_agg)

In [55]:
df_agg = pd.DataFrame(data=ds_agg[:,:], columns=ds_agg.ca.ClusterName, index=ds_agg.ra.Accession)
df_agg.head()

Unnamed: 0,ENT9,ENT8,ENT6,ENT5,ENT4,ENT7,ENT3,ENT2,ENT1,ENTG1,...,PSPEP8,PSNF2,PSNF3,PSNF1,PSNP1,PSNP3,PSNP2,PSNP5,PSNP4,PSNP6
ENSMUSG00000024647,13.976331,0.42268,0.127119,0.084942,0.03871,0.0,0.006452,0.018692,0.071429,0.01227,...,0.0,2.245614,0.193548,3.026316,0.27305,0.0,0.0,0.011364,0.0,0.007576
ENSMUSG00000041544,0.390533,0.0,0.0,0.0,0.0,0.032258,0.0,0.009346,0.0,0.0,...,0.071429,0.105263,0.096774,0.105263,0.0,0.003704,0.097222,0.011364,0.0,0.007576
ENSMUSG00000029503,7.609467,1.381443,0.186441,0.517375,0.412903,1.322581,1.219355,1.373832,1.714286,0.006135,...,0.035714,0.0,0.0,0.0,0.010638,0.003704,0.0,0.238636,0.022222,1.590909
ENSMUSG00000039942,0.928994,0.092784,0.194915,0.127413,0.148387,0.193548,0.070968,0.214953,0.142857,0.0,...,0.0,0.122807,0.16129,0.052632,0.042553,0.040741,0.055556,0.079545,0.155556,0.159091
ENSMUSG00000059187,8.739645,9.0,3.601695,1.752896,1.103226,1.870968,0.090323,0.065421,2.214286,0.0,...,0.0,0.192982,0.548387,0.078947,0.007092,0.011111,0.0,0.011364,0.0,0.007576


In [57]:
df_avg.head()

ClusterName,ENT9,ENT8,ENT6,ENT5,ENT4,ENT7,ENT3,ENT2,ENT1,ENTG1,...,PSPEP8,PSNF2,PSNF3,PSNF1,PSNP1,PSNP3,PSNP2,PSNP5,PSNP4,PSNP6
ENSMUSG00000024647,13.976331,0.42268,0.127119,0.084942,0.03871,0.0,0.006452,0.018692,0.071429,0.01227,...,0.0,2.245614,0.193548,3.026316,0.27305,0.0,0.0,0.011364,0.0,0.007576
ENSMUSG00000041544,0.390533,0.0,0.0,0.0,0.0,0.032258,0.0,0.009346,0.0,0.0,...,0.071429,0.105263,0.096774,0.105263,0.0,0.003704,0.097222,0.011364,0.0,0.007576
ENSMUSG00000029503,7.609467,1.381443,0.186441,0.517375,0.412903,1.322581,1.219355,1.373832,1.714286,0.006135,...,0.035714,0.0,0.0,0.0,0.010638,0.003704,0.0,0.238636,0.022222,1.590909
ENSMUSG00000039942,0.928994,0.092784,0.194915,0.127413,0.148387,0.193548,0.070968,0.214953,0.142857,0.0,...,0.0,0.122807,0.16129,0.052632,0.042553,0.040741,0.055556,0.079545,0.155556,0.159091
ENSMUSG00000059187,8.739645,9.0,3.601695,1.752896,1.103226,1.870968,0.090323,0.065421,2.214286,0.0,...,0.0,0.192982,0.548387,0.078947,0.007092,0.011111,0.0,0.011364,0.0,0.007576


### TESTING ON SMALLER DATA FRAME

In [8]:
cell_types_extract = ["MEINH7","HBINH9","DEGLU4","MEINH2","MEGLU6","DEINH3","HBINH3","HBGLU10","HBINH4","HBGLU3"]
bool_cells_extract = np.isin(ds.ca.ClusterName, cell_types_extract)
ds_extract = df_extract = pd.DataFrame(ds[:, bool_cells_extract].astype(int), index=ds.ra.Gene, columns=ds.ca.CellID[bool_cells_extract])

1584