In [1]:
import os
import sys

import numpy as np
import pandas as pd


In [2]:
# lib_path = os.path.abspath(os.path.pardir) # same as os.path.abspath("..")
lib_path = "/raid5/projects/timshel/sc-genetics/sc-genetics/src/lib"
sys.path.insert(1, lib_path)
from sem_pre_calculation import *

In [3]:
%load_ext autoreload
%autoreload 2

### Constants

### Read data

In [4]:
### Metadata
file_metadata = "/scratch/data-for_fast_access/pub-others/campbell2017/campbell.cell_metadata.csv"
df_metadata = pd.read_csv(file_metadata, index_col=False)
df_metadata.head()

Unnamed: 0,nGene,nUMI,orig.ident,cell_id,treatment,diet,cell_type_all_lvl1,cell_type_all_lvl2,age,sex,batch,FvF,taxonomy_lvl1,taxonomy_lvl2
0,7876,50341,arc1,arc1_TACTAACAGTAN,arc1,Chow,a18.Neurons6,n34.unassigned(2),adult (4-12 weeks old),M,b1,Fed,Neuron,Neuron
1,6427,27357,arc1,arc1_CCGCGAGCTCTT,arc1,Chow,a19.ParsTuber1,s10.Pars_Tuber1C,adult (4-12 weeks old),M,b1,Fed,Endocrine,Pars tuberalis
2,5684,26335,arc1,arc1_GTTGCACGGATA,arc1,Chow,a13.Neurons1,n06.Oxt,adult (4-12 weeks old),M,b1,Fed,Neuron,Neuron
3,5237,22311,arc1,arc1_CTGGCATTTTAT,arc1,Chow,a18.Neurons6,n13.Agrp/Gm8773,adult (4-12 weeks old),M,b1,Fed,Neuron,Neuron
4,5253,20902,arc1,arc1_TGCAACGACTAT,arc1,Chow,a18.Neurons6,n13.Agrp/Gm8773,adult (4-12 weeks old),M,b1,Fed,Neuron,Neuron


In [5]:
### Data
file_data = "/scratch/data-for_fast_access/pub-others/campbell2017/campbell.umi.csv.gz"
df_data = pd.read_csv(file_data, index_col=False) # this takes 4 min for Campbell! (Pandas is slow!)

In [7]:
df_data.set_index("gene", inplace=True) # set index
df_data.head()

Unnamed: 0_level_0,arc1_TACTAACAGTAN,arc1_CCGCGAGCTCTT,arc1_GTTGCACGGATA,arc1_CTGGCATTTTAT,arc1_TGCAACGACTAT,arc1_CCGTAATACTTN,arc1_CAATCCGCTGGN,arc1_ACAAGTCATGAT,arc1_ACGAGCCCTCCA,arc1_GAATTAGGGGTC,...,MaleFed_AGTGTAGGCGGN,MaleFed_GAGACTAGTGCN,MaleFed_ACCACCGAGTCN,MaleFed_AGGCAGCCCTTA,MaleFed_GGTAGTGTTGGN,MaleFed_AAGCAGCGCAAC,MaleFed_CGACAATGTCGN,MaleFed_GCGTTCAGCCTN,MaleFed_TGACGCGTTCTT,MaleFed_GGGGCTTATTGN
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0610005C13Rik,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0610007P14Rik,1,0,0,0,0,0,1,0,1,0,...,0,0,0,1,0,0,0,0,0,0
0610009B22Rik,6,5,3,2,1,1,0,1,2,0,...,0,0,1,0,1,0,0,0,0,1
0610009E02Rik,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0610009L18Rik,0,2,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
### [*IMPORTANT*] Check that all metadata cell_ids are identical to data columnnames. 
### We need to ensure this before we can use the metadata cell-types as annotations.
np.all(df_data.columns.values == df_metadata["cell_id"].values) # ---> True

True

### CTC log normalize

In [9]:
df_ctc_log = ctc_log_normalize(df_data)

Performning common transcript count (ctc) normalization and log-transformation on input data


### Run pre-calc for cell_type_all_lvl1 and cell_type_all_lvl2

In [11]:
dict_run = {"campbell_lvl1":"cell_type_all_lvl1",
            "campbell_lvl2":"cell_type_all_lvl2"}

In [14]:
for out_prefix in dict_run:
    annotations = df_metadata[dict_run[out_prefix]].values # get annotations
    print(annotations[:5])
    df_anova = calculate_anova_sporadically_expressed_genes(df_ctc_log, annotations, out_prefix)
    df_anova.to_csv("{}.pre_calc.sporadically_expressed_genes.anova.csv.gz".format(out_prefix), compression="gzip")
    (df_frac, df_mu, df_var, df_n) = calculate_per_anno_summary_stats(df_ctc_log, annotations, out_prefix, permute_annotations=False)
    (df_frac_null, df_mu_null, df_var_null, df_n_null) = calculate_per_anno_summary_stats(df_ctc_log, annotations, out_prefix, permute_annotations=True)

['a18.Neurons6' 'a19.ParsTuber1' 'a13.Neurons1' 'a18.Neurons6'
 'a18.Neurons6']
Splitting data frame into annotation groups
Splitting annotation #1/#20 into group
Splitting annotation #2/#20 into group
Splitting annotation #3/#20 into group
Splitting annotation #4/#20 into group
Splitting annotation #5/#20 into group
Splitting annotation #6/#20 into group
Splitting annotation #7/#20 into group
Splitting annotation #8/#20 into group
Splitting annotation #9/#20 into group
Splitting annotation #10/#20 into group
Splitting annotation #11/#20 into group
Splitting annotation #12/#20 into group
Splitting annotation #13/#20 into group
Splitting annotation #14/#20 into group
Splitting annotation #15/#20 into group
Splitting annotation #16/#20 into group
Splitting annotation #17/#20 into group
Splitting annotation #18/#20 into group
Splitting annotation #19/#20 into group
Splitting annotation #20/#20 into group
Running ANOVA
gene 0 out of 26774
gene 100 out of 26774


  f = msb / msw


gene 200 out of 26774
gene 300 out of 26774
gene 400 out of 26774
gene 500 out of 26774
gene 600 out of 26774
gene 700 out of 26774
gene 800 out of 26774
gene 900 out of 26774
gene 1000 out of 26774
gene 1100 out of 26774
gene 1200 out of 26774
gene 1300 out of 26774
gene 1400 out of 26774
gene 1500 out of 26774
gene 1600 out of 26774
gene 1700 out of 26774
gene 1800 out of 26774
gene 1900 out of 26774
gene 2000 out of 26774
gene 2100 out of 26774
gene 2200 out of 26774
gene 2300 out of 26774
gene 2400 out of 26774
gene 2500 out of 26774
gene 2600 out of 26774
gene 2700 out of 26774
gene 2800 out of 26774
gene 2900 out of 26774
gene 3000 out of 26774
gene 3100 out of 26774
gene 3200 out of 26774
gene 3300 out of 26774
gene 3400 out of 26774
gene 3500 out of 26774
gene 3600 out of 26774
gene 3700 out of 26774
gene 3800 out of 26774
gene 3900 out of 26774
gene 4000 out of 26774
gene 4100 out of 26774
gene 4200 out of 26774
gene 4300 out of 26774
gene 4400 out of 26774
gene 4500 out of 26

Splitting annotation #10/#64 into group
Splitting annotation #11/#64 into group
Splitting annotation #12/#64 into group
Splitting annotation #13/#64 into group
Splitting annotation #14/#64 into group
Splitting annotation #15/#64 into group
Splitting annotation #16/#64 into group
Splitting annotation #17/#64 into group
Splitting annotation #18/#64 into group
Splitting annotation #19/#64 into group
Splitting annotation #20/#64 into group
Splitting annotation #21/#64 into group
Splitting annotation #22/#64 into group
Splitting annotation #23/#64 into group
Splitting annotation #24/#64 into group
Splitting annotation #25/#64 into group
Splitting annotation #26/#64 into group
Splitting annotation #27/#64 into group
Splitting annotation #28/#64 into group
Splitting annotation #29/#64 into group
Splitting annotation #30/#64 into group
Splitting annotation #31/#64 into group
Splitting annotation #32/#64 into group
Splitting annotation #33/#64 into group
Splitting annotation #34/#64 into group


gene 25400 out of 26774
gene 25500 out of 26774
gene 25600 out of 26774
gene 25700 out of 26774
gene 25800 out of 26774
gene 25900 out of 26774
gene 26000 out of 26774
gene 26100 out of 26774
gene 26200 out of 26774
gene 26300 out of 26774
gene 26400 out of 26774
gene 26500 out of 26774
gene 26600 out of 26774
gene 26700 out of 26774
Number of genes sporadically expressed (pvalue > 0.00001, Skene cut-off): 10129
Running: #1/#64 | n01.Hdc
Running: #2/#64 | n02.Gm8773/Tac1
Running: #3/#64 | n03
Running: #4/#64 | n04.Sst/Nts
Running: #5/#64 | n05.Nfix/Htr2c
Running: #6/#64 | n06.Oxt
Running: #7/#64 | n07
Running: #8/#64 | n08
Running: #9/#64 | n09.Th/Slc6a3
Running: #10/#64 | n10.Ghrh
Running: #11/#64 | n11.Th/Cxcl12
Running: #12/#64 | n12.Agrp/Sst
Running: #13/#64 | n13.Agrp/Gm8773
Running: #14/#64 | n14.Pomc/Ttr
Running: #15/#64 | n15.Pomc/Anxa2
Running: #16/#64 | n16.Rgs16/Vip
Running: #17/#64 | n17.Rgs16/Dlx1
Running: #18/#64 | n18.Rgs16/Slc17a6
Running: #19/#64 | n19.Gpr50
Running: #