In [1]:
import os
import sys

import numpy as np
import pandas as pd


In [2]:
# lib_path = os.path.abspath(os.path.pardir) # same as os.path.abspath("..")
lib_path = "/projects/timshel/sc-genetics/sc-genetics/src/lib"
sys.path.insert(1, lib_path)
from sem_pre_calculation import *

In [3]:
%load_ext autoreload
%autoreload 2

### Constants

### Read data

In [4]:
### Metadata
file_metadata = "/projects/timshel/sc-scheele_lab_adipose_fluidigm_c1/data-preadipocytes_developing/preadipocyte_developing_1808.metadata.csv"
df_metadata = pd.read_csv(file_metadata, index_col=False)
df_metadata.head()

Unnamed: 0,cell_id,branch_low_res,branch_high_res,nGene,nUMI,orig.ident,timepoint,time_combined,percent.mito,res.0.5,...,Pseudotime,pc2.groups,pc1,pc2_supra_peri,pc2_subq_visce,pc2,pc3,pc4,pc5,State.old.labels
0,AAACCTGAGTCCTCCT-1,preadipocyte,preadipocyte_top100,4614,23660,SeuratProject,T1,1,0.026627,7,...,1.073875,middle,-6.187852,other,other,3.331293,-3.359013,-1.964445,11.317299,preadipocyte
1,AAACCTGCACAGGTTT-1,preadipocyte,preadipocyte_top100,3584,15599,SeuratProject,T1,1,0.023078,7,...,1.370154,middle,-7.70277,other,other,3.330828,-2.502859,-1.644394,9.496364,preadipocyte
2,AAACGGGAGGCTCTTA-1,preadipocyte,preadipocyte_top40,2979,13379,SeuratProject,T1,1,0.042679,2,...,11.151146,middle,-3.81959,other,other,1.676458,-1.378066,-2.576129,-1.121668,preadipocyte
3,AAAGATGAGCTGGAAC-1,preadipocyte,preadipocyte_top90,3051,11689,SeuratProject,T1,1,0.023783,7,...,2.181324,middle,-7.007155,other,other,1.384685,-0.724455,-3.301535,7.278057,preadipocyte
4,AAAGCAACACTTACGA-1,preadipocyte,preadipocyte_top100,4084,18077,SeuratProject,T1,1,0.028047,7,...,0.97737,top_10%_pc2,-7.175428,other,other,3.984249,-1.570916,-0.757836,12.493582,preadipocyte


In [5]:
### Data
file_data = "/projects/timshel/sc-scheele_lab_adipose_fluidigm_c1/data-preadipocytes_developing/preadipocyte_developing_1808.umi.csv.gz"
df_data = pd.read_csv(file_data, index_col=False) # this takes XXX min for adipocytes! (Pandas is slow!)

In [6]:
df_data.set_index("gene", inplace=True) # set index
df_data.head()

Unnamed: 0_level_0,AAACCTGAGTCCTCCT-1,AAACCTGCACAGGTTT-1,AAACGGGAGGCTCTTA-1,AAAGATGAGCTGGAAC-1,AAAGCAACACTTACGA-1,AAAGCAACATTACGAC-1,AAAGCAAGTCGCATAT-1,AAAGTAGCAAGGACTG-1,AAAGTAGGTCCTCTTG-1,AAAGTAGGTTCGTCTC-1,...,TATGCCCGTAGAGCTG-5,TATTACCTCTATCGCC-5,TCAACGACACCATCCT-5,TCACGAACACCTCGGA-5,TGAGCATGTTTGACTG-5,TGATTTCGTTCAGTAC-5,TGGCCAGCATAACCTG-5,TGGCTGGGTAAGTGTA-5,TGTATTCGTAGCACGA-5,TTCGGTCAGTACGATA-5
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000238009,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSG00000237683,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSG00000228463,2,2,0,0,0,0,1,3,0,3,...,0,0,0,0,0,0,0,0,0,1
ENSG00000237094,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSG00000230021,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
### [*IMPORTANT*] Check that all metadata cell_ids are identical to data columnnames. 
### We need to ensure this before we can use the metadata cell-types as annotations.
np.all(df_data.columns.values == df_metadata["cell_id"].values) # ---> True

True

### CTC log normalize

In [8]:
df_ctc_log = ctc_log_normalize(df_data)

Performning common transcript count (ctc) normalization and log-transformation on input data


### Run pre-calc for multiple annotation levels

In [9]:
# <OUTPREFIX>:<ANNOTATION_LVL_COLUMN_NAME>
dict_run = {"preadipocyte_developing_1808_branch":"branch_low_res",
            "preadipocyte_developing_1808_branch_pc2_quantile":"branch_high_res"}

In [10]:
for out_prefix in dict_run:
    annotations = df_metadata[dict_run[out_prefix]].values # get annotations
    print(annotations[:5])
    df_anova = calculate_anova_sporadically_expressed_genes(df_ctc_log, annotations, out_prefix)
    df_anova.to_csv("{}.pre_calc.sporadically_expressed_genes.anova.csv.gz".format(out_prefix), compression="gzip")
    (df_frac, df_mu, df_var, df_n) = calculate_per_anno_summary_stats(df_ctc_log, annotations, out_prefix, permute_annotations=False)
    (df_frac_null, df_mu_null, df_var_null, df_n_null) = calculate_per_anno_summary_stats(df_ctc_log, annotations, out_prefix, permute_annotations=True)

['preadipocyte' 'preadipocyte' 'preadipocyte' 'preadipocyte'
 'preadipocyte']
Splitting data frame into annotation groups
Splitting annotation #1/#3 into group
Splitting annotation #2/#3 into group
Splitting annotation #3/#3 into group
Running ANOVA
gene 0 out of 22979
gene 100 out of 22979
gene 200 out of 22979


  f = msb / msw


gene 300 out of 22979
gene 400 out of 22979
gene 500 out of 22979
gene 600 out of 22979
gene 700 out of 22979
gene 800 out of 22979
gene 900 out of 22979
gene 1000 out of 22979
gene 1100 out of 22979
gene 1200 out of 22979
gene 1300 out of 22979
gene 1400 out of 22979
gene 1500 out of 22979
gene 1600 out of 22979
gene 1700 out of 22979
gene 1800 out of 22979
gene 1900 out of 22979
gene 2000 out of 22979
gene 2100 out of 22979
gene 2200 out of 22979
gene 2300 out of 22979
gene 2400 out of 22979
gene 2500 out of 22979
gene 2600 out of 22979
gene 2700 out of 22979
gene 2800 out of 22979
gene 2900 out of 22979
gene 3000 out of 22979
gene 3100 out of 22979
gene 3200 out of 22979
gene 3300 out of 22979
gene 3400 out of 22979
gene 3500 out of 22979
gene 3600 out of 22979
gene 3700 out of 22979
gene 3800 out of 22979
gene 3900 out of 22979
gene 4000 out of 22979
gene 4100 out of 22979
gene 4200 out of 22979
gene 4300 out of 22979
gene 4400 out of 22979
gene 4500 out of 22979
gene 4600 out of 2

gene 5200 out of 22979
gene 5300 out of 22979
gene 5400 out of 22979
gene 5500 out of 22979
gene 5600 out of 22979
gene 5700 out of 22979
gene 5800 out of 22979
gene 5900 out of 22979
gene 6000 out of 22979
gene 6100 out of 22979
gene 6200 out of 22979
gene 6300 out of 22979
gene 6400 out of 22979
gene 6500 out of 22979
gene 6600 out of 22979
gene 6700 out of 22979
gene 6800 out of 22979
gene 6900 out of 22979
gene 7000 out of 22979
gene 7100 out of 22979
gene 7200 out of 22979
gene 7300 out of 22979
gene 7400 out of 22979
gene 7500 out of 22979
gene 7600 out of 22979
gene 7700 out of 22979
gene 7800 out of 22979
gene 7900 out of 22979
gene 8000 out of 22979
gene 8100 out of 22979
gene 8200 out of 22979
gene 8300 out of 22979
gene 8400 out of 22979
gene 8500 out of 22979
gene 8600 out of 22979
gene 8700 out of 22979
gene 8800 out of 22979
gene 8900 out of 22979
gene 9000 out of 22979
gene 9100 out of 22979
gene 9200 out of 22979
gene 9300 out of 22979
gene 9400 out of 22979
gene 9500 o