In [1]:
import scanpy as sc
from anndata import read_h5ad
import pandas as pd
import numpy as np
import scipy as sp
from statsmodels.stats.multitest import multipletests
import matplotlib.pyplot as plt
import seaborn as sns
import os
import time

# scTRS tools
import scdrs.util as util
import scdrs.data_loader as dl
import scdrs.method as md

# autoreload
%load_ext autoreload
%autoreload 2

### Check scores

In [6]:
DATA_PATH = '/n/holystore01/LABS/price_lab/Users/mjzhang/scDRS_data'

# HOM FILE
DF_HOM = pd.read_csv('/n/holystore01/LABS/price_lab/Users/mjzhang/scDRS_data/gene_annotation/'
                     'mouse_human_homologs.txt', sep='\t')
DIC_H2M = {x:y for x,y in zip(DF_HOM['HUMAN_GENE_SYM'], DF_HOM['MOUSE_GENE_SYM'])}
DIC_M2H = {x:y for x,y in zip(DF_HOM['MOUSE_GENE_SYM'], DF_HOM['HUMAN_GENE_SYM'])}

# GS_FILE and TRAIT_INFO
DF_GS = pd.read_csv(DATA_PATH+'/gs_file/magma_10kb_1000.74_traits.gs', sep='\t', index_col=0)
DF_TRAIT_INFO = pd.read_csv(DATA_PATH+'/supp_table/trait_info.tsv', sep='\t')
DF_TRAIT_INFO.index = DF_TRAIT_INFO['Trait_Identifier']

# Score file (to check)
DIC_SCORE_PATH = {'tms_facs': DATA_PATH+'/score_file/score.tms_facs_with_cov.magma_10kb_1000',
                  'tms_facs.ref': DATA_PATH+'/score_file/score.tms_facs_with_cov.magma_10kb_1000.500_ctrl',
                  'tms_facs.tcell_sig': DATA_PATH+'/score_file/score.tms_facs_with_cov.tcell_sig',
                  'tms_facs.tcell_sig.ref': DATA_PATH+'/score_file/score.tms_facs_with_cov.tcell_sig.500_ctrl',
                  'ts_facs': DATA_PATH+'/score_file/score.ts_facs_with_cov.magma_10kb_1000',
                  'ts_facs.ref': DATA_PATH+'/score_file/score.ts_facs_with_cov.magma_10kb_1000.500_ctrl',
                  'tms_droplet': DATA_PATH+'/score_file/score.tms_droplet_with_cov.magma_10kb_1000',
                  'tms_droplet.ref': DATA_PATH+'/score_file/score.tms_droplet_with_cov.magma_10kb_1000.500_ctrl',
                  'canogamez': DATA_PATH+'/score_file/score.canogamez_with_cov.magma_10kb_1000',
                  'canogamez.ref': DATA_PATH+'/score_file/score.canogamez_with_cov.magma_10kb_1000.500_ctrl',
                  'nathan_b0': DATA_PATH+'/score_file/score.nathan_ni_2021_b0_with_cov.magma_10kb_1000',
                  'nathan_b0.ref': DATA_PATH+'/score_file/score.nathan_ni_2021_b0_with_cov.magma_10kb_1000.500_ctrl',
                  'nathan_b1': DATA_PATH+'/score_file/score.nathan_ni_2021_b1_with_cov.magma_10kb_1000',
                  'nathan_b1.ref': DATA_PATH+'/score_file/score.nathan_ni_2021_b1_with_cov.magma_10kb_1000.500_ctrl',
                  'nathan_b2': DATA_PATH+'/score_file/score.nathan_ni_2021_b2_with_cov.magma_10kb_1000',
                  'nathan_b2.ref': DATA_PATH+'/score_file/score.nathan_ni_2021_b2_with_cov.magma_10kb_1000.500_ctrl',
                  'nathan_b3': DATA_PATH+'/score_file/score.nathan_ni_2021_b3_with_cov.magma_10kb_1000',
                  'nathan_b3.ref': DATA_PATH+'/score_file/score.nathan_ni_2021_b3_with_cov.magma_10kb_1000.500_ctrl',
                 }
DIC_TRAIT_LIST = {'tms_facs': list(DF_GS.index),
                  'tms_facs.tcell_sig': ['naive_cd4', 'memory_cd4', 'effector_cd4', 
                                         'naive_cd8', 'memory_cd8', 'effector_cd8'],
                  'ts_facs': list(DF_GS.index),
                  'tms_droplet': list(DF_GS.index), 
                  'canogamez': list(DF_GS.index),
                  'nathan_b0': list(DF_GS.index),
                  'nathan_b1': list(DF_GS.index),
                  'nathan_b2': list(DF_GS.index),
                  'nathan_b3': list(DF_GS.index)}

In [7]:
# Check if the files are there
for score in [x for x in DIC_SCORE_PATH if (x.endswith('.ref') is False)]:
    print(score)
    missing_list = []
    for i_trait,trait in enumerate(DIC_TRAIT_LIST[score]):
        if os.path.exists(DIC_SCORE_PATH[score]+'/%s.full_score.gz'%trait) is False:
            print('Missing', score, i_trait, 'batch=%d'%(np.floor(i_trait/3)), trait)
            missing_list.append(trait)
    if len(missing_list)>0:
        DF_GS.loc[missing_list].to_csv(DATA_PATH+'/gs_file/unfinished_gs/%s.gs'%score, sep='\t')

tms_facs
tms_facs.tcell_sig
ts_facs
tms_droplet
canogamez
nathan_b0
nathan_b1
nathan_b2
nathan_b3


In [24]:
# Check consistency with ref files 
# for score in [x for x in DIC_SCORE_PATH if (x.endswith('.ref') is False)]:
for score in ['tms_facs.tcell_sig', 'tms_facs', 'ts_facs']:
    score_ref = score+'.ref'
#     for trait in np.random.choice(DIC_TRAIT_LIST[score], size=10):
    for trait in DIC_TRAIT_LIST[score]:
        if (os.path.exists(DIC_SCORE_PATH[score]+'/%s.full_score.gz'%trait) is True) & \
            ((os.path.exists(DIC_SCORE_PATH[score_ref]+'/%s.full_score.gz'%trait) is True)):
            df_score = pd.read_csv(DIC_SCORE_PATH[score]+'/%s.full_score.gz'%trait, sep='\t', index_col=0) 
            df_score_ref = pd.read_csv(DIC_SCORE_PATH[score_ref]+'/%s.score.gz'%trait, sep='\t', index_col=0)
            cell_list = sorted(set(df_score.index) & set(df_score_ref.index))
            v1 = df_score_ref.loc[cell_list, 'nlog10_pval']
            v2 = df_score.loc[cell_list, 'nlog10_pval']
            pcc = np.corrcoef(v1,v2)[0,1]
            n_ctrl = len([x for x in df_score.columns if x.startswith('ctrl_norm_score')])
            trait_code = DF_TRAIT_INFO.loc[trait, 'Code'] if trait in DF_TRAIT_INFO.index else trait
            if (pcc<0.99) | (n_ctrl!=1000):
                plt.figure(figsize=[4,4])
                plt.scatter(v1, v2)
                plt.title('%s %s PCC=%0.3f n_ctrl=%d'%(score, trait_code, pcc, n_ctrl))
                plt.show()
            else:
                print('%s %s PCC=%0.3f n_ctrl=%d'%(score, trait_code, pcc, n_ctrl))
        else:
            print('Missing', score, i_trait, 'batch=%d'%(np.floor(i_trait/3)), trait)

tms_facs.tcell_sig naive_cd4 PCC=0.997 n_ctrl=1000
tms_facs.tcell_sig memory_cd4 PCC=0.997 n_ctrl=1000
tms_facs.tcell_sig effector_cd4 PCC=0.998 n_ctrl=1000
tms_facs.tcell_sig naive_cd8 PCC=0.998 n_ctrl=1000
tms_facs.tcell_sig memory_cd8 PCC=0.998 n_ctrl=1000
tms_facs.tcell_sig effector_cd8 PCC=0.998 n_ctrl=1000
tms_facs ADHD PCC=0.997 n_ctrl=1000
tms_facs AD PCC=0.997 n_ctrl=1000
tms_facs AF PCC=0.997 n_ctrl=1000
tms_facs BP PCC=0.997 n_ctrl=1000
tms_facs CD PCC=0.997 n_ctrl=1000
tms_facs Celiac PCC=0.997 n_ctrl=1000
tms_facs CAD PCC=0.997 n_ctrl=1000
tms_facs Drink per week PCC=0.997 n_ctrl=1000
tms_facs FG PCC=0.997 n_ctrl=1000
tms_facs GRT PCC=0.997 n_ctrl=1000
tms_facs IBD PCC=0.997 n_ctrl=1000
tms_facs Insomnia PCC=0.997 n_ctrl=1000
tms_facs Intel PCC=0.998 n_ctrl=1000
tms_facs SLE PCC=0.997 n_ctrl=1000
tms_facs MDD PCC=0.998 n_ctrl=1000
tms_facs MS PCC=0.997 n_ctrl=1000
tms_facs PBC PCC=0.997 n_ctrl=1000
tms_facs Reaction time PCC=0.997 n_ctrl=1000
tms_facs RA PCC=0.997 n_ctrl=1

In [23]:
DF_TRAIT_INFO.loc[trait, 'Code'] if trait in DF_TRAIT_INFO.index else trait

'effector_cd8'