In [1]:
import scanpy as sc
from anndata import read_h5ad
import pandas as pd
import numpy as np
import scipy as sp
from statsmodels.stats.multitest import multipletests
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import os
from os.path import join
import gseapy as gp
import statsmodels.api as sm
from adjustText import adjust_text

plt.rcParams['pdf.fonttype'] = 42
plt.rcParams['ps.fonttype'] = 42

# scDRS tools
import scdrs.util as util
import scdrs.data_loader as dl
import scdrs.method as md

# autoreload
%load_ext autoreload
%autoreload 2

In [2]:
# Set file paths
DATA_PATH = '/n/holystore01/LABS/price_lab/Users/mjzhang/scDRS_data'
DF_HOM = pd.read_csv('/n/holystore01/LABS/price_lab/Users/mjzhang/scDRS_data/gene_annotation/'
                     'mouse_human_homologs.txt', sep='\t')
DIC_MAP_H2M = {x:y for x,y in zip(DF_HOM['HUMAN_GENE_SYM'], DF_HOM['MOUSE_GENE_SYM'])}
DIC_MAP_M2H = {x:y for x,y in zip(DF_HOM['MOUSE_GENE_SYM'], DF_HOM['HUMAN_GENE_SYM'])}
DF_GS = pd.read_csv(DATA_PATH+'/gs_file/magma_10kb_top1000_zscore.74_traits.rv1.gs', sep='\t', index_col=0)
DF_TRAIT_INFO = pd.read_csv(DATA_PATH+'/supp_table.rv1/trait_info.tsv', sep='\t')
DF_TRAIT_INFO.index = DF_TRAIT_INFO['Trait_Identifier']
FIG_PATH = '/n/holystore01/LABS/price_lab/Users/mjzhang/scDRS_data/results/fig_tcell.rv1'

# Set data info
DS_LIST = ['canogamez', 'nathan']
DIC_INFO = {}
DIC_INFO['tms_facs'] = {'species': 'mmusculus', 'dname': 'TMS FACS'}
DIC_INFO['tms_droplet'] = {'species': 'mmusculus', 'dname': 'TMS droplet'}
DIC_INFO['ts_facs'] = {'species': 'hsapiens', 'dname': 'TS FACS'}
DIC_INFO['canogamez'] = {'species': 'hsapiens', 'dname': 'Cano-Gamez & Soskic et al.'}
DIC_INFO['nathan'] = {'species': 'hsapiens', 'dname': 'Nathan et al.'}

# Set score file
DIC_SCORE_PATH = {
    'canogamez' : DATA_PATH+'/score_file/score.canogamez_with_cov.magma_10kb_top1000_zscore',
    'nathan' : DATA_PATH+'/score_file/score.nathan_ni_2021_with_cov.magma_10kb_top1000_zscore',
                 }

# Set trait list
DIC_TRAIT_LIST = {}
DIC_TRAIT_LIST['canogamez'] = [
    'PASS_IBD_deLange2017', 
    'PASS_CD_deLange2017',
    'PASS_UC_deLange2017', 
    'PASS_Rheumatoid_Arthritis', 
    'PASS_Multiple_sclerosis', 
    'UKB_460K.disease_AID_ALL', 
    'UKB_460K.disease_HYPOTHYROIDISM_SELF_REP',
    'UKB_460K.disease_ALLERGY_ECZEMA_DIAGNOSED',
    'UKB_460K.disease_ASTHMA_DIAGNOSED', 
    'UKB_460K.disease_RESPIRATORY_ENT',
    'UKB_460K.body_HEIGHTz',
]
DIC_TRAIT_LIST['nathan'] = DIC_TRAIT_LIST['canogamez'].copy()

### Load data 

In [3]:
# Load single-cell data 
dic_data_raw = {}
dic_data_raw['canogamez'] = dl.load_canogamez(DATA_PATH)
dic_data_raw['canogamez'].obs['cell_ontology_class'] = dic_data_raw['canogamez'].obs['cluster.id'].copy()
dic_data_raw['nathan'] = dl.load_nathan(DATA_PATH)
dic_data_raw['nathan'].obs['cell_ontology_class'] = dic_data_raw['nathan'].obs['cluster_name'].copy()

In [4]:
# Load score 
dic_score = {x:pd.DataFrame() for x in DIC_SCORE_PATH}
dic_cell_batch = {x:{} for x in DIC_SCORE_PATH}
for score in DIC_SCORE_PATH:
    for trait in DIC_TRAIT_LIST[score]:
        file_path = DIC_SCORE_PATH[score]+'/%s.score.gz'%trait
        if os.path.exists(file_path):
            temp_df = pd.read_csv(file_path, sep='\t', index_col=0)
            temp_df.columns = ['%s.%s'%(trait,x) for x in temp_df.columns]
            temp_df['%s.fdr'%trait] = multipletests(temp_df['%s.pval'%trait], method='fdr_bh')[1]
            dic_score[score] = pd.concat([dic_score[score], temp_df], axis=1)
        else:
            print('# missing: %s'%file_path)        

In [5]:
# Load control score 
dic_ctrl_score = {}
for score in DIC_SCORE_PATH:
    if score.endswith('.ref'):
        print('Skip %s'%score)
        continue
    dic_ctrl_score[score] = {}
    for trait in DIC_TRAIT_LIST[score]:
        file_path = DIC_SCORE_PATH[score]+'/%s.full_score.gz'%trait
        if os.path.exists(file_path):
            print('# Loading %s %s'%(score, trait))
            dic_ctrl_score[score][trait] = pd.read_csv(file_path, sep='\t', index_col=0)
        else:
            print('# missing: %s'%file_path)

# Loading canogamez PASS_IBD_deLange2017
# Loading canogamez PASS_CD_deLange2017
# Loading canogamez PASS_UC_deLange2017
# Loading canogamez PASS_Rheumatoid_Arthritis
# Loading canogamez PASS_Multiple_sclerosis
# Loading canogamez UKB_460K.disease_AID_ALL
# Loading canogamez UKB_460K.disease_HYPOTHYROIDISM_SELF_REP
# Loading canogamez UKB_460K.disease_ALLERGY_ECZEMA_DIAGNOSED
# Loading canogamez UKB_460K.disease_ASTHMA_DIAGNOSED
# Loading canogamez UKB_460K.disease_RESPIRATORY_ENT
# Loading canogamez UKB_460K.body_HEIGHTz
# Loading nathan PASS_IBD_deLange2017
# Loading nathan PASS_CD_deLange2017
# Loading nathan PASS_UC_deLange2017
# Loading nathan PASS_Rheumatoid_Arthritis
# Loading nathan PASS_Multiple_sclerosis
# Loading nathan UKB_460K.disease_AID_ALL
# Loading nathan UKB_460K.disease_HYPOTHYROIDISM_SELF_REP
# Loading nathan UKB_460K.disease_ALLERGY_ECZEMA_DIAGNOSED
# Loading nathan UKB_460K.disease_ASTHMA_DIAGNOSED
# Loading nathan UKB_460K.disease_RESPIRATORY_ENT
# Loading nathan

### Cell-type trait association 

In [6]:
# Compute association statistics 
dic_stats = {}
for ds in DS_LIST:
    print(ds)
    score = ds
    celltype_list = sorted(set(dic_data_raw[ds].obs['cell_ontology_class']))
    trait_list = DIC_TRAIT_LIST[ds]
    dic_stats[ds] = pd.DataFrame(index=celltype_list, columns=trait_list, dtype=float)
    for trait in trait_list:
        for ct in celltype_list:
            cell_list = dic_data_raw[ds].obs_names[dic_data_raw[ds].obs['cell_ontology_class']==ct]
            score_q95 = np.quantile(dic_score[score].loc[cell_list, '%s.norm_score'%trait], 0.95)
            temp_df = dic_ctrl_score[score][trait].loc[cell_list].copy()
            temp_df = temp_df[[x for x in temp_df.columns if x.startswith('ctrl_norm_score')]]
            v_ctrl_score_q95 = np.quantile(temp_df, 0.95, axis=0)
            dic_stats[ds].loc[ct,trait] = ((v_ctrl_score_q95>=score_q95).sum()+1) / (v_ctrl_score_q95.shape[0]+1)

canogamez
nathan


In [38]:
fdr_thres = 0.2
def assign_str(x, fdr_thres=fdr_thres):
    if x<0.01:
        return '$<$.01'
    elif x<=fdr_thres:
        return ('%0.3f'%x)[1:]
    else:
        return 'ns'

In [53]:
temp_df = dic_data_raw['canogamez'].obs.groupby(
    ['cluster.id', 'cell.type', 'cytokine.condition']).agg({'nUMI':len})
temp_df = temp_df.loc[temp_df['nUMI']>10]

In [55]:
for ct in ['TCM1 (Th17/iTreg)', 'TEM (Th17/iTreg)', 'TN (Th17/iTreg)', 'nTreg (Th0)']:
    print(ct)
    display(temp_df.loc[ct])

TCM1 (Th17/iTreg)


Unnamed: 0_level_0,Unnamed: 1_level_0,nUMI
cell.type,cytokine.condition,Unnamed: 2_level_1
Memory,Th0,77.0
Memory,Th2,27.0
Memory,Th17,414.0
Memory,iTreg,423.0
Naive,Th0,20.0
Naive,Th17,64.0
Naive,iTreg,14.0


TEM (Th17/iTreg)


Unnamed: 0_level_0,Unnamed: 1_level_0,nUMI
cell.type,cytokine.condition,Unnamed: 2_level_1
Memory,Th0,58.0
Memory,Th2,20.0
Memory,Th17,1168.0
Memory,iTreg,1640.0
Naive,Th17,71.0
Naive,iTreg,37.0


TN (Th17/iTreg)


Unnamed: 0_level_0,Unnamed: 1_level_0,nUMI
cell.type,cytokine.condition,Unnamed: 2_level_1
Memory,Th0,79.0
Memory,Th2,21.0
Memory,Th17,112.0
Memory,iTreg,191.0
Naive,Th0,136.0
Naive,Th2,148.0
Naive,Th17,826.0
Naive,iTreg,1105.0


nTreg (Th0)


Unnamed: 0_level_0,Unnamed: 1_level_0,nUMI
cell.type,cytokine.condition,Unnamed: 2_level_1
Memory,Th0,254.0
Memory,Th2,150.0
Memory,Th17,301.0
Memory,iTreg,238.0


In [39]:
# Results in Cano-Gamez & Soskic et al.
ds = 'canogamez'
col = dic_data_raw[ds].obs.columns[0]
temp_df = dic_data_raw[ds].obs.groupby(['cell_ontology_class']).agg({col:len})
display(temp_df)
celltype_list = sorted(temp_df.index)
print('n_celltype=%d'%len(celltype_list), ', '.join(celltype_list))

# Get p-values
df_stats = dic_stats[ds].loc[celltype_list].copy()

# Multiple testing: applied to each trait across cell types
df_stats_fdr = df_stats.copy()
for col in df_stats_fdr:
    df_stats_fdr[col] = multipletests(df_stats_fdr[col], method='fdr_bh')[1]

# Make table 
df_res = df_stats_fdr.loc[(df_stats_fdr<fdr_thres).sum(axis=1)>0].copy()
df_res.columns = [DF_TRAIT_INFO.loc[x, 'Code'] for x in df_res.columns]
for col in df_res:
    df_res[col] = [assign_str(x) for x in df_res[col]]
df_res.index.name = DIC_INFO[ds]['dname']
print(df_res.to_latex(index_names=True, escape=False))

Unnamed: 0_level_0,cell.type
cell_ontology_class,Unnamed: 1_level_1
HSP.high,1174
IFN.high,1318
Mitotic,1650
TCM1 (Th0),1909
TCM1 (Th17/iTreg),1041
TCM2 (Th0),2596
TCM2 (Th17/iTreg),3860
TCM (resting),1915
TEM (Th0),1297
TEM (Th17/iTreg),2998


n_celltype=22 HSP.high, IFN.high, Mitotic, TCM (resting), TCM1 (Th0), TCM1 (Th17/iTreg), TCM2 (Th0), TCM2 (Th17/iTreg), TEM (Th0), TEM (Th17/iTreg), TEM (resting), TEMRA (Th0), TEMRA (Th17/iTreg), TEMRA (resting), TN (Th0), TN (Th17), TN (Th17/iTreg), TN (Th2), TN (iTreg), TN (resting), nTreg (Th0), nTreg (resting)
\begin{tabular}{llllllllllll}
\toprule
{} &   IBD &    CD &    UC &    RA &    MS &   AIT &    HT & Eczema &   ASM & RR-ENT & Height \\
Cano-Gamez & Soskic et al. &       &       &       &       &       &       &       &        &       &        &        \\
\midrule
TCM1 (Th17/iTreg)          &  .099 &  .081 &    ns &    ns &  .110 &  .127 &  .179 &     ns &    ns &     ns &     ns \\
TCM2 (Th0)                 &    ns &  .081 &    ns &    ns &    ns &    ns &    ns &     ns &    ns &     ns &     ns \\
TEM (Th0)                  &    ns &    ns &    ns &    ns &    ns &    ns &  .092 &     ns &    ns &     ns &     ns \\
TEM (Th17/iTreg)           &  .015 &  .088 &    ns &  

In [60]:
# Results in Nathan et al. (Combining 4 batches)
ds,score = 'nathan','nathan'
col = dic_data_raw[ds].obs.columns[0]
temp_df = dic_data_raw[ds].obs.groupby(['cell_ontology_class']).agg({col:len})
display(temp_df)
celltype_list = list(temp_df.index[temp_df.index!='nan'])
print('n_celltype=%d'%len(celltype_list), ', '.join(celltype_list))
    
# Get p-values
df_stats = dic_stats[ds].loc[celltype_list].copy()

# Multiple testing: applied to each trait across cell types
df_stats_fdr = df_stats.copy()

for col in df_stats_fdr:
    df_stats_fdr[col] = multipletests(df_stats_fdr[col], method='fdr_bh')[1]

# Make table 
df_res = df_stats_fdr.loc[(df_stats_fdr<0.05).sum(axis=1)>0].copy()
df_res.columns = [DF_TRAIT_INFO.loc[x, 'Code'] for x in df_res.columns]
for col in df_res:
    df_res[col] = [assign_str(x) for x in df_res[col]]
df_res.index.name = DIC_INFO[ds]['dname']
df_res.index = [x.replace('+','$^+$') for x in df_res.index]
df_res.index = [x.replace('RORC','\textit{RORC}') for x in df_res.index]
print(df_res.to_latex(index_names=True, escape=False))

Unnamed: 0_level_0,nUMI
cell_ontology_class,Unnamed: 1_level_1
CD4+ CCR4+,17321
CD4+ CCR4+ central,28951
CD4+ CCR4+ICOS+ central,27770
CD4+ CCR5+ cytotoxic,4241
CD4+ CD27+,40431
CD4+ CD27+CD161+,45519
CD4+ CD38+ICOS+ central,4891
CD4+ CD161+ Th1,17731
CD4+ CD161+ Th2,13662
CD4+ CD161+ cytotoxic,9320


n_celltype=29 CD4+ CCR4+, CD4+ CCR4+ central, CD4+ CCR4+ICOS+ central, CD4+ CCR5+ cytotoxic, CD4+ CD27+, CD4+ CD27+CD161+, CD4+ CD38+ICOS+ central, CD4+ CD161+ Th1, CD4+ CD161+ Th2, CD4+ CD161+ cytotoxic, CD4+ HLA-DR+, CD4+ RORC+ Treg, CD4+ Th1, CD4+ Th2, CD4+ Th17, CD4+ Th17/1, CD4+ Treg, CD4+ activated, CD4+ central, CD4+ cytotoxic, CD4+ lncRNA, CD4/8+ PD-1+TIGIT+, CD8+ CXCR3+, CD8+ GZMB+, CD8+ GZMK+, CD8+ activated, CD8+ central, Vd1, Vd2
\begin{tabular}{llllllllllll}
\toprule
{} &   IBD &    CD &    UC &    RA &  MS &   AIT &    HT & Eczema &   ASM & RR-ENT & Height \\
\midrule
CD4$^+$ CD161$^+$ Th2          &  .167 &    ns &    ns &    ns &  ns &    ns &    ns &   .035 &  .122 &   .058 &     ns \\
CD4$^+$ \textit{RORC}$^+$ Treg &  .167 &    ns &  .087 &  .014 &  ns &  .014 &  .014 &   .022 &  .029 &   .029 &     ns \\
CD4$^+$ Th2                    &    ns &    ns &    ns &    ns &  ns &    ns &    ns &   .022 &  .140 &   .124 &     ns \\
CD4$^+$ Th17                   &    ns &  