In [1]:
import numpy as np
import pandas as pd
from collections import Counter

In [2]:
# Read msigdb immune signatures  
DATA_PATH='/n/holystore01/LABS/price_lab/Users/mjzhang/scDRS_data'

def parse_term(term):
    study_name = term.split('_')[0]
    reg_dir = term.split('_')[-1]
    gs_name = term.replace('%s_'%study_name,'').replace('_%s'%reg_dir,'') 
    
    if ('_VS_' in gs_name):
        cond1,cond2 = gs_name.split('_VS_')
        cond1 = '_'+cond1+'_'
        cond2 = '_'+cond2+'_'
    else:
        cond1,cond2,cond1_ct,cond2_ct = 'na','na','na','na'
    
    return study_name,cond1,cond2,reg_dir
    
def read_gmt(fname):
    dic_gs = {} 
    with open(fname,'r') as f:
        for line in f:
            line = line.strip().split('\t')
            dic_gs[line[0]] = line
            
    df_gs = pd.DataFrame(index=sorted(dic_gs.keys()),
                         columns=['DNAME','COND1','COND2','DIR','TERM','GENESET','LINK'])
    df_gs['TERM'] = df_gs.index
    df_gs['LINK'] = [dic_gs[x][1] for x in df_gs.index]
    df_gs['GENESET'] = [','.join(dic_gs[x][2:]) for x in df_gs.index]
    for term in df_gs.index:
        df_gs.loc[term,['DNAME','COND1','COND2','DIR']] = parse_term(term)
        
    return df_gs

df_msd=read_gmt(DATA_PATH+'/gene_annotation/msigdb_v7.1_files_to_download_locally/'
                'msigdb_v7.1_GMTs/c7.all.v7.1.symbols.gmt')

In [3]:
# Get T cell gene sets 
def check_tcell(term):
    BLOCK_LIST = ['NK', 'BCELL', 'MAST', 'KO', 'LCMV', 'TREATED', 'IFNA_STIM_90MIN', 'TSST_ACT',
                  'ANTI_CD3CD28_STIM', 'SECONDARY', 'TERTIARY', 'QUATERNARY']
    ALLOW_LIST = ['TCELL', '_T_CELL', 'TREG', '_TH1_', '_TH2_', '_TH17_', 'TCONV', '_TH0_']
    
    return (np.array([x in term for x in ALLOW_LIST]).sum()>0) & (np.array([x in term for x in BLOCK_LIST]).sum()==0)

def get_state(cname):
    if ('NAIVE' in cname) | ('_TH0_' in cname):
        return 'naive'
    elif ('EFF' in cname) | ('ACT' in cname) | ('_STIM' in cname) | ('MEM' in cname):
        str_ = ''
        if ('EFF' in cname) | ('ACT' in cname) | ('_STIM' in cname):
            str_ += 'effector'
        if 'MEM' in cname:
            str_ += 'memory' if str_=='' else '_memory'
        return str_
    elif ('EXHAUSTED' in cname):
        return 'exhausted'
    else:
        return 'other'
    
def get_subtype(cname):
    if ('NAIVE' in cname) | ('_TH0_' in cname):
        return 'naive'
    elif ('TREG' in cname):
        return 'Treg'
    elif ('_TH1_' in cname):
        return 'Th1'
    elif ('_TH2_' in cname):
        return 'Th2'
    elif ('_TH17_' in cname):
        return 'Th17'
    else:
        return 'other'
    
df_msd_t = df_msd.copy()
df_msd_t['TCELL'] = [check_tcell(x) for x in df_msd_t.index]
df_msd_t = df_msd_t.loc[df_msd_t['TCELL']].copy()
df_msd_t['COND1_STATE'] = [get_state(x) for x in df_msd_t['COND1']]
df_msd_t['COND2_STATE'] = [get_state(x) for x in df_msd_t['COND2']]
df_msd_t['COND1_SUBTYPE'] = [get_subtype(x) for x in df_msd_t['COND1']]
df_msd_t['COND2_SUBTYPE'] = [get_subtype(x) for x in df_msd_t['COND2']]


In [4]:
# for ct in ['naive', 'reg', 'th1', 'th2', 'th17']:
#     print(ct, ','.join(set(df_msd_t.loc[df_msd_t['COND1_CT']==ct, 'COND1']) | 
#                        set(df_msd_t.loc[df_msd_t['COND2_CT']==ct, 'COND2'])))
#     print('')
    
# for term in set(df_msd_t.index[~df_msd_t['TCELL']]):
#     if ('BCELL' not in term) & ('NKCELL' not in term) & ('MONOCYTE' not in term) & ('EOSINOPHIL' not in term) \
#         & ('MACROPHAGE' not in term):
#         print(term)

### Generate .gs file 

In [7]:
# Get candidate list 
dic_term_candiate_list = {}

# naive_cd8, memory_cd8, effector_cd8_singer from Singer_Cell_2016
df_singer = pd.read_excel(DATA_PATH+'/gene_annotation/Singer_cell_2016_tcell_mSigDB_tsig.xlsx')
term = 'MSigDB Signatures used as basis for Naive Signature'
dic_term_candiate_list['naive_cd8_singer'] = list(df_singer.loc[~df_singer[term].isna(), term]) #>=10
term = 'MSigDB Signatures used as basis for Memory Signature'
dic_term_candiate_list['memory_cd8_singer'] = list(df_singer.loc[~df_singer[term].isna(), term]) #>=6
dic_term_candiate_list['effector_cd8_singer'] = ['GSE10239_NAIVE_VS_DAY4.5_EFF_CD8_TCELL_DN',
                                                 'GSE10239_MEMORY_VS_DAY4.5_EFF_CD8_TCELL_DN']

# Tcell state signatures 
for term in ['naive_cd4', 'effector_cd4', 'memory_cd4', 'naive_cd8', 'effector_cd8', 'memory_cd8']:
    celltype = term.split('_')[0]
    subtype = 'CD4' if 'cd4' in term else 'CD8'
    ind_select_up = (df_msd_t['COND1_STATE']==celltype) & (df_msd_t['COND2_STATE']!=celltype) &\
                    (df_msd_t['COND2_STATE']!='other') &\
                    (df_msd_t['DIR']=='UP') & np.array([subtype in x for x in df_msd_t.index])
    ind_select_dn = (df_msd_t['COND2_STATE']==celltype) & (df_msd_t['COND1_STATE']!=celltype) &\
                    (df_msd_t['COND1_STATE']!='other') &\
                    (df_msd_t['DIR']=='DN') & np.array([subtype in x for x in df_msd_t.index])
    dic_term_candiate_list[term]= list(df_msd_t.index[ind_select_up | ind_select_dn])

# Subtype signatures 
for term in ['SUBTYPE_Treg', 'SUBTYPE_Th1', 'SUBTYPE_Th2', 'SUBTYPE_Th17']:
    subtype = term.split('_')[1]
    ind_select_up = (df_msd_t['COND1_SUBTYPE']==subtype) & (df_msd_t['COND2_SUBTYPE']!=subtype) & \
                    (df_msd_t['COND2_SUBTYPE']!='other') & (df_msd_t['DIR']=='UP')
    ind_select_dn = (df_msd_t['COND2_SUBTYPE']==subtype) & (df_msd_t['COND1_SUBTYPE']!=subtype) & \
                    (df_msd_t['COND1_SUBTYPE']!='other') & (df_msd_t['DIR']=='DN')
    dic_term_candiate_list[term]= list(df_msd_t.index[ind_select_up | ind_select_dn])

In [35]:
# MSigDB term list
dic_term_list = {}

# T cell state signatures
dic_term_list['naive_cd4'] = dic_term_candiate_list['naive_cd4'].copy()
dic_term_list['memory_cd4'] = dic_term_candiate_list['memory_cd4'].copy()
BLOCK_LIST = ['GSE11057_EFF_MEM_VS_CENT_MEM_CD4_TCELL_DN', 
              'GSE26928_EFF_MEM_VS_CENTR_MEM_CD4_TCELL_DN',
              'GSE3982_EFF_MEMORY_VS_CENT_MEMORY_CD4_TCELL_DN']
dic_term_list['memory_cd4'] = [x for x in dic_term_list['memory_cd4'] if x not in BLOCK_LIST]
dic_term_list['effector_cd4'] = dic_term_candiate_list['effector_cd4'].copy()
dic_term_list['naive_cd8'] = dic_term_candiate_list['naive_cd8_singer'].copy()
dic_term_list['memory_cd8'] = dic_term_candiate_list['memory_cd8_singer'].copy()
dic_term_list['effector_cd8'] = dic_term_candiate_list['effector_cd8'].copy()

# T cell subtype signatures 
dic_term_list['SUBTYPE_Treg'] = dic_term_candiate_list['SUBTYPE_Treg'].copy()
dic_term_list['SUBTYPE_Th1'] = dic_term_candiate_list['SUBTYPE_Th1'].copy()
dic_term_list['SUBTYPE_Th2'] = dic_term_candiate_list['SUBTYPE_Th2'].copy()
dic_term_list['SUBTYPE_Th17'] = dic_term_candiate_list['SUBTYPE_Th17'].copy()

# Write table 
temp_dic = dic_term_list.copy()
n_max = 0 
for term in temp_dic:
    n_max = max(n_max, len(temp_dic[term]))
for term in temp_dic:
    temp_dic[term] = temp_dic[term] + ['']*(n_max - len(temp_dic[term]))
temp_df = pd.DataFrame(data=temp_dic)
temp_df.to_csv('/n/holystore01/LABS/price_lab/Users/mjzhang/scDRS_data/supp_table/tcell_msigdb.tsv',
               sep='\t', index=False)

In [36]:
# Get GENESET
dic_sig_df = {}
df_sig = pd.DataFrame(columns=['TRAIT', 'GENESET'])
for term in dic_term_list.keys():
    
    temp_df_msd = df_msd.loc[dic_term_list[term], ['DNAME', 'GENESET']].copy()
    temp_df_msd = temp_df_msd.groupby('DNAME').agg({'GENESET':','.join})

    temp_ct = Counter()
    for gs in temp_df_msd['GENESET']:
        temp_ct.update(set(gs.split(',')))
    
    temp_df = pd.DataFrame(columns=['GENE', 'CT'], data = temp_ct.most_common()) 
    temp_df = temp_df.loc[temp_df['CT']>1].copy()
    
    gene_list = []
    for ct in sorted(set(temp_df['CT']))[::-1]:
        gene_list += list(temp_df.loc[temp_df['CT']==ct, 'GENE'])
        if len(gene_list)>100:
            break
    df_sig.loc[term] = [term, ','.join(gene_list)]
    
for term in df_sig.index:
    print('%-10s n_gene=%d'%(term, len(df_sig.loc[term, 'GENESET'].split(','))))
        
df_sig.loc[[x for x in df_sig.index if (x.startswith('SUBTYPE_') is False)]].\
    to_csv('/n/holystore01/LABS/price_lab/Users/mjzhang/scDRS_data/gs_file/tcell_signature.gs',
           sep='\t', index=False)
df_sig.to_csv('/n/holystore01/LABS/price_lab/Users/mjzhang/scDRS_data/gs_file/tcell_signature_full.gs',
              sep='\t', index=False)

naive_cd4  n_gene=303
memory_cd4 n_gene=62
effector_cd4 n_gene=259
naive_cd8  n_gene=185
memory_cd8 n_gene=170
effector_cd8 n_gene=111
SUBTYPE_Treg n_gene=205
SUBTYPE_Th1 n_gene=332
SUBTYPE_Th2 n_gene=102
SUBTYPE_Th17 n_gene=513


In [28]:
# Compare to reference 
df_sig_ref = pd.read_csv('/n/holystore01/LABS/price_lab/Users/mjzhang/scDRS_data/gs_file/tcell_signature.gs.073121',
                         sep='\t', index_col=0)
for gs_name in set(df_sig_ref.index)&set(df_sig.index):
    gs = set(df_sig.loc[gs_name, 'GENESET'].split(','))
    gs_ref = set(df_sig_ref.loc[gs_name, 'GENESET'].split(','))
    print('%s, gs=%d, gs_ref=%d, overlap=%d'%(gs_name, len(gs), len(gs_ref), len(gs&gs_ref)))
    
df_sig_ref = pd.read_csv('/n/holystore01/LABS/price_lab/Users/mjzhang/scDRS_data/gs_file/tcell_subtype.gs',
                         sep='\t', index_col=0)
for gs_name in set(df_sig_ref.index)&set(df_sig.index):
    gs = set(df_sig.loc[gs_name, 'GENESET'].split(','))
    gs_ref = set(df_sig_ref.loc[gs_name, 'GENESET'].split(','))
    print('%s, gs=%d, gs_ref=%d, overlap=%d'%(gs_name, len(gs), len(gs_ref), len(gs&gs_ref)))

naive_cd4, gs=303, gs_ref=100, overlap=73
effector_cd8, gs=111, gs_ref=100, overlap=91
naive_cd8, gs=185, gs_ref=100, overlap=94
effector_cd4, gs=259, gs_ref=64, overlap=64
memory_cd4, gs=62, gs_ref=100, overlap=55
memory_cd8, gs=170, gs_ref=100, overlap=96


In [19]:
df_sig_ref

Unnamed: 0_level_0,GENESET
TRAIT,Unnamed: 1_level_1
Treg,"KLHL7,CYB5A,TARDBP,AGAP3,AMFR,UBXN1,SNRK,NTSR2..."
