In [1]:
import scanpy as sc
import pandas as pd
import numpy as np
import scipy as sp
from statsmodels.stats.multitest import multipletests
import os
from os.path import join
import time

from anndata import read_h5ad

# scTRS tools
import scTRS.util as util
import scTRS.data_loader as dl
import scTRS.method as md

# autoreload
%load_ext autoreload
%autoreload 2

### Make GS file 

In [2]:
# GENE_SCORE_PATH='/n/holystore01/LABS/price_lab/Users/mjzhang/scTRS_data/trs_gene_scores/gwas_max_abs_z'

# # Top 500 GWAS genes 
# df_gs = pd.DataFrame(columns=['TRAIT', 'GENESET'])
# for file in os.listdir(GENE_SCORE_PATH):
#     trait=file.replace('.csv','')
#     temp_df = pd.read_csv(join(GENE_SCORE_PATH, file), sep=',')
#     temp_df = temp_df.loc[~temp_df['MAX_ABS_Z'].isna()]
#     temp_df = temp_df.sort_values(by='MAX_ABS_Z', ascending=False)
#     df_gs.loc[trait] = [trait,','.join(temp_df['GENE'][0:500])]
# df_gs.to_csv('/n/holystore01/LABS/price_lab/Users/mjzhang/scTRS_data/gs_file/gwas_max_abs_z.top500.gs',
#              sep='\t', index=False)

# BATCH_SIZE=5
# for i_batch in range(np.ceil(df_gs.shape[0]/BATCH_SIZE).astype(int)):
#     df_gs.iloc[i_batch*BATCH_SIZE:(i_batch+1)*BATCH_SIZE].to_csv(
#         '/n/holystore01/LABS/price_lab/Users/mjzhang/scTRS_data/gs_file/gwas_max_abs_z.top500.batch%d.gs'%i_batch,
#                 sep='\t', index=False)

### Load data 

In [2]:
H5AD_FILE='/n/holystore01/LABS/price_lab/Users/mjzhang/scTRS_data/tabula_muris_senis/'\
            'tabula-muris-senis-facs-official-raw-obj.h5ad'
GS_FILE='/n/holystore01/LABS/price_lab/Users/mjzhang/scTRS_data/gs_file/gwas_max_abs_z.top500.gs'
GS_FILE='/n/holystore01/LABS/price_lab/Users/mjzhang/scTRS_data/gs_file_null/null.all_size500.batch0.gs'
FLAG_FILTER=True
FLAG_RAW_COUNT=True
FLAG_MOUSE_DATA=True
FLAG_RETURN_CTRL_RAW_SCORE=False
FLAG_RETURN_CTRL_NORM_SCORE=False
# OUT_FOLDER='/n/holystore01/LABS/price_lab/Users/mjzhang/scTRS_data/score_file/score.gwas_max_abs_z.top500'

In [3]:
# Load .h5ad file 
adata = read_h5ad(H5AD_FILE)
if FLAG_FILTER:
    sc.pp.filter_cells(adata, min_genes=250)
    sc.pp.filter_genes(adata, min_cells=10)
if FLAG_RAW_COUNT:
    sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)
    sc.pp.log1p(adata)

In [4]:
# Load .gs file 
df_gs = pd.read_csv(GS_FILE, sep='\t')
df_gs.index = df_gs['TRAIT']

# Covert to mouse data 
if FLAG_MOUSE_DATA:
    # Load homolog file 
    df_hom = pd.read_csv('/n/holystore01/LABS/price_lab/Users/mjzhang/scTRS_data/gene_annotation/'
                         'mouse_human_homologs.txt', sep='\t')
    dic_map = {x:y for x,y in zip(df_hom['HUMAN_GENE_SYM'], df_hom['MOUSE_GENE_SYM'])}
    for trait in df_gs.index:
        human_gene_list = df_gs.loc[trait, 'GENESET'].split(',')
        mouse_gene_list = [dic_map[x] for x in set(human_gene_list)&set(dic_map.keys())]
        df_gs.loc[trait, 'GENESET'] = ','.join(mouse_gene_list)

In [5]:
df_gs

Unnamed: 0_level_0,TRAIT,GENESET
TRAIT,Unnamed: 1_level_1,Unnamed: 2_level_1
all_size500_rep0,all_size500_rep0,F5
all_size500_rep1,all_size500_rep1,
all_size500_rep2,all_size500_rep2,
all_size500_rep3,all_size500_rep3,F9
all_size500_rep4,all_size500_rep4,
all_size500_rep5,all_size500_rep5,
all_size500_rep6,all_size500_rep6,F8
all_size500_rep7,all_size500_rep7,F3
all_size500_rep8,all_size500_rep8,
all_size500_rep9,all_size500_rep9,


In [17]:
# Compute score 
for trait in df_gs.index:
    gene_list = df_gs.loc[trait,'GENESET'].split(',')
    df_res = md.score_cell(adata, gene_list, n_ctrl=5, 
                           return_ctrl_raw_score=FLAG_RETURN_CTRL_RAW_SCORE, 
                           return_ctrl_norm_score=FLAG_RETURN_CTRL_NORM_SCORE,
                           verbose=False)
    df_res.to_csv(join(OUT_FOLDER, '%s.score.gz'%trait), sep='\t', index=True, compression='gzip')
    break

In [22]:
df_res.to_csv(join(OUT_FOLDER, '%s.score.gz'%trait), sep='\t', index=True, compression='gzip')

In [36]:
# Setup file paths
DATA_PATH='/n/holystore01/LABS/price_lab/Users/mjzhang/scTRS_data'
GENE_SCORE_PATH=join(DATA_PATH, 'trs_gene_scores/gwas_max_abs_z')
GENE_ANNOT_PATH=join(DATA_PATH, 'gene_annotation')


In [32]:
# Count data (used for scoring genes)
score_dataset = dl.load_tms_ct(DATA_PATH, data_name='facs')
print('# TMS facs count data: n_cell=%d, n_gene=%d'
      %(score_dataset.shape[0], score_dataset.shape[1]))

Trying to set attribute `.obs` of view, copying.


# TMS facs count data: n_cell=110096, n_gene=22966


In [37]:
# Trait gene sets 

df_gs = pd.DataFrame(columns=['TRAIT', 'GENESET'])
for file in os.listdir(GENE_SCORE_PATH):
    df = pd.read_csv(join(GENE_SCORE_PATH, file), sep=',')
    dic_gs[]

In [38]:
df

Unnamed: 0,GENE,MAX_ABS_Z
0,OR4F5,
1,LOC100996442,
2,OR4F29,
3,OR4F16,
4,SAMD11,
...,...,...
18570,ZMYND19,1.142
18571,ARRDC1,1.459
18572,EHMT1,2.236
18573,CACNA1B,1.958


In [23]:
(temp_df1.loc[gene_list]['hsapiens'].values!=temp_df.loc[gene_list]['HUMAN_GENE_SYM'].values).sum()

  """Entry point for launching an IPython kernel.


AttributeError: 'bool' object has no attribute 'sum'

In [26]:
temp_df1.loc[gene_list]['hsapiens'].values.shape

(15158,)

In [27]:
temp_df.loc[gene_list]['HUMAN_GENE_SYM'].values.shape

(14975,)

In [28]:
len(gene_list)

14975