-
Notifications
You must be signed in to change notification settings - Fork 11
/
scdrs
740 lines (666 loc) · 28.7 KB
/
scdrs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
#!/usr/bin/env python
import fire
import pandas as pd
import numpy as np
from statsmodels.stats.multitest import multipletests
import scdrs
from typing import Dict, List
import scanpy as sc
import os
import time
def get_cli_head():
MASTHEAD = "******************************************************************************\n"
MASTHEAD += "* Single-cell disease relevance score (scDRS)\n"
MASTHEAD += "* Version %s\n" % scdrs.__version__
MASTHEAD += "* Martin Jinye Zhang and Kangcheng Hou\n"
MASTHEAD += "* HSPH / Broad Institute / UCLA\n"
MASTHEAD += "* MIT License\n"
MASTHEAD += "******************************************************************************\n"
return MASTHEAD
def compute_score(
h5ad_file: str,
h5ad_species: str,
gs_file: str,
gs_species: str,
out_folder: str,
cov_file: str = None,
ctrl_match_opt: str = "mean_var",
weight_opt: str = "vs",
adj_prop: str = None,
flag_filter_data: bool = True,
flag_raw_count: bool = True,
n_ctrl: int = 1000,
flag_return_ctrl_raw_score: bool = False,
flag_return_ctrl_norm_score: bool = True,
):
"""
Compute scDRS scores. Generate `.score.gz` and `.full_score.gz` files for each trait.
Parameters
----------
h5ad_file : str
Single-cell .h5ad file.
h5ad_species : str
Species of h5ad_file. One of "hsapiens", "human", "mmusculus", or "mouse".
gs_file : str
scDRS gene set .gs file.
gs_species : str
Species of the gs file. One of "hsapiens", "human", "mmusculus", or "mouse".
out_folder : str
Output folder. Save scDRS score files as `<out_folder>/<trait>.score.gz` and
scDRS full score files as `<out_folder>/<trait>.full_score.gz`, where trait
identifier <trait> is from gs_file file.
cov_file : str, optional
scDRS covariate .cov file. Default is None.
ctrl_match_opt : str, optional
Control matching option. One of "mean" and "mean_var". Default is "mean_var".
weight_opt : str, optional
Weighting option. One of "uniform", "vs", "inv_std", and "od". Default is "vs".
adj_prop : str, optional
Cell group annotation (e.g., cell type) in `adata.obs.columns` used for adjusting
for cell group proportions. Cells are inversely weighted by the corresponding
group size. Default is None.
flag_filter_data : bool, optional
If to apply minimal cell and gene filtering to h5ad_file. Default is True.
flag_raw_count : bool, optional
If to apply size-factor normalization and log1p-transformation to h5ad_file.
Default is True.
n_ctrl : int, optional
Number of control gene sets. Default is 1000.
flag_return_ctrl_raw_score : bool, optional
If to return raw control scores. Default is False.
flag_return_ctrl_norm_score : bool, optional
If to return normalized control scores. Default is True.
Examples
--------
scdrs compute-score \
--h5ad-file <h5ad_file>\
--h5ad-species mouse\
--gs-file <gs_file>\
--gs-species human\
--out-folder <out_folder>\
--cov-file <cov_file>\
--flag-filter-data True\
--flag-raw-count True\
--n-ctrl 1000\
--flag-return-ctrl-raw-score False\
--flag-return-ctrl-norm-score True
"""
sys_start_time = time.time()
###########################################################################################
###### Parse Options ######
###########################################################################################
H5AD_FILE = h5ad_file
H5AD_SPECIES = h5ad_species
COV_FILE = cov_file
GS_FILE = gs_file
GS_SPECIES = gs_species
CTRL_MATCH_OPT = ctrl_match_opt
WEIGHT_OPT = weight_opt
ADJ_PROP = adj_prop
FLAG_FILTER_DATA = flag_filter_data
FLAG_RAW_COUNT = flag_raw_count
N_CTRL = n_ctrl
FLAG_RETURN_CTRL_RAW_SCORE = flag_return_ctrl_raw_score
FLAG_RETURN_CTRL_NORM_SCORE = flag_return_ctrl_norm_score
OUT_FOLDER = out_folder
if H5AD_SPECIES != GS_SPECIES:
H5AD_SPECIES = scdrs.util.convert_species_name(H5AD_SPECIES)
GS_SPECIES = scdrs.util.convert_species_name(GS_SPECIES)
header = get_cli_head()
header += "Call: scdrs compute-score \\\n"
header += "--h5ad-file %s \\\n" % H5AD_FILE
header += "--h5ad-species %s \\\n" % H5AD_SPECIES
header += "--cov-file %s \\\n" % COV_FILE
header += "--gs-file %s \\\n" % GS_FILE
header += "--gs-species %s \\\n" % GS_SPECIES
header += "--ctrl-match-opt %s \\\n" % CTRL_MATCH_OPT
header += "--weight-opt %s \\\n" % WEIGHT_OPT
header += "--adj-prop %s \\\n" % ADJ_PROP
header += "--flag-filter-data %s \\\n" % FLAG_FILTER_DATA
header += "--flag-raw-count %s \\\n" % FLAG_RAW_COUNT
header += "--n-ctrl %d \\\n" % N_CTRL
header += "--flag-return-ctrl-raw-score %s \\\n" % FLAG_RETURN_CTRL_RAW_SCORE
header += "--flag-return-ctrl-norm-score %s \\\n" % FLAG_RETURN_CTRL_NORM_SCORE
header += "--out-folder %s\n" % OUT_FOLDER
print(header)
# Check options
if H5AD_SPECIES != GS_SPECIES:
if H5AD_SPECIES not in ["mmusculus", "hsapiens"]:
raise ValueError(
"--h5ad-species needs to be one of [mmusculus, hsapiens] "
"unless --h5ad-species==--gs-species"
)
if GS_SPECIES not in ["mmusculus", "hsapiens"]:
raise ValueError(
"--gs-species needs to be one of [mmusculus, hsapiens] "
"unless --h5ad-species==--gs-species"
)
if CTRL_MATCH_OPT not in ["mean", "mean_var"]:
raise ValueError("--ctrl-match-opt needs to be one of [mean, mean_var]")
if WEIGHT_OPT not in ["uniform", "vs", "inv_std", "od"]:
raise ValueError("--weight-opt needs to be one of [uniform, vs, inv_std, od]")
###########################################################################################
###### Load data ######
###########################################################################################
print("Loading data:")
# Load .h5ad file
adata = scdrs.util.load_h5ad(
H5AD_FILE, flag_filter_data=FLAG_FILTER_DATA, flag_raw_count=FLAG_RAW_COUNT
)
print(
"--h5ad-file loaded: n_cell=%d, n_gene=%d (sys_time=%0.1fs)"
% (adata.shape[0], adata.shape[1], time.time() - sys_start_time)
)
print("First 3 cells: %s" % (str(list(adata.obs_names[:3]))))
print("First 5 genes: %s" % (str(list(adata.var_names[:5]))))
# Load .cov file
if COV_FILE is not None:
df_cov = pd.read_csv(COV_FILE, sep="\t", index_col=0)
print(
"--cov-file loaded: covariates=%s (sys_time=%0.1fs)"
% (str(list(df_cov.columns)), time.time() - sys_start_time)
)
for col in df_cov.columns:
print(
"First 5 values for '%s': %s" % (col, str(list(df_cov[col].values[:5])))
)
else:
df_cov = None
# Load .gs file
dict_gs = scdrs.util.load_gs(
GS_FILE,
src_species=GS_SPECIES,
dst_species=H5AD_SPECIES,
to_intersect=adata.var_names,
)
print(
"--gs-file loaded: n_trait=%d (sys_time=%0.1fs)"
% (len(dict_gs), time.time() - sys_start_time)
)
print("Print info for first 3 traits:")
for gs in list(dict_gs)[:3]:
print(
"First 3 elements for '%s': %s, %s"
% (gs, str(dict_gs[gs][0][:3]), str(dict_gs[gs][1][:3]))
)
print("")
###########################################################################################
###### Computation ######
###########################################################################################
# Preprocess
print("Preprocessing:")
scdrs.preprocess(
adata, cov=df_cov, adj_prop=ADJ_PROP, n_mean_bin=20, n_var_bin=20, copy=False
)
print("")
# Compute score
print("Computing scDRS score:")
for trait in dict_gs:
gene_list, gene_weights = dict_gs[trait]
if len(gene_list) < 10:
print(
"trait=%s: skipped due to small size (n_gene=%d, sys_time=%0.1fs)"
% (trait, len(gene_list), time.time() - sys_start_time)
)
continue
df_res = scdrs.score_cell(
adata,
gene_list,
gene_weight=gene_weights,
ctrl_match_key=CTRL_MATCH_OPT,
n_ctrl=N_CTRL,
weight_opt=WEIGHT_OPT,
return_ctrl_raw_score=FLAG_RETURN_CTRL_RAW_SCORE,
return_ctrl_norm_score=FLAG_RETURN_CTRL_NORM_SCORE,
verbose=False,
)
df_res.iloc[:, 0:6].to_csv(
os.path.join(OUT_FOLDER, "%s.score.gz" % trait),
sep="\t",
index=True,
compression="gzip",
)
if FLAG_RETURN_CTRL_RAW_SCORE | FLAG_RETURN_CTRL_NORM_SCORE:
df_res.to_csv(
os.path.join(OUT_FOLDER, "%s.full_score.gz" % trait),
sep="\t",
index=True,
compression="gzip",
)
v_fdr = multipletests(df_res["pval"].values, method="fdr_bh")[1]
n_rej_01 = (v_fdr < 0.1).sum()
n_rej_02 = (v_fdr < 0.2).sum()
print(
"Trait=%s, n_gene=%d: %d/%d FDR<0.1 cells, %d/%d FDR<0.2 cells (sys_time=%0.1fs)"
% (
trait,
len(gene_list),
n_rej_01,
df_res.shape[0],
n_rej_02,
df_res.shape[0],
time.time() - sys_start_time,
)
)
return
def munge_gs(
out_file: str,
pval_file: str = None,
zscore_file: str = None,
weight: str = "zscore",
fdr: float = None,
fwer: float = None,
n_min: int = 100,
n_max: int = 1000,
):
"""
Convert a .tsv GWAS gene statistics file to an scDRS .gs file.
Input file (`pval_file` or `zscore_file`) format:
- .tsv file
- First column corresponds to gene names, preferably with header 'GENE'.
- Each of other columns corresponds to a disease with header being the disease name
and the values being either the gene-level p-values (`pval_file`) or
gene-level one-sided z-scores (`zscore_file`).
For example, `pval_file` looks like
GENE BMI HEIGHT
OR4F5 0.001 0.01
DAZ3 0.01 0.001
...
Main steps.
1. Read result from a p-value or a z-score file
2. Select a subset of disease genes for each disease:
- If both `fdr` and `fwer` are None, select the top `n_max` genes.
- If `fdr` is not None, select genes based on FDR and cap between `n_min` and `n_max`.
- If `fwer` is not None, select genes based on FWER and cap between `n_min` and `n_max`.
3. Assign gene weights based on `--weight`.
4. Write the .gs file to `out_file`.
Parameters
----------
out_file : str
Path to the output .gs file.
pval_file : str, optional
P-value file. A .tsv file with first column corresponding to genes and other columns
corresponding to p-values of traits (one trait per column). One of `pval_file` and
`zscore_file` is expected. Default is None.
zscore_file : str, optional
Z-score file. A .tsv file with first column corresponding to genes and other columns
corresponding to z-scores of traits (one trait per column). One of `pval_file` and
`zscore_file` is expected. Default is None.
weight : str, optional
Gene weight options. One of "zscore" or "uniform". Default is "zscore".
fdr : float, optional
FDR threshold. Default is None. E.g., `--fdr 0.05`
fwer : float, optional
FWER threshold. Default is None. E.g., `--fwer 0.05`
n_min : int, optional
Minimum number of genes for each gene set. Default is 100. E.g., `--n-min 100`
n_max : int, optional
Maximum number of genes for each gene set. Default is 1000. E.g., `--n-min 1000`
Examples
--------
scdrs munge-gs \
--out_file <out> \
--pval-file <pval_file> \
--weight <weight> \
--n-max <n_max>
"""
sys_start_time = time.time()
###########################################################################################
###### Parse Options ######
###########################################################################################
header = get_cli_head()
header += "Call: scdrs munge-gs \\\n"
assert (
sum([(pval_file is not None), (zscore_file is not None)]) == 1
), "One of --pval-file and --zscore-file is expected."
if pval_file is not None:
header += "--pval-file %s \\\n" % pval_file
if zscore_file is not None:
header += "--zscore-file %s \\\n" % zscore_file
assert weight in [
"zscore",
"uniform",
], "--weight needs to be one of 'zscore', 'uniform'"
header += "--weight %s \\\n" % weight
assert (fdr is None) or (
fwer is None
), "At most one of --fdr and --fwer is allowed."
if fdr is not None:
header += "--fdr %s \\\n" % fdr
if fwer is not None:
header += "--fwer %s \\\n" % fdr
n_min = min(n_min, n_max)
header += "--n-min %s \\\n" % n_min
header += "--n-max %s \\\n" % n_max
assert out_file is not None, "--out-file is expected."
header += "--out-file %s\n" % out_file
print(header)
###########################################################################################
###### Load data ######
###########################################################################################
if pval_file is not None:
df_pval = pd.read_csv(pval_file, sep="\t", index_col=0)
print(
"--pval-file loaded: n_gene=%d, n_trait=%d (sys_time=%0.1fs)"
% (df_pval.shape[0], df_pval.shape[1], time.time() - sys_start_time)
)
print("Print info for the first 3 traits and first 10 genes")
print("%-20s" % "Traits", str(list(df_pval.columns[:3])))
for gene in df_pval.index[:10]:
print("%-20s" % gene, str(list(df_pval.loc[gene, df_pval.columns[:3]])))
# Check index is unique
if not df_pval.index.is_unique:
raise ValueError(
"Gene names in the provided p-value files are not unique: ",
",".join(df_pval.index[df_pval.index.duplicated()]),
". Please make sure the gene names are unique beforehand.",
)
# Check range
max_ = df_pval.max(skipna=True).max()
min_ = df_pval.min(skipna=True).min()
if (min_ > 0) and (max_ < 1):
print("--pval-file values are all between 0 and 1. Seems fine.")
else:
print("Warning: pval-file are not all between 0 and 1.")
print("")
if zscore_file is not None:
df_zscore = pd.read_csv(zscore_file, sep="\t", index_col=0)
print(
"--zscore-file loaded: n_gene=%d, n_trait=%d (sys_time=%0.1fs)"
% (df_zscore.shape[0], df_zscore.shape[1], time.time() - sys_start_time)
)
print("Print info for the first 3 traits and first 10 genes")
print("%-20s" % "Traits", str(list(df_zscore.columns[:3])))
for gene in df_zscore.index[:10]:
print("%-20s" % gene, str(list(df_zscore.loc[gene, df_zscore.columns[:3]])))
# Check index is unique
if not df_zscore.index.is_unique:
raise ValueError(
"Gene names in the provided z-score files are not unique: ",
",".join(df_zscore.index[df_zscore.index.duplicated()]),
". Please make sure the gene names are unique beforehand.",
)
# Check range
max_ = df_zscore.max(skipna=True).max()
min_ = df_zscore.min(skipna=True).min()
if (min_ < 0) and (max_ > 1):
print("--zscore-file have values above 1 or below 0. Seems fine.")
else:
print("Warning: zscore-file values are all between 0 and 1.")
df_pval = df_zscore.copy()
for col in df_pval:
df_pval[col] = scdrs.util.zsc2pval(df_zscore[col])
print("")
trait_list = sorted(df_pval.columns)
###########################################################################################
###### Computation ######
###########################################################################################
dict_gene_weights = {"TRAIT": [], "GENESET": []}
for trait in trait_list:
# Drop missing p-values
df_trait_pval = df_pval[[trait]].copy()
df_trait_pval.dropna(axis=0, inplace=True)
# Determine number of disease genes
if fdr is not None:
n_gene = multipletests(
df_trait_pval[trait].values, alpha=fdr, method="fdr_bh"
)[0].sum()
elif fwer is not None:
n_gene = multipletests(
df_trait_pval[trait].values, alpha=fwer, method="bonferroni"
)[0].sum()
else:
# If both are None, select top n_max genes
n_gene = n_max
# Restrict `n_gene` to be between `n_min` and `n_max`
n_gene = min(n_gene, n_max)
n_gene = max(n_gene, n_min)
# Select `n_gene` genes with the smallest p-values
df_trait_pval.sort_values(trait, ascending=True, inplace=True)
df_trait_pval = df_trait_pval.iloc[:n_gene]
gene_list = df_trait_pval.index
gene_pvals = df_trait_pval[trait].values.clip(min=1e-100)
if weight == "zscore":
gene_weights = scdrs.util.pval2zsc(gene_pvals).clip(max=10)
if weight == "uniform":
gene_weights = np.ones(len(gene_list))
dict_gene_weights["TRAIT"].append(trait)
dict_gene_weights["GENESET"].append(
",".join([f"{g}:{w:.5g}" for g, w in zip(gene_list, gene_weights)])
)
df_gs = pd.DataFrame(dict_gene_weights)
df_gs.to_csv(out_file, sep="\t", index=False)
print(
"Finish munging %d gene sets (sys_time=%0.1fs)"
% (df_gs.shape[0], time.time() - sys_start_time)
)
return
def perform_downstream(
h5ad_file: str,
score_file: str,
out_folder: str,
group_analysis: str = None,
corr_analysis: str = None,
gene_analysis: str = None,
flag_filter_data: bool = True,
flag_raw_count: bool = True,
knn_n_neighbors: int = 15,
knn_n_pcs: int = 20,
):
"""
Perform scDRS downstream analyses based on precomputed scDRS `.full_score.gz` files.
--group-analysis
For a given cell-group-level annotation (e.g., tissue or cell type), assess cell
group-disease association (control-score-based MC tests using 5% quantile) and within-cell
group disease-association heterogeneity (control-score-based MC tests using Geary's C).
--corr-analysis
For a given individual cell-level variable (e.g., T cell effectorness gradient), assess
association between disease and the individual cell-level variable (control-score-based
MC tests using Pearson's correlation).
--gene-analysis:
Compute Pearson's correlation between expression of each gene and the scDRS disease score.
Parameters
----------
h5ad_file : str
Single-cell .h5ad file.
score_file : str
scDRS .full_score.gz file. Use “@” to specify multiple file names, e.g.,
<score_folder>/@.full_score.gz. However, <score_folder> should not contain “@”.
out_folder : str
Output folder.
group_analysis : str, optional
Comma-seperated column names for cell group annotations in adata.obs.columns,
e.g., cell types or tissues. Results are saved as <out_folder>/<trait>.scdrs_group.<annot>,
one file per annotation. Default is None.
corr_analysis : str, optional
Comma-seperated column names for continuous annotations in adata.obs.columns,
e.g., T cell effectorness gradient. Results are saved as
<out_folder>/<trait>.scdrs_cell_corr for all variables. Default is None.
gene_analysis : str, optional
Flag to perform the gene prioritization by correlating gene expression with scDRS scores.
Specifying --gene-analysis without any arguments. Results are saved as
<out_folder>/<trait>.scdrs_gene for all genes. Default is None.
flag_filter_data : bool, optional
If to apply minimal cell and gene filtering to h5ad_file. Default is True.
flag_raw_count : bool, optional
If to apply size-factor normalization and log1p-transformation to h5ad_file.
Default is True.
knn_n_neighbors : int, optional
`n_neighbors` for computing KNN graph using `sc.pp.neighbors`.
Default is 15 (consistent with the TMS pipeline).
knn_n_pcs : int, optional
`n_pcs` for computing KNN graph using `sc.pp.neighbors`.
Default is 20 (consistent with the TMS pipeline).
Examples
--------
scdrs perform-downstream \
--h5ad-file <h5ad_file> \
--score-file <full_score_file> \
--out-folder <out_folder> \
--group-analysis cell_type \
--corr-analysis causal_variable,non_causal_variable,covariate \
--gene-analysis \
--flag-filter-data True \
--flag-raw-count True
"""
sys_start_time = time.time()
###########################################################################################
###### Parse Options ######
###########################################################################################
header = get_cli_head()
header += "Call: scdrs perform-downstream \\\n"
assert h5ad_file is not None, "--h5ad-file is expected."
header += "--h5ad-file %s \\\n" % h5ad_file
assert score_file is not None, "--score-file is expected."
header += "--score-file %s \\\n" % score_file
assert out_folder is not None, "--out-folder is expected."
header += "--out-folder %s \\\n" % out_folder
assert (
sum(
[
group_analysis is not None,
corr_analysis is not None,
gene_analysis is not None,
]
)
> 0
), "Expect at least one of `--group_analysis`, `--corr_analysis`, `--gene_analyis`"
if group_analysis is not None:
# Determine if string or list_like using duck-typing
input_type = scdrs.util.str_or_list_like(group_analysis)
if input_type == "str":
group_analysis = group_analysis.split(",")
elif input_type == "list_like":
group_analysis = list(group_analysis)
else:
raise ValueError("Expect --group_analysis to be a comma-separated string.")
# group_analysis header
header += "--group-analysis %s\\\n" % ",".join(group_analysis)
if corr_analysis is not None:
# Determine if string or list_like using duck-typing
input_type = scdrs.util.str_or_list_like(corr_analysis)
if input_type == "str":
corr_analysis = corr_analysis.split(",")
elif input_type == "list_like":
corr_analysis = list(corr_analysis)
else:
raise ValueError("Expect --corr_analysis to be a comma-separated string.")
# corr_analysis header
header += "--corr-analysis %s \\\n" % ",".join(corr_analysis)
if gene_analysis is not None:
header += "--gene-analysis %s \\\n" % gene_analysis
header += "--flag-filter-data %s \\\n" % flag_filter_data
header += "--flag-raw-count %s \\\n" % flag_raw_count
header += "--knn-n-neighbors %s \\\n" % knn_n_neighbors
header += "--knn-n-pcs %s\n" % knn_n_pcs
print(header)
###########################################################################################
###### Load data ######
###########################################################################################
adata = scdrs.util.load_h5ad(
h5ad_file=h5ad_file,
flag_filter_data=flag_filter_data,
flag_raw_count=flag_raw_count,
)
print(
"--h5ad-file loaded: n_cell=%d, n_gene=%d (sys_time=%0.1fs)"
% (adata.shape[0], adata.shape[1], time.time() - sys_start_time)
)
print("First 3 cells: %s" % (str(list(adata.obs_names[:3]))))
print("First 5 genes: %s" % (str(list(adata.var_names[:5]))))
dict_score = scdrs.util.load_scdrs_score(score_file)
print(
"--score-file loaded: n_trait=%d (sys_time=%0.1fs)"
% (len(dict_score), time.time() - sys_start_time)
)
assert len(dict_score) > 0, "Expect at least 1 score file."
print("Print info for the first trait, first 3 cells, and first 8 columns:")
trait = list(dict_score)[0]
print("%-20s" % "Trait", trait)
print("%-20s" % "Cells", str(list(dict_score[trait].index[:3])))
for col in dict_score[trait].columns[:8]:
print("%-20s" % col, str(dict_score[trait][col].values[:3]))
# Check cols in `group_analysis` are also in `adata.obs`
if group_analysis is not None:
assert (
len(set(group_analysis) - set(adata.obs)) == 0
), "Missing --group-analysis variables from `adata.obs.columns`."
# Check cols in `corr_analysis` are also in `adata.obs`
if corr_analysis is not None:
assert (
len(set(corr_analysis) - set(adata.obs)) == 0
), "Missing --corr-analysis variables from `adata.obs.columns`."
print("")
###########################################################################################
###### Computation ######
###########################################################################################
if group_analysis is not None:
print("Performing scDRS group-analysis")
# Compute KNN if not present in `adata`
if "connectivities" not in adata.obsp:
if knn_n_pcs > (min(adata.shape) - 1):
print(
"`knn_n_pcs`=%d > `min(adata.shape)-1`=%d; set `knn_n_pcs`=%d"
% (knn_n_pcs, min(adata.shape) - 1, min(adata.shape) - 1)
)
knn_n_pcs = min(adata.shape) - 1
sc.pp.pca(adata, n_comps=knn_n_pcs)
sc.pp.neighbors(adata, n_neighbors=knn_n_neighbors, n_pcs=knn_n_pcs)
print(
"`connectivities` not found in `adata.obsp`; run `sc.pp.neighbors` first"
)
# scDRS group-level analysis
for trait in dict_score:
dict_df_res = scdrs.method.downstream_group_analysis(
adata=adata,
df_full_score=dict_score[trait],
group_cols=group_analysis,
)
for group_col in group_analysis:
dict_df_res[group_col].to_csv(
os.path.join(
out_folder,
f"{trait}.scdrs_group.{group_col.replace(' ', '_').replace(',', '_')}",
),
sep="\t",
index=True,
)
print(
"Finish group-analysis for %s (sys_time=%0.1fs)"
% (trait, time.time() - sys_start_time)
)
print("")
if corr_analysis is not None:
print("Performing scDRS corr-analysis")
for trait in dict_score:
df_res = scdrs.method.downstream_corr_analysis(
adata=adata, df_full_score=dict_score[trait], var_cols=corr_analysis
)
df_res.to_csv(
os.path.join(out_folder, f"{trait}.scdrs_cell_corr"),
sep="\t",
index=True,
)
print(
"Finish corr-analysis for %s (sys_time=%0.1fs)"
% (trait, time.time() - sys_start_time)
)
print("")
if gene_analysis is not None:
print("Performing scDRS gene-analysis")
for trait in dict_score:
df_res = scdrs.method.downstream_gene_analysis(
adata=adata, df_full_score=dict_score[trait]
)
# save results
df_res.to_csv(
os.path.join(out_folder, f"{trait}.scdrs_gene"), sep="\t", index=True
)
print(
"Finish gene-analysis for %s (sys_time=%0.1fs)"
% (trait, time.time() - sys_start_time)
)
print("")
return
if __name__ == "__main__":
fire.Fire()