forked from scverse/scanpy
-
Notifications
You must be signed in to change notification settings - Fork 0
/
_simple.py
1196 lines (1074 loc) · 39.7 KB
/
_simple.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
"""Simple Preprocessing Functions
Compositions of these functions are found in sc.preprocess.recipes.
"""
from __future__ import annotations
import warnings
from functools import singledispatch
from typing import TYPE_CHECKING, Literal
import numba
import numpy as np
import scipy as sp
from anndata import AnnData
from pandas.api.types import CategoricalDtype
from scipy.sparse import csr_matrix, issparse, isspmatrix_csr, spmatrix
from sklearn.utils import check_array, sparsefuncs
from .. import logging as logg
from .._compat import old_positionals
from .._settings import settings as sett
from .._utils import (
AnyRandom,
_check_array_function_arguments,
renamed_arg,
sanitize_anndata,
view_to_actual,
)
from ..get import _check_mask, _get_obs_rep, _set_obs_rep
from ._distributed import materialize_as_ndarray
from ._utils import _get_mean_var
# install dask if available
try:
import dask.array as da
except ImportError:
da = None
# backwards compat
from ._deprecated.highly_variable_genes import filter_genes_dispersion # noqa: F401
if TYPE_CHECKING:
from collections.abc import Collection, Iterable, Sequence
from numbers import Number
from numpy.typing import NDArray
@old_positionals(
"min_counts", "min_genes", "max_counts", "max_genes", "inplace", "copy"
)
def filter_cells(
data: AnnData | spmatrix | np.ndarray,
*,
min_counts: int | None = None,
min_genes: int | None = None,
max_counts: int | None = None,
max_genes: int | None = None,
inplace: bool = True,
copy: bool = False,
) -> AnnData | tuple[np.ndarray, np.ndarray] | None:
"""\
Filter cell outliers based on counts and numbers of genes expressed.
For instance, only keep cells with at least `min_counts` counts or
`min_genes` genes expressed. This is to filter measurement outliers,
i.e. “unreliable” observations.
Only provide one of the optional parameters `min_counts`, `min_genes`,
`max_counts`, `max_genes` per call.
Parameters
----------
data
The (annotated) data matrix of shape `n_obs` × `n_vars`.
Rows correspond to cells and columns to genes.
min_counts
Minimum number of counts required for a cell to pass filtering.
min_genes
Minimum number of genes expressed required for a cell to pass filtering.
max_counts
Maximum number of counts required for a cell to pass filtering.
max_genes
Maximum number of genes expressed required for a cell to pass filtering.
inplace
Perform computation inplace or return result.
Returns
-------
Depending on `inplace`, returns the following arrays or directly subsets
and annotates the data matrix:
cells_subset
Boolean index mask that does filtering. `True` means that the
cell is kept. `False` means the cell is removed.
number_per_cell
Depending on what was thresholded (`counts` or `genes`),
the array stores `n_counts` or `n_cells` per gene.
Examples
--------
>>> import scanpy as sc
>>> adata = sc.datasets.krumsiek11()
UserWarning: Observation names are not unique. To make them unique, call `.obs_names_make_unique`.
utils.warn_names_duplicates("obs")
>>> adata.obs_names_make_unique()
>>> adata.n_obs
640
>>> adata.var_names.tolist() # doctest: +NORMALIZE_WHITESPACE
['Gata2', 'Gata1', 'Fog1', 'EKLF', 'Fli1', 'SCL',
'Cebpa', 'Pu.1', 'cJun', 'EgrNab', 'Gfi1']
>>> # add some true zeros
>>> adata.X[adata.X < 0.3] = 0
>>> # simply compute the number of genes per cell
>>> sc.pp.filter_cells(adata, min_genes=0)
>>> adata.n_obs
640
>>> adata.obs['n_genes'].min()
1
>>> # filter manually
>>> adata_copy = adata[adata.obs['n_genes'] >= 3]
>>> adata_copy.n_obs
554
>>> adata_copy.obs['n_genes'].min()
3
>>> # actually do some filtering
>>> sc.pp.filter_cells(adata, min_genes=3)
>>> adata.n_obs
554
>>> adata.obs['n_genes'].min()
3
"""
if copy:
logg.warning("`copy` is deprecated, use `inplace` instead.")
n_given_options = sum(
option is not None for option in [min_genes, min_counts, max_genes, max_counts]
)
if n_given_options != 1:
raise ValueError(
"Only provide one of the optional parameters `min_counts`, "
"`min_genes`, `max_counts`, `max_genes` per call."
)
if isinstance(data, AnnData):
adata = data.copy() if copy else data
cell_subset, number = materialize_as_ndarray(
filter_cells(
adata.X,
min_counts=min_counts,
min_genes=min_genes,
max_counts=max_counts,
max_genes=max_genes,
),
)
if not inplace:
return cell_subset, number
if min_genes is None and max_genes is None:
adata.obs["n_counts"] = number
else:
adata.obs["n_genes"] = number
adata._inplace_subset_obs(cell_subset)
return adata if copy else None
X = data # proceed with processing the data matrix
min_number = min_counts if min_genes is None else min_genes
max_number = max_counts if max_genes is None else max_genes
number_per_cell = np.sum(
X if min_genes is None and max_genes is None else X > 0, axis=1
)
if issparse(X):
number_per_cell = number_per_cell.A1
if min_number is not None:
cell_subset = number_per_cell >= min_number
if max_number is not None:
cell_subset = number_per_cell <= max_number
s = materialize_as_ndarray(np.sum(~cell_subset))
if s > 0:
msg = f"filtered out {s} cells that have "
if min_genes is not None or min_counts is not None:
msg += "less than "
msg += (
f"{min_genes} genes expressed"
if min_counts is None
else f"{min_counts} counts"
)
if max_genes is not None or max_counts is not None:
msg += "more than "
msg += (
f"{max_genes} genes expressed"
if max_counts is None
else f"{max_counts} counts"
)
logg.info(msg)
return cell_subset, number_per_cell
@old_positionals(
"min_counts", "min_cells", "max_counts", "max_cells", "inplace", "copy"
)
def filter_genes(
data: AnnData | spmatrix | np.ndarray,
*,
min_counts: int | None = None,
min_cells: int | None = None,
max_counts: int | None = None,
max_cells: int | None = None,
inplace: bool = True,
copy: bool = False,
) -> AnnData | tuple[np.ndarray, np.ndarray] | None:
"""\
Filter genes based on number of cells or counts.
Keep genes that have at least `min_counts` counts or are expressed in at
least `min_cells` cells or have at most `max_counts` counts or are expressed
in at most `max_cells` cells.
Only provide one of the optional parameters `min_counts`, `min_cells`,
`max_counts`, `max_cells` per call.
Parameters
----------
data
An annotated data matrix of shape `n_obs` × `n_vars`. Rows correspond
to cells and columns to genes.
min_counts
Minimum number of counts required for a gene to pass filtering.
min_cells
Minimum number of cells expressed required for a gene to pass filtering.
max_counts
Maximum number of counts required for a gene to pass filtering.
max_cells
Maximum number of cells expressed required for a gene to pass filtering.
inplace
Perform computation inplace or return result.
Returns
-------
Depending on `inplace`, returns the following arrays or directly subsets
and annotates the data matrix
gene_subset
Boolean index mask that does filtering. `True` means that the
gene is kept. `False` means the gene is removed.
number_per_gene
Depending on what was thresholded (`counts` or `cells`), the array stores
`n_counts` or `n_cells` per gene.
"""
if copy:
logg.warning("`copy` is deprecated, use `inplace` instead.")
n_given_options = sum(
option is not None for option in [min_cells, min_counts, max_cells, max_counts]
)
if n_given_options != 1:
raise ValueError(
"Only provide one of the optional parameters `min_counts`, "
"`min_cells`, `max_counts`, `max_cells` per call."
)
if isinstance(data, AnnData):
adata = data.copy() if copy else data
gene_subset, number = materialize_as_ndarray(
filter_genes(
adata.X,
min_cells=min_cells,
min_counts=min_counts,
max_cells=max_cells,
max_counts=max_counts,
)
)
if not inplace:
return gene_subset, number
if min_cells is None and max_cells is None:
adata.var["n_counts"] = number
else:
adata.var["n_cells"] = number
adata._inplace_subset_var(gene_subset)
return adata if copy else None
X = data # proceed with processing the data matrix
min_number = min_counts if min_cells is None else min_cells
max_number = max_counts if max_cells is None else max_cells
number_per_gene = np.sum(
X if min_cells is None and max_cells is None else X > 0, axis=0
)
if issparse(X):
number_per_gene = number_per_gene.A1
if min_number is not None:
gene_subset = number_per_gene >= min_number
if max_number is not None:
gene_subset = number_per_gene <= max_number
s = np.sum(~gene_subset)
if s > 0:
msg = f"filtered out {s} genes that are detected "
if min_cells is not None or min_counts is not None:
msg += "in less than "
msg += (
f"{min_cells} cells" if min_counts is None else f"{min_counts} counts"
)
if max_cells is not None or max_counts is not None:
msg += "in more than "
msg += (
f"{max_cells} cells" if max_counts is None else f"{max_counts} counts"
)
logg.info(msg)
return gene_subset, number_per_gene
@renamed_arg("X", "data", pos_0=True)
@singledispatch
def log1p(
data: AnnData | np.ndarray | spmatrix,
*,
base: Number | None = None,
copy: bool = False,
chunked: bool | None = None,
chunk_size: int | None = None,
layer: str | None = None,
obsm: str | None = None,
) -> AnnData | np.ndarray | spmatrix | None:
"""\
Logarithmize the data matrix.
Computes :math:`X = \\log(X + 1)`,
where :math:`log` denotes the natural logarithm unless a different base is given.
Parameters
----------
data
The (annotated) data matrix of shape `n_obs` × `n_vars`.
Rows correspond to cells and columns to genes.
base
Base of the logarithm. Natural logarithm is used by default.
copy
If an :class:`~anndata.AnnData` is passed, determines whether a copy
is returned.
chunked
Process the data matrix in chunks, which will save memory.
Applies only to :class:`~anndata.AnnData`.
chunk_size
`n_obs` of the chunks to process the data in.
layer
Entry of layers to transform.
obsm
Entry of obsm to transform.
Returns
-------
Returns or updates `data`, depending on `copy`.
"""
_check_array_function_arguments(
chunked=chunked, chunk_size=chunk_size, layer=layer, obsm=obsm
)
return log1p_array(data, copy=copy, base=base)
@log1p.register(spmatrix)
def log1p_sparse(X: spmatrix, *, base: Number | None = None, copy: bool = False):
X = check_array(
X, accept_sparse=("csr", "csc"), dtype=(np.float64, np.float32), copy=copy
)
X.data = log1p(X.data, copy=False, base=base)
return X
@log1p.register(np.ndarray)
def log1p_array(X: np.ndarray, *, base: Number | None = None, copy: bool = False):
# Can force arrays to be np.ndarrays, but would be useful to not
# X = check_array(X, dtype=(np.float64, np.float32), ensure_2d=False, copy=copy)
if copy:
if not np.issubdtype(X.dtype, np.floating):
X = X.astype(float)
else:
X = X.copy()
elif not (np.issubdtype(X.dtype, np.floating) or np.issubdtype(X.dtype, complex)):
X = X.astype(float)
np.log1p(X, out=X)
if base is not None:
np.divide(X, np.log(base), out=X)
return X
@log1p.register(AnnData)
def log1p_anndata(
adata: AnnData,
*,
base: Number | None = None,
copy: bool = False,
chunked: bool = False,
chunk_size: int | None = None,
layer: str | None = None,
obsm: str | None = None,
) -> AnnData | None:
if "log1p" in adata.uns_keys():
logg.warning("adata.X seems to be already log-transformed.")
adata = adata.copy() if copy else adata
view_to_actual(adata)
if chunked:
if (layer is not None) or (obsm is not None):
raise NotImplementedError(
"Currently cannot perform chunked operations on arrays not stored in X."
)
for chunk, start, end in adata.chunked_X(chunk_size):
adata.X[start:end] = log1p(chunk, base=base, copy=False)
else:
X = _get_obs_rep(adata, layer=layer, obsm=obsm)
X = log1p(X, copy=False, base=base)
_set_obs_rep(adata, X, layer=layer, obsm=obsm)
adata.uns["log1p"] = {"base": base}
if copy:
return adata
@old_positionals("copy", "chunked", "chunk_size")
def sqrt(
data: AnnData | spmatrix | np.ndarray,
*,
copy: bool = False,
chunked: bool = False,
chunk_size: int | None = None,
) -> AnnData | spmatrix | np.ndarray | None:
"""\
Square root the data matrix.
Computes :math:`X = \\sqrt(X)`.
Parameters
----------
data
The (annotated) data matrix of shape `n_obs` × `n_vars`.
Rows correspond to cells and columns to genes.
copy
If an :class:`~anndata.AnnData` object is passed,
determines whether a copy is returned.
chunked
Process the data matrix in chunks, which will save memory.
Applies only to :class:`~anndata.AnnData`.
chunk_size
`n_obs` of the chunks to process the data in.
Returns
-------
Returns or updates `data`, depending on `copy`.
"""
if isinstance(data, AnnData):
adata = data.copy() if copy else data
if chunked:
for chunk, start, end in adata.chunked_X(chunk_size):
adata.X[start:end] = sqrt(chunk)
else:
adata.X = sqrt(data.X)
return adata if copy else None
X = data # proceed with data matrix
if not issparse(X):
return np.sqrt(X)
else:
return X.sqrt()
def normalize_per_cell( # noqa: PLR0917
data: AnnData | np.ndarray | spmatrix,
counts_per_cell_after: float | None = None,
counts_per_cell: np.ndarray | None = None,
key_n_counts: str = "n_counts",
copy: bool = False,
layers: Literal["all"] | Iterable[str] = (),
use_rep: Literal["after", "X"] | None = None,
min_counts: int = 1,
) -> AnnData | np.ndarray | spmatrix | None:
"""\
Normalize total counts per cell.
.. warning::
.. deprecated:: 1.3.7
Use :func:`~scanpy.pp.normalize_total` instead.
The new function is equivalent to the present
function, except that
* the new function doesn't filter cells based on `min_counts`,
use :func:`~scanpy.pp.filter_cells` if filtering is needed.
* some arguments were renamed
* `copy` is replaced by `inplace`
Normalize each cell by total counts over all genes, so that every cell has
the same total count after normalization.
Similar functions are used, for example, by Seurat [Satija15]_, Cell Ranger
[Zheng17]_ or SPRING [Weinreb17]_.
Parameters
----------
data
The (annotated) data matrix of shape `n_obs` × `n_vars`. Rows correspond
to cells and columns to genes.
counts_per_cell_after
If `None`, after normalization, each cell has a total count equal
to the median of the *counts_per_cell* before normalization.
counts_per_cell
Precomputed counts per cell.
key_n_counts
Name of the field in `adata.obs` where the total counts per cell are
stored.
copy
If an :class:`~anndata.AnnData` is passed, determines whether a copy
is returned.
min_counts
Cells with counts less than `min_counts` are filtered out during
normalization.
Returns
-------
Returns `None` if `copy=False`, else returns an updated `AnnData` object. Sets the following fields:
`adata.X` : :class:`numpy.ndarray` | :class:`scipy.sparse._csr.csr_matrix` (dtype `float`)
Normalized count data matrix.
Examples
--------
>>> import scanpy as sc
>>> adata = AnnData(np.array([[1, 0], [3, 0], [5, 6]], dtype=np.float32))
>>> print(adata.X.sum(axis=1))
[ 1. 3. 11.]
>>> sc.pp.normalize_per_cell(adata)
>>> print(adata.obs)
n_counts
0 1.0
1 3.0
2 11.0
>>> print(adata.X.sum(axis=1))
[3. 3. 3.]
>>> sc.pp.normalize_per_cell(
... adata, counts_per_cell_after=1,
... key_n_counts='n_counts2',
... )
>>> print(adata.obs)
n_counts n_counts2
0 1.0 3.0
1 3.0 3.0
2 11.0 3.0
>>> print(adata.X.sum(axis=1))
[1. 1. 1.]
"""
if isinstance(data, AnnData):
start = logg.info("normalizing by total count per cell")
adata = data.copy() if copy else data
if counts_per_cell is None:
cell_subset, counts_per_cell = materialize_as_ndarray(
filter_cells(adata.X, min_counts=min_counts)
)
adata.obs[key_n_counts] = counts_per_cell
adata._inplace_subset_obs(cell_subset)
counts_per_cell = counts_per_cell[cell_subset]
normalize_per_cell(adata.X, counts_per_cell_after, counts_per_cell)
layers = adata.layers.keys() if layers == "all" else layers
if use_rep == "after":
after = counts_per_cell_after
elif use_rep == "X":
after = np.median(counts_per_cell[cell_subset])
elif use_rep is None:
after = None
else:
raise ValueError('use_rep should be "after", "X" or None')
for layer in layers:
_subset, counts = filter_cells(adata.layers[layer], min_counts=min_counts)
temp = normalize_per_cell(adata.layers[layer], after, counts, copy=True)
adata.layers[layer] = temp
logg.info(
" finished ({time_passed}): normalized adata.X and added"
f" {key_n_counts!r}, counts per cell before normalization (adata.obs)",
time=start,
)
return adata if copy else None
# proceed with data matrix
X = data.copy() if copy else data
if counts_per_cell is None:
if not copy:
raise ValueError("Can only be run with copy=True")
cell_subset, counts_per_cell = filter_cells(X, min_counts=min_counts)
X = X[cell_subset]
counts_per_cell = counts_per_cell[cell_subset]
if counts_per_cell_after is None:
counts_per_cell_after = np.median(counts_per_cell)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
counts_per_cell += counts_per_cell == 0
counts_per_cell /= counts_per_cell_after
if not issparse(X):
X /= counts_per_cell[:, np.newaxis]
else:
sparsefuncs.inplace_row_scale(X, 1 / counts_per_cell)
return X if copy else None
@old_positionals("layer", "n_jobs", "copy")
def regress_out(
adata: AnnData,
keys: str | Sequence[str],
*,
layer: str | None = None,
n_jobs: int | None = None,
copy: bool = False,
) -> AnnData | None:
"""\
Regress out (mostly) unwanted sources of variation.
Uses simple linear regression. This is inspired by Seurat's `regressOut`
function in R [Satija15]. Note that this function tends to overcorrect
in certain circumstances as described in :issue:`526`.
Parameters
----------
adata
The annotated data matrix.
keys
Keys for observation annotation on which to regress on.
layer
If provided, which element of layers to regress on.
n_jobs
Number of jobs for parallel computation.
`None` means using :attr:`scanpy._settings.ScanpyConfig.n_jobs`.
copy
Determines whether a copy of `adata` is returned.
Returns
-------
Returns `None` if `copy=False`, else returns an updated `AnnData` object. Sets the following fields:
`adata.X` | `adata.layers[layer]` : :class:`numpy.ndarray` | :class:`scipy.sparse._csr.csr_matrix` (dtype `float`)
Corrected count data matrix.
"""
start = logg.info(f"regressing out {keys}")
adata = adata.copy() if copy else adata
sanitize_anndata(adata)
view_to_actual(adata)
if isinstance(keys, str):
keys = [keys]
X = _get_obs_rep(adata, layer=layer)
if issparse(X):
logg.info(" sparse input is densified and may " "lead to high memory use")
X = X.toarray()
n_jobs = sett.n_jobs if n_jobs is None else n_jobs
# regress on a single categorical variable
variable_is_categorical = False
if keys[0] in adata.obs_keys() and isinstance(
adata.obs[keys[0]].dtype, CategoricalDtype
):
if len(keys) > 1:
raise ValueError(
"If providing categorical variable, "
"only a single one is allowed. For this one "
"we regress on the mean for each category."
)
logg.debug("... regressing on per-gene means within categories")
regressors = np.zeros(X.shape, dtype="float32")
for category in adata.obs[keys[0]].cat.categories:
mask = (category == adata.obs[keys[0]]).values
for ix, x in enumerate(X.T):
regressors[mask, ix] = x[mask].mean()
variable_is_categorical = True
# regress on one or several ordinal variables
else:
# create data frame with selected keys (if given)
if keys:
regressors = adata.obs[keys]
else:
regressors = adata.obs.copy()
# add column of ones at index 0 (first column)
regressors.insert(0, "ones", 1.0)
len_chunk = np.ceil(min(1000, X.shape[1]) / n_jobs).astype(int)
n_chunks = np.ceil(X.shape[1] / len_chunk).astype(int)
tasks = []
# split the adata.X matrix by columns in chunks of size n_chunk
# (the last chunk could be of smaller size than the others)
chunk_list = np.array_split(X, n_chunks, axis=1)
if variable_is_categorical:
regressors_chunk = np.array_split(regressors, n_chunks, axis=1)
for idx, data_chunk in enumerate(chunk_list):
# each task is a tuple of a data_chunk eg. (adata.X[:,0:100]) and
# the regressors. This data will be passed to each of the jobs.
if variable_is_categorical:
regres = regressors_chunk[idx]
else:
regres = regressors
tasks.append(tuple((data_chunk, regres, variable_is_categorical)))
from joblib import Parallel, delayed
# TODO: figure out how to test that this doesn't oversubscribe resources
res = Parallel(n_jobs=n_jobs)(delayed(_regress_out_chunk)(task) for task in tasks)
# res is a list of vectors (each corresponding to a regressed gene column).
# The transpose is needed to get the matrix in the shape needed
_set_obs_rep(adata, np.vstack(res).T, layer=layer)
logg.info(" finished", time=start)
return adata if copy else None
def _regress_out_chunk(data):
# data is a tuple containing the selected columns from adata.X
# and the regressors dataFrame
data_chunk = data[0]
regressors = data[1]
variable_is_categorical = data[2]
responses_chunk_list = []
import statsmodels.api as sm
from statsmodels.tools.sm_exceptions import PerfectSeparationError
for col_index in range(data_chunk.shape[1]):
# if all values are identical, the statsmodel.api.GLM throws an error;
# but then no regression is necessary anyways...
if not (data_chunk[:, col_index] != data_chunk[0, col_index]).any():
responses_chunk_list.append(data_chunk[:, col_index])
continue
if variable_is_categorical:
regres = np.c_[np.ones(regressors.shape[0]), regressors[:, col_index]]
else:
regres = regressors
try:
result = sm.GLM(
data_chunk[:, col_index], regres, family=sm.families.Gaussian()
).fit()
new_column = result.resid_response
except PerfectSeparationError: # this emulates R's behavior
logg.warning("Encountered PerfectSeparationError, setting to 0 as in R.")
new_column = np.zeros(data_chunk.shape[0])
responses_chunk_list.append(new_column)
return np.vstack(responses_chunk_list)
@renamed_arg("X", "data", pos_0=True)
@old_positionals("zero_center", "max_value", "copy", "layer", "obsm")
@singledispatch
def scale(
data: AnnData | spmatrix | np.ndarray,
*,
zero_center: bool = True,
max_value: float | None = None,
copy: bool = False,
layer: str | None = None,
obsm: str | None = None,
mask_obs: NDArray[np.bool_] | str | None = None,
) -> AnnData | spmatrix | np.ndarray | None:
"""\
Scale data to unit variance and zero mean.
.. note::
Variables (genes) that do not display any variation (are constant across
all observations) are retained and (for zero_center==True) set to 0
during this operation. In the future, they might be set to NaNs.
Parameters
----------
data
The (annotated) data matrix of shape `n_obs` × `n_vars`.
Rows correspond to cells and columns to genes.
zero_center
If `False`, omit zero-centering variables, which allows to handle sparse
input efficiently.
max_value
Clip (truncate) to this value after scaling. If `None`, do not clip.
copy
Whether this function should be performed inplace. If an AnnData object
is passed, this also determines if a copy is returned.
layer
If provided, which element of layers to scale.
obsm
If provided, which element of obsm to scale.
mask_obs
Restrict both the derivation of scaling parameters and the scaling itself
to a certain set of observations. The mask is specified as a boolean array
or a string referring to an array in :attr:`~anndata.AnnData.obs`.
Returns
-------
Returns `None` if `copy=False`, else returns an updated `AnnData` object. Sets the following fields:
`adata.X` | `adata.layers[layer]` : :class:`numpy.ndarray` | :class:`scipy.sparse._csr.csr_matrix` (dtype `float`)
Scaled count data matrix.
`adata.var['mean']` : :class:`pandas.Series` (dtype `float`)
Means per gene before scaling.
`adata.var['std']` : :class:`pandas.Series` (dtype `float`)
Standard deviations per gene before scaling.
`adata.var['var']` : :class:`pandas.Series` (dtype `float`)
Variances per gene before scaling.
"""
_check_array_function_arguments(layer=layer, obsm=obsm)
if layer is not None:
raise ValueError(
f"`layer` argument inappropriate for value of type {type(data)}"
)
if obsm is not None:
raise ValueError(
f"`obsm` argument inappropriate for value of type {type(data)}"
)
return scale_array(
data, zero_center=zero_center, max_value=max_value, copy=copy, mask_obs=mask_obs
)
@scale.register(np.ndarray)
def scale_array(
X: np.ndarray,
*,
zero_center: bool = True,
max_value: float | None = None,
copy: bool = False,
return_mean_std: bool = False,
mask_obs: NDArray[np.bool_] | None = None,
) -> np.ndarray | tuple[np.ndarray, NDArray[np.float64], NDArray[np.float64]]:
if copy:
X = X.copy()
if mask_obs is not None:
mask_obs = _check_mask(X, mask_obs, "obs")
scale_rv = scale_array(
X[mask_obs, :],
zero_center=zero_center,
max_value=max_value,
copy=False,
return_mean_std=return_mean_std,
mask_obs=None,
)
if return_mean_std:
X[mask_obs, :], mean, std = scale_rv
return X, mean, std
else:
X[mask_obs, :] = scale_rv
return X
if not zero_center and max_value is not None:
logg.info( # Be careful of what? This should be more specific
"... be careful when using `max_value` " "without `zero_center`."
)
if np.issubdtype(X.dtype, np.integer):
logg.info(
"... as scaling leads to float results, integer "
"input is cast to float, returning copy."
)
X = X.astype(float)
mean, var = _get_mean_var(X)
std = np.sqrt(var)
std[std == 0] = 1
if issparse(X):
if zero_center:
raise ValueError("Cannot zero-center sparse matrix.")
sparsefuncs.inplace_column_scale(X, 1 / std)
else:
if zero_center:
X -= mean
X /= std
# do the clipping
if max_value is not None:
logg.debug(f"... clipping at max_value {max_value}")
if zero_center:
X = np.clip(X, a_min=-max_value, a_max=max_value)
else:
X[X > max_value] = max_value
if return_mean_std:
return X, mean, std
else:
return X
@scale.register(spmatrix)
def scale_sparse(
X: spmatrix,
*,
zero_center: bool = True,
max_value: float | None = None,
copy: bool = False,
return_mean_std: bool = False,
mask_obs: NDArray[np.bool_] | None = None,
) -> np.ndarray | tuple[np.ndarray, NDArray[np.float64], NDArray[np.float64]]:
# need to add the following here to make inplace logic work
if zero_center:
logg.info(
"... as `zero_center=True`, sparse input is "
"densified and may lead to large memory consumption"
)
X = X.toarray()
copy = False # Since the data has been copied
return scale_array(
X,
zero_center=zero_center,
copy=copy,
max_value=max_value,
return_mean_std=return_mean_std,
mask_obs=mask_obs,
)
@scale.register(AnnData)
def scale_anndata(
adata: AnnData,
*,
zero_center: bool = True,
max_value: float | None = None,
copy: bool = False,
layer: str | None = None,
obsm: str | None = None,
mask_obs: NDArray[np.bool_] | str | None = None,
) -> AnnData | None:
adata = adata.copy() if copy else adata
str_mean_std = ("mean", "std")
if mask_obs is not None:
if isinstance(mask_obs, str):
str_mean_std = (f"mean of {mask_obs}", f"std of {mask_obs}")
else:
str_mean_std = ("mean with mask", "std with mask")
mask_obs = _check_mask(adata, mask_obs, "obs")
view_to_actual(adata)
X = _get_obs_rep(adata, layer=layer, obsm=obsm)
X, adata.var[str_mean_std[0]], adata.var[str_mean_std[1]] = scale(
X,
zero_center=zero_center,
max_value=max_value,
copy=False, # because a copy has already been made, if it were to be made
return_mean_std=True,
mask_obs=mask_obs,
)
_set_obs_rep(adata, X, layer=layer, obsm=obsm)
return adata if copy else None
@old_positionals("n_obs", "random_state", "copy")
def subsample(
data: AnnData | np.ndarray | spmatrix,
fraction: float | None = None,
*,
n_obs: int | None = None,
random_state: AnyRandom = 0,
copy: bool = False,
) -> AnnData | tuple[np.ndarray | spmatrix, NDArray[np.int64]] | None:
"""\
Subsample to a fraction of the number of observations.
Parameters
----------
data
The (annotated) data matrix of shape `n_obs` × `n_vars`.
Rows correspond to cells and columns to genes.
fraction
Subsample to this `fraction` of the number of observations.
n_obs
Subsample to this number of observations.
random_state
Random seed to change subsampling.
copy
If an :class:`~anndata.AnnData` is passed,
determines whether a copy is returned.
Returns
-------
Returns `X[obs_indices], obs_indices` if data is array-like, otherwise
subsamples the passed :class:`~anndata.AnnData` (`copy == False`) or
returns a subsampled copy of it (`copy == True`).
"""
np.random.seed(random_state)
old_n_obs = data.n_obs if isinstance(data, AnnData) else data.shape[0]
if n_obs is not None:
new_n_obs = n_obs
elif fraction is not None:
if fraction > 1 or fraction < 0:
raise ValueError(f"`fraction` needs to be within [0, 1], not {fraction}")
new_n_obs = int(fraction * old_n_obs)
logg.debug(f"... subsampled to {new_n_obs} data points")
else:
raise ValueError("Either pass `n_obs` or `fraction`.")
obs_indices = np.random.choice(old_n_obs, size=new_n_obs, replace=False)
if isinstance(data, AnnData):
if data.isbacked:
if copy:
return data[obs_indices].to_memory()
else:
raise NotImplementedError(
"Inplace subsampling is not implemented for backed objects."
)
else:
if copy:
return data[obs_indices].copy()