/
gget_cellxgene.py
226 lines (201 loc) 路 10.1 KB
/
gget_cellxgene.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
import logging
# Add and format time stamp in logging messages
logging.basicConfig(
format="%(asctime)s %(levelname)s %(message)s",
level=logging.INFO,
datefmt="%c",
)
# Mute numexpr threads info
logging.getLogger("numexpr").setLevel(logging.WARNING)
def convert_to_list(lst):
"""
Function to convert all non-list instances in a list to list.
Returns list of lists.
"""
temp = []
for el in lst:
if isinstance(el, str):
temp.append([el])
else:
temp.append(el)
return temp
def cellxgene(
species="homo_sapiens",
gene=None,
ensembl=False,
column_names=[
"dataset_id",
"assay",
"suspension_type",
"sex",
"tissue_general",
"tissue",
"cell_type",
],
meta_only=False,
tissue=None,
cell_type=None,
development_stage=None,
disease=None,
sex=None,
is_primary_data=True,
dataset_id=None,
tissue_general_ontology_term_id=None,
tissue_general=None,
assay_ontology_term_id=None,
assay=None,
cell_type_ontology_term_id=None,
development_stage_ontology_term_id=None,
disease_ontology_term_id=None,
donor_id=None,
self_reported_ethnicity_ontology_term_id=None,
self_reported_ethnicity=None,
sex_ontology_term_id=None,
suspension_type=None,
tissue_ontology_term_id=None,
census_version="stable",
verbose=True,
out=None,
):
"""
Query data from CZ CELLxGENE Discover (https://cellxgene.cziscience.com/) using the
CZ CELLxGENE Discover Census (https://github.com/chanzuckerberg/cellxgene-census).
NOTE: Querying large datasets requires a large amount of RAM. Use the cell metadata attributes
to define the (sub)dataset of interest.
The CZ CELLxGENE Discover Census recommends >16 GB of memory and a >5 Mbps internet connection.
General args:
- species Choice of 'homo_sapiens' or 'mus_musculus'. Default: 'homo_sapiens'.
- gene Str or list of gene name(s) or Ensembl ID(s), e.g. ['ACE2', 'SLC5A1'] or ['ENSG00000130234', 'ENSG00000100170']. Default: None.
NOTE: Set ensembl=True when providing Ensembl ID(s) instead of gene name(s).
See https://cellxgene.cziscience.com/gene-expression for examples of available genes.
- ensembl True/False (default: False). Set to True when genes are provided as Ensembl IDs.
- column_names List of metadata columns to return (stored in AnnData.obs when meta_only=False).
Default: ["dataset_id", "assay", "suspension_type", "sex", "tissue_general", "tissue", "cell_type"]
For more options see: https://api.cellxgene.cziscience.com/curation/ui/#/ -> Schemas -> dataset
- meta_only True/False (default: False). If True, returns only metadata dataframe (corresponds to AnnData.obs).
- census_version Str defining version of Census, e.g. "2023-05-15" or "latest" or "stable". Default: "stable".
- verbose True/False whether to print progress information. Default True.
- out If provided, saves the generated AnnData h5ad (or csv when meta_only=True) file with the specified path. Default: None.
Cell metadata attributes:
- tissue Str or list of tissue(s), e.g. ['lung', 'blood']. Default: None.
See https://cellxgene.cziscience.com/gene-expression for examples of available tissues.
- cell_type Str or list of celltype(s), e.g. ['mucus secreting cell', 'neuroendocrine cell']. Default: None.
See https://cellxgene.cziscience.com/gene-expression and select a tissue to see examples of available celltypes.
- development_stage Str or list of development stage(s). Default: None.
- disease Str or list of disease(s). Default: None.
- sex Str or list of sex(es), e.g. 'female'. Default: None.
- is_primary_data True/False (default: True). If True, returns only the canonical instance of the cellular observation.
This is commonly set to False for meta-analyses reusing data or for secondary views of data.
- dataset_id Str or list of CELLxGENE dataset ID(s). Default: None.
- tissue_general_ontology_term_id Str or list of high-level tissue UBERON ID(s). Default: None.
Also see: https://github.com/chanzuckerberg/single-cell-data-portal/blob/9b94ccb0a2e0a8f6182b213aa4852c491f6f6aff/backend/wmg/data/tissue_mapper.py
- tissue_general Str or list of high-level tissue label(s). Default: None.
Also see: https://github.com/chanzuckerberg/single-cell-data-portal/blob/9b94ccb0a2e0a8f6182b213aa4852c491f6f6aff/backend/wmg/data/tissue_mapper.py
- tissue_ontology_term_id Str or list of tissue ontology term ID(s) as defined in the CELLxGENE dataset schema. Default: None.
- assay_ontology_term_id Str or list of assay ontology term ID(s) as defined in the CELLxGENE dataset schema. Default: None.
- assay Str or list of assay(s) as defined in the CELLxGENE dataset schema. Default: None.
- cell_type_ontology_term_id Str or list of celltype ontology term ID(s) as defined in the CELLxGENE dataset schema. Default: None.
- development_stage_ontology_term_id Str or list of development stage ontology term ID(s) as defined in the CELLxGENE dataset schema. Default: None.
- disease_ontology_term_id Str or list of disease ontology term ID(s) as defined in the CELLxGENE dataset schema. Default: None.
- donor_id Str or list of donor ID(s) as defined in the CELLxGENE dataset schema. Default: None.
- self_reported_ethnicity_ontology_term_id Str or list of self reported ethnicity ontology ID(s) as defined in the CELLxGENE dataset schema. Default: None.
- self_reported_ethnicity Str or list of self reported ethnicity as defined in the CELLxGENE dataset schema. Default: None.
- sex_ontology_term_id Str or list of sex ontology ID(s) as defined in the CELLxGENE dataset schema. Default: None.
- suspension_type Str or list of suspension type(s) as defined in the CELLxGENE dataset schema. Default: None.
Returns AnnData object (when meta_only=False) or dataframe (when meta_only=True).
"""
# Check if cellxgene_census is installed
try:
import cellxgene_census
except ImportError:
logging.error(
"""
Some third-party dependencies are missing. Please run the following command:
>>> gget.setup('cellxgene') or $ gget setup cellxgene
Alternative: Install the cellxgene-census package using pip (https://pypi.org/project/cellxgene-census).
"""
)
return
# List of metadata arguments
args = [
dataset_id,
tissue_general_ontology_term_id,
tissue_general,
assay_ontology_term_id,
assay,
cell_type_ontology_term_id,
cell_type,
development_stage_ontology_term_id,
development_stage,
disease_ontology_term_id,
disease,
donor_id,
self_reported_ethnicity_ontology_term_id,
self_reported_ethnicity,
sex_ontology_term_id,
sex,
suspension_type,
tissue_ontology_term_id,
tissue,
]
if all(el is None for el in args):
logging.warning(
"""
You are attempting to query the entire Census dataset which requires a large amount of RAM (100's of GBs) and high network bandwidth.
Use the cell metadata arguments (e.g. 'tissue', 'cell_type', 'disease', etc...) to define the (sub)dataset of interest.
"""
)
# Convert args to string to get argument names
arg_names = []
for arg in args:
arg_names.append([i for i, j in locals().items() if j == arg][0])
# Convert all arguments to list
args = convert_to_list(args)
# Define metadata filter
if is_primary_data:
obs_value_filter = f"is_primary_data == True"
else:
obs_value_filter = None
for arg_name, arg in zip(arg_names, args):
if arg:
if obs_value_filter is None:
obs_value_filter = f"{arg_name} in {str(arg)}"
else:
obs_value_filter = obs_value_filter + f" and {arg_name} in {str(arg)}"
# Fetch AnnData object
if not meta_only:
if verbose:
logging.info(
"Fetching AnnData object from CZ CELLxGENE Discover. This might take a few minutes..."
)
if gene:
var_value_filter = f"{'feature_id' if ensembl else 'feature_name'} in {gene}"
else:
var_value_filter = None
with cellxgene_census.open_soma(census_version=census_version) as census:
adata = cellxgene_census.get_anndata(
census=census,
organism=species,
var_value_filter=var_value_filter,
obs_value_filter=obs_value_filter,
column_names={"obs": column_names},
)
if out:
adata.write(out)
return adata
# Fetch metadata
else:
if verbose:
logging.info("Fetching metadata from CZ CELLxGENE Discover...")
with cellxgene_census.open_soma(census_version=census_version) as census:
# Reads SOMADataFrame as a slice
cell_metadata = census["census_data"][species].obs.read(
value_filter=obs_value_filter, column_names=column_names
)
# Concatenates results to pyarrow.Table
cell_metadata = cell_metadata.concat()
# Converts to pandas.DataFrame
cell_metadata = cell_metadata.to_pandas()
if out:
cell_metadata.to_csv(out, index=False)
return cell_metadata