diff --git a/CHANGELOG.md b/CHANGELOG.md index fab4c694497..c125724f07b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -38,7 +38,11 @@ * `reference/cellranger_mkgtf`: Rename `reference/mkgtf` to `reference/cellranger_mkgtf` (PR #846). -* `interactive/run_cellxgene` and `interactive/run_cirrocumulus` were removed (PR #796). +* `labels_transfer/xgboost`: Align interface with new annotation workflow + - Store label probabilities instead of uncertainties + - Take `.h5mu` format as an input instead of `.h5ad` + +* `labels_transfer/knn`: delete outdated component due to its functionality now implemented in `labels_transfer/pynndescent_knn` * `reference/build_cellranger_arc_reference`: a default value of "output" is now specified for the argument `--genome`, inline with `reference/build_cellranger_reference` component. Additionally, providing a value for `--organism` is no longer required and its default value of `Homo Sapiens` has been removed (PR #864). @@ -83,6 +87,8 @@ * `dataflow/split_h5mu` component: Added a component to split a single h5mu file into multiple h5mu files based on the values of an .obs column (PR #824). +* `labels_transfer/pynndescent_knn`: component: Added a component for KNN classification based on a PyNNDescent neighborhood graph (PR #830). + * `workflows/test_workflows/ingestion` components & `workflows/ingestion`: Added standalone components for integration testing of ingestion workflows (PR #801). * `workflows/ingestion/make_reference`: Add additional arguments passed through to the STAR and BD Rhapsody reference components (PR #846). diff --git a/src/labels_transfer/api/common_arguments.yaml b/src/labels_transfer/api/common_arguments.yaml index 567aa0ec3b4..5f9514ca099 100644 --- a/src/labels_transfer/api/common_arguments.yaml +++ b/src/labels_transfer/api/common_arguments.yaml @@ -1,4 +1,3 @@ - argument_groups: - name: Input dataset (query) arguments arguments: @@ -25,7 +24,7 @@ obsm: - type: "double" name: "features" - example: X_integrated_scanvi + example: X_scvi required: false description: | The embedding to use for the classifier's inference. Override using the `--input_obsm_features` argument. If not provided, the `.X` slot will be used instead. @@ -41,52 +40,59 @@ description: | The `.obsm` key of the embedding to use for the classifier's inference. If not provided, the `.X` slot will be used instead. Make sure that embedding was obtained in the same way as the reference embedding (e.g. by the same model or preprocessing). - example: X_integrated_scanvi + example: X_scvi + - name: Reference dataset arguments arguments: - name: "--reference" type: "file" required: false description: "The reference data to train classifiers on." - example: https://zenodo.org/record/6337966/files/HLCA_emb_and_metadata.h5ad + example: reference.h5mu info: label: "Reference" file_format: - type: h5ad - X: - type: double - name: features - required: false - description: | - The expression data to use for the classifier's training, if `--input_obsm_features` argument is not provided. - obsm: - - type: "double" - name: "features" - example: X_integrated_scanvi - description: | - The embedding to use for the classifier's training. Override using the `--reference_obsm_features` argument. - Make sure that embedding was obtained in the same way as the query embedding (e.g. by the same model or preprocessing). - required: true - obs: - - type: "string" - name: "targets" - multiple: true - example: [ ann_level_1, ann_level_2, ann_level_3, ann_level_4, ann_level_5, ann_finest_level ] - description: "The target labels to transfer. Override using the `--reference_obs_targets` argument." + type: h5mu + mod: + rna: + description: "Modality in AnnData format containing RNA data." required: true + slots: + X: + type: double + name: features + required: false + description: | + The expression data to use for the classifier's training, if `--input_obsm_features` argument is not provided. + obsm: + - type: "double" + name: "features" + example: X_scvi + description: | + The embedding to use for the classifier's training. Override using the `--reference_obsm_features` argument. + Make sure that embedding was obtained in the same way as the query embedding (e.g. by the same model or preprocessing). + required: true + obs: + - type: "string" + name: "targets" + multiple: true + example: [ ann_level_1, ann_level_2, ann_level_3, ann_level_4, ann_level_5, ann_finest_level ] + description: "The target labels to transfer. Override using the `--reference_obs_targets` argument." + required: true - name: "--reference_obsm_features" type: string - required: true + required: false description: | - The `.obsm` key of the embedding to use for the classifier's training. + The `.obsm` key of the embedding to use for the classifier's training. If not provided, the `.X` slot will be used instead. Make sure that embedding was obtained in the same way as the query embedding (e.g. by the same model or preprocessing). - default: X_integrated_scanvi + example: X_scvi - name: "--reference_obs_targets" type: string default: [ ann_level_1, ann_level_2, ann_level_3, ann_level_4, ann_level_5, ann_finest_level ] required: false multiple: true - description: The `.obs` key of the target labels to tranfer. + description: The `.obs` key(s) of the target labels to tranfer. + - name: Outputs arguments: - name: "--output" @@ -108,22 +114,16 @@ description: "The predicted labels. Override using the `--output_obs_predictions` argument." required: true - type: "double" - name: "uncertainty" - description: "The uncertainty of the predicted labels. Override using the `--output_obs_uncertainty` argument." + name: "probability" + description: "The probability of the predicted labels. Override using the `--output_obs_probability` argument." required: false obsm: - type: "double" - name: "X_integrated_scanvi" + name: "X_scvi" description: The embedding used for the classifier's inference. Could have any name, specified by `input_obsm_features` argument." required: false - uns: - - type: "string" - name: "parameters" - example: labels_tranfer - description: Additional information about the parameters used for the label transfer. - required: true - name: "--output_obs_predictions" type: string example: @@ -133,17 +133,19 @@ In which `.obs` slots to store the predicted information. If provided, must have the same length as `--reference_obs_targets`. If empty, will default to the `reference_obs_targets` combined with the `"_pred"` suffix. - - name: "--output_obs_uncertainty" + - name: "--output_obs_probability" type: string example: required: false multiple: true description: | - In which `.obs` slots to store the uncertainty of the predictions. + In which `.obs` slots to store the probability of the predictions. If provided, must have the same length as `--reference_obs_targets`. - If empty, will default to the `reference_obs_targets` combined with the `"_uncertainty"` suffix. - - name: "--output_uns_parameters" + If empty, will default to the `reference_obs_targets` combined with the `"_probability"` suffix. + - name: "--output_compression" type: string - default: "labels_transfer" + description: | + The compression format to be used on the output h5mu object. + choices: ["gzip", "lzf"] required: false - description: "The `.uns` key to store additional information about the parameters used for the label transfer." \ No newline at end of file + example: "gzip" diff --git a/src/labels_transfer/knn/README.md b/src/labels_transfer/knn/README.md deleted file mode 100644 index 44d9209d2c0..00000000000 --- a/src/labels_transfer/knn/README.md +++ /dev/null @@ -1,24 +0,0 @@ -# KNN Labels Transfer - -This component transfers labels from a reference dataset to an query using a weighted K-Nearest Neighbors classification approach based on aproximate neighbors search of [PyNNDescent](https://pynndescent.readthedocs.io/en/latest/how_to_use_pynndescent.html). - -# Input - -- `.h5mu` files for reference and query as specified in [config](./config.vsh.yaml) -- List of targets (labels) to predict from `.obs` slot of the reference. Common choice for HLCA would be `["ann_level_1", "ann_level_2", "ann_level_3", "ann_level_4", "ann_level_5", "ann_finest_level"]` -- Number of nearest neighbors to use for prediction. We recommend using 30-50 neighbors for HLCA size dataset - -For the description of other input arguments, refer to the [config](./config.vsh.yaml). - -# Output - -`.h5mu` file with the query dataset that contains the transferred annotations -- The annotations for each of the `targets` will be stored as new columns in the `.obs` data frame with the suffix `_pred` by default -- The uncertainty of the prediction for each target will be stored as another column in `.obs` with the suffix `_uncertainty` -- `labels_transfer` entry is added to the `.uns` dictionary of the output file, which contains information about the method used and its parameters. Keys of this dictionary are labels from `targets` list. Values are dictionaries with the following keys: - -| Key | Description | -|-------------|------------------------| -| method | Always the string "KNN_pynndescent" | -| n_neighbors | Integer indicating the number of neighbors used for the prediction | -| reference | File or a link used as `reference` input argument | \ No newline at end of file diff --git a/src/labels_transfer/knn/config.vsh.yaml b/src/labels_transfer/knn/config.vsh.yaml deleted file mode 100644 index b6191067186..00000000000 --- a/src/labels_transfer/knn/config.vsh.yaml +++ /dev/null @@ -1,58 +0,0 @@ -name: knn -namespace: "labels_transfer" -description: "Performs label transfer from reference to query using KNN classifier" -info: - method_id: KNN_pynndescent -authors: - - __merge__: /src/authors/vladimir_shitov.yaml - roles: [ author ] -__merge__: ../api/common_arguments.yaml -argument_groups: - - name: "Learning parameters" - arguments: - - name: "--n_neighbors" - alternatives: ["-k"] - type: integer - description: "Number of nearest neighbors to use for classification" - required: true -resources: - - type: python_script - path: script.py - - path: ../utils/helper.py - - path: ../../utils/setup_logger.py -test_resources: - - type: python_script - path: test.py - - path: /resources_test/annotation_test_data/ - - path: /resources_test/pbmc_1k_protein_v3/ - -engines: -- type: docker - image: python:3.10-slim - setup: - - type: apt - packages: - - procps - - git - - type: python - __merge__: [/src/base/requirements/anndata_mudata.yaml, .] - - type: apt - packages: - - libopenblas-dev - - liblapack-dev - - gfortran - - type: python - __merge__: [/src/base/requirements/scanpy.yaml, .] - packages: - - pynndescent~=0.5.8 - - numba~=0.56.4 - - numpy~=1.23.5 - test_setup: - - type: python - __merge__: [ /src/base/requirements/viashpy.yaml, .] - -runners: -- type: executable -- type: nextflow - directives: - label: [highmem, highcpu] diff --git a/src/labels_transfer/knn/script.py b/src/labels_transfer/knn/script.py deleted file mode 100644 index ae4eceba726..00000000000 --- a/src/labels_transfer/knn/script.py +++ /dev/null @@ -1,135 +0,0 @@ -import sys -import warnings - -import mudata -import numpy as np -import scanpy as sc -from scipy.sparse import issparse -import pynndescent -import numba - - -## VIASH START -par = { - "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu", - "modality": "rna", - "input_obsm_features": "X_integrated_scanvi", - "reference": "https://zenodo.org/record/6337966/files/HLCA_emb_and_metadata.h5ad", - "reference_obsm_features": "X_integrated_scanvi", - "reference_obs_targets": ["ann_level_1", "ann_level_2", "ann_level_3", "ann_level_4", "ann_level_5", "ann_finest_level"], - "output": "foo.h5mu", - "output_obs_predictions": None, - "output_obs_uncertainty": None, - "output_uns_parameters": "labels_transfer", - "n_neighbors": 1 -} -meta = { - "resources_dir": "src/labels_transfer/utils" -} -## VIASH END - -sys.path.append(meta["resources_dir"]) -from helper import check_arguments, get_reference_features, get_query_features -# START TEMPORARY WORKAROUND setup_logger -# reason: resources aren't available when using Nextflow fusion -# from setup_logger import setup_logger -def setup_logger(): - import logging - from sys import stdout - - logger = logging.getLogger() - logger.setLevel(logging.INFO) - console_handler = logging.StreamHandler(stdout) - logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") - console_handler.setFormatter(logFormatter) - logger.addHandler(console_handler) - - return logger -# END TEMPORARY WORKAROUND setup_logger - -@numba.njit -def weighted_prediction(weights, ref_cats): - """Get highest weight category.""" - N = len(weights) - predictions = np.zeros((N,), dtype=ref_cats.dtype) - uncertainty = np.zeros((N,)) - for i in range(N): - obs_weights = weights[i] - obs_cats = ref_cats[i] - best_prob = 0 - for c in np.unique(obs_cats): - cand_prob = np.sum(obs_weights[obs_cats == c]) - if cand_prob > best_prob: - best_prob = cand_prob - predictions[i] = c - uncertainty[i] = max(1 - best_prob, 0) - - return predictions, uncertainty - -def distances_to_affinities(distances): - stds = np.std(distances, axis=1) - stds = (2.0 / stds) ** 2 - stds = stds.reshape(-1, 1) - distances_tilda = np.exp(-np.true_divide(distances, stds)) - - return distances_tilda / np.sum(distances_tilda, axis=1, keepdims=True) - -def main(par): - logger = setup_logger() - - logger.info("Checking arguments") - par = check_arguments(par) - - logger.info("Reading input (query) data") - mdata = mudata.read(par["input"]) - adata = mdata.mod[par["modality"]] - - logger.info("Reading reference data") - adata_reference = sc.read(par["reference"], backup_url=par["reference"]) - - # fetch feature data - train_data = get_reference_features(adata_reference, par, logger) - query_data = get_query_features(adata, par, logger) - - # pynndescent does not support sparse matrices - if issparse(train_data): - warnings.warn("Converting sparse matrix to dense. This may consume a lot of memory.") - train_data = train_data.toarray() - - logger.debug(f"Shape of train data: {train_data.shape}") - - logger.info("Building NN index") - ref_nn_index = pynndescent.NNDescent(train_data, n_neighbors=par["n_neighbors"]) - ref_nn_index.prepare() - - ref_neighbors, ref_distances = ref_nn_index.query(query_data, k=par["n_neighbors"]) - - weights = distances_to_affinities(ref_distances) - - output_uns_parameters = adata.uns.get(par["output_uns_parameters"], {}) - - # for each annotation level, get prediction and uncertainty - - for obs_tar, obs_pred, obs_unc in zip(par["reference_obs_targets"], par["output_obs_predictions"], par["output_obs_uncertainty"]): - logger.info(f"Predicting labels for {obs_tar}") - ref_cats = adata_reference.obs[obs_tar].cat.codes.to_numpy()[ref_neighbors] - prediction, uncertainty = weighted_prediction(weights, ref_cats) - prediction = np.asarray(adata_reference.obs[obs_tar].cat.categories)[prediction] - - adata.obs[obs_pred], adata.obs[obs_unc] = prediction, uncertainty - - # Write information about labels transfer to uns - output_uns_parameters[obs_tar] = { - "method": "KNN_pynndescent", - "n_neighbors": par["n_neighbors"], - "reference": par["reference"] - } - - adata.uns[par["output_uns_parameters"]] = output_uns_parameters - - mdata.mod[par['modality']] = adata - mdata.update() - mdata.write_h5mu(par['output'].strip()) - -if __name__ == "__main__": - main(par) diff --git a/src/labels_transfer/knn/test.py b/src/labels_transfer/knn/test.py deleted file mode 100644 index d986631dba3..00000000000 --- a/src/labels_transfer/knn/test.py +++ /dev/null @@ -1,80 +0,0 @@ -import pytest -from pathlib import Path -import anndata -import mudata -import numpy as np - -## VIASH START -meta = { - 'executable': './target/executable/labels_transfer/knn/knn', - 'resources_dir': './resources_test/' -} -## VIASH END - -reference_file = f"{meta['resources_dir']}/annotation_test_data/TS_Blood_filtered.h5ad" -input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" - -@pytest.fixture -def test_args(tmp_path, request): - obsm_features, obs_targets, output_uns_parameters = request.param - - # read reference - tempfile_reference_file = tmp_path / "reference.h5ad" - reference_adata = anndata.read_h5ad(reference_file) - - # generate reference obs targets - for i, target in enumerate(obs_targets): - class_names = [str(idx) for idx in range(i + 2)] # e.g. ["0", "1", "2"], the higher the level, the more the classes - reference_adata.obs[target] = np.random.choice(class_names, size=reference_adata.n_obs) - - # read input query - tempfile_input_file = tmp_path / "input.h5mu" - input_data = mudata.read_h5mu(input_file) - adata = input_data.mod["rna"] - - # generate features - reference_adata.obsm[obsm_features] = np.random.normal(size=(reference_adata.n_obs, 30)) - adata.obsm[obsm_features] = np.random.normal(size=(adata.n_obs, 30)) - - # write files - reference_adata.write(tempfile_reference_file.name) - input_data.write(tempfile_input_file.name) - - return tempfile_reference_file, reference_adata, tempfile_input_file, adata, obsm_features, obs_targets, output_uns_parameters - -@pytest.mark.parametrize("test_args", [("X_integrated_scvi", ["celltype"], None), ("X_int", ["ann_level_1", "ann_level_2", "ann_level_3"], "lab_tran")], indirect=True) -def test_label_transfer(run_component, test_args): - tempfile_reference_file, reference_adata, tempfile_input_file, query_adata, obsm_features, obs_targets, output_uns_parameters = test_args - - args = [ - "--input", tempfile_input_file.name, - "--modality", "rna", - "--input_obsm_features", obsm_features, - "--reference", tempfile_reference_file.name, - "--reference_obsm_features", obsm_features, - "--reference_obs_targets", ";".join(obs_targets), - "--output", "output.h5mu", - "--n_neighbors", "5" - ] - - if output_uns_parameters is not None: - args.extend(["--output_uns_parameters", output_uns_parameters]) - - run_component(args) - - assert Path("output.h5mu").is_file() - - output_data = mudata.read_h5mu("output.h5mu") - - exp_uns = "labels_transfer" if output_uns_parameters is None else output_uns_parameters - - for target in obs_targets: - assert f"{target}_pred" in output_data.mod["rna"].obs, f"Predictions are missing from output\noutput: {output_data.mod['rna'].obs}" - assert f"{target}_uncertainty" in output_data.mod["rna"].obs, f"Uncertainties are missing from output\noutput: {output_data.mod['rna'].obs}" - assert exp_uns in output_data.mod["rna"].uns, f"Parameters are missing from output\noutput: {output_data.mod['rna'].uns}" - assert target in output_data.mod["rna"].uns[exp_uns], f"Parameters are missing from output\noutput: {output_data.mod['rna'].uns}" - assert output_data.mod["rna"].uns[exp_uns][target].get("method") == "KNN_pynndescent", f"Wrong method in parameters\noutput: {output_data.mod['rna'].uns}" - assert output_data.mod["rna"].uns[exp_uns][target].get("n_neighbors") == 5, f"Wrong number of neighbors in parameters\noutput: {output_data.mod['rna'].uns}" - -if __name__ == '__main__': - exit(pytest.main([__file__])) \ No newline at end of file diff --git a/src/labels_transfer/pynndescent_knn/config.vsh.yaml b/src/labels_transfer/pynndescent_knn/config.vsh.yaml new file mode 100644 index 00000000000..7f1651f8a54 --- /dev/null +++ b/src/labels_transfer/pynndescent_knn/config.vsh.yaml @@ -0,0 +1,67 @@ +name: pynndescent_knn +namespace: "labels_transfer" +description: | + This component generates a neighborhood graph based using the PyNNDescentTransformer, followed by classification using a k-nearest neighborhood vote. +authors: + - __merge__: /src/authors/dorien_roosen.yaml + roles: [ maintainer, author ] + - __merge__: /src/authors/vladimir_shitov.yaml + roles: [ author ] + +__merge__: ../api/common_arguments.yaml + +argument_groups: + + - name: KNN label transfer arguments + arguments: + - name: "--weights" + type: string + choices: ["uniform", "distance", "gaussian"] + default: "uniform" + description: | + Weight function used in prediction. Possible values are: + - `uniform` - all points in each neighborhood are weighted equally + - `distance` - weight points by the inverse of their distance + - `gaussian` - weight points by the sum of their Gaussian kernel similarities to each sample + - name: "--n_neighbors" + type: integer + min: 5 + default: 15 + description: | + The number of neighbors to use in k-neighbor graph structure used for fast approximate nearest neighbor search with PyNNDescent. + Larger values will result in more accurate search results at the cost of computation time. +resources: + - type: python_script + path: script.py + - path: ../utils/helper.py + +test_resources: + - type: python_script + path: test.py + - path: /resources_test/annotation_test_data/ + - path: /resources_test/pbmc_1k_protein_v3/ + +engines: + - type: docker + image: python:3.12 + setup: + - type: apt + packages: + - procps + - pkg-config + - libhdf5-dev + - type: python + __merge__: /src/base/requirements/anndata_mudata.yaml + - type: python + packages: + - pynndescent~=0.5.10 + - numpy<2 + test_setup: + - type: python + __merge__: [ /src/base/requirements/viashpy.yaml ] + +runners: + - type: executable + - type: nextflow + directives: + label: [highmem, highcpu] diff --git a/src/labels_transfer/pynndescent_knn/script.py b/src/labels_transfer/pynndescent_knn/script.py new file mode 100644 index 00000000000..18773fc582a --- /dev/null +++ b/src/labels_transfer/pynndescent_knn/script.py @@ -0,0 +1,119 @@ +import mudata as mu +import numpy as np +import sys +from pynndescent import PyNNDescentTransformer +from sklearn.neighbors import KNeighborsClassifier + +## VIASH START +par = { + "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu", + "modality": "rna", + "input_obsm_features": None, + "reference": "resources_test/annotation_test_data/TS_Blood_filtered.h5mu", + "reference_obsm_features": None, + "reference_obs_targets": ["cell_type"], + "output": "foo_distance.h5mu", + "output_obs_predictions": None, + "output_obs_probability": None, + "output_uns_parameters": "labels_transfer", + "output_compression": None, + "weights": "distance", + "n_neighbors": 15 +} +meta = { + "resources_dir": "src/labels_transfer/utils" +} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from helper import check_arguments, get_reference_features, get_query_features + + +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger +# END TEMPORARY WORKAROUND setup_logger + + +def distances_to_affinities(distances): + # Apply Gaussian kernel to distances + stds = np.std(distances, axis=1) + stds = (2.0 / stds) ** 2 + stds = stds.reshape(-1, 1) + distances_tilda = np.exp(-np.true_divide(distances, stds)) + + # normalize the distances_tilda + # if the sum of a row of the distances tilda equals 0, + # set normalized distances for that row to 1 + # else divide the row values by the value of the sum of the row + distances_tilda_normalized = np.where( + np.sum(distances_tilda, axis=1, keepdims=True) == 0, + 1, + distances_tilda / np.sum(distances_tilda, axis=1, keepdims=True) + ) + return distances_tilda_normalized + + +logger = setup_logger() + +# Reading in data +logger.info(f"Reading in query dataset {par['input']} and reference datasets {par['reference']}") +q_mdata = mu.read_h5mu(par["input"]) +q_adata = q_mdata.mod[par["modality"]] + +r_mdata = mu.read_h5mu(par["reference"]) +r_adata = r_mdata.mod[par["modality"]] + +# check arguments +logger.info("Checking arguments") +par = check_arguments(par) + +# Generating training and inference data +logger.info("Generating training and inference data") +train_X = get_reference_features(r_adata, par, logger) +inference_X = get_query_features(q_adata, par, logger) + +neighbors_transformer = PyNNDescentTransformer( + n_neighbors=par["n_neighbors"], + parallel_batch_queries=True, +) +neighbors_transformer.fit(train_X) + +# Square sparse matrix with distances to n neighbors in reference data +reference_neighbors = neighbors_transformer.transform(inference_X) +query_neighbors = neighbors_transformer.transform(train_X) + +# For each target, train a classifier and predict labels +for obs_tar, obs_pred, obs_proba in zip(par["reference_obs_targets"], par["output_obs_predictions"], par["output_obs_probability"]): + logger.info(f"Predicting labels for {obs_tar}") + + weights_dict = { + "uniform": "uniform", + "distance": "distance", + "gaussian": distances_to_affinities + } + + logger.info(f"Using KNN classifier with {par['weights']} weights") + train_y = r_adata.obs[obs_tar].to_numpy() + classifier = KNeighborsClassifier(n_neighbors=par["n_neighbors"], metric="precomputed", weights=weights_dict[par["weights"]]) + classifier.fit(X=query_neighbors, y=train_y) + predicted_labels = classifier.predict(reference_neighbors) + probabilities = classifier.predict_proba(reference_neighbors).max(axis=1) + + # save_results + logger.info(f"Saving predictions to {obs_pred} and probabilities to {obs_proba} in obs") + q_adata.obs[obs_pred] = predicted_labels + q_adata.obs[obs_proba] = probabilities + +logger.info(f"Saving output data to {par['output']}") +q_mdata.mod[par['modality']] = q_adata +q_mdata.write_h5mu(par['output'], compression=par['output_compression']) diff --git a/src/labels_transfer/pynndescent_knn/test.py b/src/labels_transfer/pynndescent_knn/test.py new file mode 100644 index 00000000000..cabc57ad6e3 --- /dev/null +++ b/src/labels_transfer/pynndescent_knn/test.py @@ -0,0 +1,70 @@ +import pytest +from pathlib import Path +import anndata as ad +import mudata as mu + +## VIASH START +meta = { + 'resources_dir': './resources_test/' +} +## VIASH END + +reference_h5ad_file = f"{meta['resources_dir']}/annotation_test_data/TS_Blood_filtered.h5ad" +# convert reference to h5mu +reference_adata = ad.read_h5ad(reference_h5ad_file) +reference_mdata = mu.MuData({"rna": reference_adata}) +reference_file = f"{meta['resources_dir']}/annotation_test_data/TS_Blood_filtered.h5mu" +reference_mdata.write_h5mu(reference_file) +input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" + + +def test_label_transfer(run_component): + + args = [ + "--input", input_file, + "--modality", "rna", + "--reference", reference_file, + "--reference_obs_targets", "cell_type", + "--output", "output.h5mu", + "--n_neighbors", "5" + ] + + run_component(args) + + assert Path("output.h5mu").is_file() + + output_data = mu.read_h5mu("output.h5mu") + + assert "cell_type_pred" in output_data.mod["rna"].obs, f"Predictions cell_type_pred is missing from output\noutput: {output_data.mod['rna'].obs}" + assert "cell_type_probability" in output_data.mod["rna"].obs, f"Uncertainties cell_type_probability is missing from output\noutput: {output_data.mod['rna'].obs}" + + +@pytest.mark.parametrize("weights", ["uniform", "distance", "gaussian"]) +def test_label_transfer_prediction_columns(run_component, weights): + + output = f"output_{weights}.h5mu" + + args = [ + "--input", input_file, + "--modality", "rna", + "--reference", reference_file, + "--reference_obs_targets", "cell_type", + "--weights", weights, + "--output", output, + "--output_obs_probability", "test_probability", + "--output_obs_predictions", "test_prediction", + "--n_neighbors", "5" + ] + + run_component(args) + + assert Path(output).is_file() + + output_data = mu.read_h5mu(output) + + assert "test_prediction" in output_data.mod["rna"].obs, f"Predictions test_prediction is missing from output\noutput: {output_data.mod['rna'].obs}" + assert "test_probability" in output_data.mod["rna"].obs, f"Uncertainties test_probability is missing from output\noutput: {output_data.mod['rna'].obs}" + + +if __name__ == '__main__': + exit(pytest.main([__file__])) diff --git a/src/labels_transfer/utils/helper.py b/src/labels_transfer/utils/helper.py index a90bf59efdb..be879425afe 100644 --- a/src/labels_transfer/utils/helper.py +++ b/src/labels_transfer/utils/helper.py @@ -1,3 +1,5 @@ +from scipy.sparse import issparse + def check_arguments(par): # check output .obs predictions if not par["output_obs_predictions"]: @@ -5,9 +7,9 @@ def check_arguments(par): assert len(par["output_obs_predictions"]) == len(par["reference_obs_targets"]), f"Number of output_obs_predictions must match number of reference_obs_targets\npar: {par}" # check output .obs uncertainty - if not par["output_obs_uncertainty"]: - par["output_obs_uncertainty"] = [ t + "_uncertainty" for t in par["reference_obs_targets"]] - assert len(par["output_obs_uncertainty"]) == len(par["reference_obs_targets"]), f"Number of output_obs_uncertainty must match number of reference_obs_targets\npar: {par}" + if not par["output_obs_probability"]: + par["output_obs_probability"] = [ t + "_probability" for t in par["reference_obs_targets"]] + assert len(par["output_obs_probability"]) == len(par["reference_obs_targets"]), f"Number of output_obs_probability must match number of reference_obs_targets\npar: {par}" return par @@ -29,4 +31,4 @@ def get_query_features(adata, par, logger): logger.info(f"Using .obsm[{par['input_obsm_features']}] of query data") query_data = adata.obsm[par["input_obsm_features"]] - return query_data \ No newline at end of file + return query_data diff --git a/src/labels_transfer/xgboost/config.vsh.yaml b/src/labels_transfer/xgboost/config.vsh.yaml index 34b4fabe3ee..4baf79f07d7 100644 --- a/src/labels_transfer/xgboost/config.vsh.yaml +++ b/src/labels_transfer/xgboost/config.vsh.yaml @@ -30,6 +30,11 @@ argument_groups: description: Output directory for model direction: output required: false + - name: "--output_uns_parameters" + type: string + default: "xgboost_parameters" + description: The key in `uns` slot of the output AnnData object to store the parameters of the XGBoost classifier. + required: false - name: "Learning parameters" arguments: - name: "--learning_rate" diff --git a/src/labels_transfer/xgboost/script.py b/src/labels_transfer/xgboost/script.py index 6cc11aa22d5..7306975c566 100644 --- a/src/labels_transfer/xgboost/script.py +++ b/src/labels_transfer/xgboost/script.py @@ -25,7 +25,7 @@ "reference_obs_targets": ["ann_level_1", "ann_level_2", "ann_level_3", "ann_level_4", "ann_level_5", "ann_finest_level"], "output": "foo.h5mu", "output_obs_predictions": None, - "output_obs_uncertainty": None, + "output_obs_probability": None, "output_uns_parameters": "labels_transfer", "force_retrain": False, "use_gpu": True, @@ -256,27 +256,27 @@ def project_labels( query_dataset, cell_type_classifier_model: xgb.XGBClassifier, annotation_column_name='label_pred', - uncertainty_column_name='label_uncertainty', - uncertainty_thresh=None # Note: currently not passed to predict function + probability_column_name='label_probability', + probability_thresh=None # Note: currently not passed to predict function ): """ - A function that projects predicted labels onto the query dataset, along with uncertainty scores. + A function that projects predicted labels onto the query dataset, along with probability estimations. Performs in-place update of the adata object, adding columns to the `obs` DataFrame. Input: * `query_dataset`: The query `AnnData` object * `model_file`: Path to the classification model file * `prediction_key`: Column name in `adata.obs` where to store the predicted labels - * `uncertainty_key`: Column name in `adata.obs` where to store the uncertainty scores - * `uncertainty_thresh`: The uncertainty threshold above which we call a cell 'Unknown' + * `probability_key`: Column name in `adata.obs` where to store the labels probabilities + * `probability_thresh`: The probability threshold below which we call a cell 'Unknown' Output: Nothing is output, the passed anndata is modified inplace """ - if (uncertainty_thresh is not None) and (uncertainty_thresh < 0 or uncertainty_thresh > 1): - raise ValueError(f'`uncertainty_thresh` must be `None` or between 0 and 1.') + if (probability_thresh is not None) and (probability_thresh < 0 or probability_thresh > 1): + raise ValueError(f'`probability_thresh` must be `None` or between 0 and 1.') query_data = get_query_features(query_dataset, par, logger) @@ -288,14 +288,14 @@ def project_labels( # Format probabilities df_probs = pd.DataFrame(probs, columns=cell_type_classifier_model.classes_, index=query_dataset.obs_names) - query_dataset.obs[uncertainty_column_name] = 1 - df_probs.max(1) + query_dataset.obs[probability_column_name] = df_probs.max(1) # Note: this is here in case we want to propose a set of values for the user to accept to seed the # manual curation of predicted labels - if uncertainty_thresh is not None: + if probability_thresh is not None: logger.info("Marking uncertain predictions") query_dataset.obs[annotation_column_name + "_filtered"] = [ - val if query_dataset.obs[uncertainty_column_name][i] < uncertainty_thresh + val if query_dataset.obs[probability_column_name][i] >= probability_thresh else "Unknown" for i, val in enumerate(query_dataset.obs[annotation_column_name])] return query_dataset @@ -306,7 +306,7 @@ def predict( cell_type_classifier_model_path, annotation_column_name: str, prediction_column_name: str, - uncertainty_column_name: str, + probability_column_name: str, models_info, use_gpu: bool = False ) -> pd.DataFrame: @@ -328,7 +328,7 @@ def predict( project_labels(query_dataset, cell_type_classifier_model, annotation_column_name=prediction_column_name, - uncertainty_column_name=uncertainty_column_name) + probability_column_name=probability_column_name) logger.info("Converting labels from numbers to classes") labels_encoder = LabelEncoder() @@ -342,10 +342,11 @@ def main(par): logger.info("Checking arguments") par = check_arguments(par) - mdata = mudata.read(par["input"].strip()) - adata = mdata.mod[par["modality"]] + mdata_query = mudata.read(par["input"].strip()) + adata_query = mdata_query.mod[par["modality"]] - adata_reference = sc.read(par["reference"], backup_url=par["reference"]) + mdata_reference = mudata.read(par["reference"]) + adata_reference = mdata_reference.mod[par["modality"]] # If classifiers for targets are in the model_output directory, simply open them and run (unless `retrain` != True) # If some classifiers are missing, train and save them first @@ -363,19 +364,19 @@ def main(par): build_ref_classifiers(adata_reference, targets_to_train, model_path=par["model_output"], gpu=par["use_gpu"], eval_verbosity=par["verbosity"]) - output_uns_parameters = adata.uns.get(par["output_uns_parameters"], {}) + output_uns_parameters = adata_query.uns.get(par["output_uns_parameters"], {}) with open(par["model_output"] + "/model_info.json", "r") as f: models_info = json.loads(f.read()) - for obs_target, obs_pred, obs_unc in zip(par["reference_obs_targets"], par["output_obs_predictions"], par["output_obs_uncertainty"]): + for obs_target, obs_pred, obs_unc in zip(par["reference_obs_targets"], par["output_obs_predictions"], par["output_obs_probability"]): logger.info(f"Predicting {obs_target}") - adata = predict(query_dataset=adata, + adata_query = predict(query_dataset=adata_query, cell_type_classifier_model_path=os.path.join(par["model_output"], "classifier_" + obs_target + ".xgb"), annotation_column_name=obs_target, prediction_column_name=obs_pred, - uncertainty_column_name=obs_unc, + probability_column_name=obs_unc, models_info=models_info, use_gpu=par["use_gpu"]) @@ -386,14 +387,14 @@ def main(par): **training_params } - adata.uns[par["output_uns_parameters"]] = output_uns_parameters + adata_query.uns[par["output_uns_parameters"]] = output_uns_parameters logger.info("Updating mdata") - mdata.mod[par['modality']] = adata - mdata.update() + mdata_query.mod[par['modality']] = adata_query + mdata_query.update() logger.info("Writing output") - mdata.write_h5mu(par['output'].strip()) + mdata_query.write_h5mu(par['output'].strip()) if __name__ == "__main__": main(par) diff --git a/src/labels_transfer/xgboost/test.py b/src/labels_transfer/xgboost/test.py index 3a81bbfe4e9..3dcfdfef778 100644 --- a/src/labels_transfer/xgboost/test.py +++ b/src/labels_transfer/xgboost/test.py @@ -12,7 +12,7 @@ } ## VIASH END -reference_file = f"{meta['resources_dir']}/annotation_test_data/TS_Blood_filtered.h5ad" +reference_h5ad_file = f"{meta['resources_dir']}/annotation_test_data/TS_Blood_filtered.h5ad" input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" @@ -20,11 +20,11 @@ def test_args(tmp_path, request): obsm_features, obs_targets, output_uns_parameters = request.param - tempfile_reference_file = tmp_path / "reference.h5ad" + tempfile_reference_file = tmp_path / "reference.h5mu" tempfile_input_file = tmp_path / "input.h5mu" # read reference - reference_adata = anndata.read_h5ad(reference_file) + reference_adata = anndata.read_h5ad(reference_h5ad_file) # generate reference obs targets for i, target in enumerate(obs_targets): @@ -39,8 +39,9 @@ def test_args(tmp_path, request): reference_adata.obsm[obsm_features] = np.random.normal(size=(reference_adata.n_obs, 30)) input_rna_adata.obsm[obsm_features] = np.random.normal(size=(input_rna_adata.n_obs, 30)) + reference_mdata = mudata.MuData({"rna": reference_adata}) # write files - reference_adata.write_h5ad(str(tempfile_reference_file)) + reference_mdata.write_h5mu(str(tempfile_reference_file)) input_mudata.write_h5mu(str(tempfile_input_file)) return tempfile_reference_file, reference_adata, tempfile_input_file, input_rna_adata, obsm_features, obs_targets, output_uns_parameters @@ -72,11 +73,11 @@ def test_label_transfer(run_component, test_args): output_data = mudata.read_h5mu("output.h5mu") - exp_uns = "labels_transfer" if output_uns_parameters is None else output_uns_parameters + exp_uns = "xgboost_parameters" if output_uns_parameters is None else output_uns_parameters for target in obs_targets: assert f"{target}_pred" in output_data.mod["rna"].obs, f"Predictions are missing from output\noutput: {output_data.mod['rna'].obs}" - assert f"{target}_uncertainty" in output_data.mod["rna"].obs, f"Uncertainties are missing from output\noutput: {output_data.mod['rna'].obs}" + assert f"{target}_probability" in output_data.mod["rna"].obs, f"Probabilities are missing from output\noutput: {output_data.mod['rna'].obs}" assert exp_uns in output_data.mod["rna"].uns, f"Parameters are missing from output\noutput: {output_data.mod['rna'].uns}" assert target in output_data.mod["rna"].uns[exp_uns], f"Parameters are missing from output\noutput: {output_data.mod['rna'].uns}" assert output_data.mod["rna"].uns[exp_uns][target].get("method") == "XGBClassifier", f"Wrong method in parameters\noutput: {output_data.mod['rna'].uns}" @@ -126,7 +127,7 @@ def test_retraining(run_component, test_args, tmp_path): for target in obs_targets: assert f"{target}_pred" in output_data.mod["rna"].obs, f"Predictions are missing from output\noutput: {output_data.mod['rna'].obs}" - assert f"{target}_uncertainty" in output_data.mod["rna"].obs, f"Uncertainties are missing from output\noutput: {output_data.mod['rna'].obs}" + assert f"{target}_probability" in output_data.mod["rna"].obs, f"Probabilities are missing from output\noutput: {output_data.mod['rna'].obs}" assert output_uns_parameters in output_data.mod["rna"].uns, f"Parameters are missing from output\noutput: {output_data.mod['rna'].uns}" assert target in output_data.mod["rna"].uns[output_uns_parameters], f"Parameters are missing from output\noutput: {output_data.mod['rna'].uns}" assert output_data.mod["rna"].uns[output_uns_parameters][target].get("method") == "XGBClassifier", f"Wrong method in parameters\noutput: {output_data.mod['rna'].uns}"