Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
# openpipelines 1.0.0-rc6

## BUG FIXES

* `dataflow/concatenate_h5mu`: fix regression bug where observations are no longer linked to the correct metadata
after concatenation (PR #807)

# openpipelines 1.0.0-rc5

## BUG FIXES
Expand Down
7 changes: 4 additions & 3 deletions src/base/openpipelinetestutils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,10 @@ def _get_columns_in_all_modalities(annotation_object, axis_string: str):
if column_name not in global_columns]
extra_cols_to_remove += [column_name for column_name in column_names
if column_name in global_columns]
axis_setter(annotation_object, axis_getter(annotation_object).drop(extra_cols_to_remove,
axis="columns",
inplace=False))
if modality_name:
axis_setter(annotation_object, axis_getter(annotation_object).drop(extra_cols_to_remove,
axis="columns",
inplace=False))

for mod_name in modality_names:
modality = annotation_object.mod[mod_name]
Expand Down
4 changes: 2 additions & 2 deletions src/dataflow/concat/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ functionality:
- path: /resources_test/concat_test_data/human_brain_3k_filtered_feature_bc_matrix_subset_unique_obs.h5mu
platforms:
- type: docker
image: python:3.10-slim
image: python:3.11-slim
setup:
- type: apt
packages:
Expand All @@ -72,9 +72,9 @@ platforms:
__merge__: [/src/base/requirements/anndata_mudata.yaml, .]
packages:
- pandas~=2.1.1
__merge__: [ /src/base/requirements/python_test_setup.yaml, .]
test_setup:
- type: python
__merge__: [ /src/base/requirements/viashpy.yaml, .]
packages:
- muon
- type: native
Expand Down
4 changes: 2 additions & 2 deletions src/dataflow/concatenate_h5mu/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ functionality:
- path: /resources_test/concat_test_data/human_brain_3k_filtered_feature_bc_matrix_subset_unique_obs.h5mu
platforms:
- type: docker
image: python:3.10-slim
image: python:3.11-slim
setup:
- type: apt
packages:
Expand All @@ -71,9 +71,9 @@ platforms:
__merge__: [/src/base/requirements/anndata_mudata.yaml, .]
packages:
- pandas~=2.1.1
__merge__: [ /src/base/requirements/python_test_setup.yaml, .]
test_setup:
- type: python
__merge__: [ /src/base/requirements/viashpy.yaml, .]
packages:
- muon
- type: native
Expand Down
17 changes: 7 additions & 10 deletions src/dataflow/concatenate_h5mu/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ def any_row_contains_duplicate_values(n_processes: int, frame: pd.DataFrame) ->
is_duplicated = pool.map(nunique, iter(numpy_array))
return any(is_duplicated)

def concatenate_matrices(n_processes: int, matrices: dict[str, pd.DataFrame], align_to: pd.Index | None) \
def concatenate_matrices(n_processes: int, matrices: dict[str, pd.DataFrame], align_to: pd.Index) \
-> tuple[dict[str, pd.DataFrame], pd.DataFrame | None, dict[str, pd.core.dtypes.dtypes.Dtype]]:
"""
Merge matrices by combining columns that have the same name.
Expand Down Expand Up @@ -152,7 +152,7 @@ def get_first_non_na_value_vector(df):
def split_conflicts_and_concatenated_columns(n_processes: int,
matrices: dict[str, pd.DataFrame],
column_names: Iterable[str],
align_to: pd.Index | None = None) -> \
align_to: pd.Index) -> \
tuple[dict[str, pd.DataFrame], pd.DataFrame]:
"""
Retrieve columns with the same name from a list of dataframes which are
Expand All @@ -172,8 +172,7 @@ def split_conflicts_and_concatenated_columns(n_processes: int,
join="outer", sort=False)
if any_row_contains_duplicate_values(n_processes, concatenated_columns):
concatenated_columns.columns = columns.keys() # Use the sample id as column name
if align_to is not None:
concatenated_columns = concatenated_columns.reindex(align_to, copy=False)
concatenated_columns = concatenated_columns.reindex(align_to, copy=False)
conflicts[f'conflict_{column_name}'] = concatenated_columns
else:
unique_values = get_first_non_na_value_vector(concatenated_columns)
Expand All @@ -182,8 +181,7 @@ def split_conflicts_and_concatenated_columns(n_processes: int,
return conflicts, pd.DataFrame(index=align_to)
concatenated_matrix = pd.concat(concatenated_matrix, join="outer",
axis=1, sort=False)
if align_to is not None:
concatenated_matrix = concatenated_matrix.reindex(align_to, copy=False)
concatenated_matrix = concatenated_matrix.reindex(align_to, copy=False)
return conflicts, concatenated_matrix

def cast_to_writeable_dtype(result: pd.DataFrame) -> pd.DataFrame:
Expand Down Expand Up @@ -220,8 +218,7 @@ def split_conflicts_modalities(n_processes: int, samples: dict[str, anndata.AnnD
for matrix_name in matrices_to_parse:
matrices = {sample_id: getattr(sample, matrix_name) for sample_id, sample in samples.items()}
output_index = getattr(output, matrix_name).index
align_to = output_index if matrix_name == "var" else None
conflicts, concatenated_matrix = concatenate_matrices(n_processes, matrices, align_to)
conflicts, concatenated_matrix = concatenate_matrices(n_processes, matrices, output_index)
if concatenated_matrix.empty:
concatenated_matrix.index = output_index
# Write the conflicts to the output
Expand All @@ -238,7 +235,7 @@ def concatenate_modality(n_processes: int, mod: str, input_files: Iterable[str |
other_axis_mode: str, input_ids: tuple[str]) -> anndata.AnnData:

concat_modes = {
"move": None,
"move": "unique",
}
other_axis_mode_to_apply = concat_modes.get(other_axis_mode, other_axis_mode)

Expand All @@ -247,7 +244,7 @@ def concatenate_modality(n_processes: int, mod: str, input_files: Iterable[str |
try:
mod_data[input_id] = mu.read_h5ad(input_file, mod=mod)
except KeyError as e: # Modality does not exist for this sample, skip it
if f"Unable to open object '{mod}' doesn't exist" not in str(e):
if f"Unable to synchronously open object (object '{mod}' doesn't exist)" not in str(e):
raise e
pass
check_observations_unique(mod_data.values())
Expand Down
Loading