diff --git a/anndata/_io/specs/methods.py b/anndata/_io/specs/methods.py index fde9c0f34..70bd36945 100644 --- a/anndata/_io/specs/methods.py +++ b/anndata/_io/specs/methods.py @@ -663,10 +663,23 @@ def write_dataframe(f, key, df, _writer, dataset_kwargs=MappingProxyType({})): if reserved in df.columns: raise ValueError(f"{reserved!r} is a reserved name for dataframe columns.") group = f.require_group(key) + if not df.columns.is_unique: + duplicates = list(df.columns[df.columns.duplicated()]) + raise ValueError( + f"Found repeated column names: {duplicates}. Column names must be unique." + ) col_names = [check_key(c) for c in df.columns] group.attrs["column-order"] = col_names if df.index.name is not None: + if df.index.name in col_names and not pd.Series( + df.index, index=df.index + ).equals(df[df.index.name]): + raise ValueError( + f"DataFrame.index.name ({df.index.name!r}) is also used by a column " + "whose values are different. This is not supported. Please make sure " + "the values are the same, or use a different name." + ) index_name = df.index.name else: index_name = "_index" diff --git a/anndata/tests/test_io_elementwise.py b/anndata/tests/test_io_elementwise.py index 08853b6c4..7f7dac4dd 100644 --- a/anndata/tests/test_io_elementwise.py +++ b/anndata/tests/test_io_elementwise.py @@ -300,3 +300,32 @@ def test_read_zarr_from_group(tmp_path, consolidated): with read_func(pth) as z: expected = ad.read_zarr(z["table/table"]) assert_equal(adata, expected) + + +def test_dataframe_column_uniqueness(store): + repeated_cols = pd.DataFrame(np.ones((3, 2)), columns=["a", "a"]) + + with pytest_8_raises( + ValueError, + match=r"Found repeated column names: \['a'\]\. Column names must be unique\.", + ): + write_elem(store, "repeated_cols", repeated_cols) + + index_shares_col_name = pd.DataFrame( + {"col_name": [1, 2, 3]}, index=pd.Index([1, 3, 2], name="col_name") + ) + + with pytest_8_raises( + ValueError, + match=r"DataFrame\.index\.name \('col_name'\) is also used by a column whose values are different\.", + ): + write_elem(store, "index_shares_col_name", index_shares_col_name) + + index_shared_okay = pd.DataFrame( + {"col_name": [1, 2, 3]}, index=pd.Index([1, 2, 3], name="col_name") + ) + + write_elem(store, "index_shared_okay", index_shared_okay) + result = read_elem(store["index_shared_okay"]) + + assert_equal(result, index_shared_okay) diff --git a/docs/release-notes/0.10.6.md b/docs/release-notes/0.10.6.md index 4a978f4dd..052523979 100644 --- a/docs/release-notes/0.10.6.md +++ b/docs/release-notes/0.10.6.md @@ -4,6 +4,7 @@ ``` * Defer import of zarr in test helpers, as scanpy CI job relies on them {pr}`1343` {user}`ilan-gold` +* Writing a dataframe with non-unique column names now throws an error, instead of silently overwriting {pr}`1335` {user}`ivirshup` ```{rubric} Documentation ```