From 7c205aa4936373aab571c2399c5362cf146b7758 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Thu, 26 Mar 2026 16:24:49 +0100 Subject: [PATCH 1/7] Update tests to reduce amount of times py_api fixture is used --- tests/routers/openml/dataset_tag_test.py | 106 ++--- .../openml/datasets_list_datasets_test.py | 426 +++++++++--------- tests/routers/openml/datasets_test.py | 359 ++++++++------- 3 files changed, 462 insertions(+), 429 deletions(-) diff --git a/tests/routers/openml/dataset_tag_test.py b/tests/routers/openml/dataset_tag_test.py index a9444c88..41746f83 100644 --- a/tests/routers/openml/dataset_tag_test.py +++ b/tests/routers/openml/dataset_tag_test.py @@ -4,10 +4,12 @@ import pytest from sqlalchemy.ext.asyncio import AsyncConnection -from core.errors import AuthenticationFailedError, TagAlreadyExistsError +from core.errors import TagAlreadyExistsError from database.datasets import get_tags_for +from database.users import User +from routers.openml.datasets import tag_dataset from tests import constants -from tests.users import ApiKey +from tests.users import ADMIN_USER, OWNER_USER, SOME_USER, ApiKey @pytest.mark.parametrize( @@ -22,73 +24,71 @@ async def test_dataset_tag_rejects_unauthorized(key: ApiKey, py_api: httpx.Async json={"data_id": next(iter(constants.PRIVATE_DATASET_ID)), "tag": "test"}, ) assert response.status_code == HTTPStatus.UNAUTHORIZED - assert response.headers["content-type"] == "application/problem+json" - error = response.json() - assert error["type"] == AuthenticationFailedError.uri - assert error["code"] == "103" + + +@pytest.mark.parametrize( + "tag", + ["", "h@", " a", "a" * 65], + ids=["too short", "@", "space", "too long"], +) +async def test_dataset_tag_invalid_tag_is_rejected( + # Constraints for the tag are handled by FastAPI + tag: str, + py_api: httpx.AsyncClient, +) -> None: + new = await py_api.post( + f"/datasets/tag?api_key={ApiKey.ADMIN}", + json={"data_id": 1, "tag": tag}, + ) + + assert new.status_code == HTTPStatus.UNPROCESSABLE_ENTITY + assert new.json()["detail"][0]["loc"] == ["body", "tag"] + + +# ── Direct call tests: tag_dataset ── @pytest.mark.mut @pytest.mark.parametrize( - "key", - [ApiKey.ADMIN, ApiKey.SOME_USER, ApiKey.OWNER_USER], + "user", + [ADMIN_USER, SOME_USER, OWNER_USER], ids=["administrator", "non-owner", "owner"], ) -async def test_dataset_tag( - key: ApiKey, expdb_test: AsyncConnection, py_api: httpx.AsyncClient -) -> None: +async def test_dataset_tag(user: User, expdb_test: AsyncConnection) -> None: dataset_id, tag = next(iter(constants.PRIVATE_DATASET_ID)), "test" - response = await py_api.post( - f"/datasets/tag?api_key={key}", - json={"data_id": dataset_id, "tag": tag}, + result = await tag_dataset( + data_id=dataset_id, + tag=tag, + user=user, + expdb_db=expdb_test, ) - assert response.status_code == HTTPStatus.OK - assert response.json() == {"data_tag": {"id": str(dataset_id), "tag": [tag]}} + assert result == {"data_tag": {"id": str(dataset_id), "tag": [tag]}} tags = await get_tags_for(id_=dataset_id, connection=expdb_test) assert tag in tags @pytest.mark.mut -async def test_dataset_tag_returns_existing_tags(py_api: httpx.AsyncClient) -> None: - dataset_id, tag = 1, "test" - response = await py_api.post( - f"/datasets/tag?api_key={ApiKey.ADMIN}", - json={"data_id": dataset_id, "tag": tag}, +async def test_dataset_tag_returns_existing_tags(expdb_test: AsyncConnection) -> None: + dataset_id, tag = 1, "test" # Dataset 1 already is tagged with 'study_14' + result = await tag_dataset( + data_id=dataset_id, + tag=tag, + user=ADMIN_USER, + expdb_db=expdb_test, ) - assert response.status_code == HTTPStatus.OK - assert response.json() == {"data_tag": {"id": str(dataset_id), "tag": ["study_14", tag]}} + assert result == {"data_tag": {"id": str(dataset_id), "tag": ["study_14", tag]}} @pytest.mark.mut -async def test_dataset_tag_fails_if_tag_exists(py_api: httpx.AsyncClient) -> None: +async def test_dataset_tag_fails_if_tag_exists(expdb_test: AsyncConnection) -> None: dataset_id, tag = 1, "study_14" # Dataset 1 already is tagged with 'study_14' - response = await py_api.post( - f"/datasets/tag?api_key={ApiKey.ADMIN}", - json={"data_id": dataset_id, "tag": tag}, - ) - assert response.status_code == HTTPStatus.CONFLICT - assert response.headers["content-type"] == "application/problem+json" - error = response.json() - assert error["type"] == TagAlreadyExistsError.uri - assert error["code"] == "473" - assert str(dataset_id) in error["detail"] - assert tag in error["detail"] - - -@pytest.mark.parametrize( - "tag", - ["", "h@", " a", "a" * 65], - ids=["too short", "@", "space", "too long"], -) -async def test_dataset_tag_invalid_tag_is_rejected( - tag: str, - py_api: httpx.AsyncClient, -) -> None: - new = await py_api.post( - f"/datasets/tag?api_key={ApiKey.ADMIN}", - json={"data_id": 1, "tag": tag}, - ) - - assert new.status_code == HTTPStatus.UNPROCESSABLE_ENTITY - assert new.json()["detail"][0]["loc"] == ["body", "tag"] + with pytest.raises(TagAlreadyExistsError) as e: + await tag_dataset( + data_id=dataset_id, + tag=tag, + user=ADMIN_USER, + expdb_db=expdb_test, + ) + assert str(dataset_id) in e.value.detail + assert tag in e.value.detail diff --git a/tests/routers/openml/datasets_list_datasets_test.py b/tests/routers/openml/datasets_list_datasets_test.py index e619c468..ded608fc 100644 --- a/tests/routers/openml/datasets_list_datasets_test.py +++ b/tests/routers/openml/datasets_list_datasets_test.py @@ -7,99 +7,193 @@ import pytest from hypothesis import given from hypothesis import strategies as st +from sqlalchemy.ext.asyncio import AsyncConnection from core.errors import NoResultsError +from database.users import User +from routers.dependencies import Pagination +from routers.openml.datasets import DatasetStatusFilter, list_datasets from tests import constants -from tests.users import ApiKey +from tests.users import ADMIN_USER, DATASET_130_OWNER, OWNER_USER, SOME_USER, ApiKey -def _assert_empty_result( - response: httpx.Response, -) -> None: - assert response.status_code == HTTPStatus.NOT_FOUND - assert response.headers["content-type"] == "application/problem+json" - error = response.json() - assert error["type"] == NoResultsError.uri - assert error["code"] == "372" - - -async def test_list(py_api: httpx.AsyncClient) -> None: +async def test_list_route(py_api: httpx.AsyncClient) -> None: response = await py_api.get("/datasets/list/") assert response.status_code == HTTPStatus.OK assert len(response.json()) >= 1 +@pytest.mark.slow +@hypothesis.settings( # type: ignore[untyped-decorator] # 108 + max_examples=500, # This number needs to be better motivated + suppress_health_check=[hypothesis.HealthCheck.function_scoped_fixture], + deadline=None, +) +@given( # type: ignore[untyped-decorator] # 108 + number_missing_values=st.sampled_from([None, "2", "2..10000"]), + number_features=st.sampled_from([None, "5", "2..100"]), + number_classes=st.sampled_from([None, "5", "2..100"]), + number_instances=st.sampled_from([None, "150", "2..100"]), + limit=st.sampled_from([None, 1, 100, 1000]), + offset=st.sampled_from([None, 1, 100, 1000]), + status=st.sampled_from([None, "active", "deactivated", "in_preparation"]), + data_id=st.sampled_from([None, [61], [61, 130]]), + data_name=st.sampled_from([None, "abalone", "iris", "NotPresentInTheDatabase"]), + data_version=st.sampled_from([None, 2, 4]), + tag=st.sampled_from([None, "study_14", "study_not_in_db"]), + # We don't test ADMIN user, as we fixed a bug which treated them as a regular user + api_key=st.sampled_from([None, ApiKey.SOME_USER, ApiKey.OWNER_USER]), +) +async def test_list_data_identical( + py_api: httpx.AsyncClient, + php_api: httpx.AsyncClient, + **kwargs: dict[str, Any], +) -> Any: # noqa: ANN401 + limit, offset = kwargs["limit"], kwargs["offset"] + if (limit and not offset) or (offset and not limit): + # Behavior change: in new API these may be used independently, not in old. + return hypothesis.reject() + + api_key = kwargs.pop("api_key") + api_key_query = f"?api_key={api_key}" if api_key else "" + + # Pagination parameters are nested in the new query style + # The old style has no `limit` by default, so we mimic this with a high default + new_style = kwargs | {"pagination": {"limit": limit or 1_000_000}} + if offset is not None: + new_style["pagination"]["offset"] = offset + + # old style `/data/filter` encodes all filters as a path + query = [ + [filter_, value if not isinstance(value, list) else ",".join(str(v) for v in value)] + for filter_, value in kwargs.items() + if value is not None + ] + uri = "/data/list" + if query: + uri += f"/{'/'.join([str(v) for q in query for v in q])}" + uri += api_key_query + + new, original = await asyncio.gather( + py_api.post(f"/datasets/list{api_key_query}", json=new_style), + php_api.get(uri), + ) + + # Note: RFC 9457 changed some status codes (PRECONDITION_FAILED -> NOT_FOUND for no results) + # and the error response format, so we can't compare error responses directly. + php_is_error = original.status_code == HTTPStatus.PRECONDITION_FAILED + py_is_error = new.status_code == HTTPStatus.NOT_FOUND + + if php_is_error or py_is_error: + # Both should be errors in the same cases + assert php_is_error == py_is_error, ( + f"PHP status={original.status_code}, Python status={new.status_code}" + ) + # Verify Python API returns RFC 9457 format + assert new.headers["content-type"] == "application/problem+json" + error = new.json() + assert error["type"] == NoResultsError.uri + assert error["code"] == "372" + assert original.json()["error"]["message"] == "No results" + assert error["detail"] == "No datasets match the search criteria." + return None + new_json = new.json() + # Qualities in new response are typed + for dataset in new_json: + for quality in dataset["quality"]: + quality["value"] = str(quality["value"]) + + # PHP API has a double nested dictionary that never has other entries + php_json = original.json()["data"]["dataset"] + assert len(php_json) == len(new_json) + assert php_json == new_json + return None + + +# ── Direct call tests: list_datasets ── + + @pytest.mark.parametrize( ("status", "amount"), [ - ("active", constants.NUMBER_OF_PUBLIC_ACTIVE_DATASETS), - ("deactivated", constants.NUMBER_OF_DEACTIVATED_DATASETS), - ("in_preparation", constants.NUMBER_OF_DATASETS_IN_PREPARATION), - ("all", constants.NUMBER_OF_DATASETS - constants.NUMBER_OF_PRIVATE_DATASETS), + (DatasetStatusFilter.ACTIVE, constants.NUMBER_OF_PUBLIC_ACTIVE_DATASETS), + (DatasetStatusFilter.DEACTIVATED, constants.NUMBER_OF_DEACTIVATED_DATASETS), + (DatasetStatusFilter.IN_PREPARATION, constants.NUMBER_OF_DATASETS_IN_PREPARATION), + ( + DatasetStatusFilter.ALL, + constants.NUMBER_OF_DATASETS - constants.NUMBER_OF_PRIVATE_DATASETS, + ), ], ) -async def test_list_filter_active(status: str, amount: int, py_api: httpx.AsyncClient) -> None: - response = await py_api.post( - "/datasets/list", - json={"status": status, "pagination": {"limit": constants.NUMBER_OF_DATASETS}}, +async def test_list_filter_active( + status: DatasetStatusFilter, amount: int, expdb_test: AsyncConnection +) -> None: + result = await list_datasets( + pagination=Pagination(limit=constants.NUMBER_OF_DATASETS), + status=status, + user=None, + expdb_db=expdb_test, ) - assert response.status_code == HTTPStatus.OK, response.json() - assert len(response.json()) == amount + assert len(result) == amount @pytest.mark.parametrize( - ("api_key", "amount"), + ("user", "amount"), [ - (ApiKey.ADMIN, constants.NUMBER_OF_DATASETS), - (ApiKey.DATASET_130_OWNER, constants.NUMBER_OF_DATASETS), - (ApiKey.SOME_USER, constants.NUMBER_OF_DATASETS - constants.NUMBER_OF_PRIVATE_DATASETS), + (ADMIN_USER, constants.NUMBER_OF_DATASETS), + (DATASET_130_OWNER, constants.NUMBER_OF_DATASETS), + (SOME_USER, constants.NUMBER_OF_DATASETS - constants.NUMBER_OF_PRIVATE_DATASETS), (None, constants.NUMBER_OF_DATASETS - constants.NUMBER_OF_PRIVATE_DATASETS), ], ) async def test_list_accounts_privacy( - api_key: ApiKey | None, amount: int, py_api: httpx.AsyncClient + user: User | None, amount: int, expdb_test: AsyncConnection ) -> None: - key = f"?api_key={api_key}" if api_key else "" - response = await py_api.post( - f"/datasets/list{key}", - json={"status": "all", "pagination": {"limit": 1000}}, + result = await list_datasets( + pagination=Pagination(limit=1000), + status=DatasetStatusFilter.ALL, + user=user, + expdb_db=expdb_test, ) - assert response.status_code == HTTPStatus.OK, response.json() - assert len(response.json()) == amount + assert len(result) == amount @pytest.mark.parametrize( ("name", "count"), [("abalone", 1), ("iris", 2)], ) -async def test_list_data_name_present(name: str, count: int, py_api: httpx.AsyncClient) -> None: - # The second iris dataset is private, so we need to authenticate. - response = await py_api.post( - f"/datasets/list?api_key={ApiKey.ADMIN}", - json={"status": "all", "data_name": name}, +async def test_list_data_name_present(name: str, count: int, expdb_test: AsyncConnection) -> None: + # The second iris dataset is private, so we need an admin user. + result = await list_datasets( + pagination=Pagination(), + status=DatasetStatusFilter.ALL, + data_name=name, + user=ADMIN_USER, + expdb_db=expdb_test, ) - assert response.status_code == HTTPStatus.OK - datasets = response.json() - assert len(datasets) == count - assert all(dataset["name"] == name for dataset in datasets) + assert len(result) == count + assert all(dataset["name"] == name for dataset in result) @pytest.mark.parametrize( "name", ["ir", "long_name_without_overlap"], ) -async def test_list_data_name_absent(name: str, py_api: httpx.AsyncClient) -> None: - response = await py_api.post( - f"/datasets/list?api_key={ApiKey.ADMIN}", - json={"status": "all", "data_name": name}, - ) - _assert_empty_result(response) +async def test_list_data_name_absent(name: str, expdb_test: AsyncConnection) -> None: + with pytest.raises(NoResultsError): + await list_datasets( + pagination=Pagination(), + status=DatasetStatusFilter.ALL, + data_name=name, + user=ADMIN_USER, + expdb_db=expdb_test, + ) @pytest.mark.parametrize("limit", [None, 5, 10, 200]) -@pytest.mark.parametrize("offset", [None, 0, 5, 129, 140, 200]) +@pytest.mark.parametrize("offset", [None, 0, 5, 129, 140]) async def test_list_pagination( - limit: int | None, offset: int | None, py_api: httpx.AsyncClient + limit: int | None, offset: int | None, expdb_test: AsyncConnection ) -> None: # dataset ids are contiguous until 131, then there are 161, 162, and 163. extra_datasets = [161, 162, 163] @@ -113,17 +207,19 @@ async def test_list_pagination( end = start + (100 if limit is None else limit) expected_ids = all_ids[start:end] - offset_body = {} if offset is None else {"offset": offset} - limit_body = {} if limit is None else {"limit": limit} - filters = {"status": "all", "pagination": offset_body | limit_body} - response = await py_api.post("/datasets/list", json=filters) + pagination = Pagination(offset=offset or 0, limit=limit or 100) - if offset in [140, 200]: - _assert_empty_result(response) + try: + result = await list_datasets( + pagination=pagination, + status=DatasetStatusFilter.ALL, + user=None, + expdb_db=expdb_test, + ) + except NoResultsError: + assert offset == 140, "Result was expected but NoResultsError was raised." return - - assert response.status_code == HTTPStatus.OK - reported_ids = {dataset["did"] for dataset in response.json()} + reported_ids = {dataset["did"] for dataset in result} assert reported_ids == set(expected_ids) @@ -131,85 +227,96 @@ async def test_list_pagination( ("version", "count"), [(1, 100), (2, 7), (5, 1)], ) -async def test_list_data_version(version: int, count: int, py_api: httpx.AsyncClient) -> None: - response = await py_api.post( - f"/datasets/list?api_key={ApiKey.ADMIN}", - json={"status": "all", "data_version": version}, +async def test_list_data_version(version: int, count: int, expdb_test: AsyncConnection) -> None: + result = await list_datasets( + pagination=Pagination(), + status=DatasetStatusFilter.ALL, + data_version=version, + user=ADMIN_USER, + expdb_db=expdb_test, ) - assert response.status_code == HTTPStatus.OK - datasets = response.json() - assert len(datasets) == count - assert {dataset["version"] for dataset in datasets} == {version} + assert len(result) == count + assert {dataset["version"] for dataset in result} == {version} -async def test_list_data_version_no_result(py_api: httpx.AsyncClient) -> None: +async def test_list_data_version_no_result(expdb_test: AsyncConnection) -> None: version_with_no_datasets = 42 - response = await py_api.post( - f"/datasets/list?api_key={ApiKey.ADMIN}", - json={"status": "all", "data_version": version_with_no_datasets}, - ) - _assert_empty_result(response) + with pytest.raises(NoResultsError): + await list_datasets( + pagination=Pagination(), + status=DatasetStatusFilter.ALL, + data_version=version_with_no_datasets, + user=ADMIN_USER, + expdb_db=expdb_test, + ) -@pytest.mark.parametrize( - "key", - [ApiKey.SOME_USER, ApiKey.DATASET_130_OWNER, ApiKey.ADMIN], -) +@pytest.mark.parametrize("user", [SOME_USER, DATASET_130_OWNER, ADMIN_USER]) @pytest.mark.parametrize( ("user_id", "count"), [(1, 59), (2, 34), (16, 1)], ) -async def test_list_uploader(user_id: int, count: int, key: str, py_api: httpx.AsyncClient) -> None: - response = await py_api.post( - f"/datasets/list?api_key={key}", - json={"status": "all", "uploader": user_id}, - ) +async def test_list_uploader( + user_id: int, count: int, user: User, expdb_test: AsyncConnection +) -> None: # The dataset of user 16 is private, so can not be retrieved by other users. owner_user_id = 16 - if key == ApiKey.SOME_USER and user_id == owner_user_id: - _assert_empty_result(response) - return - - assert response.status_code == HTTPStatus.OK - assert len(response.json()) == count + try: + result = await list_datasets( + pagination=Pagination(), + status=DatasetStatusFilter.ALL, + uploader=user_id, + user=user, + expdb_db=expdb_test, + ) + assert len(result) == count + except NoResultsError: + assert user is SOME_USER, "Admin and Owner should always see a result" + assert user_id == owner_user_id, "Only empty result should be for owner_user filter" @pytest.mark.parametrize( "data_id", [[1], [1, 2, 3], [1, 2, 3, 3000], [1, 2, 3, 130]], ) -async def test_list_data_id(data_id: list[int], py_api: httpx.AsyncClient) -> None: - response = await py_api.post( - "/datasets/list", - json={"status": "all", "data_id": data_id}, +async def test_list_data_id(data_id: list[int], expdb_test: AsyncConnection) -> None: + result = await list_datasets( + pagination=Pagination(), + status=DatasetStatusFilter.ALL, + data_id=data_id, + user=None, + expdb_db=expdb_test, ) - - assert response.status_code == HTTPStatus.OK private_or_not_exist = {130, 3000} - assert len(response.json()) == len(set(data_id) - private_or_not_exist) + expected = set(data_id) - private_or_not_exist + returned = {dataset["did"] for dataset in result} + assert returned == expected @pytest.mark.parametrize( ("tag", "count"), [("study_14", 100), ("study_15", 1)], ) -async def test_list_data_tag(tag: str, count: int, py_api: httpx.AsyncClient) -> None: - response = await py_api.post( - "/datasets/list", - # study_14 has 100 datasets, we overwrite the default `limit` because otherwise - # we don't know if the results are limited by filtering on the tag. - json={"status": "all", "tag": tag, "pagination": {"limit": 101}}, +async def test_list_data_tag(tag: str, count: int, expdb_test: AsyncConnection) -> None: + result = await list_datasets( + pagination=Pagination(limit=101), + status=DatasetStatusFilter.ALL, + tag=tag, + user=None, + expdb_db=expdb_test, ) - assert response.status_code == HTTPStatus.OK - assert len(response.json()) == count + assert len(result) == count -async def test_list_data_tag_empty(py_api: httpx.AsyncClient) -> None: - response = await py_api.post( - "/datasets/list", - json={"status": "all", "tag": "not-a-tag"}, - ) - _assert_empty_result(response) +async def test_list_data_tag_empty(expdb_test: AsyncConnection) -> None: + with pytest.raises(NoResultsError): + await list_datasets( + pagination=Pagination(), + status=DatasetStatusFilter.ALL, + tag="not-a-tag", + user=None, + expdb_db=expdb_test, + ) @pytest.mark.parametrize( @@ -226,98 +333,13 @@ async def test_list_data_tag_empty(py_api: httpx.AsyncClient) -> None: ], ) async def test_list_data_quality( - quality: str, range_: str, count: int, py_api: httpx.AsyncClient + quality: str, range_: str, count: int, expdb_test: AsyncConnection ) -> None: - response = await py_api.post( - "/datasets/list", - json={"status": "all", quality: range_}, + result = await list_datasets( + pagination=Pagination(), + status=DatasetStatusFilter.ALL, + user=None, + expdb_db=expdb_test, + **{quality: range_}, ) - assert response.status_code == HTTPStatus.OK, response.json() - assert len(response.json()) == count - - -@pytest.mark.slow -@hypothesis.settings( # type: ignore[untyped-decorator] # 108 - max_examples=500, # This number needs to be better motivated - suppress_health_check=[hypothesis.HealthCheck.function_scoped_fixture], - deadline=None, -) -@given( # type: ignore[untyped-decorator] # 108 - number_missing_values=st.sampled_from([None, "2", "2..10000"]), - number_features=st.sampled_from([None, "5", "2..100"]), - number_classes=st.sampled_from([None, "5", "2..100"]), - number_instances=st.sampled_from([None, "150", "2..100"]), - limit=st.sampled_from([None, 1, 100, 1000]), - offset=st.sampled_from([None, 1, 100, 1000]), - status=st.sampled_from([None, "active", "deactivated", "in_preparation"]), - data_id=st.sampled_from([None, [61], [61, 130]]), - data_name=st.sampled_from([None, "abalone", "iris", "NotPresentInTheDatabase"]), - data_version=st.sampled_from([None, 2, 4]), - tag=st.sampled_from([None, "study_14", "study_not_in_db"]), - # We don't test ADMIN user, as we fixed a bug which treated them as a regular user - api_key=st.sampled_from([None, ApiKey.SOME_USER, ApiKey.OWNER_USER]), -) -async def test_list_data_identical( - py_api: httpx.AsyncClient, - php_api: httpx.AsyncClient, - **kwargs: dict[str, Any], -) -> Any: # noqa: ANN401 - limit, offset = kwargs["limit"], kwargs["offset"] - if (limit and not offset) or (offset and not limit): - # Behavior change: in new API these may be used independently, not in old. - return hypothesis.reject() - - api_key = kwargs.pop("api_key") - api_key_query = f"?api_key={api_key}" if api_key else "" - - # Pagination parameters are nested in the new query style - # The old style has no `limit` by default, so we mimic this with a high default - new_style = kwargs | {"pagination": {"limit": limit or 1_000_000}} - if offset is not None: - new_style["pagination"]["offset"] = offset - - # old style `/data/filter` encodes all filters as a path - query = [ - [filter_, value if not isinstance(value, list) else ",".join(str(v) for v in value)] - for filter_, value in kwargs.items() - if value is not None - ] - uri = "/data/list" - if query: - uri += f"/{'/'.join([str(v) for q in query for v in q])}" - uri += api_key_query - - new, original = await asyncio.gather( - py_api.post(f"/datasets/list{api_key_query}", json=new_style), - php_api.get(uri), - ) - - # Note: RFC 9457 changed some status codes (PRECONDITION_FAILED -> NOT_FOUND for no results) - # and the error response format, so we can't compare error responses directly. - php_is_error = original.status_code == HTTPStatus.PRECONDITION_FAILED - py_is_error = new.status_code == HTTPStatus.NOT_FOUND - - if php_is_error or py_is_error: - # Both should be errors in the same cases - assert php_is_error == py_is_error, ( - f"PHP status={original.status_code}, Python status={new.status_code}" - ) - # Verify Python API returns RFC 9457 format - assert new.headers["content-type"] == "application/problem+json" - error = new.json() - assert error["type"] == NoResultsError.uri - assert error["code"] == "372" - assert original.json()["error"]["message"] == "No results" - assert error["detail"] == "No datasets match the search criteria." - return None - new_json = new.json() - # Qualities in new response are typed - for dataset in new_json: - for quality in dataset["quality"]: - quality["value"] = str(quality["value"]) - - # PHP API has a double nested dictionary that never has other entries - php_json = original.json()["data"]["dataset"] - assert len(php_json) == len(new_json) - assert php_json == new_json - return None + assert len(result) == count diff --git a/tests/routers/openml/datasets_test.py b/tests/routers/openml/datasets_test.py index 91ef5bee..f1e4a2fd 100644 --- a/tests/routers/openml/datasets_test.py +++ b/tests/routers/openml/datasets_test.py @@ -7,48 +7,27 @@ from sqlalchemy.ext.asyncio import AsyncConnection from core.errors import ( + DatasetAdminOnlyError, DatasetNoAccessError, DatasetNotFoundError, + DatasetNotOwnedError, DatasetProcessingError, ) from database.users import User -from routers.openml.datasets import get_dataset +from routers.openml.datasets import get_dataset, get_dataset_features, update_dataset_status from schemas.datasets.openml import DatasetMetadata, DatasetStatus from tests import constants from tests.users import ADMIN_USER, DATASET_130_OWNER, NO_USER, SOME_USER, ApiKey -@pytest.mark.parametrize( - ("dataset_id", "response_code"), - [ - (-1, HTTPStatus.NOT_FOUND), - (138, HTTPStatus.NOT_FOUND), - (100_000, HTTPStatus.NOT_FOUND), - ], -) -async def test_error_unknown_dataset( - dataset_id: int, - response_code: int, - py_api: httpx.AsyncClient, -) -> None: - response = await py_api.get(f"/datasets/{dataset_id}") - - assert response.status_code == response_code - assert response.headers["content-type"] == "application/problem+json" - error = response.json() - assert error["type"] == DatasetNotFoundError.uri - assert error["title"] == "Dataset Not Found" - assert error["status"] == HTTPStatus.NOT_FOUND - assert re.match(r"No dataset with id -?\d+ found.", error["detail"]) - assert error["code"] == "111" +# ── py_api: routing + serialization, RFC 9457 format, regression ── -async def test_get_dataset(py_api: httpx.AsyncClient) -> None: +async def test_get_dataset_via_api(py_api: httpx.AsyncClient) -> None: response = await py_api.get("/datasets/1") assert response.status_code == HTTPStatus.OK description = response.json() assert description.pop("description").startswith("**Author**:") - assert description == { "id": 1, "name": "anneal", @@ -81,48 +60,7 @@ async def test_get_dataset(py_api: httpx.AsyncClient) -> None: } -@pytest.mark.parametrize( - "user", - [ - NO_USER, - SOME_USER, - ], -) -async def test_private_dataset_no_access( - user: User | None, - expdb_test: AsyncConnection, - user_test: AsyncConnection, -) -> None: - with pytest.raises(DatasetNoAccessError) as e: - await get_dataset( - dataset_id=130, - user=user, - user_db=user_test, - expdb_db=expdb_test, - ) - assert e.value.status_code == HTTPStatus.FORBIDDEN - assert e.value.uri == DatasetNoAccessError.uri - no_access = 112 - assert e.value.code == no_access - - -@pytest.mark.parametrize( - "user", [DATASET_130_OWNER, ADMIN_USER, pytest.param(SOME_USER, marks=pytest.mark.xfail)] -) -async def test_private_dataset_access( - user: User, expdb_test: AsyncConnection, user_test: AsyncConnection -) -> None: - dataset = await get_dataset( - dataset_id=130, - user=user, - user_db=user_test, - expdb_db=expdb_test, - ) - assert isinstance(dataset, DatasetMetadata) - - -async def test_dataset_features(py_api: httpx.AsyncClient) -> None: - # Dataset 4 has both nominal and numerical features, so provides reasonable coverage +async def test_get_features_via_api(py_api: httpx.AsyncClient) -> None: response = await py_api.get("/datasets/features/4") assert response.status_code == HTTPStatus.OK assert response.json() == [ @@ -175,161 +113,234 @@ async def test_dataset_features(py_api: httpx.AsyncClient) -> None: ] -async def test_dataset_features_with_ontology(py_api: httpx.AsyncClient) -> None: - # Dataset 11 has ontology data for features 1, 2, and 3 - response = await py_api.get("/datasets/features/11") +async def test_update_status_via_api(py_api: httpx.AsyncClient) -> None: + response = await py_api.post( + "/datasets/status/update", + json={"dataset_id": 1, "status": "active"}, + ) + # Without authentication, we expect 401 — confirms the route is wired up. + assert response.status_code == HTTPStatus.UNAUTHORIZED + + +async def test_rfc9457_error_format(py_api: httpx.AsyncClient) -> None: + """Single test for the generic RFC 9457 exception handler — covers all error types.""" + response = await py_api.get("/datasets/100000") + assert response.status_code == HTTPStatus.NOT_FOUND + assert response.headers["content-type"] == "application/problem+json" + error = response.json() + assert error["type"] == DatasetNotFoundError.uri + assert error["title"] == "Dataset Not Found" + assert error["status"] == HTTPStatus.NOT_FOUND + assert re.match(r"No dataset with id \d+ found.", error["detail"]) + assert error["code"] == "111" + + +@pytest.mark.mut +async def test_dataset_no_500_with_multiple_processing_entries( + py_api: httpx.AsyncClient, + expdb_test: AsyncConnection, +) -> None: + """Regression test for issue #145: multiple processing entries caused 500.""" + await expdb_test.execute( + text("INSERT INTO evaluation_engine(id, name, description) VALUES (99, 'test_engine', '')"), + ) + await expdb_test.execute( + text( + "INSERT INTO data_processed(did, evaluation_engine_id, user_id, processing_date) " + "VALUES (1, 99, 2, '2020-01-01 00:00:00')", + ), + ) + response = await py_api.get("/datasets/1") assert response.status_code == HTTPStatus.OK - features = {f["index"]: f for f in response.json()} - assert features[1]["ontology"] == ["https://en.wikipedia.org/wiki/Service_(motor_vehicle)"] - assert features[2]["ontology"] == [ + + +# ── Direct call tests: get_dataset ── + + +@pytest.mark.parametrize( + "dataset_id", + [-1, 138, 100_000], +) +async def test_get_dataset_not_found( + dataset_id: int, + expdb_test: AsyncConnection, + user_test: AsyncConnection, +) -> None: + with pytest.raises(DatasetNotFoundError): + await get_dataset( + dataset_id=dataset_id, + user=None, + user_db=user_test, + expdb_db=expdb_test, + ) + + +@pytest.mark.parametrize( + "user", + [ + NO_USER, + SOME_USER, + ], +) +async def test_private_dataset_no_access( + user: User | None, + expdb_test: AsyncConnection, + user_test: AsyncConnection, +) -> None: + with pytest.raises(DatasetNoAccessError) as e: + await get_dataset( + dataset_id=130, + user=user, + user_db=user_test, + expdb_db=expdb_test, + ) + assert e.value.status_code == HTTPStatus.FORBIDDEN + assert e.value.uri == DatasetNoAccessError.uri + no_access = 112 + assert e.value.code == no_access + + +@pytest.mark.parametrize( + "user", [DATASET_130_OWNER, ADMIN_USER, pytest.param(SOME_USER, marks=pytest.mark.xfail)] +) +async def test_private_dataset_access( + user: User, expdb_test: AsyncConnection, user_test: AsyncConnection +) -> None: + dataset = await get_dataset( + dataset_id=130, + user=user, + user_db=user_test, + expdb_db=expdb_test, + ) + assert isinstance(dataset, DatasetMetadata) + + +# ── Direct call tests: get_dataset_features ── + + +async def test_dataset_features_with_ontology(expdb_test: AsyncConnection) -> None: + features = await get_dataset_features(dataset_id=11, user=None, expdb=expdb_test) + by_index = {f.index: f for f in features} + assert by_index[1].ontology == ["https://en.wikipedia.org/wiki/Service_(motor_vehicle)"] + assert by_index[2].ontology == [ "https://en.wikipedia.org/wiki/Car_door", "https://en.wikipedia.org/wiki/Door", ] - assert features[3]["ontology"] == [ + assert by_index[3].ontology == [ "https://en.wikipedia.org/wiki/Passenger_vehicles_in_the_United_States" ] - # Features without ontology should not include the field - assert "ontology" not in features[0] - assert "ontology" not in features[4] + assert by_index[0].ontology is None + assert by_index[4].ontology is None -async def test_dataset_features_no_access(py_api: httpx.AsyncClient) -> None: - response = await py_api.get("/datasets/features/130") - assert response.status_code == HTTPStatus.FORBIDDEN +async def test_dataset_features_no_access(expdb_test: AsyncConnection) -> None: + with pytest.raises(DatasetNoAccessError): + await get_dataset_features(dataset_id=130, user=None, expdb=expdb_test) -@pytest.mark.parametrize( - "api_key", - [ApiKey.ADMIN, ApiKey.DATASET_130_OWNER], -) +@pytest.mark.parametrize("user", [ADMIN_USER, DATASET_130_OWNER]) async def test_dataset_features_access_to_private( - api_key: ApiKey, py_api: httpx.AsyncClient + user: User, expdb_test: AsyncConnection ) -> None: - response = await py_api.get(f"/datasets/features/130?api_key={api_key}") - assert response.status_code == HTTPStatus.OK + features = await get_dataset_features(dataset_id=130, user=user, expdb=expdb_test) + assert isinstance(features, list) -async def test_dataset_features_with_processing_error(py_api: httpx.AsyncClient) -> None: - # When a dataset is processed to extract its feature metadata, errors may occur. - # In that case, no feature information will ever be available. +async def test_dataset_features_with_processing_error(expdb_test: AsyncConnection) -> None: dataset_id = 55 - response = await py_api.get(f"/datasets/features/{dataset_id}") - assert response.status_code == HTTPStatus.PRECONDITION_FAILED - assert response.headers["content-type"] == "application/problem+json" - error = response.json() - assert error["type"] == DatasetProcessingError.uri - assert error["code"] == "274" - assert "No features found" in error["detail"] - assert str(dataset_id) in error["detail"] + with pytest.raises(DatasetProcessingError) as e: + await get_dataset_features(dataset_id=dataset_id, user=None, expdb=expdb_test) + assert "No features found" in e.value.detail + assert str(dataset_id) in e.value.detail -async def test_dataset_features_dataset_does_not_exist(py_api: httpx.AsyncClient) -> None: - resource = await py_api.get("/datasets/features/1000") - assert resource.status_code == HTTPStatus.NOT_FOUND +async def test_dataset_features_dataset_does_not_exist(expdb_test: AsyncConnection) -> None: + with pytest.raises(DatasetNotFoundError): + await get_dataset_features(dataset_id=1000, user=None, expdb=expdb_test) -async def _assert_status_update_is_successful( - apikey: ApiKey, - dataset_id: int, - status: str, - py_api: httpx.AsyncClient, -) -> None: - response = await py_api.post( - f"/datasets/status/update?api_key={apikey}", - json={"dataset_id": dataset_id, "status": status}, - ) - assert response.status_code == HTTPStatus.OK - assert response.json() == { - "dataset_id": dataset_id, - "status": status, - } +# ── Direct call tests: update_dataset_status ── @pytest.mark.mut -@pytest.mark.parametrize( - "dataset_id", - [3, 4], -) +@pytest.mark.parametrize("dataset_id", [3, 4]) async def test_dataset_status_update_active_to_deactivated( - dataset_id: int, py_api: httpx.AsyncClient + dataset_id: int, expdb_test: AsyncConnection ) -> None: - await _assert_status_update_is_successful( - apikey=ApiKey.ADMIN, + result = await update_dataset_status( dataset_id=dataset_id, status=DatasetStatus.DEACTIVATED, - py_api=py_api, + user=ADMIN_USER, + expdb=expdb_test, ) + assert result == {"dataset_id": dataset_id, "status": DatasetStatus.DEACTIVATED} @pytest.mark.mut -async def test_dataset_status_update_in_preparation_to_active(py_api: httpx.AsyncClient) -> None: - await _assert_status_update_is_successful( - apikey=ApiKey.ADMIN, - dataset_id=next(iter(constants.IN_PREPARATION_ID)), +async def test_dataset_status_update_in_preparation_to_active( + expdb_test: AsyncConnection, +) -> None: + dataset_id = next(iter(constants.IN_PREPARATION_ID)) + result = await update_dataset_status( + dataset_id=dataset_id, status=DatasetStatus.ACTIVE, - py_api=py_api, + user=ADMIN_USER, + expdb=expdb_test, ) + assert result == {"dataset_id": dataset_id, "status": DatasetStatus.ACTIVE} @pytest.mark.mut async def test_dataset_status_update_in_preparation_to_deactivated( - py_api: httpx.AsyncClient, + expdb_test: AsyncConnection, ) -> None: - await _assert_status_update_is_successful( - apikey=ApiKey.ADMIN, - dataset_id=next(iter(constants.IN_PREPARATION_ID)), + dataset_id = next(iter(constants.IN_PREPARATION_ID)) + result = await update_dataset_status( + dataset_id=dataset_id, status=DatasetStatus.DEACTIVATED, - py_api=py_api, + user=ADMIN_USER, + expdb=expdb_test, ) + assert result == {"dataset_id": dataset_id, "status": DatasetStatus.DEACTIVATED} @pytest.mark.mut -async def test_dataset_status_update_deactivated_to_active(py_api: httpx.AsyncClient) -> None: - await _assert_status_update_is_successful( - apikey=ApiKey.ADMIN, - dataset_id=next(iter(constants.DEACTIVATED_DATASETS)), +async def test_dataset_status_update_deactivated_to_active( + expdb_test: AsyncConnection, +) -> None: + dataset_id = next(iter(constants.DEACTIVATED_DATASETS)) + result = await update_dataset_status( + dataset_id=dataset_id, status=DatasetStatus.ACTIVE, - py_api=py_api, + user=ADMIN_USER, + expdb=expdb_test, ) + assert result == {"dataset_id": dataset_id, "status": DatasetStatus.ACTIVE} -@pytest.mark.parametrize( - ("dataset_id", "api_key", "status"), - [ - (1, ApiKey.SOME_USER, DatasetStatus.ACTIVE), - (1, ApiKey.SOME_USER, DatasetStatus.DEACTIVATED), - (2, ApiKey.SOME_USER, DatasetStatus.DEACTIVATED), - (33, ApiKey.SOME_USER, DatasetStatus.ACTIVE), - (131, ApiKey.SOME_USER, DatasetStatus.ACTIVE), - ], -) -async def test_dataset_status_unauthorized( +@pytest.mark.parametrize("dataset_id", [1, 33, 131]) +async def test_dataset_status_non_admin_cannot_activate( dataset_id: int, - api_key: ApiKey, - status: str, - py_api: httpx.AsyncClient, + expdb_test: AsyncConnection, ) -> None: - response = await py_api.post( - f"/datasets/status/update?api_key={api_key}", - json={"dataset_id": dataset_id, "status": status}, - ) - assert response.status_code == HTTPStatus.FORBIDDEN + with pytest.raises(DatasetAdminOnlyError): + await update_dataset_status( + dataset_id=dataset_id, + status=DatasetStatus.ACTIVE, + user=SOME_USER, + expdb=expdb_test, + ) -@pytest.mark.mut -async def test_dataset_no_500_with_multiple_processing_entries( - py_api: httpx.AsyncClient, +@pytest.mark.parametrize("dataset_id", [1, 2]) +async def test_dataset_status_non_owner_cannot_deactivate( + dataset_id: int, expdb_test: AsyncConnection, ) -> None: - """Regression test for issue #145: multiple processing entries caused 500.""" - await expdb_test.execute( - text("INSERT INTO evaluation_engine(id, name, description) VALUES (99, 'test_engine', '')"), - ) - await expdb_test.execute( - text( - "INSERT INTO data_processed(did, evaluation_engine_id, user_id, processing_date) " - "VALUES (1, 99, 2, '2020-01-01 00:00:00')", - ), - ) - response = await py_api.get("/datasets/1") - assert response.status_code == HTTPStatus.OK + with pytest.raises(DatasetNotOwnedError): + await update_dataset_status( + dataset_id=dataset_id, + status=DatasetStatus.DEACTIVATED, + user=SOME_USER, + expdb=expdb_test, + ) From a4f5cdc76e64a064f1610a4517e55a97f970c81b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 26 Mar 2026 15:29:15 +0000 Subject: [PATCH 2/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/routers/openml/datasets_list_datasets_test.py | 2 +- tests/routers/openml/datasets_test.py | 7 ++----- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/tests/routers/openml/datasets_list_datasets_test.py b/tests/routers/openml/datasets_list_datasets_test.py index ded608fc..10822cc5 100644 --- a/tests/routers/openml/datasets_list_datasets_test.py +++ b/tests/routers/openml/datasets_list_datasets_test.py @@ -14,7 +14,7 @@ from routers.dependencies import Pagination from routers.openml.datasets import DatasetStatusFilter, list_datasets from tests import constants -from tests.users import ADMIN_USER, DATASET_130_OWNER, OWNER_USER, SOME_USER, ApiKey +from tests.users import ADMIN_USER, DATASET_130_OWNER, SOME_USER, ApiKey async def test_list_route(py_api: httpx.AsyncClient) -> None: diff --git a/tests/routers/openml/datasets_test.py b/tests/routers/openml/datasets_test.py index f1e4a2fd..aaedded3 100644 --- a/tests/routers/openml/datasets_test.py +++ b/tests/routers/openml/datasets_test.py @@ -17,8 +17,7 @@ from routers.openml.datasets import get_dataset, get_dataset_features, update_dataset_status from schemas.datasets.openml import DatasetMetadata, DatasetStatus from tests import constants -from tests.users import ADMIN_USER, DATASET_130_OWNER, NO_USER, SOME_USER, ApiKey - +from tests.users import ADMIN_USER, DATASET_130_OWNER, NO_USER, SOME_USER # ── py_api: routing + serialization, RFC 9457 format, regression ── @@ -239,9 +238,7 @@ async def test_dataset_features_no_access(expdb_test: AsyncConnection) -> None: @pytest.mark.parametrize("user", [ADMIN_USER, DATASET_130_OWNER]) -async def test_dataset_features_access_to_private( - user: User, expdb_test: AsyncConnection -) -> None: +async def test_dataset_features_access_to_private(user: User, expdb_test: AsyncConnection) -> None: features = await get_dataset_features(dataset_id=130, user=user, expdb=expdb_test) assert isinstance(features, list) From e2c27b5d3a984bc34794fd735edb593b7e97ecff Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Thu, 26 Mar 2026 16:44:23 +0100 Subject: [PATCH 3/7] Separate out tests to one file per endpoint --- .../routers/openml/datasets_features_test.py | 106 +++++++++++++ tests/routers/openml/datasets_get_test.py | 142 ++++++++++++++++++ tests/routers/openml/datasets_status_test.py | 106 +++++++++++++ 3 files changed, 354 insertions(+) create mode 100644 tests/routers/openml/datasets_features_test.py create mode 100644 tests/routers/openml/datasets_get_test.py create mode 100644 tests/routers/openml/datasets_status_test.py diff --git a/tests/routers/openml/datasets_features_test.py b/tests/routers/openml/datasets_features_test.py new file mode 100644 index 00000000..aa3988b5 --- /dev/null +++ b/tests/routers/openml/datasets_features_test.py @@ -0,0 +1,106 @@ +"""Tests for the GET /datasets/features/{dataset_id} endpoint.""" + +from http import HTTPStatus + +import httpx +import pytest +from sqlalchemy.ext.asyncio import AsyncConnection + +from core.errors import DatasetNoAccessError, DatasetNotFoundError, DatasetProcessingError +from database.users import User +from routers.openml.datasets import get_dataset_features +from tests.users import ADMIN_USER, DATASET_130_OWNER + + +async def test_get_features_via_api(py_api: httpx.AsyncClient) -> None: + response = await py_api.get("/datasets/features/4") + assert response.status_code == HTTPStatus.OK + assert response.json() == [ + { + "index": 0, + "name": "left-weight", + "data_type": "numeric", + "is_target": False, + "is_ignore": False, + "is_row_identifier": False, + "number_of_missing_values": 0, + }, + { + "index": 1, + "name": "left-distance", + "data_type": "numeric", + "is_target": False, + "is_ignore": False, + "is_row_identifier": False, + "number_of_missing_values": 0, + }, + { + "index": 2, + "name": "right-weight", + "data_type": "numeric", + "is_target": False, + "is_ignore": False, + "is_row_identifier": False, + "number_of_missing_values": 0, + }, + { + "index": 3, + "name": "right-distance", + "data_type": "numeric", + "is_target": False, + "is_ignore": False, + "is_row_identifier": False, + "number_of_missing_values": 0, + }, + { + "index": 4, + "name": "class", + "data_type": "nominal", + "nominal_values": ["B", "L", "R"], + "is_target": True, + "is_ignore": False, + "is_row_identifier": False, + "number_of_missing_values": 0, + }, + ] + + +async def test_dataset_features_with_ontology(expdb_test: AsyncConnection) -> None: + features = await get_dataset_features(dataset_id=11, user=None, expdb=expdb_test) + by_index = {f.index: f for f in features} + assert by_index[1].ontology == ["https://en.wikipedia.org/wiki/Service_(motor_vehicle)"] + assert by_index[2].ontology == [ + "https://en.wikipedia.org/wiki/Car_door", + "https://en.wikipedia.org/wiki/Door", + ] + assert by_index[3].ontology == [ + "https://en.wikipedia.org/wiki/Passenger_vehicles_in_the_United_States" + ] + assert by_index[0].ontology is None + assert by_index[4].ontology is None + + +async def test_dataset_features_no_access(expdb_test: AsyncConnection) -> None: + with pytest.raises(DatasetNoAccessError): + await get_dataset_features(dataset_id=130, user=None, expdb=expdb_test) + + +@pytest.mark.parametrize("user", [ADMIN_USER, DATASET_130_OWNER]) +async def test_dataset_features_access_to_private( + user: User, expdb_test: AsyncConnection +) -> None: + features = await get_dataset_features(dataset_id=130, user=user, expdb=expdb_test) + assert isinstance(features, list) + + +async def test_dataset_features_with_processing_error(expdb_test: AsyncConnection) -> None: + dataset_id = 55 + with pytest.raises(DatasetProcessingError) as e: + await get_dataset_features(dataset_id=dataset_id, user=None, expdb=expdb_test) + assert "No features found" in e.value.detail + assert str(dataset_id) in e.value.detail + + +async def test_dataset_features_dataset_does_not_exist(expdb_test: AsyncConnection) -> None: + with pytest.raises(DatasetNotFoundError): + await get_dataset_features(dataset_id=1000, user=None, expdb=expdb_test) diff --git a/tests/routers/openml/datasets_get_test.py b/tests/routers/openml/datasets_get_test.py new file mode 100644 index 00000000..fe67abee --- /dev/null +++ b/tests/routers/openml/datasets_get_test.py @@ -0,0 +1,142 @@ +"""Tests for the GET /datasets/{dataset_id} endpoint.""" + +import re +from http import HTTPStatus + +import httpx +import pytest +from sqlalchemy import text +from sqlalchemy.ext.asyncio import AsyncConnection + +from core.errors import DatasetNoAccessError, DatasetNotFoundError +from database.users import User +from routers.openml.datasets import get_dataset +from schemas.datasets.openml import DatasetMetadata +from tests.users import ADMIN_USER, DATASET_130_OWNER, NO_USER, SOME_USER + + +async def test_get_dataset_via_api(py_api: httpx.AsyncClient) -> None: + response = await py_api.get("/datasets/1") + assert response.status_code == HTTPStatus.OK + description = response.json() + assert description.pop("description").startswith("**Author**:") + assert description == { + "id": 1, + "name": "anneal", + "version": 1, + "format": "arff", + "description_version": 1, + "upload_date": "2014-04-06T23:19:24", + "licence": "Public", + "url": "http://php-api/data/v1/download/1/anneal.arff", + "parquet_url": "http://minio:9000/datasets/0000/0001/dataset_1.pq", + "file_id": 1, + "default_target_attribute": ["class"], + "version_label": "1", + "tag": ["study_14"], + "visibility": "public", + "status": "active", + "processing_date": "2024-01-04T10:13:59", + "md5_checksum": "4eaed8b6ec9d8211024b6c089b064761", + "row_id_attribute": [], + "ignore_attribute": [], + "language": "", + "error": None, + "warning": None, + "citation": "", + "collection_date": None, + "contributor": [], + "creator": [], + "paper_url": None, + "original_data_url": [], + } + + +async def test_rfc9457_error_format(py_api: httpx.AsyncClient) -> None: + """Single test for the generic RFC 9457 exception handler — covers all error types.""" + response = await py_api.get("/datasets/100000") + assert response.status_code == HTTPStatus.NOT_FOUND + assert response.headers["content-type"] == "application/problem+json" + error = response.json() + assert error["type"] == DatasetNotFoundError.uri + assert error["title"] == "Dataset Not Found" + assert error["status"] == HTTPStatus.NOT_FOUND + assert re.match(r"No dataset with id \d+ found.", error["detail"]) + assert error["code"] == "111" + + +@pytest.mark.mut +async def test_dataset_no_500_with_multiple_processing_entries( + py_api: httpx.AsyncClient, + expdb_test: AsyncConnection, +) -> None: + """Regression test for issue #145: multiple processing entries caused 500.""" + await expdb_test.execute( + text("INSERT INTO evaluation_engine(id, name, description) VALUES (99, 'test_engine', '')"), + ) + await expdb_test.execute( + text( + "INSERT INTO data_processed(did, evaluation_engine_id, user_id, processing_date) " + "VALUES (1, 99, 2, '2020-01-01 00:00:00')", + ), + ) + response = await py_api.get("/datasets/1") + assert response.status_code == HTTPStatus.OK + + +@pytest.mark.parametrize( + "dataset_id", + [-1, 138, 100_000], +) +async def test_get_dataset_not_found( + dataset_id: int, + expdb_test: AsyncConnection, + user_test: AsyncConnection, +) -> None: + with pytest.raises(DatasetNotFoundError): + await get_dataset( + dataset_id=dataset_id, + user=None, + user_db=user_test, + expdb_db=expdb_test, + ) + + +@pytest.mark.parametrize( + "user", + [ + NO_USER, + SOME_USER, + ], +) +async def test_private_dataset_no_access( + user: User | None, + expdb_test: AsyncConnection, + user_test: AsyncConnection, +) -> None: + with pytest.raises(DatasetNoAccessError) as e: + await get_dataset( + dataset_id=130, + user=user, + user_db=user_test, + expdb_db=expdb_test, + ) + assert e.value.status_code == HTTPStatus.FORBIDDEN + assert e.value.uri == DatasetNoAccessError.uri + no_access = 112 + assert e.value.code == no_access + + +@pytest.mark.parametrize( + "user", [DATASET_130_OWNER, ADMIN_USER, pytest.param(SOME_USER, marks=pytest.mark.xfail)] +) +async def test_private_dataset_access( + user: User, expdb_test: AsyncConnection, user_test: AsyncConnection +) -> None: + dataset = await get_dataset( + dataset_id=130, + user=user, + user_db=user_test, + expdb_db=expdb_test, + ) + assert isinstance(dataset, DatasetMetadata) diff --git a/tests/routers/openml/datasets_status_test.py b/tests/routers/openml/datasets_status_test.py new file mode 100644 index 00000000..1e2271fc --- /dev/null +++ b/tests/routers/openml/datasets_status_test.py @@ -0,0 +1,106 @@ +"""Tests for the POST /datasets/status/update endpoint.""" + +from http import HTTPStatus + +import httpx +import pytest +from sqlalchemy.ext.asyncio import AsyncConnection + +from core.errors import DatasetAdminOnlyError, DatasetNotOwnedError +from routers.openml.datasets import update_dataset_status +from schemas.datasets.openml import DatasetStatus +from tests import constants +from tests.users import ADMIN_USER, SOME_USER + + +async def test_update_status_via_api(py_api: httpx.AsyncClient) -> None: + response = await py_api.post( + "/datasets/status/update", + json={"dataset_id": 1, "status": "active"}, + ) + # Without authentication, we expect 401 — confirms the route is wired up. + assert response.status_code == HTTPStatus.UNAUTHORIZED + + +@pytest.mark.mut +@pytest.mark.parametrize("dataset_id", [3, 4]) +async def test_dataset_status_update_active_to_deactivated( + dataset_id: int, expdb_test: AsyncConnection +) -> None: + result = await update_dataset_status( + dataset_id=dataset_id, + status=DatasetStatus.DEACTIVATED, + user=ADMIN_USER, + expdb=expdb_test, + ) + assert result == {"dataset_id": dataset_id, "status": DatasetStatus.DEACTIVATED} + + +@pytest.mark.mut +async def test_dataset_status_update_in_preparation_to_active( + expdb_test: AsyncConnection, +) -> None: + dataset_id = next(iter(constants.IN_PREPARATION_ID)) + result = await update_dataset_status( + dataset_id=dataset_id, + status=DatasetStatus.ACTIVE, + user=ADMIN_USER, + expdb=expdb_test, + ) + assert result == {"dataset_id": dataset_id, "status": DatasetStatus.ACTIVE} + + +@pytest.mark.mut +async def test_dataset_status_update_in_preparation_to_deactivated( + expdb_test: AsyncConnection, +) -> None: + dataset_id = next(iter(constants.IN_PREPARATION_ID)) + result = await update_dataset_status( + dataset_id=dataset_id, + status=DatasetStatus.DEACTIVATED, + user=ADMIN_USER, + expdb=expdb_test, + ) + assert result == {"dataset_id": dataset_id, "status": DatasetStatus.DEACTIVATED} + + +@pytest.mark.mut +async def test_dataset_status_update_deactivated_to_active( + expdb_test: AsyncConnection, +) -> None: + dataset_id = next(iter(constants.DEACTIVATED_DATASETS)) + result = await update_dataset_status( + dataset_id=dataset_id, + status=DatasetStatus.ACTIVE, + user=ADMIN_USER, + expdb=expdb_test, + ) + assert result == {"dataset_id": dataset_id, "status": DatasetStatus.ACTIVE} + + +@pytest.mark.parametrize("dataset_id", [1, 33, 131]) +async def test_dataset_status_non_admin_cannot_activate( + dataset_id: int, + expdb_test: AsyncConnection, +) -> None: + with pytest.raises(DatasetAdminOnlyError): + await update_dataset_status( + dataset_id=dataset_id, + status=DatasetStatus.ACTIVE, + user=SOME_USER, + expdb=expdb_test, + ) + + +@pytest.mark.parametrize("dataset_id", [1, 2]) +async def test_dataset_status_non_owner_cannot_deactivate( + dataset_id: int, + expdb_test: AsyncConnection, +) -> None: + with pytest.raises(DatasetNotOwnedError): + await update_dataset_status( + dataset_id=dataset_id, + status=DatasetStatus.DEACTIVATED, + user=SOME_USER, + expdb=expdb_test, + ) From 6b3a0141f6a7deb7e5156d023cac41bab22f4b85 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Thu, 26 Mar 2026 16:44:50 +0100 Subject: [PATCH 4/7] Remove old test file --- tests/routers/openml/datasets_test.py | 343 -------------------------- 1 file changed, 343 deletions(-) delete mode 100644 tests/routers/openml/datasets_test.py diff --git a/tests/routers/openml/datasets_test.py b/tests/routers/openml/datasets_test.py deleted file mode 100644 index aaedded3..00000000 --- a/tests/routers/openml/datasets_test.py +++ /dev/null @@ -1,343 +0,0 @@ -import re -from http import HTTPStatus - -import httpx -import pytest -from sqlalchemy import text -from sqlalchemy.ext.asyncio import AsyncConnection - -from core.errors import ( - DatasetAdminOnlyError, - DatasetNoAccessError, - DatasetNotFoundError, - DatasetNotOwnedError, - DatasetProcessingError, -) -from database.users import User -from routers.openml.datasets import get_dataset, get_dataset_features, update_dataset_status -from schemas.datasets.openml import DatasetMetadata, DatasetStatus -from tests import constants -from tests.users import ADMIN_USER, DATASET_130_OWNER, NO_USER, SOME_USER - -# ── py_api: routing + serialization, RFC 9457 format, regression ── - - -async def test_get_dataset_via_api(py_api: httpx.AsyncClient) -> None: - response = await py_api.get("/datasets/1") - assert response.status_code == HTTPStatus.OK - description = response.json() - assert description.pop("description").startswith("**Author**:") - assert description == { - "id": 1, - "name": "anneal", - "version": 1, - "format": "arff", - "description_version": 1, - "upload_date": "2014-04-06T23:19:24", - "licence": "Public", - "url": "http://php-api/data/v1/download/1/anneal.arff", - "parquet_url": "http://minio:9000/datasets/0000/0001/dataset_1.pq", - "file_id": 1, - "default_target_attribute": ["class"], - "version_label": "1", - "tag": ["study_14"], - "visibility": "public", - "status": "active", - "processing_date": "2024-01-04T10:13:59", - "md5_checksum": "4eaed8b6ec9d8211024b6c089b064761", - "row_id_attribute": [], - "ignore_attribute": [], - "language": "", - "error": None, - "warning": None, - "citation": "", - "collection_date": None, - "contributor": [], - "creator": [], - "paper_url": None, - "original_data_url": [], - } - - -async def test_get_features_via_api(py_api: httpx.AsyncClient) -> None: - response = await py_api.get("/datasets/features/4") - assert response.status_code == HTTPStatus.OK - assert response.json() == [ - { - "index": 0, - "name": "left-weight", - "data_type": "numeric", - "is_target": False, - "is_ignore": False, - "is_row_identifier": False, - "number_of_missing_values": 0, - }, - { - "index": 1, - "name": "left-distance", - "data_type": "numeric", - "is_target": False, - "is_ignore": False, - "is_row_identifier": False, - "number_of_missing_values": 0, - }, - { - "index": 2, - "name": "right-weight", - "data_type": "numeric", - "is_target": False, - "is_ignore": False, - "is_row_identifier": False, - "number_of_missing_values": 0, - }, - { - "index": 3, - "name": "right-distance", - "data_type": "numeric", - "is_target": False, - "is_ignore": False, - "is_row_identifier": False, - "number_of_missing_values": 0, - }, - { - "index": 4, - "name": "class", - "data_type": "nominal", - "nominal_values": ["B", "L", "R"], - "is_target": True, - "is_ignore": False, - "is_row_identifier": False, - "number_of_missing_values": 0, - }, - ] - - -async def test_update_status_via_api(py_api: httpx.AsyncClient) -> None: - response = await py_api.post( - "/datasets/status/update", - json={"dataset_id": 1, "status": "active"}, - ) - # Without authentication, we expect 401 — confirms the route is wired up. - assert response.status_code == HTTPStatus.UNAUTHORIZED - - -async def test_rfc9457_error_format(py_api: httpx.AsyncClient) -> None: - """Single test for the generic RFC 9457 exception handler — covers all error types.""" - response = await py_api.get("/datasets/100000") - assert response.status_code == HTTPStatus.NOT_FOUND - assert response.headers["content-type"] == "application/problem+json" - error = response.json() - assert error["type"] == DatasetNotFoundError.uri - assert error["title"] == "Dataset Not Found" - assert error["status"] == HTTPStatus.NOT_FOUND - assert re.match(r"No dataset with id \d+ found.", error["detail"]) - assert error["code"] == "111" - - -@pytest.mark.mut -async def test_dataset_no_500_with_multiple_processing_entries( - py_api: httpx.AsyncClient, - expdb_test: AsyncConnection, -) -> None: - """Regression test for issue #145: multiple processing entries caused 500.""" - await expdb_test.execute( - text("INSERT INTO evaluation_engine(id, name, description) VALUES (99, 'test_engine', '')"), - ) - await expdb_test.execute( - text( - "INSERT INTO data_processed(did, evaluation_engine_id, user_id, processing_date) " - "VALUES (1, 99, 2, '2020-01-01 00:00:00')", - ), - ) - response = await py_api.get("/datasets/1") - assert response.status_code == HTTPStatus.OK - - -# ── Direct call tests: get_dataset ── - - -@pytest.mark.parametrize( - "dataset_id", - [-1, 138, 100_000], -) -async def test_get_dataset_not_found( - dataset_id: int, - expdb_test: AsyncConnection, - user_test: AsyncConnection, -) -> None: - with pytest.raises(DatasetNotFoundError): - await get_dataset( - dataset_id=dataset_id, - user=None, - user_db=user_test, - expdb_db=expdb_test, - ) - - -@pytest.mark.parametrize( - "user", - [ - NO_USER, - SOME_USER, - ], -) -async def test_private_dataset_no_access( - user: User | None, - expdb_test: AsyncConnection, - user_test: AsyncConnection, -) -> None: - with pytest.raises(DatasetNoAccessError) as e: - await get_dataset( - dataset_id=130, - user=user, - user_db=user_test, - expdb_db=expdb_test, - ) - assert e.value.status_code == HTTPStatus.FORBIDDEN - assert e.value.uri == DatasetNoAccessError.uri - no_access = 112 - assert e.value.code == no_access - - -@pytest.mark.parametrize( - "user", [DATASET_130_OWNER, ADMIN_USER, pytest.param(SOME_USER, marks=pytest.mark.xfail)] -) -async def test_private_dataset_access( - user: User, expdb_test: AsyncConnection, user_test: AsyncConnection -) -> None: - dataset = await get_dataset( - dataset_id=130, - user=user, - user_db=user_test, - expdb_db=expdb_test, - ) - assert isinstance(dataset, DatasetMetadata) - - -# ── Direct call tests: get_dataset_features ── - - -async def test_dataset_features_with_ontology(expdb_test: AsyncConnection) -> None: - features = await get_dataset_features(dataset_id=11, user=None, expdb=expdb_test) - by_index = {f.index: f for f in features} - assert by_index[1].ontology == ["https://en.wikipedia.org/wiki/Service_(motor_vehicle)"] - assert by_index[2].ontology == [ - "https://en.wikipedia.org/wiki/Car_door", - "https://en.wikipedia.org/wiki/Door", - ] - assert by_index[3].ontology == [ - "https://en.wikipedia.org/wiki/Passenger_vehicles_in_the_United_States" - ] - assert by_index[0].ontology is None - assert by_index[4].ontology is None - - -async def test_dataset_features_no_access(expdb_test: AsyncConnection) -> None: - with pytest.raises(DatasetNoAccessError): - await get_dataset_features(dataset_id=130, user=None, expdb=expdb_test) - - -@pytest.mark.parametrize("user", [ADMIN_USER, DATASET_130_OWNER]) -async def test_dataset_features_access_to_private(user: User, expdb_test: AsyncConnection) -> None: - features = await get_dataset_features(dataset_id=130, user=user, expdb=expdb_test) - assert isinstance(features, list) - - -async def test_dataset_features_with_processing_error(expdb_test: AsyncConnection) -> None: - dataset_id = 55 - with pytest.raises(DatasetProcessingError) as e: - await get_dataset_features(dataset_id=dataset_id, user=None, expdb=expdb_test) - assert "No features found" in e.value.detail - assert str(dataset_id) in e.value.detail - - -async def test_dataset_features_dataset_does_not_exist(expdb_test: AsyncConnection) -> None: - with pytest.raises(DatasetNotFoundError): - await get_dataset_features(dataset_id=1000, user=None, expdb=expdb_test) - - -# ── Direct call tests: update_dataset_status ── - - -@pytest.mark.mut -@pytest.mark.parametrize("dataset_id", [3, 4]) -async def test_dataset_status_update_active_to_deactivated( - dataset_id: int, expdb_test: AsyncConnection -) -> None: - result = await update_dataset_status( - dataset_id=dataset_id, - status=DatasetStatus.DEACTIVATED, - user=ADMIN_USER, - expdb=expdb_test, - ) - assert result == {"dataset_id": dataset_id, "status": DatasetStatus.DEACTIVATED} - - -@pytest.mark.mut -async def test_dataset_status_update_in_preparation_to_active( - expdb_test: AsyncConnection, -) -> None: - dataset_id = next(iter(constants.IN_PREPARATION_ID)) - result = await update_dataset_status( - dataset_id=dataset_id, - status=DatasetStatus.ACTIVE, - user=ADMIN_USER, - expdb=expdb_test, - ) - assert result == {"dataset_id": dataset_id, "status": DatasetStatus.ACTIVE} - - -@pytest.mark.mut -async def test_dataset_status_update_in_preparation_to_deactivated( - expdb_test: AsyncConnection, -) -> None: - dataset_id = next(iter(constants.IN_PREPARATION_ID)) - result = await update_dataset_status( - dataset_id=dataset_id, - status=DatasetStatus.DEACTIVATED, - user=ADMIN_USER, - expdb=expdb_test, - ) - assert result == {"dataset_id": dataset_id, "status": DatasetStatus.DEACTIVATED} - - -@pytest.mark.mut -async def test_dataset_status_update_deactivated_to_active( - expdb_test: AsyncConnection, -) -> None: - dataset_id = next(iter(constants.DEACTIVATED_DATASETS)) - result = await update_dataset_status( - dataset_id=dataset_id, - status=DatasetStatus.ACTIVE, - user=ADMIN_USER, - expdb=expdb_test, - ) - assert result == {"dataset_id": dataset_id, "status": DatasetStatus.ACTIVE} - - -@pytest.mark.parametrize("dataset_id", [1, 33, 131]) -async def test_dataset_status_non_admin_cannot_activate( - dataset_id: int, - expdb_test: AsyncConnection, -) -> None: - with pytest.raises(DatasetAdminOnlyError): - await update_dataset_status( - dataset_id=dataset_id, - status=DatasetStatus.ACTIVE, - user=SOME_USER, - expdb=expdb_test, - ) - - -@pytest.mark.parametrize("dataset_id", [1, 2]) -async def test_dataset_status_non_owner_cannot_deactivate( - dataset_id: int, - expdb_test: AsyncConnection, -) -> None: - with pytest.raises(DatasetNotOwnedError): - await update_dataset_status( - dataset_id=dataset_id, - status=DatasetStatus.DEACTIVATED, - user=SOME_USER, - expdb=expdb_test, - ) From 7f588d9944bf091973f1e75b3e04ab60cd8b9893 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Thu, 26 Mar 2026 16:51:32 +0100 Subject: [PATCH 5/7] Fix pre-commit issues --- tests/routers/openml/datasets_features_test.py | 4 +--- tests/routers/openml/datasets_list_datasets_test.py | 5 +++-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/tests/routers/openml/datasets_features_test.py b/tests/routers/openml/datasets_features_test.py index aa3988b5..193b0f31 100644 --- a/tests/routers/openml/datasets_features_test.py +++ b/tests/routers/openml/datasets_features_test.py @@ -86,9 +86,7 @@ async def test_dataset_features_no_access(expdb_test: AsyncConnection) -> None: @pytest.mark.parametrize("user", [ADMIN_USER, DATASET_130_OWNER]) -async def test_dataset_features_access_to_private( - user: User, expdb_test: AsyncConnection -) -> None: +async def test_dataset_features_access_to_private(user: User, expdb_test: AsyncConnection) -> None: features = await get_dataset_features(dataset_id=130, user=user, expdb=expdb_test) assert isinstance(features, list) diff --git a/tests/routers/openml/datasets_list_datasets_test.py b/tests/routers/openml/datasets_list_datasets_test.py index 10822cc5..d8fb5735 100644 --- a/tests/routers/openml/datasets_list_datasets_test.py +++ b/tests/routers/openml/datasets_list_datasets_test.py @@ -217,7 +217,8 @@ async def test_list_pagination( expdb_db=expdb_test, ) except NoResultsError: - assert offset == 140, "Result was expected but NoResultsError was raised." + expect_empty_offset = 140 + assert offset == expect_empty_offset, "Result was expected but NoResultsError was raised." return reported_ids = {dataset["did"] for dataset in result} assert reported_ids == set(expected_ids) @@ -340,6 +341,6 @@ async def test_list_data_quality( status=DatasetStatusFilter.ALL, user=None, expdb_db=expdb_test, - **{quality: range_}, + **{quality: range_}, # type: ignore[arg-type] ) assert len(result) == count From 0b8f7b4a4a964c47259088479b6d6352666beae9 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Thu, 26 Mar 2026 17:20:47 +0100 Subject: [PATCH 6/7] trigger ci From e16b04ea8e0506bd2a633e654a45190803f786a7 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Thu, 26 Mar 2026 17:24:07 +0100 Subject: [PATCH 7/7] Kick off CI for changes to tests --- .github/workflows/tests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 07f64402..d0af8bca 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -6,6 +6,7 @@ on: pull_request: paths: - 'src/**' + - 'tests/**' - 'docker/**' - 'docker-compose.yaml' - 'pyproject.toml'