From 772183568dfedfba27fa571d4399b8f5a6b5aedd Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Thu, 2 Nov 2017 14:46:43 +0100 Subject: [PATCH 1/2] ADD URL to exception --- openml/_api_calls.py | 13 +++++++++---- openml/exceptions.py | 6 +++++- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/openml/_api_calls.py b/openml/_api_calls.py index b59d926bb..81a3d7756 100644 --- a/openml/_api_calls.py +++ b/openml/_api_calls.py @@ -95,7 +95,7 @@ def _read_url_files(url, data=None, file_dictionary=None, file_elements=None): # 'gzip,deflate' response = requests.post(url, data=data, files=file_elements) if response.status_code != 200: - raise _parse_server_exception(response) + raise _parse_server_exception(response, url=url) if 'Content-Encoding' not in response.headers or \ response.headers['Content-Encoding'] != 'gzip': warnings.warn('Received uncompressed content from OpenML for %s.' % url) @@ -117,14 +117,14 @@ def _read_url(url, data=None): response = requests.post(url, data=data) if response.status_code != 200: - raise _parse_server_exception(response) + raise _parse_server_exception(response, url=url) if 'Content-Encoding' not in response.headers or \ response.headers['Content-Encoding'] != 'gzip': warnings.warn('Received uncompressed content from OpenML for %s.' % url) return response.text -def _parse_server_exception(response): +def _parse_server_exception(response, url=None): # OpenML has a sopisticated error system # where information about failures is provided. try to parse this try: @@ -143,4 +143,9 @@ def _parse_server_exception(response): # 512 for runs, 370 for datasets (should be 372), 500 for flows # 482 for tasks return OpenMLServerNoResult(code, message, additional) - return OpenMLServerException(code, message, additional) + return OpenMLServerException( + code=code, + message=message, + additional=additional, + url=url + ) diff --git a/openml/exceptions.py b/openml/exceptions.py index eb5890a1c..386e25cdc 100644 --- a/openml/exceptions.py +++ b/openml/exceptions.py @@ -16,9 +16,13 @@ class OpenMLServerException(OpenMLServerError): """exception for when the result of the server was not 200 (e.g., listing call w/o results). """ - def __init__(self, code, message, additional=None): + # Code needs to be optional to allow the exceptino to be picklable: + # https://stackoverflow.com/questions/16244923/how-to-make-a-custom-exception-class-with-multiple-init-args-pickleable + def __init__(self, message, code=None, additional=None, url=None): + self.message = message self.code = code self.additional = additional + self.url = url super(OpenMLServerException, self).__init__(message) From 34d5a96b61eb4e143db3142f8ebda787c2902fd0 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Thu, 2 Nov 2017 14:47:23 +0100 Subject: [PATCH 2/2] test md5 hash on dataset download --- openml/datasets/functions.py | 11 +++++++++++ tests/test_datasets/test_dataset_functions.py | 14 ++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index 9f7fa4e71..f6dea2cfb 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -1,4 +1,5 @@ from collections import OrderedDict +import hashlib import io import os import re @@ -365,6 +366,8 @@ def _get_dataset_arff(did_cache_dir, description): Location of arff file. """ output_file_path = os.path.join(did_cache_dir, "dataset.arff") + md5_checksum_fixture = description.get("oml:md5_checksum") + did = description.get("oml:id") # This means the file is still there; whether it is useful is up to # the user and not checked by the program. @@ -377,6 +380,14 @@ def _get_dataset_arff(did_cache_dir, description): url = description['oml:url'] arff_string = _read_url(url) + md5 = hashlib.md5() + md5.update(arff_string.encode('utf8')) + md5_checksum = md5.hexdigest() + if md5_checksum != md5_checksum_fixture: + raise ValueError( + 'Checksum %s of downloaded dataset %d is unequal to the checksum ' + '%s sent by the server.' % (md5_checksum, did, md5_checksum_fixture) + ) with io.open(output_file_path, "w", encoding='utf8') as fh: fh.write(arff_string) diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index 337efc55b..5a0520a46 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -200,6 +200,20 @@ def test__getarff_path_dataset_arff(self): self.assertIsInstance(arff_path, str) self.assertTrue(os.path.exists(arff_path)) + def test__getarff_md5_issue(self): + description = { + 'oml:id': 5, + 'oml:md5_checksum': 'abc', + 'oml:url': 'https://www.openml.org/data/download/61', + } + self.assertRaisesRegexp( + ValueError, + 'Checksum ad484452702105cbf3d30f8deaba39a9 of downloaded dataset 5 ' + 'is unequal to the checksum abc sent by the server.', + _get_dataset_arff, + self.workdir, description, + ) + def test__get_dataset_features(self): features = _get_dataset_features(self.workdir, 2) self.assertIsInstance(features, dict)