Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions openml/_api_calls.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def _read_url_files(url, data=None, file_dictionary=None, file_elements=None):
# 'gzip,deflate'
response = requests.post(url, data=data, files=file_elements)
if response.status_code != 200:
raise _parse_server_exception(response)
raise _parse_server_exception(response, url=url)
if 'Content-Encoding' not in response.headers or \
response.headers['Content-Encoding'] != 'gzip':
warnings.warn('Received uncompressed content from OpenML for %s.' % url)
Expand All @@ -117,14 +117,14 @@ def _read_url(url, data=None):
response = requests.post(url, data=data)

if response.status_code != 200:
raise _parse_server_exception(response)
raise _parse_server_exception(response, url=url)
if 'Content-Encoding' not in response.headers or \
response.headers['Content-Encoding'] != 'gzip':
warnings.warn('Received uncompressed content from OpenML for %s.' % url)
return response.text


def _parse_server_exception(response):
def _parse_server_exception(response, url=None):
# OpenML has a sopisticated error system
# where information about failures is provided. try to parse this
try:
Expand All @@ -143,4 +143,9 @@ def _parse_server_exception(response):
# 512 for runs, 370 for datasets (should be 372), 500 for flows
# 482 for tasks
return OpenMLServerNoResult(code, message, additional)
return OpenMLServerException(code, message, additional)
return OpenMLServerException(
code=code,
message=message,
additional=additional,
url=url
)
11 changes: 11 additions & 0 deletions openml/datasets/functions.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from collections import OrderedDict
import hashlib
import io
import os
import re
Expand Down Expand Up @@ -365,6 +366,8 @@ def _get_dataset_arff(did_cache_dir, description):
Location of arff file.
"""
output_file_path = os.path.join(did_cache_dir, "dataset.arff")
md5_checksum_fixture = description.get("oml:md5_checksum")
did = description.get("oml:id")

# This means the file is still there; whether it is useful is up to
# the user and not checked by the program.
Expand All @@ -377,6 +380,14 @@ def _get_dataset_arff(did_cache_dir, description):

url = description['oml:url']
arff_string = _read_url(url)
md5 = hashlib.md5()
md5.update(arff_string.encode('utf8'))
md5_checksum = md5.hexdigest()
if md5_checksum != md5_checksum_fixture:
raise ValueError(
'Checksum %s of downloaded dataset %d is unequal to the checksum '
'%s sent by the server.' % (md5_checksum, did, md5_checksum_fixture)
)

with io.open(output_file_path, "w", encoding='utf8') as fh:
fh.write(arff_string)
Expand Down
6 changes: 5 additions & 1 deletion openml/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,13 @@ class OpenMLServerException(OpenMLServerError):
"""exception for when the result of the server was
not 200 (e.g., listing call w/o results). """

def __init__(self, code, message, additional=None):
# Code needs to be optional to allow the exceptino to be picklable:
# https://stackoverflow.com/questions/16244923/how-to-make-a-custom-exception-class-with-multiple-init-args-pickleable
def __init__(self, message, code=None, additional=None, url=None):
self.message = message
self.code = code
self.additional = additional
self.url = url
super(OpenMLServerException, self).__init__(message)


Expand Down
14 changes: 14 additions & 0 deletions tests/test_datasets/test_dataset_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,20 @@ def test__getarff_path_dataset_arff(self):
self.assertIsInstance(arff_path, str)
self.assertTrue(os.path.exists(arff_path))

def test__getarff_md5_issue(self):
description = {
'oml:id': 5,
'oml:md5_checksum': 'abc',
'oml:url': 'https://www.openml.org/data/download/61',
}
self.assertRaisesRegexp(
ValueError,
'Checksum ad484452702105cbf3d30f8deaba39a9 of downloaded dataset 5 '
'is unequal to the checksum abc sent by the server.',
_get_dataset_arff,
self.workdir, description,
)

def test__get_dataset_features(self):
features = _get_dataset_features(self.workdir, 2)
self.assertIsInstance(features, dict)
Expand Down