Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Single input task partial fix #541

Merged
merged 7 commits into from Dec 3, 2018
Empty file modified ci_scripts/flake8_diff.sh 100644 → 100755
Empty file.
8 changes: 4 additions & 4 deletions openml/config.py
Expand Up @@ -19,11 +19,11 @@
'apikey': None,
'server': "https://www.openml.org/api/v1/xml",
'verbosity': 0,
'cachedir': os.path.expanduser('~/.openml/cache'),
'cachedir': os.path.expanduser(os.path.join('~', '.openml', 'cache')),
mfeurer marked this conversation as resolved.
Show resolved Hide resolved
'avoid_duplicate_runs': 'True',
}

config_file = os.path.expanduser('~/.openml/config')
config_file = os.path.expanduser(os.path.join('~', '.openml' 'config'))

# Default values are actually added here in the _setup() function which is
# called at the end of this module
Expand All @@ -48,7 +48,7 @@ def _setup():
global avoid_duplicate_runs
# read config file, create cache directory
try:
os.mkdir(os.path.expanduser('~/.openml'))
os.mkdir(os.path.expanduser(os.path.join('~', '.openml')))
except (IOError, OSError):
# TODO add debug information
pass
Expand Down Expand Up @@ -96,7 +96,7 @@ def get_cache_directory():

"""
url_suffix = urlparse(server).netloc
reversed_url_suffix = '/'.join(url_suffix.split('.')[::-1])
reversed_url_suffix = os.sep.join(url_suffix.split('.')[::-1])
if not cache_directory:
_cachedir = _defaults(cache_directory)
else:
Expand Down
51 changes: 32 additions & 19 deletions openml/tasks/functions.py
Expand Up @@ -10,9 +10,10 @@
from ..datasets import get_dataset
from .task import (
OpenMLClassificationTask,
OpenMLRegressionTask,
OpenMLClusteringTask,
OpenMLLearningCurveTask,
OpenMLRegressionTask,
OpenMLSupervisedTask
)
import openml.utils
import openml._api_calls
Expand Down Expand Up @@ -292,9 +293,13 @@ def get_task(task_id):
try:
task = _get_task_description(task_id)
dataset = get_dataset(task.dataset_id)
class_labels = dataset.retrieve_class_labels(task.target_name)
task.class_labels = class_labels
task.download_split()
# Clustering tasks do not have class labels
# and do not offer download_split
if isinstance(task, OpenMLSupervisedTask):
task.download_split()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it make sense to move this into the task classes? If they have a split and class labels they retrieve them, otherwise, they don't. Also, how does this work for regression tasks (regarding class labels)?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice point, it does not work for regression. What do you think about this.
As for your first point, I do not really like it, because then we have to call get_dataset from the task.

if isinstance(task, OpenMLSupervisedTask):
    task.download_split()
    if isinstance(task, OpenMLClassificationTask): 
        task.class_labels = \
            dataset.retrieve_class_labels(task.target_name)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed. Your proposed solution is fine for this.

if isinstance(task, OpenMLClassificationTask):
task.class_labels = \
dataset.retrieve_class_labels(task.target_name)
except Exception as e:
openml.utils._remove_cache_dir_for_id(
TASKS_CACHE_DIR_NAME,
Expand Down Expand Up @@ -323,6 +328,7 @@ def _get_task_description(task_id):
fh.write(task_xml)
return _create_task_from_xml(task_xml)


def _create_task_from_xml(xml):
"""Create a task given a xml string.

Expand All @@ -336,46 +342,53 @@ def _create_task_from_xml(xml):
OpenMLTask
"""
dic = xmltodict.parse(xml)["oml:task"]

estimation_parameters = dict()
inputs = dict()
# Due to the unordered structure we obtain, we first have to extract
# the possible keys of oml:input; dic["oml:input"] is a list of
# OrderedDicts
for input_ in dic["oml:input"]:
name = input_["@name"]
inputs[name] = input_

# Check if there is a list of inputs
if isinstance(dic["oml:input"], list):
for input_ in dic["oml:input"]:
name = input_["@name"]
inputs[name] = input_
# Single input case
elif isinstance(dic["oml:input"], dict):
name = dic["oml:input"]["@name"]
inputs[name] = dic["oml:input"]

evaluation_measures = None
if 'evaluation_measures' in inputs:
evaluation_measures = inputs["evaluation_measures"][
"oml:evaluation_measures"]["oml:evaluation_measure"]

# Convert some more parameters
for parameter in \
inputs["estimation_procedure"]["oml:estimation_procedure"][
"oml:parameter"]:
name = parameter["@name"]
text = parameter.get("#text", "")
estimation_parameters[name] = text

task_type = dic["oml:task_type"]
common_kwargs = {
'task_id': dic["oml:task_id"],
'task_type': task_type,
'task_type_id': dic["oml:task_type_id"],
'data_set_id': inputs["source_data"][
"oml:data_set"]["oml:data_set_id"],
'estimation_procedure_type': inputs["estimation_procedure"][
"oml:estimation_procedure"]["oml:type"],
'estimation_parameters': estimation_parameters,
'evaluation_measure': evaluation_measures,
}
if task_type in (
"Supervised Classification",
"Supervised Regression",
"Learning Curve"
):
# Convert some more parameters
for parameter in \
inputs["estimation_procedure"]["oml:estimation_procedure"][
"oml:parameter"]:
name = parameter["@name"]
text = parameter.get("#text", "")
estimation_parameters[name] = text

common_kwargs['estimation_procedure_type'] = inputs[
"estimation_procedure"][
"oml:estimation_procedure"]["oml:type"],
common_kwargs['estimation_parameters'] = estimation_parameters,
common_kwargs['target_name'] = inputs[
"source_data"]["oml:data_set"]["oml:target_feature"]
common_kwargs['data_splits_url'] = inputs["estimation_procedure"][
Expand Down
16 changes: 5 additions & 11 deletions openml/tasks/task.py
Expand Up @@ -9,16 +9,11 @@

class OpenMLTask(object):
def __init__(self, task_id, task_type_id, task_type, data_set_id,
estimation_procedure_type, estimation_parameters,
evaluation_measure):
self.task_id = int(task_id)
self.task_type_id = int(task_type_id)
self.task_type = task_type
self.dataset_id = int(data_set_id)
self.estimation_procedure = dict()
self.estimation_procedure["type"] = estimation_procedure_type
self.estimation_procedure["parameters"] = estimation_parameters
self.estimation_parameters = estimation_parameters
self.evaluation_measure = evaluation_measure

def get_dataset(self):
Expand Down Expand Up @@ -57,12 +52,14 @@ def __init__(self, task_id, task_type_id, task_type, data_set_id,
task_type_id=task_type_id,
task_type=task_type,
data_set_id=data_set_id,
estimation_procedure_type=estimation_procedure_type,
estimation_parameters=estimation_parameters,
evaluation_measure=evaluation_measure,
)
self.target_name = target_name
self.estimation_procedure = dict()
self.estimation_procedure["type"] = estimation_procedure_type
self.estimation_procedure["parameters"] = estimation_parameters
self.estimation_parameters = estimation_parameters
self.estimation_procedure["data_splits_url"] = data_splits_url
self.target_name = target_name
self.split = None

def get_X_and_y(self):
Expand Down Expand Up @@ -169,15 +166,12 @@ def __init__(self, task_id, task_type_id, task_type, data_set_id,

class OpenMLClusteringTask(OpenMLTask):
def __init__(self, task_id, task_type_id, task_type, data_set_id,
estimation_procedure_type, estimation_parameters,
evaluation_measure, number_of_clusters=None):
super(OpenMLClusteringTask, self).__init__(
task_id=task_id,
task_type_id=task_type_id,
task_type=task_type,
data_set_id=data_set_id,
estimation_procedure_type=estimation_procedure_type,
estimation_parameters=estimation_parameters,
evaluation_measure=evaluation_measure,
)
self.number_of_clusters = number_of_clusters
Expand Down
9 changes: 9 additions & 0 deletions tests/test_tasks/test_task_functions.py
Expand Up @@ -156,6 +156,15 @@ def test_get_task_with_cache(self):
task = openml.tasks.get_task(1)
self.assertIsInstance(task, OpenMLTask)

def test_get_task_different_types(self):
openml.config.server = self.production_server
# Regression task
openml.tasks.functions.get_task(5001)
# Learning curve
openml.tasks.functions.get_task(64)
# Issue 538, get_task failing with clustering task.
openml.tasks.functions.get_task(126033)

def test_download_split(self):
task = openml.tasks.get_task(1)
split = task.download_split()
Expand Down