Skip to content

Commit

Permalink
Merge pull request #213 from openml/smartimputer
Browse files Browse the repository at this point in the history
Smartimputer
  • Loading branch information
mfeurer committed Mar 27, 2017
2 parents 34efa1b + b11d5a5 commit b3262b6
Show file tree
Hide file tree
Showing 16 changed files with 20,346 additions and 90 deletions.
7 changes: 4 additions & 3 deletions openml/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
"""
from . import config

from .datasets import OpenMLDataset
from .datasets import OpenMLDataset, OpenMLDataFeature
from . import datasets
from . import runs
from . import flows
Expand All @@ -27,5 +27,6 @@

__version__ = "0.2.1"

__all__ = ['OpenMLDataset', 'OpenMLRun', 'OpenMLSplit', 'datasets',
'OpenMLTask', 'OpenMLFlow', 'config', 'runs', 'flows']
__all__ = ['OpenMLDataset', 'OpenMLDataFeature', 'OpenMLRun',
'OpenMLSplit', 'datasets', 'OpenMLTask', 'OpenMLFlow',
'config', 'runs', 'flows']
3 changes: 2 additions & 1 deletion openml/datasets/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from .functions import (list_datasets, check_datasets_active,
get_datasets, get_dataset)
from .dataset import OpenMLDataset
from .data_feature import OpenMLDataFeature

__all__ = ['check_datasets_active', 'get_dataset', 'get_datasets',
'OpenMLDataset', 'list_datasets']
'OpenMLDataset', 'OpenMLDataFeature', 'list_datasets']
36 changes: 36 additions & 0 deletions openml/datasets/data_feature.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@

class OpenMLDataFeature(object):
"""Data Feature (a.k.a. Attribute) object.
Parameters
----------
index : int
The index of this feature
name : str
Name of the feature
data_type : str
can be nominal, numeric, string, date (corresponds to arff)
nominal_values : list(str)
list of the possible values, in case of nominal attribute
number_missing_values : int
"""
LEGAL_DATA_TYPES = ['nominal', 'numeric', 'string', 'date']

def __init__(self, index, name, data_type, nominal_values, number_missing_values):
if type(index) != int:
raise ValueError('Index is of wrong datatype')
if data_type not in self.LEGAL_DATA_TYPES:
raise ValueError('data type should be in %s, found: %s' %(str(self.LEGAL_DATA_TYPES),data_type))
if nominal_values is not None and type(nominal_values) != list:
raise ValueError('Nominal_values is of wrong datatype')
if type(number_missing_values) != int:
raise ValueError('number_missing_values is of wrong datatype')

self.index = index
self.name = str(name)
self.data_type = str(data_type)
self.nominal_values = nominal_values
self.number_missing_values = number_missing_values

def __str__(self):
return "[%d - %s (%s)]" %(self.index, self.name, self.data_type)
91 changes: 83 additions & 8 deletions openml/datasets/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import io
import logging
import os
import six
import sys

import arff
Expand All @@ -10,6 +11,7 @@
import scipy.sparse
import xmltodict

from .data_feature import OpenMLDataFeature
from ..exceptions import PyOpenMLError

if sys.version_info[0] >= 3:
Expand Down Expand Up @@ -63,7 +65,15 @@ def __init__(self, dataset_id=None, name=None, version=None, description=None,
self.url = url
self.default_target_attribute = default_target_attribute
self.row_id_attribute = row_id_attribute
self.ignore_attributes = ignore_attribute
self.ignore_attributes = None
if isinstance(ignore_attribute, six.string_types):
self.ignore_attributes = [ignore_attribute]
elif isinstance(ignore_attribute, list):
self.ignore_attributes = ignore_attribute
elif ignore_attribute is None:
pass
else:
raise ValueError('wrong data type for ignore_attribute. Should be list. ')
self.version_label = version_label
self.citation = citation
self.tag = tag
Expand All @@ -73,7 +83,20 @@ def __init__(self, dataset_id=None, name=None, version=None, description=None,
self.update_comment = update_comment
self.md5_cheksum = md5_checksum
self.data_file = data_file
self.features = features
self.features = None

if features is not None:
self.features = {}
for idx, xmlfeature in enumerate(features['oml:feature']):
feature = OpenMLDataFeature(int(xmlfeature['oml:index']),
xmlfeature['oml:name'],
xmlfeature['oml:data_type'],
None, #todo add nominal values (currently not in database)
int(xmlfeature['oml:number_of_missing_values']))
if idx != feature.index:
raise ValueError('Data features not provided in right order')
self.features[feature.index] = feature


if data_file is not None:
if self._data_features_supported():
Expand Down Expand Up @@ -205,10 +228,7 @@ def get_data(self, target=None, target_dtype=int, include_row_id=False,
if not self.ignore_attributes:
pass
else:
if is_string(self.ignore_attributes):
to_exclude.append(self.ignore_attributes)
else:
to_exclude.extend(self.ignore_attributes)
to_exclude.extend(self.ignore_attributes)

if len(to_exclude) > 0:
logger.info("Going to remove the following attributes:"
Expand Down Expand Up @@ -298,6 +318,61 @@ def retrieve_class_labels(self, target_name='class'):
else:
return None


def get_features_by_type(self, data_type, exclude=None,
exclude_ignore_attributes=True,
exclude_row_id_attribute=True):
'''
Returns indices of features of a given type, e.g., all nominal features.
Can use additional parameters to exclude various features by index or ontology.
Parameters
----------
data_type : str
The data type to return (e.g., nominal, numeric, date, string)
exclude : list(int)
Indices to exclude (and adapt the return values as if these indices
are not present)
exclude_ignore_attributes : bool
Whether to exclude the defined ignore attributes (and adapt the
return values as if these indices are not present)
exclude_row_id_attribute : bool
Whether to exclude the defined row id attributes (and adapt the
return values as if these indices are not present)
Returns
-------
result : list
a list of indices that have the specified data type
'''
assert data_type in OpenMLDataFeature.LEGAL_DATA_TYPES, "Illegal feature type requested"
if self.ignore_attributes is not None:
assert type(self.ignore_attributes) is list, "ignore_attributes should be a list"
if self.row_id_attribute is not None:
assert type(self.row_id_attribute) is str, "row id attribute should be a str"
if exclude is not None:
assert type(exclude) is list, "Exclude should be a list"
# assert all(isinstance(elem, str) for elem in exclude), "Exclude should be a list of strings"
to_exclude = []
if exclude is not None:
to_exclude.extend(exclude)
if exclude_ignore_attributes and self.ignore_attributes is not None:
to_exclude.extend(self.ignore_attributes)
if exclude_row_id_attribute and self.row_id_attribute is not None:
to_exclude.append(self.row_id_attribute)

result = []
offset = 0
# this function assumes that everything in to_exclude will be 'excluded' from the dataset (hence the offset)
for idx in self.features:
name = self.features[idx].name
if name in to_exclude:
offset += 1
else:
if self.features[idx].data_type == data_type:
result.append(idx-offset)
return result

def publish(self):
"""Publish the dataset on the OpenML server.
Expand Down Expand Up @@ -349,8 +424,8 @@ def _to_xml(self):

def _data_features_supported(self):
if self.features is not None:
for feature in self.features['oml:feature']:
if feature['oml:data_type'] not in ['numeric', 'nominal']:
for idx in self.features:
if self.features[idx].data_type not in ['numeric', 'nominal']:
return False
return True
return True
4 changes: 2 additions & 2 deletions openml/flows/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,10 +66,10 @@ def list_flows(offset=None, size=None, tag=None):
if tag is not None:
api_call += "/tag/%s" % tag

return _list_datasets(api_call)
return _list_flows(api_call)


def _list_datasets(api_call):
def _list_flows(api_call):
# TODO add proper error handling here!
xml_string = _perform_api_call(api_call)
flows_dict = xmltodict.parse(xml_string)
Expand Down
64 changes: 52 additions & 12 deletions openml/runs/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,11 @@
import io
import os
import xmltodict
import numpy as np
import warnings
from sklearn.model_selection._search import BaseSearchCV

from ..exceptions import PyOpenMLError
from .. import config
from ..flows import sklearn_to_flow
from ..exceptions import OpenMLCacheException
Expand Down Expand Up @@ -50,11 +53,7 @@ def run_task(task, model):

# execute the run
run = OpenMLRun(task_id=task.task_id, flow_id=None, dataset_id=dataset.dataset_id, model=model)

try:
run.data_content, run.trace_content = _run_task_get_arffcontent(model, task, class_labels)
except AttributeError as message:
run.error_message = str(message)
run.data_content, run.trace_content = _run_task_get_arffcontent(model, task, class_labels)

# now generate the flow
flow = sklearn_to_flow(model)
Expand All @@ -70,6 +69,46 @@ def run_task(task, model):
return run


def _prediction_to_row(rep_no, fold_no, row_id, correct_label, predicted_label,
predicted_probabilities, class_labels, model_classes_mapping):
"""Util function that turns probability estimates of a classifier for a given
instance into the right arff format to upload to openml.
Parameters
----------
rep_no : int
fold_no : int
row_id : int
row id in the initial dataset
correct_label : str
original label of the instance
predicted_label : str
the label that was predicted
predicted_probabilities : array (size=num_classes)
probabilities per class
class_labels : array (size=num_classes)
model_classes_mapping : list
A list of classes the model produced.
Obtained by BaseEstimator.classes_
Returns
-------
arff_line : list
representation of the current prediction in OpenML format
"""
arff_line = [rep_no, fold_no, row_id]
for class_label_idx in range(len(class_labels)):
if class_label_idx in model_classes_mapping:
index = np.where(model_classes_mapping == class_label_idx)[0][0] # TODO: WHY IS THIS 2D???
arff_line.append(predicted_probabilities[index])
else:
arff_line.append(0.0)

arff_line.append(class_labels[predicted_label])
arff_line.append(correct_label)
return arff_line

# JvR: why is class labels a parameter? could be removed and taken from task object, right?
def _run_task_get_arffcontent(model, task, class_labels):
X, Y = task.get_X_and_y()
arff_datacontent = []
Expand All @@ -88,19 +127,20 @@ def _run_task_get_arffcontent(model, task, class_labels):
testY = Y[test_indices]

model.fit(trainX, trainY)

if isinstance(model, BaseSearchCV):
_add_results_to_arfftrace(arff_tracecontent, fold_no, model,
rep_no)
_add_results_to_arfftrace(arff_tracecontent, fold_no, model, rep_no)
model_classes = model.best_estimator_.classes_
else:
model_classes = model.classes_

ProbaY = model.predict_proba(testX)
PredY = model.predict(testX)
if ProbaY.shape[1] != len(class_labels):
warnings.warn("Repeat %d Fold %d: estimator only predicted for %d/%d classes!" %(rep_no, fold_no, ProbaY.shape[1], len(class_labels)))

for i in range(0, len(test_indices)):
assert(len(ProbaY[i]) == len(class_labels)), 'Predicted probabilities and available classes do not match. (sklearn bug?) '
arff_line = [rep_no, fold_no, test_indices[i]]
arff_line.extend(ProbaY[i])
arff_line.append(class_labels[PredY[i]])
arff_line.append(class_labels[testY[i]])
arff_line = _prediction_to_row(rep_no, fold_no, test_indices[i], class_labels[testY[i]], PredY[i], ProbaY[i], class_labels, model_classes)
arff_datacontent.append(arff_line)

fold_no = fold_no + 1
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,5 @@ liac-arff>=2.1.1dev
xmltodict
nose
requests
scikit-learn
scikit-learn>=0.18
nbformat

0 comments on commit b3262b6

Please sign in to comment.