Merge pull request #213 from openml/smartimputer

Smartimputer
openml · Mar 27, 2017 · b3262b6 · b3262b6
2 parents 34efa1b + b11d5a5
commit b3262b6
Show file tree

Hide file tree

Showing 16 changed files with 20,346 additions and 90 deletions.
diff --git a/openml/__init__.py b/openml/__init__.py
@@ -16,7 +16,7 @@
 """
 from . import config
 
-from .datasets import OpenMLDataset
+from .datasets import OpenMLDataset, OpenMLDataFeature
 from . import datasets
 from . import runs
 from . import flows
@@ -27,5 +27,6 @@
 
 __version__ = "0.2.1"
 
-__all__ = ['OpenMLDataset', 'OpenMLRun', 'OpenMLSplit', 'datasets',
-           'OpenMLTask', 'OpenMLFlow', 'config', 'runs', 'flows']
+__all__ = ['OpenMLDataset', 'OpenMLDataFeature', 'OpenMLRun',
+           'OpenMLSplit', 'datasets', 'OpenMLTask', 'OpenMLFlow',
+           'config', 'runs', 'flows']
diff --git a/openml/datasets/__init__.py b/openml/datasets/__init__.py
@@ -1,6 +1,7 @@
 from .functions import (list_datasets, check_datasets_active,
                         get_datasets, get_dataset)
 from .dataset import OpenMLDataset
+from .data_feature import OpenMLDataFeature
 
 __all__ = ['check_datasets_active', 'get_dataset', 'get_datasets',
-           'OpenMLDataset', 'list_datasets']
+           'OpenMLDataset', 'OpenMLDataFeature', 'list_datasets']
diff --git a/openml/datasets/data_feature.py b/openml/datasets/data_feature.py
@@ -0,0 +1,36 @@
+
+class OpenMLDataFeature(object):
+    """Data Feature (a.k.a. Attribute) object.
+
+       Parameters
+       ----------
+       index : int
+            The index of this feature
+        name : str
+            Name of the feature
+        data_type : str
+            can be nominal, numeric, string, date (corresponds to arff)
+        nominal_values : list(str)
+            list of the possible values, in case of nominal attribute
+        number_missing_values : int
+       """
+    LEGAL_DATA_TYPES = ['nominal', 'numeric', 'string', 'date']
+
+    def __init__(self, index, name, data_type, nominal_values, number_missing_values):
+        if type(index) != int:
+            raise ValueError('Index is of wrong datatype')
+        if data_type not in self.LEGAL_DATA_TYPES:
+            raise ValueError('data type should be in %s, found: %s' %(str(self.LEGAL_DATA_TYPES),data_type))
+        if nominal_values is not None and type(nominal_values) != list:
+            raise ValueError('Nominal_values is of wrong datatype')
+        if type(number_missing_values) != int:
+            raise ValueError('number_missing_values is of wrong datatype')
+
+        self.index = index
+        self.name = str(name)
+        self.data_type = str(data_type)
+        self.nominal_values = nominal_values
+        self.number_missing_values = number_missing_values
+
+    def __str__(self):
+        return "[%d - %s (%s)]" %(self.index, self.name, self.data_type)
diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py
@@ -2,6 +2,7 @@
 import io
 import logging
 import os
+import six
 import sys
 
 import arff
@@ -10,6 +11,7 @@
 import scipy.sparse
 import xmltodict
 
+from .data_feature import OpenMLDataFeature
 from ..exceptions import PyOpenMLError
 
 if sys.version_info[0] >= 3:
@@ -63,7 +65,15 @@ def __init__(self, dataset_id=None, name=None, version=None, description=None,
         self.url = url
         self.default_target_attribute = default_target_attribute
         self.row_id_attribute = row_id_attribute
-        self.ignore_attributes = ignore_attribute
+        self.ignore_attributes = None
+        if isinstance(ignore_attribute, six.string_types):
+            self.ignore_attributes = [ignore_attribute]
+        elif isinstance(ignore_attribute, list):
+            self.ignore_attributes = ignore_attribute
+        elif ignore_attribute is None:
+            pass
+        else:
+            raise ValueError('wrong data type for ignore_attribute. Should be list. ')
         self.version_label = version_label
         self.citation = citation
         self.tag = tag
@@ -73,7 +83,20 @@ def __init__(self, dataset_id=None, name=None, version=None, description=None,
         self.update_comment = update_comment
         self.md5_cheksum = md5_checksum
         self.data_file = data_file
-        self.features = features
+        self.features = None
+
+        if features is not None:
+            self.features = {}
+            for idx, xmlfeature in enumerate(features['oml:feature']):
+                feature = OpenMLDataFeature(int(xmlfeature['oml:index']),
+                                            xmlfeature['oml:name'],
+                                            xmlfeature['oml:data_type'],
+                                            None, #todo add nominal values (currently not in database)
+                                            int(xmlfeature['oml:number_of_missing_values']))
+                if idx != feature.index:
+                    raise ValueError('Data features not provided in right order')
+                self.features[feature.index] = feature
+
 
         if data_file is not None:
             if self._data_features_supported():
@@ -205,10 +228,7 @@ def get_data(self, target=None, target_dtype=int, include_row_id=False,
             if not self.ignore_attributes:
                 pass
             else:
-                if is_string(self.ignore_attributes):
-                    to_exclude.append(self.ignore_attributes)
-                else:
-                    to_exclude.extend(self.ignore_attributes)
+                to_exclude.extend(self.ignore_attributes)
 
         if len(to_exclude) > 0:
             logger.info("Going to remove the following attributes:"
@@ -298,6 +318,61 @@ def retrieve_class_labels(self, target_name='class'):
         else:
             return None
 
+
+    def get_features_by_type(self, data_type, exclude=None,
+                             exclude_ignore_attributes=True,
+                             exclude_row_id_attribute=True):
+        '''
+        Returns indices of features of a given type, e.g., all nominal features.
+        Can use additional parameters to exclude various features by index or ontology.
+
+        Parameters
+        ----------
+        data_type : str
+            The data type to return (e.g., nominal, numeric, date, string)
+        exclude : list(int)
+            Indices to exclude (and adapt the return values as if these indices
+                        are not present)
+        exclude_ignore_attributes : bool
+            Whether to exclude the defined ignore attributes (and adapt the
+            return values as if these indices are not present)
+        exclude_row_id_attribute : bool
+            Whether to exclude the defined row id attributes (and adapt the
+            return values as if these indices are not present)
+
+        Returns
+        -------
+        result : list
+            a list of indices that have the specified data type
+        '''
+        assert data_type in OpenMLDataFeature.LEGAL_DATA_TYPES, "Illegal feature type requested"
+        if self.ignore_attributes is not None:
+            assert type(self.ignore_attributes) is list, "ignore_attributes should be a list"
+        if self.row_id_attribute is not None:
+            assert type(self.row_id_attribute) is str, "row id attribute should be a str"
+        if exclude is not None:
+            assert type(exclude) is list, "Exclude should be a list"
+            # assert all(isinstance(elem, str) for elem in exclude), "Exclude should be a list of strings"
+        to_exclude = []
+        if exclude is not None:
+            to_exclude.extend(exclude)
+        if exclude_ignore_attributes and self.ignore_attributes is not None:
+            to_exclude.extend(self.ignore_attributes)
+        if exclude_row_id_attribute and self.row_id_attribute is not None:
+            to_exclude.append(self.row_id_attribute)
+
+        result = []
+        offset = 0
+        # this function assumes that everything in to_exclude will be 'excluded' from the dataset (hence the offset)
+        for idx in self.features:
+            name = self.features[idx].name
+            if name in to_exclude:
+                offset += 1
+            else:
+                if self.features[idx].data_type == data_type:
+                    result.append(idx-offset)
+        return result
+
     def publish(self):
         """Publish the dataset on the OpenML server.
 
@@ -349,8 +424,8 @@ def _to_xml(self):
 
     def _data_features_supported(self):
         if self.features is not None:
-            for feature in self.features['oml:feature']:
-                if feature['oml:data_type'] not in ['numeric', 'nominal']:
+            for idx in self.features:
+                if self.features[idx].data_type not in ['numeric', 'nominal']:
                     return False
             return True
         return True
diff --git a/openml/flows/functions.py b/openml/flows/functions.py
@@ -66,10 +66,10 @@ def list_flows(offset=None, size=None, tag=None):
     if tag is not None:
         api_call += "/tag/%s" % tag
 
-    return _list_datasets(api_call)
+    return _list_flows(api_call)
 
 
-def _list_datasets(api_call):
+def _list_flows(api_call):
     # TODO add proper error handling here!
     xml_string = _perform_api_call(api_call)
     flows_dict = xmltodict.parse(xml_string)

diff --git a/openml/runs/functions.py b/openml/runs/functions.py
@@ -2,8 +2,11 @@
 import io
 import os
 import xmltodict
+import numpy as np
+import warnings
 from sklearn.model_selection._search import BaseSearchCV
 
+from ..exceptions import PyOpenMLError
 from .. import config
 from ..flows import sklearn_to_flow
 from ..exceptions import OpenMLCacheException
@@ -50,11 +53,7 @@ def run_task(task, model):
 
     # execute the run
     run = OpenMLRun(task_id=task.task_id, flow_id=None, dataset_id=dataset.dataset_id, model=model)
-
-    try:
-        run.data_content, run.trace_content = _run_task_get_arffcontent(model, task, class_labels)
-    except AttributeError as message:
-        run.error_message = str(message)
+    run.data_content, run.trace_content = _run_task_get_arffcontent(model, task, class_labels)
 
     # now generate the flow
     flow = sklearn_to_flow(model)
@@ -70,6 +69,46 @@ def run_task(task, model):
     return run
 
 
+def _prediction_to_row(rep_no, fold_no, row_id, correct_label, predicted_label,
+                       predicted_probabilities, class_labels, model_classes_mapping):
+    """Util function that turns probability estimates of a classifier for a given
+        instance into the right arff format to upload to openml.
+
+        Parameters
+        ----------
+        rep_no : int
+        fold_no : int
+        row_id : int
+            row id in the initial dataset
+        correct_label : str
+            original label of the instance
+        predicted_label : str
+            the label that was predicted
+        predicted_probabilities : array (size=num_classes)
+            probabilities per class
+        class_labels : array (size=num_classes)
+        model_classes_mapping : list
+            A list of classes the model produced.
+            Obtained by BaseEstimator.classes_
+
+        Returns
+        -------
+        arff_line : list
+            representation of the current prediction in OpenML format
+        """
+    arff_line = [rep_no, fold_no, row_id]
+    for class_label_idx in range(len(class_labels)):
+        if class_label_idx in model_classes_mapping:
+            index = np.where(model_classes_mapping == class_label_idx)[0][0]  # TODO: WHY IS THIS 2D???
+            arff_line.append(predicted_probabilities[index])
+        else:
+            arff_line.append(0.0)
+
+    arff_line.append(class_labels[predicted_label])
+    arff_line.append(correct_label)
+    return arff_line
+
+# JvR: why is class labels a parameter? could be removed and taken from task object, right?
 def _run_task_get_arffcontent(model, task, class_labels):
     X, Y = task.get_X_and_y()
     arff_datacontent = []
@@ -88,19 +127,20 @@ def _run_task_get_arffcontent(model, task, class_labels):
             testY = Y[test_indices]
 
             model.fit(trainX, trainY)
+
             if isinstance(model, BaseSearchCV):
-                _add_results_to_arfftrace(arff_tracecontent, fold_no, model,
-                                          rep_no)
+                _add_results_to_arfftrace(arff_tracecontent, fold_no, model, rep_no)
+                model_classes = model.best_estimator_.classes_
+            else:
+                model_classes = model.classes_
 
             ProbaY = model.predict_proba(testX)
             PredY = model.predict(testX)
+            if ProbaY.shape[1] != len(class_labels):
+                warnings.warn("Repeat %d Fold %d: estimator only predicted for %d/%d classes!" %(rep_no, fold_no, ProbaY.shape[1], len(class_labels)))
 
             for i in range(0, len(test_indices)):
-                assert(len(ProbaY[i]) == len(class_labels)), 'Predicted probabilities and available classes do not match. (sklearn bug?) '
-                arff_line = [rep_no, fold_no, test_indices[i]]
-                arff_line.extend(ProbaY[i])
-                arff_line.append(class_labels[PredY[i]])
-                arff_line.append(class_labels[testY[i]])
+                arff_line = _prediction_to_row(rep_no, fold_no, test_indices[i], class_labels[testY[i]], PredY[i], ProbaY[i], class_labels, model_classes)
                 arff_datacontent.append(arff_line)
 
             fold_no = fold_no + 1

diff --git a/requirements.txt b/requirements.txt
@@ -5,5 +5,5 @@ liac-arff>=2.1.1dev
 xmltodict
 nose
 requests
-scikit-learn
+scikit-learn>=0.18
 nbformat