openml · mfeurer · Mar 18, 2019 · Mar 17, 2019 · Mar 17, 2019 · Mar 17, 2019
diff --git a/appveyor.yml b/appveyor.yml
@@ -29,9 +29,8 @@ install:
   - rmdir C:\\cygwin /s /q
 
   # Update previous packages and install the build and runtime dependencies of the project.
-  # XXX: setuptools>23 is currently broken on Win+py3 with numpy
-  # (https://github.com/pypa/setuptools/issues/728)
-  - conda update --all --yes setuptools=23
+  - conda update conda --yes
+  - conda update --all --yes
 
   # Install the build and runtime dependencies of the project.
   - "cd C:\\projects\\openml-python"

diff --git a/examples/datasets_tutorial.py b/examples/datasets_tutorial.py
@@ -77,6 +77,15 @@
 print(X.head())
 print(X.info())
 
+############################################################################
+# Sometimes you only need access to a dataset's metadata.
+# In those cases, you can download the dataset without downloading the
+# data file. The dataset object can be used as normal.
+# Whenever you use any functionality that requires the data,
+# such as `get_data`, the data will be downloaded.
+dataset = openml.datasets.get_dataset(68, download_data=False)
+
+
 ############################################################################
 # Exercise 2
 # **********

diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py
@@ -157,7 +157,7 @@ def __init__(self, name, description, format=None,
                 feature = OpenMLDataFeature(int(xmlfeature['oml:index']),
                                             xmlfeature['oml:name'],
                                             xmlfeature['oml:data_type'],
-                                            None,
+                                            xmlfeature.get('oml:nominal_value'),
                                             int(nr_missing))
                 if idx != feature.index:
                     raise ValueError('Data features not provided '
@@ -167,96 +167,104 @@ def __init__(self, name, description, format=None,
         self.qualities = _check_qualities(qualities)
 
         if data_file is not None:
-            self.data_pickle_file = data_file.replace('.arff', '.pkl.py3')
+            self.data_pickle_file = self._data_arff_to_pickle(data_file)
+        else:
+            self.data_pickle_file = None
 
-            if os.path.exists(self.data_pickle_file):
-                logger.debug("Data pickle file already exists.")
-            else:
-                try:
-                    data = self._get_arff(self.format)
-                except OSError as e:
-                    logger.critical("Please check that the data file %s is "
-                                    "there and can be read.", self.data_file)
-                    raise e
-
-                ARFF_DTYPES_TO_PD_DTYPE = {
-                    'INTEGER': 'integer',
-                    'REAL': 'floating',
-                    'NUMERIC': 'floating',
-                    'STRING': 'string'
-                }
-                attribute_dtype = {}
-                attribute_names = []
-                categories_names = {}
-                categorical = []
-                for name, type_ in data['attributes']:
-                    # if the feature is nominal and the a sparse matrix is
-                    # requested, the categories need to be numeric
-                    if (isinstance(type_, list)
-                            and self.format.lower() == 'sparse_arff'):
-                        try:
-                            np.array(type_, dtype=np.float32)
-                        except ValueError:
-                            raise ValueError(
-                                "Categorical data needs to be numeric when "
-                                "using sparse ARFF."
-                            )
-                    # string can only be supported with pandas DataFrame
-                    elif (type_ == 'STRING'
-                          and self.format.lower() == 'sparse_arff'):
+    def _data_arff_to_pickle(self, data_file):
+        data_pickle_file = data_file.replace('.arff', '.pkl.py3')
+        if os.path.exists(data_pickle_file):
+            logger.debug("Data pickle file already exists.")
+            return data_pickle_file
+        else:
+            try:
+                data = self._get_arff(self.format)
+            except OSError as e:
+                logger.critical("Please check that the data file %s is "
+                                "there and can be read.", data_file)
+                raise e
+
+            ARFF_DTYPES_TO_PD_DTYPE = {
+                'INTEGER': 'integer',
+                'REAL': 'floating',
+                'NUMERIC': 'floating',
+                'STRING': 'string'
+            }
+            attribute_dtype = {}
+            attribute_names = []
+            categories_names = {}
+            categorical = []
+            for name, type_ in data['attributes']:
+                # if the feature is nominal and the a sparse matrix is
+                # requested, the categories need to be numeric
+                if (isinstance(type_, list)
+                        and self.format.lower() == 'sparse_arff'):
+                    try:
+                        np.array(type_, dtype=np.float32)
+                    except ValueError:
                         raise ValueError(
-                            "Dataset containing strings is not supported "
-                            "with sparse ARFF."
+                            "Categorical data needs to be numeric when "
+                            "using sparse ARFF."
                         )
-
-                    # infer the dtype from the ARFF header
-                    if isinstance(type_, list):
-                        categorical.append(True)
-                        categories_names[name] = type_
-                        if len(type_) == 2:
-                            type_norm = [cat.lower().capitalize()
-                                         for cat in type_]
-                            if set(['True', 'False']) == set(type_norm):
-                                categories_names[name] = [
-                                    True if cat == 'True' else False
-                                    for cat in type_norm
-                                ]
-                                attribute_dtype[name] = 'boolean'
-                            else:
-                                attribute_dtype[name] = 'categorical'
+                # string can only be supported with pandas DataFrame
+                elif (type_ == 'STRING'
+                      and self.format.lower() == 'sparse_arff'):
+                    raise ValueError(
+                        "Dataset containing strings is not supported "
+                        "with sparse ARFF."
+                    )
+
+                # infer the dtype from the ARFF header
+                if isinstance(type_, list):
+                    categorical.append(True)
+                    categories_names[name] = type_
+                    if len(type_) == 2:
+                        type_norm = [cat.lower().capitalize()
+                                     for cat in type_]
+                        if set(['True', 'False']) == set(type_norm):
+                            categories_names[name] = [
+                                True if cat == 'True' else False
+                                for cat in type_norm
+                            ]
+                            attribute_dtype[name] = 'boolean'
                         else:
                             attribute_dtype[name] = 'categorical'
                     else:
-                        categorical.append(False)
-                        attribute_dtype[name] = ARFF_DTYPES_TO_PD_DTYPE[type_]
-                    attribute_names.append(name)
-
-                if self.format.lower() == 'sparse_arff':
-                    X = data['data']
-                    X_shape = (max(X[1]) + 1, max(X[2]) + 1)
-                    X = scipy.sparse.coo_matrix(
-                        (X[0], (X[1], X[2])), shape=X_shape, dtype=np.float32)
-                    X = X.tocsr()
-
-                elif self.format.lower() == 'arff':
-                    X = pd.DataFrame(data['data'], columns=attribute_names)
-
-                    col = []
-                    for column_name in X.columns:
-                        if attribute_dtype[column_name] in ('categorical',
-                                                            'boolean'):
-                            col.append(self._unpack_categories(
-                                X[column_name], categories_names[column_name]))
-                        else:
-                            col.append(X[column_name])
-                    X = pd.concat(col, axis=1)
-
-                # Pickle the dataframe or the sparse matrix.
-                with open(self.data_pickle_file, "wb") as fh:
-                    pickle.dump((X, categorical, attribute_names), fh, -1)
-                logger.debug("Saved dataset %d: %s to file %s" %
-                             (int(self.dataset_id or -1), self.name,
-                              self.data_pickle_file))
+                        attribute_dtype[name] = 'categorical'
+                else:
+                    categorical.append(False)
+                    attribute_dtype[name] = ARFF_DTYPES_TO_PD_DTYPE[type_]
+                attribute_names.append(name)
+
+            if self.format.lower() == 'sparse_arff':
+                X = data['data']
+                X_shape = (max(X[1]) + 1, max(X[2]) + 1)
+                X = scipy.sparse.coo_matrix(
+                    (X[0], (X[1], X[2])), shape=X_shape, dtype=np.float32)
+                X = X.tocsr()
+
+            elif self.format.lower() == 'arff':
+                X = pd.DataFrame(data['data'], columns=attribute_names)
+
+                col = []
+                for column_name in X.columns:
+                    if attribute_dtype[column_name] in ('categorical',
+                                                        'boolean'):
+                        col.append(self._unpack_categories(
+                            X[column_name], categories_names[column_name]))
+                    else:
+                        col.append(X[column_name])
+                X = pd.concat(col, axis=1)
+
+            # Pickle the dataframe or the sparse matrix.
+            with open(data_pickle_file, "wb") as fh:
+                pickle.dump((X, categorical, attribute_names), fh, -1)
+            logger.debug("Saved dataset {did}: {name} to file {path}"
+                         .format(did=int(self.dataset_id or -1),
+                                 name=self.name,
+                                 path=data_pickle_file)
+                         )
+            return data_pickle_file
 
     def push_tag(self, tag):
         """Annotates this data set with a tag on the server.
@@ -394,13 +402,19 @@ def _unpack_categories(series, categories):
         return pd.Series(col, index=series.index, dtype='category',
                          name=series.name)
 
-    def get_data(self, target=None,
-                 include_row_id=False,
-                 include_ignore_attributes=False,
-                 return_categorical_indicator=False,
-                 return_attribute_names=False,
-                 dataset_format=None):
-        """Returns dataset content as dataframes or sparse matrices.
+    def _download_data(self) -> None:
+        """ Download ARFF data file to standard cache directory. Set `self.data_file`. """
+        # import required here to avoid circular import.
+        from .functions import _get_dataset_arff
+        self.data_file = _get_dataset_arff(self)
+
+    def get_data(self, target: str = None,
+                 include_row_id: bool = False,
+                 include_ignore_attributes: bool = False,
+                 return_categorical_indicator: bool = False,
+                 return_attribute_names: bool = False,
+                 dataset_format: str = None):
+        """ Returns dataset content as dataframes or sparse matrices.
 
         Parameters
         ----------
@@ -416,10 +430,10 @@ def get_data(self, target=None,
             categorical.
         return_attribute_names : boolean (default=False)
             Whether to return attribute names.
-        dataset_format : string
-            The format of returned dataset. If ``array``, the returned dataset
-            will be a NumPy array or a SciPy sparse matrix. If ``dataframe``,
-            the returned dataset will be a Pandas DataFrame or SparseDataFrame.
+        dataset_format : string, optional
+            The format of returned dataset.
+            If ``array``, the returned dataset will be a NumPy array or a SciPy sparse matrix.
+            If ``dataframe``, the returned dataset will be a Pandas DataFrame or SparseDataFrame.
 
         Returns
         -------
@@ -428,12 +442,11 @@ def get_data(self, target=None,
         y : ndarray or series, shape (n_samples,)
             Target column(s). Only returned if target is not None.
         categorical_indicator : boolean ndarray
-            Mask that indicate categorical features. Only returned if
-            return_categorical_indicator is True.
+            Mask that indicate categorical features.
+            Only returned if return_categorical_indicator is True.
         return_attribute_names : list of strings
-            List of attribute names. Returned only if return_attribute_names is
-            True.
-
+            List of attribute names.
+            Only returned if return_attribute_names is True.
         """
         if dataset_format is None:
             warn('The default of "dataset_format" will change from "array" to'
@@ -442,6 +455,11 @@ def get_data(self, target=None,
 
         rval = []
 
+        if self.data_pickle_file is None:
+            if self.data_file is None:
+                self._download_data()
+            self.data_pickle_file = self._data_arff_to_pickle(self.data_file)
+
         path = self.data_pickle_file
         if not os.path.exists(path):
             raise ValueError("Cannot find a pickle file for dataset %s at "
@@ -554,26 +572,10 @@ def retrieve_class_labels(self, target_name='class'):
         -------
         list
         """
-
-        # TODO improve performance, currently reads the whole file
-        # Should make a method that only reads the attributes
-        arffFileName = self.data_file
-
-        if self.format.lower() == 'arff':
-            return_type = arff.DENSE
-        elif self.format.lower() == 'sparse_arff':
-            return_type = arff.COO
-        else:
-            raise ValueError('Unknown data format %s' % self.format)
-
-        with io.open(arffFileName, encoding='utf8') as fh:
-            arffData = arff.ArffDecoder().decode(fh, return_type=return_type)
-
-        dataAttributes = dict(arffData['attributes'])
-        if target_name in dataAttributes:
-            return dataAttributes[target_name]
-        else:
-            return None
+        for feature in self.features.values():
+            if (feature.name == target_name) and (feature.data_type == 'nominal'):
+                return feature.nominal_values
+        return None
 
     def get_features_by_type(self, data_type, exclude=None,
                              exclude_ignore_attributes=True,