Merge pull request #115 from ottogroup/develop

Release 1.2.2
ottogroup · Aug 16, 2019 · 52d0aed · 52d0aed
2 parents 11a6e02 + 0f12ac2
commit 52d0aed
Show file tree

Hide file tree

Showing 23 changed files with 325 additions and 144 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -10,16 +10,20 @@ matrix:
 env:
   - TRAVIS=yes
 before_install:
-  - wget http://repo.continuum.io/miniconda/Miniconda3-4.3.31-Linux-x86_64.sh -O miniconda.sh
+  - pip install -U pip && pip --version
+  - wget https://repo.anaconda.com/miniconda/Miniconda3-4.6.14-Linux-x86_64.sh -O miniconda.sh
   - chmod +x miniconda.sh
   - ./miniconda.sh -b
   - export PATH=/home/travis/miniconda3/bin:$PATH
   - conda config --set always_yes yes --set changeps1 no
   - conda install -q --yes -c r rpy2 r-randomforest tzlocal
-  - conda install -q python=$TRAVIS_PYTHON_VERSION --file requirements.txt
   - conda update -q conda
   - pip install coveralls
-  - travis_retry python setup.py dev
+install:
+  - conda install -q python=$TRAVIS_PYTHON_VERSION
+  - pip install -r requirements.txt -r requirements-dev.txt
+  - pip install pandas==0.23.4  # for rpy2 compatibility
+  - pip install -e .[docs,testing]
 script:
   - travis_wait py.test --runslow
 deploy:

diff --git a/CHANGES.txt b/CHANGES.txt
@@ -1,3 +1,17 @@
+v1.2.2 - 2019-08-15
+===================
+
+- Added default Palladium config file locations (to be checked if
+  `PALLADIUM_CONFIG` is not set)
+
+- Introduced `palladium.dataset.CSV` to replace `palladium.dataset.Table`
+
+- `__copy__` configuration directive now accepts a `__default__`
+
+- Updated requirements in order to use newer versions of dependencies
+  (also fixing potential security vulnerabilities in dependencies)
+
+
 v1.2.1.1 - 2018-12-13
 =====================
 

diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-1.2.1.1
+1.2.2
diff --git a/conda_recipe/meta.yaml b/conda_recipe/meta.yaml
@@ -15,6 +15,7 @@ build:
   entry_points:
     - pld-admin = palladium.fit:admin_cmd
     - pld-devserver = palladium.server:devserver_cmd
+    - pld-export = palladium.util:export_cmd
     - pld-fit = palladium.fit:fit_cmd
     - pld-grid-search = palladium.fit:grid_search_cmd
     - pld-list = palladium.eval:list_cmd
@@ -71,6 +72,7 @@ test:
     - py.test --pyargs palladium
     - pld-admin --help
     - pld-devserver --help
+    - pld-export --help
     - pld-fit --help
     - pld-grid-search --help
     - pld-list --help

diff --git a/docs/user/configuration.rst b/docs/user/configuration.rst
@@ -15,7 +15,12 @@ Configuration files use Python syntax.  For an introduction, please
 visit the :ref:`tutorial`.
 
 Palladium uses an environment variable called ``PALLADIUM_CONFIG`` to
-look up the location of the configuration file.
+look up the location of one or more configuration files.  If
+``PALLADIUM_CONFIG`` is not set, Palladium will try to find a
+configuration file at these locations:
+
+- ``palladium-config.py``
+- ``etc/palladium-config.py``
 
 Variables
 =========
@@ -41,7 +46,7 @@ folder as the configuration:
 .. code-block:: python
 
     'dataset_loader_train': {
-        '__factory__': 'palladium.dataset.Table',
+        '__factory__': 'palladium.dataset.CSV',
         'path': '{}/data.csv'.format(here),
         }
 
@@ -75,15 +80,15 @@ file:
 .. code-block:: python
 
     'dataset_loader_train': {
-        '__factory__': 'palladium.dataset.Table',
+        '__factory__': 'palladium.dataset.CSV',
         'path': '{}/train.csv'.format(here),
         'many': '...',
         'more': {'...'},
         'entries': ['...'],
         }
 
     'dataset_loader_test': {
-        '__factory__': 'palladium.dataset.Table',
+        '__factory__': 'palladium.dataset.CSV',
         'path': '{}/test.csv'.format(here),
         'many': '...',
         'more': {'...'},
@@ -95,7 +100,7 @@ With ``__copy__``, you can reduce this down to:
 .. code-block:: python
 
     'dataset_loader_train': {
-        '__factory__': 'palladium.dataset.Table',
+        '__factory__': 'palladium.dataset.CSV',
         'path': '{}/train.csv'.format(here),
         'many': '...',
         'more': {'...'},
@@ -108,3 +113,15 @@ With ``__copy__``, you can reduce this down to:
         }
 
 Reducing duplication in your configuration can help avoid errors.
+
+If the target of the ``__copy__`` directive does not exist, we can
+fall back to a default using the ``__default__`` special keyword.  An
+example that defaults to an empty ``param_grid`` for cross
+validation:
+
+.. code-block:: python
+
+    'grid_search': {
+        'param_grid': {'__copy__': 'param_grid', '__default__': {}},
+        # ... some involved grid search configuration
+    }
diff --git a/docs/user/faq.rst b/docs/user/faq.rst
@@ -164,6 +164,36 @@ passed at runtime.
         'n_jobs': -1,
         }
 
+Can I use Bayesian optimization instead of grid search to tune my hyperparameters?
+==================================================================================
+
+The grid search configuration allows you to use a class other than
+:class:`sklearn.grid_search.GridSearchCV` to do the hyperparameter
+search.  Here's an example configuration that uses `scikit-optimize
+<https://scikit-optimize.github.io/>`_ to search for hyperparameters
+using Bayesian optimization, assuming an :class:`sklearn.svm.SVC`
+classifier:
+
+.. code-block:: python
+
+    'grid_search': {
+        '__factory__': 'skopt.BayesSearchCV',
+        'estimator': {'__copy__': 'model'},
+        'n_iter': 16,
+        'search_spaces': {
+            'C': {
+                '__factory__': 'skopt.space.Real',
+                'low': 1e-6, 'high': 1e+1, 'prior': 'log-uniform',
+            },
+            'degree': {
+                '__factory__': 'skopt.space.Integer',
+                'low': 1, 'high': 20,
+            },
+        },
+        'return_train_score': True,
+        'refit': False,
+        'verbose': 4,
+    }
 
 Can I use my cluster to run a hyperparameter search?
 ====================================================

diff --git a/docs/user/tutorial.rst b/docs/user/tutorial.rst
@@ -144,22 +144,22 @@ our dataset loader that helps us load the training data from the CSV
 file with the data, and define which rows should be used as data and
 target values.  The first entry inside ``dataset_loader_train``
 defines the type of dataset loader we want to use.  That is
-:class:`palladium.dataset.Table`:
+:class:`palladium.dataset.CSV`:
 
 .. code-block:: python
 
     'dataset_loader_train': {
-        '__factory__': 'palladium.dataset.Table',
+        '__factory__': 'palladium.dataset.CSV',
 
-The rest what is inside the ``dataset_loader_train`` are the keyword
-arguments that are used to initialize the :class:`~palladium.dataset.Table`
-component.  The full definition of ``dataset_loader_train`` looks like
-this:
+The rest of what is inside the ``dataset_loader_train`` are the
+keyword arguments that are used to initialize the
+:class:`~palladium.dataset.CSV` class.  The full definition of
+``dataset_loader_train`` looks like this:
 
 .. code-block:: python
 
     'dataset_loader_train': {
-        '__factory__': 'palladium.dataset.Table',
+        '__factory__': 'palladium.dataset.CSV',
         'path': 'iris.data',
         'names': [
             'sepal length',
@@ -169,20 +169,20 @@ this:
             'species',
             ],
         'target_column': 'species',
-        'sep': ',',
         'nrows': 100,
         }
 
-You can now take a look at :class:`~palladium.dataset.Table`'s API to find
-out what parameters a Table accepts and what they mean.  But to
-summarize: the ``path`` is the path to the CSV file.  In our case,
-this is the relative path to ``iris.data``.  Because our CSV file
-doesn't have the column names in the first line, we have to provide
-the column names using the ``names`` parameter.  The ``target_column``
-defines which of the columns should be used as the value to be
-predicted; this is the last column, which we named ``species``.  The
-``nrows`` parameter tells :class:`~palladium.dataset.Table` to return only
-the first hundred samples from our CSV file.
+You can take a look at :class:`~palladium.dataset.CSV`'s API to find
+out what parameters the CSV dataset loader accepts and what they mean.
+But to summarize: the ``path`` is the path to the CSV file.  In our
+case, this is the relative path to ``iris.data``.  Because our CSV
+file doesn't have the column names in the first line, we have to
+provide the column names using the ``names`` parameter.  The
+``target_column`` defines which of the columns should be used as the
+value to be predicted; this is the last column, which we named
+``species``.  The ``nrows`` parameter tells
+:class:`~palladium.dataset.CSV` to return only the first hundred
+samples from our CSV file.
 
 If you take a look at the next section in the config file, which is
 ``dataset_loader_test``, you will notice that it is very similar to
@@ -197,21 +197,28 @@ the ``skiprows`` parameter and thus skips the first hundred examples
 
         'skiprows': 100,
 
-Under the hood, :class:`~palladium.dataset.Table` uses
-:func:`pandas.io.parsers.read_table` to do the actual loading.  Any
-additional named parameters passed to :class:`~palladium.dataset.Table` are
-passed on to :func:`~pandas.io.parsers.read_table`.  That is the case
-for the ``sep`` parameter in our example, but there are a lot of other
-useful options, too, like ``usecols``, ``skiprows`` and so on.
+.. note::
+
+  At this point you may be wondering if there's a way to not repeat
+  the entire ``dataset_loader_train`` section to define the test
+  dataset loader, just to change the ``skiprows`` argument, there is!
+  Check out the `configuration`_ docs for details on how to use the
+  ``__copy__`` special keyword.
+
+Under the hood, :class:`~palladium.dataset.CSV` uses
+:func:`pandas.io.parsers.read_csv` to do the actual loading.  Any
+additional named parameters passed to :class:`~palladium.dataset.CSV`
+are passed on to :func:`~pandas.io.parsers.read_csv`.  In our example,
+that is the case for the ``nrows`` and ``skiprows`` parameters.
 
 Palladium also includes a dataset loader for loading data from an SQL
 database: :class:`palladium.dataset.SQL`.
 
 But if you find yourself in need to write your own dataset loader,
 then that is pretty easy to do: Take a look at Palladium's
-:class:`~palladium.interfaces.DatasetLoader` interface that documents how a
-:class:`~palladium.interfaces.DatasetLoader` like
-:class:`~palladium.dataset.Table` needs to look like.
+:class:`~palladium.interfaces.DatasetLoader` interface that documents
+how a :class:`~palladium.interfaces.DatasetLoader` like
+:class:`~palladium.dataset.CSV` needs to look like.
 
 
 Model
@@ -324,16 +331,20 @@ Let us take a look at the configuration of ``grid_search``:
         'param_grid': {
             'C': [0.1, 0.3, 1.0],
             },
+        'return_train_score': True,
         'verbose': 4,
         }
 
 What parameters should be checked can be specified in the entry
 ``param_grid``. If more than one parameter with sets of values to
 check are provided, all possible combinations are explored by grid
 search. ``verbose`` allows to set the level for grid search
-messages. It is possible to set other parameters of grid search, e.g.,
-how many jobs to be run in parallel can be specified in `n_jobs` (if
-set to -1, all cores are used).
+messages. With ``return_train_score`` set to ``True``, the result will
+also include scores for the training data for each fold.
+
+It is possible to set other parameters of grid search, e.g., how many
+jobs to be run in parallel can be specified in `n_jobs` (if set to -1,
+all cores are used).
 
 Palladium uses :class:`sklearn.grid_search.GridSearchCV` to do the actual
 work.  Thus, you'll want to take a look at the `scikit-learn docs for

diff --git a/examples/R/config-iris-dataset-from-python.py b/examples/R/config-iris-dataset-from-python.py
@@ -4,7 +4,7 @@
 
 {
     'dataset_loader_train': {
-        '__factory__': 'palladium.dataset.Table',
+        '__factory__': 'palladium.dataset.CSV',
         'path': 'iris.data',
         'names': [
             'sepal length',
@@ -14,7 +14,6 @@
             'species',
         ],
         'target_column': 'species',
-        'sep': ',',
         'nrows': 100,
     },
 

diff --git a/examples/iris/config.py b/examples/iris/config.py
@@ -5,7 +5,7 @@
     },
 
     'dataset_loader_train': {
-        '__factory__': 'palladium.dataset.Table',
+        '__factory__': 'palladium.dataset.CSV',
         'path': 'iris.data',
         'names': [
             'sepal length',
@@ -15,12 +15,11 @@
             'species',
         ],
         'target_column': 'species',
-        'sep': ',',
         'nrows': 100,
     },
 
     'dataset_loader_test': {
-        '__factory__': 'palladium.dataset.Table',
+        '__factory__': 'palladium.dataset.CSV',
         'path': 'iris.data',
         'names': [
             'sepal length',
@@ -30,19 +29,21 @@
             'species',
         ],
         'target_column': 'species',
-        'sep': ',',
         'skiprows': 100,
     },
 
     'model': {
         '__factory__': 'sklearn.linear_model.LogisticRegression',
         'C': 0.3,
+        'solver': 'lbfgs',
+        'multi_class': 'auto',
     },
 
     'grid_search': {
         'param_grid': {
             'C': [0.1, 0.3, 1.0],
         },
+        'return_train_score': True,
         'verbose': 4,
         'n_jobs': -1,
     },

diff --git a/examples/julia/config.py b/examples/julia/config.py
@@ -1,6 +1,6 @@
 {
     'dataset_loader_train': {
-        '__factory__': 'palladium.dataset.Table',
+        '__factory__': 'palladium.dataset.CSV',
         'path': 'iris.data',
         'names': [
             'sepal length',
@@ -10,13 +10,12 @@
             'species',
         ],
         'target_column': 'species',
-        'sep': ',',
         'nrows': 100,
         'converters': {'species': lambda x: 1 if x == 'Iris-setosa' else -1},
     },
 
     'dataset_loader_test': {
-        '__factory__': 'palladium.dataset.Table',
+        '__factory__': 'palladium.dataset.CSV',
         'path': 'iris.data',
         'names': [
             'sepal length',
@@ -26,7 +25,6 @@
             'species',
         ],
         'target_column': 'species',
-        'sep': ',',
         'skiprows': 100,
         'converters': {'species': lambda x: 1 if x == 'Iris-setosa' else -1},
     },