From 30c8df3058cd1f50b783dddc110610ef7d75de85 Mon Sep 17 00:00:00 2001 From: Daniel Nouri Date: Thu, 23 May 2019 14:01:05 +0200 Subject: [PATCH 01/19] Travis: Update to use latest pip and Miniconda versions Also, improve the installation sequence: We now run "python setup.py dev" after installing requirements.txt, which avoids double installation of dependencies. --- .travis.yml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 40ad332..e8dd9c8 100644 --- a/.travis.yml +++ b/.travis.yml @@ -10,7 +10,8 @@ matrix: env: - TRAVIS=yes before_install: - - wget http://repo.continuum.io/miniconda/Miniconda3-4.3.31-Linux-x86_64.sh -O miniconda.sh + - pip install -U pip && pip --version + - wget https://repo.anaconda.com/miniconda/Miniconda3-4.6.14-Linux-x86_64.sh -O miniconda.sh - chmod +x miniconda.sh - ./miniconda.sh -b - export PATH=/home/travis/miniconda3/bin:$PATH @@ -19,7 +20,10 @@ before_install: - conda install -q python=$TRAVIS_PYTHON_VERSION --file requirements.txt - conda update -q conda - pip install coveralls - - travis_retry python setup.py dev +install: + - pip install -r requirements.txt + - pip install -e . + - python setup.py dev script: - travis_wait py.test --runslow deploy: From 5a2c65959a539442bb66611e454176b7d534fdb5 Mon Sep 17 00:00:00 2001 From: Daniel Nouri Date: Thu, 23 May 2019 15:32:23 +0200 Subject: [PATCH 02/19] Travis: Remove pip install of requiements; we use conda for this --- .travis.yml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/.travis.yml b/.travis.yml index e8dd9c8..b8d825e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -17,13 +17,11 @@ before_install: - export PATH=/home/travis/miniconda3/bin:$PATH - conda config --set always_yes yes --set changeps1 no - conda install -q --yes -c r rpy2 r-randomforest tzlocal - - conda install -q python=$TRAVIS_PYTHON_VERSION --file requirements.txt - conda update -q conda - pip install coveralls install: - - pip install -r requirements.txt - - pip install -e . - - python setup.py dev + - conda install -q python=$TRAVIS_PYTHON_VERSION --file requirements.txt + - travis_retry python setup.py dev script: - travis_wait py.test --runslow deploy: From 74da978473b71066351a3c0208fad1a7be130dec Mon Sep 17 00:00:00 2001 From: Daniel Nouri Date: Thu, 25 Apr 2019 17:26:07 +0200 Subject: [PATCH 03/19] Add a few default locations for the Palladium config file If ``PALLADIUM_CONFIG`` is not set, Palladium will try to find a configuration file at these locations: - ``palladium-config.py`` - ``etc/palladium-config.py`` --- docs/user/configuration.rst | 7 ++++++- palladium/config.py | 13 +++++++++++++ palladium/tests/test_config.py | 18 +++++++++++++++++- palladium/tests/test_persistence.py | 4 ++-- 4 files changed, 38 insertions(+), 4 deletions(-) diff --git a/docs/user/configuration.rst b/docs/user/configuration.rst index e05197d..c3b9908 100644 --- a/docs/user/configuration.rst +++ b/docs/user/configuration.rst @@ -15,7 +15,12 @@ Configuration files use Python syntax. For an introduction, please visit the :ref:`tutorial`. Palladium uses an environment variable called ``PALLADIUM_CONFIG`` to -look up the location of the configuration file. +look up the location of one or more configuration files. If +``PALLADIUM_CONFIG`` is not set, Palladium will try to find a +configuration file at these locations: + +- ``palladium-config.py`` +- ``etc/palladium-config.py`` Variables ========= diff --git a/palladium/config.py b/palladium/config.py index f3cb278..a5a2146 100644 --- a/palladium/config.py +++ b/palladium/config.py @@ -12,6 +12,11 @@ refer to the manual for more details. """ +DEFAULT_CONFIG_FILE_LOCATIONS = ( + 'palladium-config.py', + os.path.join('etc', 'palladium-config.py'), + ) + class Config(dict): """A dictionary that represents the app's configuration. @@ -212,7 +217,15 @@ def _get_config(**extra): if not _config.initialized: _config.update(extra) _config.initialized = True + fnames = os.environ.get('PALLADIUM_CONFIG') + if fnames is None: + for fname in DEFAULT_CONFIG_FILE_LOCATIONS: + if os.path.exists(fname): # pragma: no cover + fnames = fname + print("Using configuration at {}".format(fname)) + break + if fnames is not None: configs = [] fnames = [fname.strip() for fname in fnames.split(',')] diff --git a/palladium/tests/test_config.py b/palladium/tests/test_config.py index a3ab706..02fec6b 100644 --- a/palladium/tests/test_config.py +++ b/palladium/tests/test_config.py @@ -1,3 +1,4 @@ +from contextlib import contextmanager from functools import reduce import operator import os @@ -38,6 +39,14 @@ def __init__(self): self.cfg = get_config().copy() +@contextmanager +def cwd(path): + before = os.getcwd() + os.chdir(path) + yield + os.chdir(before) + + def test_config_class_keyerror(): from palladium.config import Config with pytest.raises(KeyError) as e: @@ -71,7 +80,7 @@ def get_config(self): @pytest.fixture def config1_fname(self, tmpdir): - path = tmpdir.join('config1.py') + path = tmpdir.join('palladium-config.py') path.write("""{ 'env': environ['ENV1'], 'here': here, @@ -100,6 +109,13 @@ def config3_fname(self, tmpdir): def test_extras(self, get_config): assert get_config(foo='bar')['foo'] == 'bar' + def test_default_config(self, get_config, config1_fname, monkeypatch): + here = os.path.dirname(config1_fname) + monkeypatch.setitem(os.environ, 'ENV1', 'one') + with cwd(here): + config = get_config() + assert config['here'] == here + def test_variables(self, get_config, config1_fname, monkeypatch): monkeypatch.setitem(os.environ, 'PALLADIUM_CONFIG', config1_fname) monkeypatch.setitem(os.environ, 'ENV1', 'one') diff --git a/palladium/tests/test_persistence.py b/palladium/tests/test_persistence.py index ef18502..071ab25 100644 --- a/palladium/tests/test_persistence.py +++ b/palladium/tests/test_persistence.py @@ -76,7 +76,7 @@ def test_read(self, File): patch('palladium.persistence.pickle.load') as load: lm.return_value = [{'version': 99}] lp.return_value = {'active-model': '99'} - exists.return_value = True + exists.side_effect = lambda fn: fn == '/models/model-99.pkl.gz' open.return_value = MagicMock() result = File('/models/model-{version}').read() open.assert_called_with('/models/model-99.pkl.gz', 'rb') @@ -90,7 +90,7 @@ def test_read_with_version(self, File): patch('palladium.persistence.gzip.open') as gzopen,\ patch('palladium.persistence.pickle.load') as load: lm.return_value = [{'version': 99}] - exists.return_value = True + exists.side_effect = lambda fn: fn == '/models/model-432.pkl.gz' open.return_value = MagicMock() result = File('/models/model-{version}').read(432) open.assert_called_with('/models/model-432.pkl.gz', 'rb') From 888afcc9b1283bbb51c3e42422a16a0c66456d4e Mon Sep 17 00:00:00 2001 From: Daniel Nouri Date: Thu, 23 May 2019 13:06:07 +0200 Subject: [PATCH 04/19] In pld-fit, free training data memory before loading any test data --- palladium/fit.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/palladium/fit.py b/palladium/fit.py index 8587d82..5bc8df3 100644 --- a/palladium/fit.py +++ b/palladium/fit.py @@ -1,6 +1,7 @@ """Utilities for fitting modles. """ +import gc from warnings import warn import sys @@ -74,9 +75,13 @@ def scorer(model, X, y): annotate(model, {'score_train': score_train}) logger.info("Train score: {}".format(score_train)) + X, y = None, None + gc.collect() + score_test = None if evaluate and dataset_loader_test is not None: - X_test, y_test = dataset_loader_test() + with timer(logger.info, "Loading test data"): + X_test, y_test = dataset_loader_test() with timer(logger.debug, "Evaluating model on test set"): score_test = scorer(model, X_test, y_test) annotate(model, {'score_test': score_test}) From 6ffd82c227f3516b915503d948b3bc2329e1b2ee Mon Sep 17 00:00:00 2001 From: Daniel Nouri Date: Wed, 5 Jun 2019 17:52:37 +0200 Subject: [PATCH 05/19] GridSearchCV no longer return_train_score by default Update the tutorial to include `return_train_score` in the config, since the output includes it. --- docs/user/tutorial.rst | 10 +++++++--- examples/iris/config.py | 1 + 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/docs/user/tutorial.rst b/docs/user/tutorial.rst index de0f2f5..43da2e6 100644 --- a/docs/user/tutorial.rst +++ b/docs/user/tutorial.rst @@ -324,6 +324,7 @@ Let us take a look at the configuration of ``grid_search``: 'param_grid': { 'C': [0.1, 0.3, 1.0], }, + 'return_train_score': True, 'verbose': 4, } @@ -331,9 +332,12 @@ What parameters should be checked can be specified in the entry ``param_grid``. If more than one parameter with sets of values to check are provided, all possible combinations are explored by grid search. ``verbose`` allows to set the level for grid search -messages. It is possible to set other parameters of grid search, e.g., -how many jobs to be run in parallel can be specified in `n_jobs` (if -set to -1, all cores are used). +messages. With ``return_train_score`` set to ``True``, the result will +also include scores for the training data for each fold. + +It is possible to set other parameters of grid search, e.g., how many +jobs to be run in parallel can be specified in `n_jobs` (if set to -1, +all cores are used). Palladium uses :class:`sklearn.grid_search.GridSearchCV` to do the actual work. Thus, you'll want to take a look at the `scikit-learn docs for diff --git a/examples/iris/config.py b/examples/iris/config.py index 93a6db7..72109b0 100644 --- a/examples/iris/config.py +++ b/examples/iris/config.py @@ -43,6 +43,7 @@ 'param_grid': { 'C': [0.1, 0.3, 1.0], }, + 'return_train_score': True, 'verbose': 4, 'n_jobs': -1, }, From 12cc84f35031753866b805a8d10a6166ef84c3db Mon Sep 17 00:00:00 2001 From: Daniel Nouri Date: Wed, 5 Jun 2019 17:54:43 +0200 Subject: [PATCH 06/19] sklearn.externals.joblib is deprecated; import from joblib package --- palladium/fit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/palladium/fit.py b/palladium/fit.py index 5bc8df3..7771131 100644 --- a/palladium/fit.py +++ b/palladium/fit.py @@ -7,8 +7,8 @@ from datetime import datetime from docopt import docopt +from joblib import parallel_backend import pandas -from sklearn.externals.joblib import parallel_backend from sklearn.metrics import get_scorer from sklearn.model_selection import GridSearchCV From 7ff90ac4ca1334bf08ca1ec76e13356696a0cc82 Mon Sep 17 00:00:00 2001 From: Daniel Nouri Date: Wed, 5 Jun 2019 17:55:33 +0200 Subject: [PATCH 07/19] grid_search script: move 'double scoring' warning to after exception It's a very small and cosmetic change, but the effect is that, if you specify 'scoring' both in 'grid_search' and on a global level, you will now see a ValueError without seeing a warning that tells you to move 'scoring' to the top level. --- palladium/fit.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/palladium/fit.py b/palladium/fit.py index 7771131..de0bd8d 100644 --- a/palladium/fit.py +++ b/palladium/fit.py @@ -216,12 +216,12 @@ def grid_search(dataset_loader_train, model, grid_search, scoring=None, search_kwargs['cv'] = apply_kwargs(cv, n=len(y), X=X, y=y) if 'scoring' in search_kwargs: - warn("Use of 'scoring' inside of 'grid_search' is deprecated. " - "To fix, move 'scoring' up to the top level of the configuration " - "dict.", DeprecationWarning) if scoring is not None: raise ValueError("You cannot define 'scoring' in 'grid_search' " "and globally.") + warn("Use of 'scoring' inside of 'grid_search' is deprecated. " + "To fix, move 'scoring' up to the top level of the configuration " + "dict.", DeprecationWarning) scoring = search_kwargs['scoring'] elif scoring is not None: search_kwargs['scoring'] = scoring From bcf9920ed26bb43eae102141d1b57e5006053bde Mon Sep 17 00:00:00 2001 From: Daniel Nouri Date: Wed, 5 Jun 2019 18:25:03 +0200 Subject: [PATCH 08/19] Introduce palladium.dataset.CSV to replace palladium.dataset.Table Table relies on pandas.read_table, which is deprecated. Also, CSV as a name is more intuitive. --- docs/user/configuration.rst | 8 +-- docs/user/tutorial.rst | 61 +++++++++++-------- examples/R/config-iris-dataset-from-python.py | 3 +- examples/iris/config.py | 6 +- examples/julia/config.py | 6 +- palladium/dataset.py | 14 ++++- palladium/tests/test_dataset.py | 51 ++++++++-------- 7 files changed, 79 insertions(+), 70 deletions(-) diff --git a/docs/user/configuration.rst b/docs/user/configuration.rst index c3b9908..40a2323 100644 --- a/docs/user/configuration.rst +++ b/docs/user/configuration.rst @@ -46,7 +46,7 @@ folder as the configuration: .. code-block:: python 'dataset_loader_train': { - '__factory__': 'palladium.dataset.Table', + '__factory__': 'palladium.dataset.CSV', 'path': '{}/data.csv'.format(here), } @@ -80,7 +80,7 @@ file: .. code-block:: python 'dataset_loader_train': { - '__factory__': 'palladium.dataset.Table', + '__factory__': 'palladium.dataset.CSV', 'path': '{}/train.csv'.format(here), 'many': '...', 'more': {'...'}, @@ -88,7 +88,7 @@ file: } 'dataset_loader_test': { - '__factory__': 'palladium.dataset.Table', + '__factory__': 'palladium.dataset.CSV', 'path': '{}/test.csv'.format(here), 'many': '...', 'more': {'...'}, @@ -100,7 +100,7 @@ With ``__copy__``, you can reduce this down to: .. code-block:: python 'dataset_loader_train': { - '__factory__': 'palladium.dataset.Table', + '__factory__': 'palladium.dataset.CSV', 'path': '{}/train.csv'.format(here), 'many': '...', 'more': {'...'}, diff --git a/docs/user/tutorial.rst b/docs/user/tutorial.rst index 43da2e6..3d06de4 100644 --- a/docs/user/tutorial.rst +++ b/docs/user/tutorial.rst @@ -144,22 +144,22 @@ our dataset loader that helps us load the training data from the CSV file with the data, and define which rows should be used as data and target values. The first entry inside ``dataset_loader_train`` defines the type of dataset loader we want to use. That is -:class:`palladium.dataset.Table`: +:class:`palladium.dataset.CSV`: .. code-block:: python 'dataset_loader_train': { - '__factory__': 'palladium.dataset.Table', + '__factory__': 'palladium.dataset.CSV', -The rest what is inside the ``dataset_loader_train`` are the keyword -arguments that are used to initialize the :class:`~palladium.dataset.Table` -component. The full definition of ``dataset_loader_train`` looks like -this: +The rest of what is inside the ``dataset_loader_train`` are the +keyword arguments that are used to initialize the +:class:`~palladium.dataset.CSV` class. The full definition of +``dataset_loader_train`` looks like this: .. code-block:: python 'dataset_loader_train': { - '__factory__': 'palladium.dataset.Table', + '__factory__': 'palladium.dataset.CSV', 'path': 'iris.data', 'names': [ 'sepal length', @@ -169,20 +169,20 @@ this: 'species', ], 'target_column': 'species', - 'sep': ',', 'nrows': 100, } -You can now take a look at :class:`~palladium.dataset.Table`'s API to find -out what parameters a Table accepts and what they mean. But to -summarize: the ``path`` is the path to the CSV file. In our case, -this is the relative path to ``iris.data``. Because our CSV file -doesn't have the column names in the first line, we have to provide -the column names using the ``names`` parameter. The ``target_column`` -defines which of the columns should be used as the value to be -predicted; this is the last column, which we named ``species``. The -``nrows`` parameter tells :class:`~palladium.dataset.Table` to return only -the first hundred samples from our CSV file. +You can take a look at :class:`~palladium.dataset.CSV`'s API to find +out what parameters the CSV dataset loader accepts and what they mean. +But to summarize: the ``path`` is the path to the CSV file. In our +case, this is the relative path to ``iris.data``. Because our CSV +file doesn't have the column names in the first line, we have to +provide the column names using the ``names`` parameter. The +``target_column`` defines which of the columns should be used as the +value to be predicted; this is the last column, which we named +``species``. The ``nrows`` parameter tells +:class:`~palladium.dataset.CSV` to return only the first hundred +samples from our CSV file. If you take a look at the next section in the config file, which is ``dataset_loader_test``, you will notice that it is very similar to @@ -197,21 +197,28 @@ the ``skiprows`` parameter and thus skips the first hundred examples 'skiprows': 100, -Under the hood, :class:`~palladium.dataset.Table` uses -:func:`pandas.io.parsers.read_table` to do the actual loading. Any -additional named parameters passed to :class:`~palladium.dataset.Table` are -passed on to :func:`~pandas.io.parsers.read_table`. That is the case -for the ``sep`` parameter in our example, but there are a lot of other -useful options, too, like ``usecols``, ``skiprows`` and so on. +.. note:: + + At this point you may be wondering if there's a way to not repeat + the entire ``dataset_loader_train`` section to define the test + dataset loader, just to change the ``skiprows`` argument, there is! + Check out the `configuration`_ docs for details on how to use the + ``__copy__`` special keyword. + +Under the hood, :class:`~palladium.dataset.CSV` uses +:func:`pandas.io.parsers.read_csv` to do the actual loading. Any +additional named parameters passed to :class:`~palladium.dataset.CSV` +are passed on to :func:`~pandas.io.parsers.read_csv`. In our example, +that is the case for the ``nrows`` and ``skiprows`` parameters. Palladium also includes a dataset loader for loading data from an SQL database: :class:`palladium.dataset.SQL`. But if you find yourself in need to write your own dataset loader, then that is pretty easy to do: Take a look at Palladium's -:class:`~palladium.interfaces.DatasetLoader` interface that documents how a -:class:`~palladium.interfaces.DatasetLoader` like -:class:`~palladium.dataset.Table` needs to look like. +:class:`~palladium.interfaces.DatasetLoader` interface that documents +how a :class:`~palladium.interfaces.DatasetLoader` like +:class:`~palladium.dataset.CSV` needs to look like. Model diff --git a/examples/R/config-iris-dataset-from-python.py b/examples/R/config-iris-dataset-from-python.py index b91a381..f4271a4 100644 --- a/examples/R/config-iris-dataset-from-python.py +++ b/examples/R/config-iris-dataset-from-python.py @@ -4,7 +4,7 @@ { 'dataset_loader_train': { - '__factory__': 'palladium.dataset.Table', + '__factory__': 'palladium.dataset.CSV', 'path': 'iris.data', 'names': [ 'sepal length', @@ -14,7 +14,6 @@ 'species', ], 'target_column': 'species', - 'sep': ',', 'nrows': 100, }, diff --git a/examples/iris/config.py b/examples/iris/config.py index 72109b0..98fcb8f 100644 --- a/examples/iris/config.py +++ b/examples/iris/config.py @@ -5,7 +5,7 @@ }, 'dataset_loader_train': { - '__factory__': 'palladium.dataset.Table', + '__factory__': 'palladium.dataset.CSV', 'path': 'iris.data', 'names': [ 'sepal length', @@ -15,12 +15,11 @@ 'species', ], 'target_column': 'species', - 'sep': ',', 'nrows': 100, }, 'dataset_loader_test': { - '__factory__': 'palladium.dataset.Table', + '__factory__': 'palladium.dataset.CSV', 'path': 'iris.data', 'names': [ 'sepal length', @@ -30,7 +29,6 @@ 'species', ], 'target_column': 'species', - 'sep': ',', 'skiprows': 100, }, diff --git a/examples/julia/config.py b/examples/julia/config.py index 974b2ff..f91f7aa 100644 --- a/examples/julia/config.py +++ b/examples/julia/config.py @@ -1,6 +1,6 @@ { 'dataset_loader_train': { - '__factory__': 'palladium.dataset.Table', + '__factory__': 'palladium.dataset.CSV', 'path': 'iris.data', 'names': [ 'sepal length', @@ -10,13 +10,12 @@ 'species', ], 'target_column': 'species', - 'sep': ',', 'nrows': 100, 'converters': {'species': lambda x: 1 if x == 'Iris-setosa' else -1}, }, 'dataset_loader_test': { - '__factory__': 'palladium.dataset.Table', + '__factory__': 'palladium.dataset.CSV', 'path': 'iris.data', 'names': [ 'sepal length', @@ -26,7 +25,6 @@ 'species', ], 'target_column': 'species', - 'sep': ',', 'skiprows': 100, 'converters': {'species': lambda x: 1 if x == 'Iris-setosa' else -1}, }, diff --git a/palladium/dataset.py b/palladium/dataset.py index cd6f462..8d799b2 100644 --- a/palladium/dataset.py +++ b/palladium/dataset.py @@ -13,12 +13,12 @@ from .util import RruleThread -class Table(DatasetLoader): +class CSV(DatasetLoader): """A :class:`~palladium.interfaces.DatasetLoader` that uses - :func:`pandas.io.parsers.read_table` to load data from a file or + :func:`pandas.io.parsers.read_csv` to load data from a file or URL. """ - pandas_read = staticmethod(pandas.io.parsers.read_table) + pandas_read = staticmethod(pandas.io.parsers.read_csv) def __init__(self, path, target_column=None, ndarray=True, **kwargs): @@ -63,6 +63,14 @@ def __call__(self): return data, target +class Table(CSV): + """A :class:`~palladium.interfaces.DatasetLoader` that uses the + deprecated :func:`pandas.io.parsers.read_table` to load data from + a file or URL. + """ + pandas_read = staticmethod(pandas.io.parsers.read_table) + + class SQL(DatasetLoader): """A :class:`~palladium.interfaces.DatasetLoader` that uses :func:`pandas.io.sql.read_sql` to load data from an SQL database. diff --git a/palladium/tests/test_dataset.py b/palladium/tests/test_dataset.py index 37649a4..1c53c7c 100644 --- a/palladium/tests/test_dataset.py +++ b/palladium/tests/test_dataset.py @@ -3,7 +3,6 @@ from unittest.mock import MagicMock from unittest.mock import patch -import numpy as np from pandas import DataFrame import pytest import sklearn @@ -15,54 +14,54 @@ }) -class TestTable: +class TestCSV: @pytest.fixture - def Table(self): - from palladium.dataset import Table - return Table - - def test_it(self, Table): - with patch("palladium.dataset.Table.pandas_read") as read_table: - read_table.return_value = dummy_dataframe[3:5] # simulate skiprows - dataset = Table('mypath', 'targetcol', some='keyword', skiprows=3) + def CSV(self): + from palladium.dataset import CSV + return CSV + + def test_it(self, CSV): + with patch("palladium.dataset.CSV.pandas_read") as read_csv: + read_csv.return_value = dummy_dataframe[3:5] # simulate skiprows + dataset = CSV('mypath', 'targetcol', some='keyword', skiprows=3) data, target = dataset() - read_table.assert_called_with('mypath', some='keyword', skiprows=3) + read_csv.assert_called_with('mypath', some='keyword', skiprows=3) assert len(data) == len(target) == 2 assert data.tolist() == [[13, 23.0], [14, 24.0]] assert target.tolist() == [3, 4] - def test_ndarray_false(self, Table): - with patch("palladium.dataset.Table.pandas_read") as read_table: - read_table.return_value = dummy_dataframe[3:5] - dataset = Table('mypath', 'targetcol', some='keyword', - skiprows=3, ndarray=False) # simulate skiprows + def test_ndarray_false(self, CSV): + with patch("palladium.dataset.CSV.pandas_read") as read_csv: + read_csv.return_value = dummy_dataframe[3:5] + dataset = CSV('mypath', 'targetcol', some='keyword', + skiprows=3, ndarray=False) # simulate skiprows data, target = dataset() assert data['datacol1'].tolist() == [13, 14] assert data['datacol2'].tolist() == [23.0, 24.0] assert target.tolist() == [3, 4] - def test_no_slice(self, Table): - with patch("palladium.dataset.Table.pandas_read") as read_table: - read_table.return_value = dummy_dataframe - dataset = Table('mypath', 'targetcol', some='keyword') + def test_no_slice(self, CSV): + with patch("palladium.dataset.CSV.pandas_read") as read_csv: + read_csv.return_value = dummy_dataframe + dataset = CSV('mypath', 'targetcol', some='keyword') data, target = dataset() - read_table.assert_called_with('mypath', some='keyword') + read_csv.assert_called_with('mypath', some='keyword') assert len(data) == len(target) == len(dummy_dataframe) assert data.tolist() == [ [10, 20.0], [11, 21.0], [12, 22.0], [13, 23.0], [14, 24.0], ] assert target.tolist() == [0, 1, 2, 3, 4] - def test_table_no_target(self, Table): - with patch("palladium.dataset.Table.pandas_read") as read_table: - read_table.return_value = dummy_dataframe - dataset = Table('mypath', some='keyword') + def test_no_target(self, CSV): + with patch("palladium.dataset.CSV.pandas_read") as read_csv: + read_csv.return_value = dummy_dataframe + dataset = CSV('mypath', some='keyword') data, target = dataset() - read_table.assert_called_with('mypath', some='keyword') + read_csv.assert_called_with('mypath', some='keyword') assert len(data) == len(dummy_dataframe) assert target is None From cb798f66f57c3ec7d892d7b677d2d08544971fa3 Mon Sep 17 00:00:00 2001 From: Daniel Nouri Date: Fri, 21 Jun 2019 19:24:31 +0200 Subject: [PATCH 09/19] __copy__ configuration directive now accepts a __default__ --- docs/user/configuration.rst | 12 ++++++++++++ palladium/config.py | 14 ++++++++++++-- palladium/tests/test_config.py | 6 ++++++ 3 files changed, 30 insertions(+), 2 deletions(-) diff --git a/docs/user/configuration.rst b/docs/user/configuration.rst index 40a2323..5004919 100644 --- a/docs/user/configuration.rst +++ b/docs/user/configuration.rst @@ -113,3 +113,15 @@ With ``__copy__``, you can reduce this down to: } Reducing duplication in your configuration can help avoid errors. + +If the target of the ``__copy__`` directive does not exist, we can +fall back to a default using the ``__default__`` special keyword. An +example that defaults to an empty ``param_grid`` for cross +validation: + +.. code-block:: python + + 'grid_search': { + 'param_grid': {'__copy__': 'param_grid', '__default__': {}}, + # ... some involved grid search configuration + } diff --git a/palladium/config.py b/palladium/config.py index a5a2146..295a538 100644 --- a/palladium/config.py +++ b/palladium/config.py @@ -94,10 +94,20 @@ def __call__(self, name, props): if self_reference: value = self._resolve(self.configs[:-1], dotted_path) else: - value = self._resolve(self.configs, dotted_path) + try: + value = self._resolve(self.configs, dotted_path) + except KeyError: + if '__default__' in props: + return props['__default__'] + else: + raise value = deepcopy(value) - if len(props) > 1: + nonmagicprops = [ + prop for prop in props + if not (prop.startswith('__') and prop.endswith('__')) + ] + if nonmagicprops: recursive_copy = self.key in value value.update(props) if not recursive_copy: diff --git a/palladium/tests/test_config.py b/palladium/tests/test_config.py index 02fec6b..678f7b3 100644 --- a/palladium/tests/test_config.py +++ b/palladium/tests/test_config.py @@ -235,6 +235,11 @@ def config2(self): }, 'mycopiedconstant': { '__copy__': 'mycopiedconstant', + '__default__': 42, + }, + 'mycopywithdefault': { + '__copy__': 'nonexistant', + '__default__': 42, }, } @@ -301,6 +306,7 @@ def test_config1_and_2(self, process_config, config1, config2): config['mysupernewdict']['mycopiedcomponent'], MyDummyComponent) assert config['mycopiedconstant'] == 3 + assert config['mycopywithdefault'] == 42 def test_initialize_config_logging(self, process_config): with patch('palladium.config.dictConfig') as dictConfig: From 37ad5c3ac4112b6b4b1dc467ecaec60e823b006e Mon Sep 17 00:00:00 2001 From: Daniel Nouri Date: Wed, 5 Jun 2019 17:25:36 +0200 Subject: [PATCH 10/19] Update requirements.txt and requirements-dev.txt to latest versions Used https://pypi.org/project/pur/ --- requirements-dev.txt | 10 +++++----- requirements.txt | 38 ++++++++++++++++++-------------------- 2 files changed, 23 insertions(+), 25 deletions(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index 8f03930..13b14ba 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,6 +1,6 @@ cov-core==1.15.0 -coverage==4.5.2 -pluggy==0.8.0 -pytest==4.0.1 -pytest-cov==2.6.0 -requests-mock==1.5.2 +coverage==4.5.4 +pluggy==0.12.0 +pytest==5.0.1 +pytest-cov==2.7.1 +requests-mock==1.6.0 diff --git a/requirements.txt b/requirements.txt index a2db78c..651e5c0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,26 +1,24 @@ -attrs==18.2.0 -certifi==2018.11.29 +certifi==2019.6.16 chardet==3.0.4 -click==7.0 +Click==7.0 docopt==0.6.2 -flask==1.0.2 -idna==2.7 +Flask==1.1.1 +idna==2.8 itsdangerous==1.1.0 -Jinja2==2.10 -joblib==0.13.0 -MarkupSafe==1.1.0 -more-itertools==4.3.0 -numpy==1.15.4 -pandas==0.23.4 -psutil==5.4.8 -python-dateutil==2.7.5 -pytz==2018.7 -requests==2.20.1 -scikit-learn==0.20.1 -scipy==1.1.0 +Jinja2==2.10.1 +joblib==0.13.2 +MarkupSafe==1.1.1 +numpy==1.17.0 +pandas==0.24.2 +psutil==5.6.3 +python-dateutil==2.8.0 +pytz==2019.2 +requests==2.22.0 +scikit-learn==0.21.3 +scipy==1.3.0 six==1.12.0 -SQLAlchemy==1.2.14 +SQLAlchemy==1.3.6 ujson==1.35 -urllib3==1.23 -Werkzeug==0.14.1 +urllib3==1.25.3 +Werkzeug==0.15.5 # julia==0.4.5 From d95cc18cf9ecbd7cd5de0c7812eda6bbd98cb6cf Mon Sep 17 00:00:00 2001 From: Daniel Nouri Date: Fri, 2 Aug 2019 19:36:49 +0200 Subject: [PATCH 11/19] Deal with some smaller deprecation issues --- examples/iris/config.py | 2 ++ palladium/tests/test_fit.py | 1 + setup.cfg | 2 ++ 3 files changed, 5 insertions(+) diff --git a/examples/iris/config.py b/examples/iris/config.py index 98fcb8f..e58ab22 100644 --- a/examples/iris/config.py +++ b/examples/iris/config.py @@ -35,6 +35,8 @@ 'model': { '__factory__': 'sklearn.linear_model.LogisticRegression', 'C': 0.3, + 'solver': 'lbfgs', + 'multi_class': 'auto', }, 'grid_search': { diff --git a/palladium/tests/test_fit.py b/palladium/tests/test_fit.py index e7f0b43..5890e35 100644 --- a/palladium/tests/test_fit.py +++ b/palladium/tests/test_fit.py @@ -438,6 +438,7 @@ def estimator(self): LogisticRegression(solver='liblinear'), param_grid={'C': [0.001, 0.01]}, cv=3, + iid=False, ) @pytest.mark.parametrize('backend', ['threading', 'sequential']) diff --git a/setup.cfg b/setup.cfg index 88bb005..37e4964 100644 --- a/setup.cfg +++ b/setup.cfg @@ -8,3 +8,5 @@ addopts = --cov=palladium --cov-report=term-missing --cov-config .coveragerc palladium/ examples/ python_files = test*py +markers = + slow: run slow tests From d79a67548e074552fc1addc959ce0772b68a963e Mon Sep 17 00:00:00 2001 From: Daniel Nouri Date: Fri, 2 Aug 2019 19:37:55 +0200 Subject: [PATCH 12/19] Do not support writing in text mode with RestIO This is warning that was previously triggered: requests.exceptions.FileModeWarning: Requests has determined the content-length for this request using the binary size of the file: however, the file has been opened in text mode (i.e. without the 'b' flag in the mode). This may lead to an incorrect content-length. In Requests 3.0, support will be removed for files in text mode. --- palladium/persistence.py | 7 ++++--- palladium/tests/test_persistence.py | 19 ++++++++++--------- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/palladium/persistence.py b/palladium/persistence.py index 7a44f78..edacdf5 100644 --- a/palladium/persistence.py +++ b/palladium/persistence.py @@ -135,7 +135,7 @@ def open(self, path, mode='r'): else: reader = codecs.getreader(res.encoding or 'utf-8') return reader(res.raw) - elif mode[0] == 'w': + elif mode == 'wb': return self._write(path, mode=mode) raise NotImplementedError("filemode: %s" % (mode,)) @@ -252,8 +252,9 @@ def _read_md(self): def _update_md(self, data): data2 = self._read_md() data2.update(data) - with self.io.open(self._md_filename, 'w') as f: - json.dump(data2, f, indent=4) + with self.io.open(self._md_filename, 'wb') as f: + bytes = json.dumps(data2, indent=4).encode('utf-8') + f.write(bytes) def upgrade(self, from_version=None, to_version=__version__): if from_version is None: diff --git a/palladium/tests/test_persistence.py b/palladium/tests/test_persistence.py index 071ab25..681cad4 100644 --- a/palladium/tests/test_persistence.py +++ b/palladium/tests/test_persistence.py @@ -214,7 +214,6 @@ def test_list_properties_with_metadata(self, File): def test_update_md(self, File): with patch('palladium.persistence.File._read_md') as read_md,\ - patch('palladium.persistence.json.dump') as dump,\ patch('builtins.open') as open: read_md.return_value = { 'hello': 'world', @@ -222,12 +221,14 @@ def test_update_md(self, File): 'properties': {}, } File('model-{version}')._update_md_orig({'models': [2]}) - open.assert_called_with('model-metadata.json', 'w') - dump.assert_called_with( - {'hello': 'world', 'models': [2], 'properties': {}}, - open.return_value.__enter__.return_value, - indent=4, - ) + open.assert_called_with('model-metadata.json', 'wb') + fh = open.return_value.__enter__.return_value + json_written = json.loads(fh.write.call_args[0][0].decode('utf-8')) + assert json_written == { + 'hello': 'world', + 'models': [2], + 'properties': {}, + } def test_read_md(self, File): with patch('builtins.open') as open,\ @@ -657,7 +658,7 @@ def handle_put_md(request, context): assert put_md.called assert pickle.loads(gzip.decompress(put_model_body)) == model - assert len(json.loads(put_md_body)['models']) == 1 + assert len(json.loads(put_md_body.decode('utf-8'))['models']) == 1 self.assert_auth_headers(mocked_requests) def test_download(self, mocked_requests, persister): @@ -722,7 +723,7 @@ def handle_put_md(request, context): persister.delete(1) assert put_md.called assert delete_model.called - assert len(json.loads(put_md_body)['models']) == 0 + assert len(json.loads(put_md_body.decode('utf-8'))['models']) == 0 self.assert_auth_headers(mocked_requests) From 4ef3d5f9c32334f1f385d43ea7ba04d68894cc15 Mon Sep 17 00:00:00 2001 From: Daniel Nouri Date: Fri, 2 Aug 2019 19:40:04 +0200 Subject: [PATCH 13/19] Test bugfix: More conservative timing in threading tests --- palladium/tests/test_server.py | 6 +++--- palladium/tests/test_util.py | 39 ++++++++++++++++++++-------------- 2 files changed, 26 insertions(+), 19 deletions(-) diff --git a/palladium/tests/test_server.py b/palladium/tests/test_server.py index 783ec79..2062b3c 100644 --- a/palladium/tests/test_server.py +++ b/palladium/tests/test_server.py @@ -665,7 +665,7 @@ def test_it(self, fit, config, jobs, flask_app): config['model_persister'] = model_persister with flask_app.test_request_context(method='POST'): resp = fit() - sleep(0.005) + sleep(0.02) resp_json = json.loads(resp.get_data(as_text=True)) job = jobs[resp_json['job_id']] assert job['status'] == 'finished' @@ -686,7 +686,7 @@ def test_pass_args(self, fit, flask_app, args, args_expected): fit_base.__name__ = 'mock' with flask_app.test_request_context(method='POST', data=args): fit() - sleep(0.005) + sleep(0.02) assert fit_base.call_args == call(**args_expected) @@ -707,7 +707,7 @@ def test_success(self, update_model_cache, config, jobs, flask_app): config['model_persister'] = model_persister with flask_app.test_request_context(method='POST'): resp = update_model_cache() - sleep(0.005) + sleep(0.02) resp_json = json.loads(resp.get_data(as_text=True)) job = jobs[resp_json['job_id']] assert job['status'] == 'finished' diff --git a/palladium/tests/test_util.py b/palladium/tests/test_util.py index e52ea11..a584c3f 100644 --- a/palladium/tests/test_util.py +++ b/palladium/tests/test_util.py @@ -1,3 +1,4 @@ +from collections import OrderedDict from datetime import datetime import threading from time import sleep @@ -150,7 +151,7 @@ def test_last_execution(self, RruleThread): dtstart=datetime(2014, 10, 30, 13, 21, 18))) thread.last_execution = datetime(2014, 10, 30, 13, 21, 18) thread.start() - sleep(0.005) + sleep(0.02) assert func.call_count == 1 def test_func_raises(self, RruleThread): @@ -164,7 +165,7 @@ def test_func_raises(self, RruleThread): with patch('palladium.util.logger') as logger: thread.start() - sleep(0.005) + sleep(0.02) assert func.call_count == 1 assert logger.exception.call_count == 1 @@ -174,7 +175,7 @@ def test_sleep_between_checks(self, RruleThread): rr.between.return_value = False thread = RruleThread(func, rr, sleep_between_checks=0.0010) thread.start() - sleep(0.005) + sleep(0.02) assert func.call_count == 0 assert rr.between.call_count > 1 @@ -390,7 +391,7 @@ def myfunc(add): results = [] for i in range(3): results.append(run_job(myfunc, add=i)) - sleep(0.005) + sleep(0.02) assert result == 3 assert len(jobs) == len(results) == 3 assert set(jobs.keys()) == set(r[1] for r in results) @@ -406,7 +407,7 @@ def myfunc(divisor): num_threads_before = len(threading.enumerate()) for i in range(3): run_job(myfunc, divisor=i) - sleep(0.005) + sleep(0.02) num_threads_after = len(threading.enumerate()) assert num_threads_before == num_threads_after @@ -426,15 +427,21 @@ def myfunc(tts): run_job(myfunc, tts=i/100) job1, job2, job3 = sorted(jobs.values(), key=lambda x: x['started']) - assert job1['status'] == 'finished' - assert job2['status'] == job3['status'] == 'running' - assert len(threading.enumerate()) - num_threads_before == 2 - sleep(0.015) - assert job2['status'] == 'finished' - assert job3['status'] == 'running' - assert len(threading.enumerate()) - num_threads_before == 1 - - sleep(0.015) - assert job3['status'] == 'finished' - assert len(threading.enumerate()) - num_threads_before == 0 + samples = [] + for i in range(10): + samples.append(( + job1['status'], + job2['status'], + job3['status'], + len(threading.enumerate()), + )) + sleep(1/100) + + got = list(OrderedDict.fromkeys(samples)) + expected = [ + ('finished', 'running', 'running', num_threads_before+2), + ('finished', 'finished', 'running', num_threads_before+1), + ('finished', 'finished', 'finished', num_threads_before+0), + ] + assert got == expected From c66592a1cbf60b6fe652d80a38f2b80425174e4c Mon Sep 17 00:00:00 2001 From: Daniel Nouri Date: Fri, 2 Aug 2019 19:51:02 +0200 Subject: [PATCH 14/19] In travis, use pip to install dependencies --- .travis.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index b8d825e..c878ed0 100644 --- a/.travis.yml +++ b/.travis.yml @@ -20,8 +20,9 @@ before_install: - conda update -q conda - pip install coveralls install: - - conda install -q python=$TRAVIS_PYTHON_VERSION --file requirements.txt - - travis_retry python setup.py dev + - conda install -q python=$TRAVIS_PYTHON_VERSION + - pip install -r requirements.txt -r requirements-dev.txt + - pip install -e .[docs,testing] script: - travis_wait py.test --runslow deploy: From cf2348ffe8a9b943deb1b78917eb4cc41b1b31b6 Mon Sep 17 00:00:00 2001 From: Daniel Nouri Date: Fri, 2 Aug 2019 20:33:27 +0200 Subject: [PATCH 15/19] Fix dependency issue between older rpy2 and newer pandas Good combination is rpy2==2.9.4 and pandas==0.23.4 --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index c878ed0..d36e201 100644 --- a/.travis.yml +++ b/.travis.yml @@ -22,6 +22,7 @@ before_install: install: - conda install -q python=$TRAVIS_PYTHON_VERSION - pip install -r requirements.txt -r requirements-dev.txt + - pip install pandas==0.23.4 # for rpy2 compatibility - pip install -e .[docs,testing] script: - travis_wait py.test --runslow From 1f28e50f534359be37b0876af2afc386293d1e72 Mon Sep 17 00:00:00 2001 From: Daniel Nouri Date: Fri, 2 Aug 2019 19:45:58 +0200 Subject: [PATCH 16/19] More tests for config __default__ directive added in PR #109 Test coverage back to 100% --- palladium/tests/test_config.py | 38 ++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/palladium/tests/test_config.py b/palladium/tests/test_config.py index 678f7b3..b8b1f25 100644 --- a/palladium/tests/test_config.py +++ b/palladium/tests/test_config.py @@ -1,4 +1,5 @@ from contextlib import contextmanager +from copy import deepcopy from functools import reduce import operator import os @@ -308,6 +309,43 @@ def test_config1_and_2(self, process_config, config1, config2): assert config['mycopiedconstant'] == 3 assert config['mycopywithdefault'] == 42 + @pytest.fixture + def config3(self): + return { + 'first': 5, + 'second': { + '__copy__': 'first', + '__default__': 6, + }, + } + + def test_copy_source_exists_with_default(self, process_config, config3): + expected = deepcopy(config3) + expected['second'] = expected['first'] + got = process_config(config3) + assert got == expected + + def test_copy_source_exists_no_default(self, process_config, config3): + expected = deepcopy(config3) + expected['second'] = expected['first'] + del config3['second']['__default__'] + got = process_config(config3) + assert got == expected + + def test_copy_source_missing_with_default(self, process_config, config3): + expected = deepcopy(config3) + expected['second'] = expected['second']['__default__'] + del expected['first'] + del config3['first'] + got = process_config(config3) + assert got == expected + + def test_copy_source_missing_no_default(self, process_config, config3): + del config3['first'] + del config3['second']['__default__'] + with pytest.raises(KeyError): + process_config(config3) + def test_initialize_config_logging(self, process_config): with patch('palladium.config.dictConfig') as dictConfig: process_config({'logging': 'yes, please'}) From 46d76861fd924df397bef673331fbb144ec03a45 Mon Sep 17 00:00:00 2001 From: Daniel Nouri Date: Fri, 2 Aug 2019 21:13:34 +0200 Subject: [PATCH 17/19] Add configuration example that uses Bayesian hyperparam optimization --- docs/user/faq.rst | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/docs/user/faq.rst b/docs/user/faq.rst index 0019adc..89655b3 100644 --- a/docs/user/faq.rst +++ b/docs/user/faq.rst @@ -164,6 +164,36 @@ passed at runtime. 'n_jobs': -1, } +Can I use Bayesian optimization instead of grid search to tune my hyperparameters? +================================================================================== + +The grid search configuration allows you to use a class other than +:class:`sklearn.grid_search.GridSearchCV` to do the hyperparameter +search. Here's an example configuration that uses `scikit-optimize +`_ to search for hyperparameters +using Bayesian optimization, assuming an :class:`sklearn.svm.SVC` +classifier: + +.. code-block:: python + + 'grid_search': { + '__factory__': 'skopt.BayesSearchCV', + 'estimator': {'__copy__': 'model'}, + 'n_iter': 16, + 'search_spaces': { + 'C': { + '__factory__': 'skopt.space.Real', + 'low': 1e-6, 'high': 1e+1, 'prior': 'log-uniform', + }, + 'degree': { + '__factory__': 'skopt.space.Integer', + 'low': 1, 'high': 20, + }, + }, + 'return_train_score': True, + 'refit': False, + 'verbose': 4, + } Can I use my cluster to run a hyperparameter search? ==================================================== From b6153f4675daa4b76f6e534680b66b0b53d55c21 Mon Sep 17 00:00:00 2001 From: Andreas Lattner Date: Thu, 15 Aug 2019 17:57:02 +0200 Subject: [PATCH 18/19] Added pld-export to conda recipe --- conda_recipe/meta.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/conda_recipe/meta.yaml b/conda_recipe/meta.yaml index b5ca7d6..705343d 100644 --- a/conda_recipe/meta.yaml +++ b/conda_recipe/meta.yaml @@ -15,6 +15,7 @@ build: entry_points: - pld-admin = palladium.fit:admin_cmd - pld-devserver = palladium.server:devserver_cmd + - pld-export = palladium.util:export_cmd - pld-fit = palladium.fit:fit_cmd - pld-grid-search = palladium.fit:grid_search_cmd - pld-list = palladium.eval:list_cmd @@ -71,6 +72,7 @@ test: - py.test --pyargs palladium - pld-admin --help - pld-devserver --help + - pld-export --help - pld-fit --help - pld-grid-search --help - pld-list --help From 0f12ac26cd53dd1e1ab6f35cfc5ef87437d83190 Mon Sep 17 00:00:00 2001 From: Andreas Lattner Date: Thu, 15 Aug 2019 17:59:13 +0200 Subject: [PATCH 19/19] Updated CHANGES.txt and bumped VERSION --- CHANGES.txt | 14 ++++++++++++++ VERSION | 2 +- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/CHANGES.txt b/CHANGES.txt index 3953e1a..a626264 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,3 +1,17 @@ +v1.2.2 - 2019-08-15 +=================== + +- Added default Palladium config file locations (to be checked if + `PALLADIUM_CONFIG` is not set) + +- Introduced `palladium.dataset.CSV` to replace `palladium.dataset.Table` + +- `__copy__` configuration directive now accepts a `__default__` + +- Updated requirements in order to use newer versions of dependencies + (also fixing potential security vulnerabilities in dependencies) + + v1.2.1.1 - 2018-12-13 ===================== diff --git a/VERSION b/VERSION index e563f37..23aa839 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.1.1 +1.2.2