Skip to content

Commit

Permalink
Merge aa5f3d8 into 2bc0acf
Browse files Browse the repository at this point in the history
  • Loading branch information
dnouri committed Oct 25, 2018
2 parents 2bc0acf + aa5f3d8 commit 0ea7149
Show file tree
Hide file tree
Showing 7 changed files with 183 additions and 22 deletions.
37 changes: 22 additions & 15 deletions docs/user/R.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,27 +7,34 @@ R support
.. contents::
:local:


Palladium has support for using :class:`~palladium.interfaces.DatasetLoader` and
:class:`~palladium.interfaces.Model` objects that are programmed in the R
programming language.
Palladium has support for using dataset loaders and models written
in R. There's wrapper classes for
:class:`~palladium.interfaces.DatasetLoader` and
:class:`~palladium.interfaces.Model` that can execute R code to do the
actual work.

To use Palladium's R support, you'll have to install R and the Python
`rpy2 <https://pypi.python.org/pypi/rpy2>`_ package and `tzlocal
<https://pypi.python.org/pypi/rpy2>`_.

An example is available in the ``examples/R`` folder in the source
tree of Palladium (:download:`config.py <../../examples/R/config.py>`,
:download:`iris.R <../../examples/R/iris.R>`, :download:`iris.data
<../../examples/iris/iris.data>`). It contains an example of a very
simple dataset loader and model implemented in R:
Two examples are available in the ``examples/R`` folder in the source
tree of Palladium. The first example fits the `iris` dataset loaded
in R using the R ``randomForest`` (:download:`config-iris.py
<../../examples/R/config-iris.py>`, :download:`iris.R
<../../examples/R/iris.R>`, :download:`iris.data
<../../examples/iris/iris.data>`).

In this example, function ``dataset`` is responsible for loading the
dataset, while ``train.randomForest`` does the fitting:

.. literalinclude:: ../../examples/R/iris.R
:language: R
:linenos:

When configuring a dataset loader that is programmed in R, use the
:class:`palladium.R.DatasetLoader`. An example:
:class:`palladium.R.DatasetLoader`. Note how this points to the
``dataset`` function inside the ``iris.R`` script that we defined
above:

.. code-block:: python
Expand All @@ -37,11 +44,9 @@ When configuring a dataset loader that is programmed in R, use the
'funcname': 'dataset',
},
The ``scriptname`` points to the R script that contains the function
``dataset``.

R models are configured very similarly, using
:class:`palladium.R.ClassificationModel`:
R classification models are configured very similarly, using
:class:`palladium.R.ClassificationModel`. This time, we point to the
``train.randomForest`` function that we defined in our R script.

.. code-block:: python
Expand All @@ -61,3 +66,5 @@ with string target values. Thus ``['Iris-setosa', 'Iris-versicolor',

It is okay to use a :class:`~palladium.interfaces.DatasetLoader` that is
programmed in Python together with an R model.

TODO Categorical variables and Rpy2Transform
26 changes: 26 additions & 0 deletions examples/R/config-iris-dataset-from-python.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Use this file in conjunction with config-iris.py by setting:
#
# export PALLADIUM_CONFIG=config-iris.py,config-iris-dataset-from-python.py

{
'dataset_loader_train': {
'__factory__': 'palladium.dataset.Table',
'path': 'iris.data',
'names': [
'sepal length',
'sepal width',
'petal length',
'petal width',
'species',
],
'target_column': 'species',
'sep': ',',
'nrows': 100,
},

'dataset_loader_test': {
'__copy__': 'dataset_loader_train',
'skiprows': 100,
'nrows': None,
},
}
File renamed without changes.
48 changes: 48 additions & 0 deletions examples/R/config-tooth.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
{
'service_metadata': {
'service_name': 'tooth',
'service_version': '0.1',
},

'dataset_loader_train': {
'__factory__': 'palladium.R.DatasetLoader',
'scriptname': 'tooth.R',
'funcname': 'dataset',
},

'dataset_loader_test': {
'__factory__': 'palladium.R.DatasetLoader',
'scriptname': 'tooth.R',
'funcname': 'dataset',
},

'model': {
'__factory__': 'sklearn.pipeline.Pipeline',
'steps': [
['rpy2', {
'__factory__': 'palladium.R.Rpy2Transform',
}],
['regressor', {
'__factory__': 'palladium.R.RegressionModel',
'scriptname': 'tooth.R',
'funcname': 'train.randomForest',
}],
],
},

'model_persister': {
'__factory__': 'palladium.persistence.CachedUpdatePersister',
'impl': {
'__factory__': 'palladium.persistence.Database',
'url': 'sqlite:///tooth-model.db',
},
},

'predict_service': {
'__factory__': 'palladium.server.PredictService',
'mapping': [
('supp', 'str'),
('dose', 'float'),
],
},
}
32 changes: 25 additions & 7 deletions examples/R/test_R_functional.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,34 @@
pytest_plugins = 'palladium'


config2path = {
'config-iris.py': '/predict?'
'sepal length=1.0&sepal width=1.1&petal length=0.777&petal width=5',

'config-iris.py,config-iris-dataset-from-python.py': '/predict?'
'sepal length=1.0&sepal width=1.1&petal length=0.777&petal width=5',

'config-tooth.py': '/predict?'
'supp=OJ&dose=0.5',
}


@pytest.mark.slow
def test_functional(flask_app_test):
@pytest.mark.parametrize(
'config_filename', [
'config-iris.py',
'config-iris.py,config-iris-dataset-from-python.py',
'config-tooth.py',
],
)
def test_functional(flask_app_test, config_filename):
config_fname = os.path.join(
os.path.dirname(__file__),
'config.py',
config_filename,
)
with flask_app_test.test_request_context():
run_smoke_tests_with_config(
config_fname, run=['fit', 'test', 'predict'])


if __name__ == '__main__':
test_functional()
config_fname,
run=['fit', 'test', 'predict'],
func_kwargs={'predict': {'path': config2path[config_filename]}},
)
18 changes: 18 additions & 0 deletions examples/R/tooth.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
packages_needed <- c("randomForest")
packages_missing <-
packages_needed[!(packages_needed %in% installed.packages()[,"Package"])]
if(length(packages_missing))
install.packages(packages_missing, repos='http://cran.uni-muenster.de')

library(randomForest)

dataset <- function() {
data(ToothGrowth) # The Effect of Vitamin C on Tooth Growth in Guinea Pigs
x <- ToothGrowth[,2:3]
y <- ToothGrowth[,1]
list(x, y)
}

train.randomForest <- function(x, y) {
randomForest(x, y)
}
44 changes: 44 additions & 0 deletions palladium/R.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,16 @@
from palladium.interfaces import DatasetLoader
from palladium.interfaces import Model
import numpy as np
from pandas import Categorical
from pandas import DataFrame
from pandas import Series
from rpy2 import robjects
from rpy2.robjects import pandas2ri
from rpy2.robjects.pandas2ri import py2ri
from rpy2.robjects.numpy2ri import numpy2ri
from sklearn.base import TransformerMixin
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score
from sklearn.preprocessing import LabelEncoder


Expand Down Expand Up @@ -81,3 +84,44 @@ def predict(self, X):

def score(self, X, y):
return accuracy_score(self.predict(X), np.asarray(y))


class RegressionModel(AbstractModel):
"""A :class:`~palladium.interfaces.Model` for regression problems
that uses an R model for training and prediction.
"""

def predict(self, X):
X = self._from_python(X)
return np.asarray(self.r['predict'](self.rmodel_, X))

def score(self, X, y):
return r2_score(self.predict(X), np.asarray(y))


class Rpy2Transform(TransformerMixin):
def fit(self, X, y):
if isinstance(X, (np.ndarray, DataFrame)):
return self
self.index2levels_ = {}
for index in range(len(X.colnames)):
if hasattr(X[index], 'levels'):
self.index2levels_[index] = tuple(X[index].levels)
self.colnames_ = X.colnames
return self

def transform(self, X):
if isinstance(X, np.ndarray) and hasattr(self, 'index2levels_'):
X = DataFrame(X, columns=self.colnames_)
if isinstance(X, DataFrame) and hasattr(self, 'index2levels_'):
for index, levels in self.index2levels_.items():
colname = X.columns[index]
X[colname] = Categorical(
X[colname],
categories=levels,
)
X = py2ri(X)
# Deal with an rpy2 issue whereas colnames appear to get
# mangled when calling py2ri:
X.colnames = self.colnames_
return X

0 comments on commit 0ea7149

Please sign in to comment.