Merge aa5f3d8 into 2bc0acf

ottogroup · Oct 25, 2018 · 0ea7149 · 0ea7149
2 parents 2bc0acf + aa5f3d8
commit 0ea7149
Show file tree

Hide file tree

Showing 7 changed files with 183 additions and 22 deletions.
diff --git a/docs/user/R.rst b/docs/user/R.rst
@@ -7,27 +7,34 @@ R support
 .. contents::
    :local:
 
-
-Palladium has support for using :class:`~palladium.interfaces.DatasetLoader` and
-:class:`~palladium.interfaces.Model` objects that are programmed in the R
-programming language.
+Palladium has support for using dataset loaders and models written
+in R.  There's wrapper classes for
+:class:`~palladium.interfaces.DatasetLoader` and
+:class:`~palladium.interfaces.Model` that can execute R code to do the
+actual work.
 
 To use Palladium's R support, you'll have to install R and the Python
 `rpy2 <https://pypi.python.org/pypi/rpy2>`_ package and `tzlocal
 <https://pypi.python.org/pypi/rpy2>`_.
 
-An example is available in the ``examples/R`` folder in the source
-tree of Palladium (:download:`config.py <../../examples/R/config.py>`,
-:download:`iris.R <../../examples/R/iris.R>`, :download:`iris.data
-<../../examples/iris/iris.data>`).  It contains an example of a very
-simple dataset loader and model implemented in R:
+Two examples are available in the ``examples/R`` folder in the source
+tree of Palladium.  The first example fits the `iris` dataset loaded
+in R using the R ``randomForest`` (:download:`config-iris.py
+<../../examples/R/config-iris.py>`, :download:`iris.R
+<../../examples/R/iris.R>`, :download:`iris.data
+<../../examples/iris/iris.data>`).
+
+In this example, function ``dataset`` is responsible for loading the
+dataset, while ``train.randomForest`` does the fitting:
 
 .. literalinclude:: ../../examples/R/iris.R
    :language: R
    :linenos:
 
 When configuring a dataset loader that is programmed in R, use the
-:class:`palladium.R.DatasetLoader`.  An example:
+:class:`palladium.R.DatasetLoader`.  Note how this points to the
+``dataset`` function inside the ``iris.R`` script that we defined
+above:
 
 .. code-block:: python
 
@@ -37,11 +44,9 @@ When configuring a dataset loader that is programmed in R, use the
       'funcname': 'dataset',
       },
 
-The ``scriptname`` points to the R script that contains the function
-``dataset``.
-
-R models are configured very similarly, using
-:class:`palladium.R.ClassificationModel`:
+R classification models are configured very similarly, using
+:class:`palladium.R.ClassificationModel`.  This time, we point to the
+``train.randomForest`` function that we defined in our R script.
 
 .. code-block:: python
 
@@ -61,3 +66,5 @@ with string target values.  Thus ``['Iris-setosa', 'Iris-versicolor',
 
 It is okay to use a :class:`~palladium.interfaces.DatasetLoader` that is
 programmed in Python together with an R model.
+
+TODO Categorical variables and Rpy2Transform
diff --git a/examples/R/config-iris-dataset-from-python.py b/examples/R/config-iris-dataset-from-python.py
@@ -0,0 +1,26 @@
+# Use this file in conjunction with config-iris.py by setting:
+#
+#   export PALLADIUM_CONFIG=config-iris.py,config-iris-dataset-from-python.py
+
+{
+    'dataset_loader_train': {
+        '__factory__': 'palladium.dataset.Table',
+        'path': 'iris.data',
+        'names': [
+            'sepal length',
+            'sepal width',
+            'petal length',
+            'petal width',
+            'species',
+        ],
+        'target_column': 'species',
+        'sep': ',',
+        'nrows': 100,
+    },
+
+    'dataset_loader_test': {
+        '__copy__': 'dataset_loader_train',
+        'skiprows': 100,
+        'nrows': None,
+    },
+}
diff --git a/examples/R/config.py → examples/R/config-iris.py b/examples/R/config.py → examples/R/config-iris.py
diff --git a/examples/R/config-tooth.py b/examples/R/config-tooth.py
@@ -0,0 +1,48 @@
+{
+    'service_metadata': {
+        'service_name': 'tooth',
+        'service_version': '0.1',
+    },
+
+    'dataset_loader_train': {
+        '__factory__': 'palladium.R.DatasetLoader',
+        'scriptname': 'tooth.R',
+        'funcname': 'dataset',
+    },
+
+    'dataset_loader_test': {
+        '__factory__': 'palladium.R.DatasetLoader',
+        'scriptname': 'tooth.R',
+        'funcname': 'dataset',
+    },
+
+    'model': {
+        '__factory__': 'sklearn.pipeline.Pipeline',
+        'steps': [
+            ['rpy2', {
+                '__factory__': 'palladium.R.Rpy2Transform',
+            }],
+            ['regressor', {
+                '__factory__': 'palladium.R.RegressionModel',
+                'scriptname': 'tooth.R',
+                'funcname': 'train.randomForest',
+            }],
+        ],
+    },
+
+    'model_persister': {
+        '__factory__': 'palladium.persistence.CachedUpdatePersister',
+        'impl': {
+            '__factory__': 'palladium.persistence.Database',
+            'url': 'sqlite:///tooth-model.db',
+        },
+    },
+
+    'predict_service': {
+        '__factory__': 'palladium.server.PredictService',
+        'mapping': [
+            ('supp', 'str'),
+            ('dose', 'float'),
+        ],
+    },
+}
diff --git a/examples/R/test_R_functional.py b/examples/R/test_R_functional.py
@@ -10,16 +10,34 @@
 pytest_plugins = 'palladium'
 
 
+config2path = {
+    'config-iris.py': '/predict?'
+    'sepal length=1.0&sepal width=1.1&petal length=0.777&petal width=5',
+
+    'config-iris.py,config-iris-dataset-from-python.py': '/predict?'
+    'sepal length=1.0&sepal width=1.1&petal length=0.777&petal width=5',
+
+    'config-tooth.py': '/predict?'
+    'supp=OJ&dose=0.5',
+    }
+
+
 @pytest.mark.slow
-def test_functional(flask_app_test):
+@pytest.mark.parametrize(
+    'config_filename', [
+        'config-iris.py',
+        'config-iris.py,config-iris-dataset-from-python.py',
+        'config-tooth.py',
+        ],
+    )
+def test_functional(flask_app_test, config_filename):
     config_fname = os.path.join(
         os.path.dirname(__file__),
-        'config.py',
+        config_filename,
     )
     with flask_app_test.test_request_context():
         run_smoke_tests_with_config(
-            config_fname, run=['fit', 'test', 'predict'])
-
-
-if __name__ == '__main__':
-    test_functional()
+            config_fname,
+            run=['fit', 'test', 'predict'],
+            func_kwargs={'predict': {'path': config2path[config_filename]}},
+            )
diff --git a/examples/R/tooth.R b/examples/R/tooth.R
@@ -0,0 +1,18 @@
+packages_needed <- c("randomForest")
+packages_missing <-
+  packages_needed[!(packages_needed %in% installed.packages()[,"Package"])]
+if(length(packages_missing))
+  install.packages(packages_missing, repos='http://cran.uni-muenster.de')
+
+library(randomForest)
+
+dataset <- function() {
+    data(ToothGrowth) # The Effect of Vitamin C on Tooth Growth in Guinea Pigs
+    x <- ToothGrowth[,2:3]
+    y <- ToothGrowth[,1]
+    list(x, y)
+}
+
+train.randomForest <- function(x, y) {
+    randomForest(x, y)
+}
diff --git a/palladium/R.py b/palladium/R.py
@@ -4,13 +4,16 @@
 from palladium.interfaces import DatasetLoader
 from palladium.interfaces import Model
 import numpy as np
+from pandas import Categorical
 from pandas import DataFrame
 from pandas import Series
 from rpy2 import robjects
 from rpy2.robjects import pandas2ri
 from rpy2.robjects.pandas2ri import py2ri
 from rpy2.robjects.numpy2ri import numpy2ri
+from sklearn.base import TransformerMixin
 from sklearn.metrics import accuracy_score
+from sklearn.metrics import r2_score
 from sklearn.preprocessing import LabelEncoder
 
 
@@ -81,3 +84,44 @@ def predict(self, X):
 
     def score(self, X, y):
         return accuracy_score(self.predict(X), np.asarray(y))
+
+
+class RegressionModel(AbstractModel):
+    """A :class:`~palladium.interfaces.Model` for regression problems
+    that uses an R model for training and prediction.
+    """
+
+    def predict(self, X):
+        X = self._from_python(X)
+        return np.asarray(self.r['predict'](self.rmodel_, X))
+
+    def score(self, X, y):
+        return r2_score(self.predict(X), np.asarray(y))
+
+
+class Rpy2Transform(TransformerMixin):
+    def fit(self, X, y):
+        if isinstance(X, (np.ndarray, DataFrame)):
+            return self
+        self.index2levels_ = {}
+        for index in range(len(X.colnames)):
+            if hasattr(X[index], 'levels'):
+                self.index2levels_[index] = tuple(X[index].levels)
+        self.colnames_ = X.colnames
+        return self
+
+    def transform(self, X):
+        if isinstance(X, np.ndarray) and hasattr(self, 'index2levels_'):
+            X = DataFrame(X, columns=self.colnames_)
+        if isinstance(X, DataFrame) and hasattr(self, 'index2levels_'):
+            for index, levels in self.index2levels_.items():
+                colname = X.columns[index]
+                X[colname] = Categorical(
+                    X[colname],
+                    categories=levels,
+                    )
+            X = py2ri(X)
+            # Deal with an rpy2 issue whereas colnames appear to get
+            # mangled when calling py2ri:
+            X.colnames = self.colnames_
+        return X