Update runffx tool for Python 3 (#32)

* Bump version number for PyPI release, and include Py3.6 as supported. * Update push_to_pypi to use twine. * Update runffx tool for Python 3 * Added a Scikit-learn API, bumped FFX version number and Python/Sklearn numbers. * Squeeze an unused dimension in y. Move a comment in api. Add a commented line for pushing to test-PyPI. * Bump version number again and remember to rm old versions when uploading to PyPI. * Corrected the count of GP nodes (complexity) and used it in existing test, also added the code from the README as a test. * Bump version number for PyPI
natekupp · Apr 4, 2019 · 9a8b3c9 · 9a8b3c9
1 parent 77cc110
commit 9a8b3c9
Show file tree

Hide file tree

Showing 8 changed files with 138 additions and 42 deletions.
diff --git a/Readme.md b/Readme.md
@@ -15,17 +15,17 @@ To install from PyPI, simply run:
 	pip install ffx
 
 ## Usage
-FFX can either be run in stand-alone mode, or within your existing Python code. It installs both a command-line utility `runffx` and the Python module `ffx`.
+FFX can either be run in stand-alone mode, or within your existing Python code using its own API or a Scikit-learn style API. It installs both a command-line utility `runffx` and the Python module `ffx`.
 
 __Standalone__
 
 	runffx test train_X.csv train_y.csv test_X.csv test_y.csv
 
 Use `runffx help` for more information on using the command-line utility.
 
-__Python Module__
+__Python Module (run interface)__
 
-The following snippet is a simple example of how to use FFX. Note that all arguments are expected to be of type `numpy.ndarray` or `pandas.DataFrame`.
+The FFX Python module exposes a function, `ffx.run()`. The following snippet is a simple example of how to use FFX this way. Note that all arguments are expected to be of type `numpy.ndarray` or `pandas.DataFrame`.
 
 	import numpy as np
 	import ffx
@@ -41,14 +41,32 @@ The following snippet is a simple example of how to use FFX. Note that all argum
 	    yhat = model.simulate(test_X)
 	    print(model)
 
-Presently, the FFX Python module only exposes a single API method, `ffx.run()`.
+__Scikit-Learn interface__
+
+The FFX Python module also exposes a class, `ffx.FFXRegressor` which provides a Scikit-learn API, in particular `fit(X, y)`, `predict(X)`, and `score(X, y)` methods. In this API, all of the models produced by FFX (the whole Pareto front) are accessible after `fit()`ing as `_models`, but `predict()` and `score()` will use only the model of highest accuracy and highest complexity. Here is an example of usage.
+
+    import numpy as np
+    import ffx
+
+    # This creates a dataset of 2 predictors
+    X = np.random.random((20, 2))
+    y = 0.1 * X[:, 0] + 0.5 * X[:, 1]
+
+    train_X, test_X = X[:10], X[10:]
+    train_y, test_y = y[:10], y[10:]
+
+    FFX = ffx.FFXRegressor()
+    FFX.fit(train_X, train_y)
+    print("Prediction:", FFX.predict(test_X))
+    print("Score:", FFX.score(test_X, test_y))
+
 
 
 ## Dependencies
-* python (tested on 2.5, 2.6, 2.7, and 3.5)
+* python (tested on 2.5, 2.6, 2.7, 3.5, 3.6, 3.7)
 * numpy (1.6.0+)
 * scipy (0.9.0+)
-* scikit-learn (0.9+)
+* scikit-learn (1.5+)
 * pandas (optional, enables support for labeled `pandas.DataFrame` datasets)
 
 

diff --git a/ffx/api.py b/ffx/api.py
@@ -1,4 +1,27 @@
+from sklearn.base import BaseEstimator, RegressorMixin
 from . import core
 
+"""api.py defines user interfaces to FFX. run() runs the complete method.
+FFXRegressor is a Scikit-learn style regressor."""
+
 def run(train_X, train_y, test_X, test_y, varnames=None, verbose=False):
     return core.MultiFFXModelFactory().build(train_X, train_y, test_X, test_y, varnames, verbose)
+
+class FFXRegressor(BaseEstimator, RegressorMixin):
+    """This class provides a Scikit-learn style estimator."""
+    def __init__(self):
+        pass
+    def fit(self, X, y):
+        # if X is a Pandas DataFrame, we don't have to pass in varnames.
+        # otherwise we make up placeholders.        
+        if hasattr(X, 'columns'):
+            varnames = None
+        else:
+            varnames = ["X%d" % i for i in range(len(X))]
+        self._models = run(X, y, X, y, varnames=varnames)
+        self._model = self._models[-1]
+    def predict(self, X):
+        return self._model.simulate(X)
+    def complexity(self):
+        return self._model.complexity()
+
diff --git a/ffx/bin/runffx b/ffx/bin/runffx
@@ -57,20 +57,20 @@ output is scalar.  Values in a given row are separated by spaces.
     #got the right number of args?  If not, output help
     num_args = len(args)
     if num_args != 2:
-        print help, '\nGot %d args, need 2.' % num_args; return
+        print(help, '\nGot %d args, need 2.' % num_args); return
     X_file, y_file = args[0], args[1]
 
     if not (X_file.endswith('.csv') or X_file.endswith('.txt')):
-        print "INPUTS_FILE file '%s' needs to end with .csv or .txt." % X_file
+        print("INPUTS_FILE file '%s' needs to end with .csv or .txt." % X_file)
         return
     if not os.path.exists(X_file):
-        print "INPUTS_FILE file '%s' does not exist.  Early exit." % X_file
+        print("INPUTS_FILE file '%s' does not exist.  Early exit." % X_file)
         return
     if not (y_file.endswith('.csv') or y_file.endswith('.txt')):
-        print "OUTPUTS_FILE file '%s' needs to end with .csv or .txt." % y_file
+        print("OUTPUTS_FILE file '%s' needs to end with .csv or .txt." % y_file)
         return
     if not os.path.exists(y_file):
-        print "OUTPUTS_FILE file '%s' does not exist.  Early exit." % y_file
+        print("OUTPUTS_FILE file '%s' does not exist.  Early exit." % y_file)
         return
 
     #create the target output filenames, and ensure they don't exist
@@ -81,9 +81,9 @@ output is scalar.  Values in a given row are separated by spaces.
     test_y_file  = join(y_file, 'test_')
     for newfile in [train_X_file, train_y_file, test_X_file, test_y_file]:
         if os.path.exists(newfile):
-            print "New file '%s' exists, and should not.  Early exit." % newfile; return
+            print("New file '%s' exists, and should not.  Early exit." % newfile); return
 
-    print "Begin runffx splitdata.  INPUTS_FILE.csv=%s, OUTPUTS_FILE.csv=%s" % (X_file, y_file)
+    print("Begin runffx splitdata.  INPUTS_FILE.csv=%s, OUTPUTS_FILE.csv=%s" % (X_file, y_file))
 
     #create X, y
     X = read(X_file, dim=2) #[sample_i][var_i] : float
@@ -104,20 +104,20 @@ output is scalar.  Values in a given row are separated by spaces.
     test_X  = numpy.take(X, test_I, 0)
     test_y  = numpy.take(y, test_I)
 
-    print "There will be %d samples in training data, and %d samples in test data" % (len(train_y), len(test_y))
+    print("There will be %d samples in training data, and %d samples in test data" % (len(train_y), len(test_y)))
 
     delimiter = ',' if X_file.endswith('.csv') else '\t'
     numpy.savetxt(train_X_file, train_X, delimiter=delimiter)
     numpy.savetxt(train_y_file, train_y, delimiter=delimiter)
     numpy.savetxt(test_X_file, test_X, delimiter=delimiter)
     numpy.savetxt(test_y_file, test_y, delimiter=delimiter)
 
-    print "Created these files:"
-    print "  Training inputs:  %s" % train_X_file
-    print "  Training outputs: %s" % train_y_file
-    print "  Testing inputs:   %s" % test_X_file
-    print "  Testing outputs:  %s" % test_y_file
-    print "\nDone runffx splitdata."
+    print("Created these files:")
+    print("  Training inputs:  %s" % train_X_file)
+    print("  Training outputs: %s" % train_y_file)
+    print("  Testing inputs:   %s" % test_X_file)
+    print("  Testing outputs:  %s" % test_y_file)
+    print("\nDone runffx splitdata.")
 
 def aboutdata(args):
     help = """
@@ -127,14 +127,14 @@ Simply prints the number of variables and number of samples for the given ascii
 
 """    
     if len(args) != 1:
-        print help, "\nGot %d arguments; need 1." % len(args); return
+        print(help, "\nGot %d arguments; need 1." % len(args)); return
 
     d = numpy.shape(read(args[0]))
     if len(d) == 1:
         d = 1,d[0]
-    print "Data file: %s" % args[0]
-    print "Number of input variables: %d" % d[0]
-    print "Number of input samples: %d" % d[1]
+    print("Data file: %s" % args[0])
+    print("Number of input variables: %d" % d[0])
+    print("Number of input samples: %d" % d[1])
 
 
 def testffx(args):
@@ -159,8 +159,8 @@ because the output is scalar.  Values in a given row are separated by spaces.
     #got the right number of args?  If not, output help
     num_args = len(args)
     if not (4 <= num_args <= 5):
-        print help, '\nGot %d args. Need 4 or 5.' % num_args; return
-    print "Begin ffx test."
+        print(help, '\nGot %d args. Need 4 or 5.' % num_args); return
+    print("Begin ffx test.")
 
     #get X/y
     train_X, train_y, test_X, test_y = [read(f,dim) for f,dim in zip(args[:4],[2,1,2,1])] 
@@ -183,8 +183,8 @@ because the output is scalar.  Values in a given row are separated by spaces.
             f.write('%10s, %13s, %s\n' % 
                     ('%d' % model.numBases(), '%.4f' % (model.test_nmse * 100.0), model))
     elapsed_time = time.time() - start_time
-    print "Done.  Runtime: %.1f seconds.  Results are in: %s" % \
-        (elapsed_time, output_csv)
+    print("Done.  Runtime: %.1f seconds.  Results are in: %s" % 
+          (elapsed_time, output_csv))
 
 #=================================================================================
 #utility functions
@@ -206,20 +206,20 @@ def read(filename, dim=None, **kwargs):
 #=================================================================================
 if __name__ == '__main__':
     if len(sys.argv) == 1:
-        print USAGE
+        print(USAGE)
         sys.exit(0)
 
     toolname, args = sys.argv[1], sys.argv[2:]
     if toolname == 'help':
-        print USAGE
+        print(USAGE)
     elif toolname == 'splitdata':
         splitdata(args)
     elif toolname == 'aboutdata':
         aboutdata(args)
     elif toolname == 'test':
         testffx(args)
     else:
-        print "There is no toolname of '%s'." % toolname, USAGE
+        print("There is no toolname of '%s'." % toolname, USAGE)
+
 
 
-
diff --git a/ffx/core.py b/ffx/core.py
@@ -291,7 +291,7 @@ def complexity(self):
         # the base itself.
         num_complexity = 1 + sum(3 + b.complexity() for b in self.bases_n)
         if self.bases_d:
-            denom_complexity = 1 + sum(2 + b.complexity()
+            denom_complexity = 1 + sum(3 + b.complexity()
                                        for b in self.bases_d)
             # add 1 for the division
             return num_complexity + 1 + denom_complexity
@@ -398,17 +398,27 @@ def __str__(self):
             raise 'Unknown op %d' % op
 
     def complexity(self):
+        """Return an integer measure of model complexity. It's intended to
+        measure the number of nodes in the GP tree corresponding to
+        the model. We assume the GP language includes: +, -, *, /,
+        MAX0, MIN0, LOG10 but not GTH, LTH.  Thus, MAX0(x) returns the
+        value max(0, x) but contributes only 1 + complexity(x) to the
+        complexity count. GTH(thr, x) returns the value max(0, thr-x)
+        but because it would be implemented in GP as MAX0(thr-x) it contributes
+        3 + complexity(x) to the count."""
+
         op = self.nonlin_op
-        if op == OP_ABS:     return 1 + self.simple_base.complexity()
-        elif op == OP_MAX0:  return 2 + self.simple_base.complexity()
-        elif op == OP_MIN0:  return 2 + self.simple_base.complexity()
-        elif op == OP_LOG10: return 1 + self.simple_base.complexity()
-        elif op == OP_GTH:   return 4 + self.simple_base.complexity()
-        elif op == OP_LTH:   return 4 + self.simple_base.complexity()
-        else:                raise 'Unknown op %d' % op
+        if   op == OP_ABS:     return 1 + self.simple_base.complexity()
+        elif op == OP_MAX0:    return 1 + self.simple_base.complexity() 
+        elif op == OP_MIN0:    return 1 + self.simple_base.complexity()
+        elif op == OP_LOG10:   return 1 + self.simple_base.complexity()
+        elif op == OP_GTH:     return 3 + self.simple_base.complexity()
+        elif op == OP_LTH:     return 3 + self.simple_base.complexity()
+        else:                  raise 'Unknown op %d' % op
 
 
 class ProductBase:
+
     """e.g. x2^2 * log(x1^3)"""
 
     def __init__(self, base1, base2):
@@ -639,6 +649,11 @@ def build(self, X, y, ss, varnames=None, verbose=False):
         if self.nrow != len(y):
             raise Exception('X sample count and y sample count do not match')
 
+        # if y has shape (N, 1) then we reshape to just (N,)
+        if len(y.shape) > 1:
+            assert y.shape[1] == 1
+            y = numpy.reshape(y, (y.shape[0],))
+
         if self.ncol == 0:
             print('  Corner case: no input vars, so return a ConstantModel')
             return [ConstantModel(y.mean(), 0)]

diff --git a/push_to_pypi.sh b/push_to_pypi.sh
@@ -1 +1,5 @@
-python setup.py sdist bdist_egg register upload
+rm dist/*
+# they don't recommend using "setup.py register upload" anymore
+python setup.py sdist bdist_egg bdist_wheel # just build
+#twine upload --repository-url https://test.pypi.org/legacy/ dist/* # testpypi
+twine upload dist/* # pip install twine if needed.
diff --git a/setup.py b/setup.py
@@ -2,7 +2,7 @@
 from setuptools import setup
 setup(
     name = "ffx",
-    version = "1.3.7",
+    version = "1.3.10",
     author = "Trent McConaghy",
     author_email = "gtrent@gmail.com",
     maintainer = "Nate Kupp",
@@ -25,5 +25,6 @@
         'Programming Language :: Python :: 2.7',
         'Programming Language :: Python :: 3.5',
         'Programming Language :: Python :: 3.6',
+        'Programming Language :: Python :: 3.7'
     ],
 )
diff --git a/tests/readme_test.py b/tests/readme_test.py
@@ -0,0 +1,17 @@
+# this is the tiny example in the README, and now it
+# also prints out numBases and complexity for each model
+
+import numpy as np
+import ffx
+
+train_X = np.array( [ (1.5,2,3), (4,5,6) ] ).T
+train_y = np.array( [1,2,3])
+
+test_X = np.array( [ (5.241,1.23, 3.125), (1.1,0.124,0.391) ] ).T
+test_y = np.array( [3.03,0.9113,1.823])
+
+models = ffx.run(train_X, train_y, test_X, test_y, ["a", "b"])
+print("numBases: GP-complexity : model")
+for model in models:
+    yhat = model.simulate(test_X)
+    print(model.numBases(), ":", model.complexity(), ": ", model)
diff --git a/tests/test_sklearn_api.py b/tests/test_sklearn_api.py
@@ -0,0 +1,18 @@
+#!/usr/bin/env python
+
+import numpy as np
+import ffx
+
+# This creates a dataset of 2 predictors
+X = np.random.random((20, 2))
+y = 0.1 * X[:, 0] + 0.5 * X[:, 1]
+
+train_X, test_X = X[:10], X[10:]
+train_y, test_y = y[:10], y[10:]
+
+FFX = ffx.FFXRegressor()
+FFX.fit(train_X, train_y)
+print("Prediction:", FFX.predict(test_X))
+print("Score:", FFX.score(test_X, test_y))
+print("Complexity:", FFX.complexity())
+print("Model:", FFX._model)