From a54360ed48250fef0656f1be433473f436a06179 Mon Sep 17 00:00:00 2001 From: Nate Kupp Date: Thu, 19 Dec 2019 15:17:24 -0800 Subject: [PATCH] [6/N] Modernize FFX: add bases --- ffx/core/__init__.py | 156 +----------------------------------------- ffx/core/bases.py | 158 +++++++++++++++++++++++++++++++++++++++++++ ffx/core/utils.py | 27 ++++++++ 3 files changed, 188 insertions(+), 153 deletions(-) create mode 100644 ffx/core/bases.py create mode 100644 ffx/core/utils.py diff --git a/ffx/core/__init__.py b/ffx/core/__init__.py index a4e292b..52ebe12 100644 --- a/ffx/core/__init__.py +++ b/ffx/core/__init__.py @@ -12,6 +12,7 @@ from sklearn.linear_model import ElasticNet from .approach import Approach +from .bases import OperatorBase, ProductBase, SimpleBase from .constants import ( CONSIDER_DENOM, CONSIDER_EXPON, @@ -210,166 +211,15 @@ def complexity(self): # We have a leading constant, then for each base we have a # coefficient, a multiply, and a plus, plus the complexity of # the base itself. - num_complexity = 1 + sum(3 + b.complexity() for b in self.bases_n) + num_complexity = 1 + sum(3 + b.complexity for b in self.bases_n) if self.bases_d: - denom_complexity = 1 + sum(3 + b.complexity() for b in self.bases_d) + denom_complexity = 1 + sum(3 + b.complexity for b in self.bases_d) # add 1 for the division return num_complexity + 1 + denom_complexity else: return num_complexity -class SimpleBase: - """e.g. x4^2""" - - def __init__(self, var, exponent): - self.var = var - self.exponent = exponent - - def simulate(self, X): - """ - @arguments - X -- 2d array of [sample_i][var_i] : float - @return - y -- 1d array of [sample_i] : float - """ - return X[:, self.var] ** self.exponent - - def __str__(self): - if self.exponent == 1: - return 'x%d' % self.var - else: - return 'x%d^%g' % (self.var, self.exponent) - - def complexity(self): - if self.exponent == 1: - return 1 - else: - return 3 - - -class OperatorBase: - """e.g. log(x4^2)""" - - def __init__(self, simple_base, nonlin_op, thr=INF): - """ - @arguments - simple_base -- SimpleBase - nonlin_op -- one of OPS - thr -- None or float -- depends on nonlin_op - """ - self.simple_base = simple_base - self.nonlin_op = nonlin_op - self.thr = thr - - def simulate(self, X): - """ - @arguments - X -- 2d array of [sample_i][var_i] : float - @return - y -- 1d array of [sample_i] : float - """ - op = self.nonlin_op - ok = True - y_lin = self.simple_base.simulate(X) - - if op == OP_ABS: - ya = numpy.abs(y_lin) - elif op == OP_MAX0: - ya = numpy.clip(y_lin, 0.0, INF) - elif op == OP_MIN0: - ya = numpy.clip(y_lin, -INF, 0.0) - elif op == OP_LOG10: - # safeguard against: log() on values <= 0.0 - mn, mx = min(y_lin), max(y_lin) - if mn <= 0.0 or numpy.isnan(mn) or mx == INF or numpy.isnan(mx): - ok = False - else: - ya = numpy.log10(y_lin) - elif op == OP_GTH: - ya = numpy.clip(self.thr - y_lin, 0.0, INF) - elif op == OP_LTH: - ya = numpy.clip(y_lin - self.thr, 0.0, INF) - else: - raise 'Unknown op %d' % op - - if ok: # could always do ** exp, but faster ways if exp is 0,1 - y = ya - else: - y = INF * numpy.ones(X.shape[0], dtype=float) - return y - - def __str__(self): - op = self.nonlin_op - simple_s = str(self.simple_base) - if op == OP_ABS: - return 'abs(%s)' % simple_s - elif op == OP_MAX0: - return 'max(0, %s)' % simple_s - elif op == OP_MIN0: - return 'min(0, %s)' % simple_s - elif op == OP_LOG10: - return 'log10(%s)' % simple_s - elif op == OP_GTH: - return 'max(0,%s-%s)' % (coefStr(self.thr), simple_s) - elif op == OP_LTH: - return ('max(0,%s-%s)' % (simple_s, coefStr(self.thr))).replace('--', '+') - else: - raise 'Unknown op %d' % op - - def complexity(self): - """Return an integer measure of model complexity. It's intended to - measure the number of nodes in the GP tree corresponding to - the model. We assume the GP language includes: +, -, *, /, - MAX0, MIN0, LOG10 but not GTH, LTH. Thus, MAX0(x) returns the - value max(0, x) but contributes only 1 + complexity(x) to the - complexity count. GTH(thr, x) returns the value max(0, thr-x) - but because it would be implemented in GP as MAX0(thr-x) it contributes - 3 + complexity(x) to the count.""" - - op = self.nonlin_op - if op == OP_ABS: - return 1 + self.simple_base.complexity() - elif op == OP_MAX0: - return 1 + self.simple_base.complexity() - elif op == OP_MIN0: - return 1 + self.simple_base.complexity() - elif op == OP_LOG10: - return 1 + self.simple_base.complexity() - elif op == OP_GTH: - return 3 + self.simple_base.complexity() - elif op == OP_LTH: - return 3 + self.simple_base.complexity() - else: - raise 'Unknown op %d' % op - - -class ProductBase: - - """e.g. x2^2 * log(x1^3)""" - - def __init__(self, base1, base2): - self.base1 = base1 - self.base2 = base2 - - def simulate(self, X): - """ - @arguments - X -- 2d array of [sample_i][var_i] : float - @return - y -- 1d array of [sample_i] : float - """ - yhat1 = self.base1.simulate(X) - yhat2 = self.base2.simulate(X) - return yhat1 * yhat2 - - def __str__(self): - return '%s * %s' % (self.base1, self.base2) - - def complexity(self): - return 1 + self.base1.complexity() + self.base2.complexity() - - class ConstantModel(RegressorMixin): """e.g. 3.2""" diff --git a/ffx/core/bases.py b/ffx/core/bases.py new file mode 100644 index 0000000..87b9458 --- /dev/null +++ b/ffx/core/bases.py @@ -0,0 +1,158 @@ +import abc + +import numpy as np +import scipy +import six + +from .constants import INF, OP_ABS, OP_GTH, OP_LOG10, OP_LTH, OP_MAX0, OP_MIN0 +from .utils import coef_str + + +class Base(six.with_metaclass(abc.ABCMeta)): + @abc.abstractmethod + def simulate(self, X): + pass + + @abc.abstractproperty + def complexity(self): + '''Return an integer measure of model complexity. It's intended to + measure the number of nodes in the GP tree corresponding to + the model. We assume the GP language includes: +, -, *, /, + MAX0, MIN0, LOG10 but not GTH, LTH. Thus, MAX0(x) returns the + value max(0, x) but contributes only 1 + complexity(x) to the + complexity count. GTH(thr, x) returns the value max(0, thr-x) + but because it would be implemented in GP as MAX0(thr-x) it contributes + 3 + complexity(x) to the count.''' + + +class SimpleBase(Base): + '''e.g. x4^2''' + + def __init__(self, var, exponent): + self.var = var + self.exponent = exponent + + def simulate(self, X): + ''' + @arguments + X -- 2d array of [sample_i][var_i] : float + @return + y -- 1d array of [sample_i] : float + ''' + return X[:, self.var] ** self.exponent + + def __str__(self): + if self.exponent == 1: + return 'x%d' % self.var + else: + return 'x%d^%g' % (self.var, self.exponent) + + @property + def complexity(self): + return 1 if self.exponent == 1 else 3 + + +class OperatorBase(Base): + '''e.g. log(x4^2)''' + + def __init__(self, simple_base, nonlin_op, thr=INF): + ''' + @arguments + simple_base -- SimpleBase + nonlin_op -- one of OPS + thr -- None or float -- depends on nonlin_op + ''' + self.simple_base = simple_base + self.nonlin_op = nonlin_op + self.thr = thr + + def simulate(self, X): + ''' + @arguments + X -- 2d array of [sample_i][var_i] : float + @return + y -- 1d array of [sample_i] : float + ''' + op = self.nonlin_op + ok = True + y_lin = self.simple_base.simulate(X) + + if op == OP_ABS: + ya = np.abs(y_lin) + elif op == OP_MAX0: + ya = np.clip(y_lin, 0.0, INF) + elif op == OP_MIN0: + ya = np.clip(y_lin, -INF, 0.0) + elif op == OP_LOG10: + # safeguard against: log() on values <= 0.0 + mn, mx = min(y_lin), max(y_lin) + if mn <= 0.0 or scipy.isnan(mn) or mx == INF or scipy.isnan(mx): + ok = False + else: + ya = np.log10(y_lin) + elif op == OP_GTH: + ya = np.clip(self.thr - y_lin, 0.0, INF) + elif op == OP_LTH: + ya = np.clip(y_lin - self.thr, 0.0, INF) + else: + raise 'Unknown op %d' % op + + if ok: # could always do ** exp, but faster ways if exp is 0,1 + y = ya + else: + y = INF * np.ones(X.shape[0], dtype=float) + return y + + def __str__(self): + op = self.nonlin_op + simple_s = str(self.simple_base) + if op == OP_ABS: + return 'abs(%s)' % simple_s + elif op == OP_MAX0: + return 'max(0, %s)' % simple_s + elif op == OP_MIN0: + return 'min(0, %s)' % simple_s + elif op == OP_LOG10: + return 'log10(%s)' % simple_s + elif op == OP_GTH: + return 'max(0,%s-%s)' % (coef_str(self.thr), simple_s) + elif op == OP_LTH: + return ('max(0,%s-%s)' % (simple_s, coef_str(self.thr))).replace('--', '+') + else: + raise 'Unknown op %d' % op + + @property + def complexity(self): + op = self.nonlin_op + if op in [OP_ABS, OP_MAX0, OP_MIN0, OP_LOG10]: + return 1 + self.simple_base.complexity + elif op in [OP_GTH, OP_LTH]: + return 3 + self.simple_base.complexity + else: + raise 'Unknown op %d' % op + + +class ProductBase(Base): + '''e.g. x2^2 * log(x1^3)''' + + def __init__(self, base1, base2): + self.base1 = base1 + self.base2 = base2 + + def simulate(self, X): + ''' + @arguments + X -- 2d array of [sample_i][var_i] : float + @return + y -- 1d array of [sample_i] : float + ''' + yhat1 = self.base1.simulate(X) + yhat2 = self.base2.simulate(X) + return yhat1 * yhat2 + + def __str__(self): + return '%s * %s' % (self.base1, self.base2) + + @property + def complexity(self): + return 1 + self.base1.complexity + self.base2.complexity diff --git a/ffx/core/utils.py b/ffx/core/utils.py new file mode 100644 index 0000000..d3267b8 --- /dev/null +++ b/ffx/core/utils.py @@ -0,0 +1,27 @@ +import numpy as np + + +def coef_str(x): + '''Gracefully print a number to 3 significant digits. See _testcoef_str in + unit tests''' + if x == 0.0: + s = '0' + elif np.abs(x) < 1e-4: + s = ('%.2e' % x).replace('e-0', 'e-') + elif np.abs(x) < 1e-3: + s = '%.6f' % x + elif np.abs(x) < 1e-2: + s = '%.5f' % x + elif np.abs(x) < 1e-1: + s = '%.4f' % x + elif np.abs(x) < 1e0: + s = '%.3f' % x + elif np.abs(x) < 1e1: + s = '%.2f' % x + elif np.abs(x) < 1e2: + s = '%.1f' % x + elif np.abs(x) < 1e4: + s = '%.0f' % x + else: + s = ('%.2e' % x).replace('e+0', 'e') + return s