In [1]:
# Dependencies:
#  base_eq.ipynb
#  base_plot.ipynb

In [3]:
import json
import os

import sklearn.base as skb
import sklearn.preprocessing as skpr
import sklearn.pipeline as skpi
import sklearn.linear_model as sklm

# https://github.com/patrick-kidger/tools
import tools

import tensorflow as tf
tfer = tf.errors
tfe = tf.estimator
tfi = tf.initializers
tfk = tf.keras
tfla = tf.layers
tflog = tf.logging
tflo = tf.losses
tft = tf.train

# Convenience imports for those files running this one
import collections as co
import functools as ft
import itertools as it

In [7]:
### Grid hyperparameters
# Everything we do is on a grid

# The separation between points of the fine grid
fine_grid_sep = tools.Object(t=0.01, x=0.01)
# The separation between points of the coarse grid
coarse_grid_sep = tools.Object(t=0.1, x=0.1)
# The amount of intervals in the coarse grid. Thus the coarse grid will contain
# (num_intervals.t + 1) * (num_intervals.x + 1) elements.
# So with num_intervals.t = 3, num_intervals.x = 3, it looks like:
#
# @ @ @ @
#
# @ @ @ @
#
# @ @ @ @
#
# @ @ @ @
num_intervals = tools.Object(t=7, x=7)


fine_grid_fineness = tools.Object(t=int(coarse_grid_sep.t // fine_grid_sep.t), 
                                  x=int(coarse_grid_sep.x // fine_grid_sep.x))
coarse_grid_size = tools.Object(t=num_intervals.t * coarse_grid_sep.t,
                                x=num_intervals.x * coarse_grid_sep.x)


### Grids to evaluate our solution on

def grid(point, grid_size, grid_fineness):
    """Creates a grid whose bottom left entry is at the specified :point:
    location. The size of the overall grid may be specified via :grid_size:, and
    the fineness of the subdivision by :grid_fineness:, both of which should be
    of the form tools.Object(t, x). Thus the resulting grid has
    (grid_fineness.t + 1) * (grid_fineness.x + 1) elements."""
    t, x = point
    return [(t_, x_) for t_ in np.linspace(t, t + grid_size.t, 
                                           grid_fineness.t + 1)
                     for x_ in np.linspace(x, x + grid_size.x, 
                                           grid_fineness.x + 1)]

def fine_grid(point):
    """Creates a fine grid whose bottom left entry is at the specified :point:
    location, with size and fineness determined by the earlier hyperparameters.
    """
    return grid(point, coarse_grid_sep, fine_grid_fineness)

def coarse_grid(point):
    """Creates a coarse grid for which the bottom left entry of its middle
    square is as the specified :t:, :x: location, with size and fineness
    determined by the earlier hyperparameters.
    """
    left_intervals_t = np.floor((num_intervals.t - 1) / 2)
    left_intervals_x = np.floor((num_intervals.x - 1) / 2)
    
    left_amount_t = left_intervals_t * coarse_grid_sep.t
    left_amount_x = left_intervals_x * coarse_grid_sep.x
    
    t, x = point
    bottomleft_point = (t - left_amount_t, x - left_amount_x)
    return grid(bottomleft_point, coarse_grid_size, num_intervals)

In [8]:
### Data generation

def gen_one_peakon():
    """Returns a random peakon and a random location."""
    # Random solution to the CH equation
    c = np.random.uniform(3, 10)
    peakon = Peakon(c=c)
    # Random location near the peak
    t = np.random.uniform(0, 10)
    x = c * t + np.random.uniform(-2, 2)
    return (t, x), peakon

def gen_two_peakon():
    """Returns a random two peakon solution, and a random location."""
    # Random solution to the CH equation
    p1 = np.random.uniform(3, 10)
    p2 = np.random.uniform(3, 10)
    x1 = np.random.uniform(0, 3)
    x2 = np.random.uniform(3.001, 6)
    twopeakon = TwoPeakon(x1, x2, p1, p2)
    # Random location near both of the peaks
    t = np.random.uniform(0, 0.5)
    left = min(x1 - 0.5 + p1 * t, x2 - 0.5 + p2 * t)
    right = max(x1 + 0.5 + p1 * t, x2 + 0.5 + p2 * t)
    middle = (right + left) / 2
    semidist = (right - left) / 2
    x = middle + semidist * np.random.uniform(-1, 1) ** 3
    return (t, x), twopeakon

def sol_on_grid(point, solution):
    """Returns the values of the :solution: on fine and coarse grids around the
    specified :point:.
    """
    # Grids at the location
    cg = coarse_grid(point)
    fg = fine_grid(point)
    # Features: the solution on the coarse grid
    X = solution.on_grid(cg)
    # Labels: the solution on the fine grid
    y = solution.on_grid(fg)
    return X, y

def sol_at_point(point, solution):
    """Returns the values of the :solution: on a coarse grid and at a random
    point near the specified :point:.
    """
    
    cg = coarse_grid(point)
    
    # Random offset from the random location that we ask for predictions at. The
    # distribution is asymmetric because we're moving relative to :point:, which
    # is in the _bottom left_ of the central cell of the coarse grid. The asymmetric
    # distribution thus makes this relative to te centre of the central cell.
    #
    # This value is not scaled relative to the size of the grid as we expect
    # that the predictions should be scale invariant, and we do not want the
    # network to unnecessarily learn the size of coarse_grid_sep.
    x_offset = np.random.uniform(-0.5, 1.5)
    t_offset = np.random.uniform(-0.5, 1.5)
    
    # Features: the solution on the coarse grid and the point to interpolate at.
    X = solution.on_grid(cg, extra=2)
    # We tell the network the offset; as the network has no way of knowing the
    # location of the grid then adding a translation would only confuse it.
    X[-2] = t_offset - 0.5  # -0.5 to normalise
    X[-1] = x_offset - 0.5  # -0.5 to normalise
    
    t, x = point
    # Label: the solution at the interpolation point
    y = np.full(1, peakon((t + t_offset * coarse_grid_sep.t, 
                           x + x_offset * coarse_grid_sep.x)))
    
    return X, y

# A particularly nice X, y that is right on the peak of the peakon
X_peak = np.array([0.71136994, 0.64367414, 0.58242045, 0.52699581, 0.47684553,
                   0.43146768, 0.3904081 , 0.35325586, 1.53965685, 1.39313912,
                   1.26056441, 1.14060584, 1.03206285, 0.93384908, 0.84498159,
                   0.76457096, 3.33236346, 3.01524715, 2.72830845, 2.46867557,
                   2.23375003, 2.02118061, 1.82883985, 1.65480272, 7.21241639,
                   6.52606422, 5.9050271 , 5.34308947, 4.83462728, 4.37455167,
                   3.95825804, 3.58157998, 3.81911647, 4.22077645, 4.66467938,
                   5.155268  , 5.69745227, 6.29665855, 6.95888391, 7.69075612,
                   1.76455206, 1.95013162, 2.15522875, 2.38189614, 2.63240234,
                   2.90925451, 3.21522348, 3.55337148, 0.81527861, 0.90102221,
                   0.99578354, 1.10051101, 1.21625277, 1.34416719, 1.48553448,
                   1.64176951, 0.37668439, 0.41630063, 0.46008335, 0.50847074,
                   0.56194707, 0.62104756, 0.68636371, 0.75854921])
y_peak = np.array([5.34308947, 5.28992485, 5.23728921, 5.18517732, 5.13358394,
                   5.08250393, 5.03193217, 4.98186361, 4.93229323, 4.8832161 ,
                   4.83462728, 5.77198627, 5.71455405, 5.65769329, 5.6013983 ,
                   5.54566345, 5.49048318, 5.43585196, 5.38176433, 5.32821488,
                   5.27519826, 5.22270916, 6.23531118, 6.1732688 , 6.11184375,
                   6.05102988, 5.99082113, 5.93121147, 5.87219493, 5.81376561,
                   5.75591767, 5.69864534, 5.64194287, 6.73582778, 6.66880518,
                   6.60244946, 6.53675399, 6.4717122 , 6.40731759, 6.34356371,
                   6.2804442 , 6.21795273, 6.15608307, 6.09482902, 7.27652151,
                   7.20411891, 7.13243673, 7.0614678 , 6.99120502, 6.92164137,
                   6.85276989, 6.78458369, 6.71707595, 6.65023993, 6.58406894,
                   7.58429925, 7.66052273, 7.70496678, 7.62830108, 7.55239822,
                   7.4772506 , 7.40285071, 7.32919112, 7.25626445, 7.18406341,
                   7.11258079, 7.0207356 , 7.09129517, 7.16256387, 7.23454883,
                   7.30725726, 7.38069641, 7.45487364, 7.52979637, 7.60547208,
                   7.68190835, 7.68351697, 6.49904846, 6.56436498, 6.63033795,
                   6.69697395, 6.76427966, 6.8322618 , 6.90092717, 6.97028264,
                   7.04033515, 7.11109169, 7.18255935, 6.01612613, 6.0765892 ,
                   6.13765994, 6.19934444, 6.26164889, 6.32457951, 6.38814259,
                   6.45234449, 6.51719163, 6.58269049, 6.64884763, 5.56908812,
                   5.62505839, 5.68159116, 5.7386921 , 5.79636692, 5.85462137,
                   5.9134613 , 5.97289257, 6.03292114, 6.093553  , 6.15479423,
                   5.155268  , 5.2070793 , 5.25941132, 5.31226928, 5.36565848,
                   5.41958424, 5.47405197, 5.5290671 , 5.58463515, 5.64076167,
                   5.69745227])


class BatchData:
    """Wrapper around a function that generates a single data point, and produces
    batches in the manner expected by TensorFlow.
    """
    
    def __init__(self, gen_one_data, batch_size, batch_reuse=1):
        """The argument :gen_one_data: is the function to generate a single data
        point. It will be called without arguments. The :batch_size: may also be
        set; it defaults to 10.
        
        There is also a :batch_reuse: argument. If generating data is intensive,
        then we may wish to use it several times to train our network before
        discarding it. This makes sense because we're working in a somewhat
        different context to that of normal machine learning, as we can generate
        an infinite amount of data (as that's just mathematical computations).
        The idea is roughly equivalent to reusing the same data between epochs.
        
        Calls to BatchData for batches of data will return the same batch of
        data :batch_reuse: times before generating a new batch of data.
        
        The :batch_reuse: argument may also be set to None to have BatchData
        endlessly give exactly the same batch of data. For example when
        comparing different neural networks: giving each network the same data
        in evaluation ensures a fair test.
        """
        
        self.gen_one_data = gen_one_data
        self.batch_size = batch_size
        self.batch_reuse = batch_reuse
        
        # How many times this batch of data has been served so far
        self._batch_reuse_index = 0
        
        # Will be assigned values in self._gen_data()
        self.X = None
        self.y = None
        self._gen_data()

    def __call__(self):
        """Generates a batch of data."""
        
        if self.batch_reuse is not None:
            if self._batch_reuse_index >= self.batch_reuse:
                self._gen_data()
                self._batch_reuse_index = 1
            else:
                self._batch_reuse_index += 1
            
        return self.X, self.y
    
    def _gen_data(self):
        """Generates self.batch_size amount of data."""
        
        X_new, y_new = self.gen_one_data()
        
        X = np.empty((self.batch_size, *X_new.shape), dtype=X_new.dtype)
        y = np.empty((self.batch_size, *y_new.shape), dtype=y_new.dtype)
        
        if self.batch_size >= 0:
            X[0] = X_new
            y[0] = y_new
            for i in range(1, self.batch_size):
                X_new, y_new = self.gen_one_data()
                X[i] = X_new
                y[i] = y_new
                
        # For some mad reason TensorFlow seems to demand that the features be 
        # wrapped in a dictionary but that the labels are not.
        self.X = {'X': X}
        self.y = y
        
    @classmethod
    def from_single_data(cls, X, y):
        return cls(gen_one_data=lambda: (X, y), batch_size=1)
        
    @classmethod
    def from_data(cls, X, y):
        assert len(X) == len(y)
        
        i = -1
        def gen_one_data():
            nonlocal i  # Yuck, but I think this is the cleanest way
            i += 1
            return X[i], y[i]
        
        return cls(gen_one_data, batch_size=len(X))
    
    @classmethod
    def test(cls, gen_one_data, batch_size=1):
        """Generates :batch_size: amount of test data, where :gen_one_data: is a 
        function for generating one piece of data."""
        return cls(gen_one_data, batch_size, None)

In [9]:
### Data preprocessing

# Not using sklearn's Pipelines as they don't provide good ways of processing the
# labels.
class Processor(tools.SubclassTrackerMixin('__name__')):
    """Base class for preprocessors."""
    
    save_attr = []
    init_attr = []
    checkpoint_filename = 'processor-checkpoint'
    
    def __init__(self, *args, **kwargs):
        self.batch_data = None
        self.X = None
        self.y = None
        self._training = True
        super(Processor, self).__init__(*args, **kwargs)
        
    def data(self, batch_data):
        self.batch_data = batch_data
        return self  # for chaining
    
    def training(self, val):
        self._training = val
        return self  # for chaining
        
    def __call__(self, use_tf=True):
        """If :use_tf: is True, which it defaults to, then outputted data is 
        wrapped in tf.constant, which is the form TensorFlow is expecting it in.
        The :use_tf: argument may be set to False to instead output the raw data,
        if the regressor is not using TensorFlow."""
        def wrapped_call():
            if self.batch_data is None:
                raise RuntimeError("Must be passed data via 'init' method before "
                                   "being called.")
                
            X, y = self.batch_data()
            X = X['X']
            X, y = self.transform(X, y)
            X = {'X': X}
            self.X = X
            self.y = y
            
            if use_tf:
                tf_x = {key: tf.constant(val) for key, val in self.X.items()}
                return tf_x, tf.constant(self.y)
            else:
                return X, y
        return wrapped_call
    
    def transform(self, X, y):
        """Processes the data."""
        raise NotImplementedError
        
    def inverse_transform(self, y):
        """Performs the inverse transform on the data."""
        raise NotImplementedError
    
    def save(self, model_dir, step):
        write_dict = {'save': {name: getattr(self, name) for name in self.save_attr},
                      'init': {name: getattr(self, name) for name in self.init_attr}}
        with open(model_dir + '/' + self.checkpoint_filename, 'w') as f:
            f.write(json.dumps(write_dict))
        tflog.info('Saving processor checkpoint for {} into {}'.format(step, model_dir))
        
    @classmethod
    def _get_load_dict(cls, model_dir):
        with open(model_dir + '/' + cls.checkpoint_filename, 'r') as f:
            return json.loads(f.read())
        
    def _load(self, load_dict):
        tflog.info("Processor checkpoint file found, restoring values.")
        training_val = self._training
        self._training = True  # to make sure that we can set values
        for key, val in load_dict['save'].items():
            setattr(self, key, val)
        self._training = training_val
    
    def load(self, model_dir):
        try:
            load_dict = self._get_load_dict(model_dir)
        except FileNotFoundError:
            tflog.info("No processor checkpoint file '{}' found."
                       .format(model_dir + '/' + self.checkpoint_filename))
        else:
            self._load(load_dict)
            
    @classmethod
    def load_init(cls, model_dir):
        load_dict = cls._get_load_dict(model_dir)
        self = cls(**load_dict['init'])
        self._load(load_dict)
        return self
    
    
class IdentityProcess(Processor):
    """Performs no processing, but still uses Processor so that the conversion
    to TensorFlow (via the use_tf argument for __call__) may still be performed.
    """
    
    def transform(self, X, y):
        return X, y
    
    def inverse_transform(self, y):
        return y
  
    
class ScaleDataOverall(Processor):
    """Scales data to between -1 and 1. Scaling is done across all batches."""
    
    init_attr = ['momentum']
    save_attr = ['X_mean', 'X_extent']
    
    def __init__(self, momentum=0.99, **kwargs):
        self.momentum = momentum
        self._X_mean = None
        self._X_extent = None
        super(ScaleDataOverall, self).__init__(**kwargs)
        
    @property
    def X_mean(self):
        return self._X_mean
    
    @X_mean.setter
    def X_mean(self, value):
        if self._training:
            self._X_mean = value
        
    @property
    def X_extent(self):
        return self._X_extent
    
    @X_extent.setter
    def X_extent(self, value):
        if self._training:
            self._X_extent = value
    
    def transform(self, X, y):
        X_mean = np.mean(X)
        X_extent = np.max(np.abs(X - X_mean))
        try:
            self.X_mean = self.X_mean * self.momentum + X_mean * (1 - self.momentum)
            self.X_extent = self.X_extent * self.momentum + X_extent * (1 - self.momentum)
        except TypeError:
            # First time around so self.X_mean is None
            self.X_mean = X_mean
            self.X_extent = X_extent
        
        X_scaled = (X - self.X_mean) / self.X_extent
        y_scaled = (y - self.X_mean) / self.X_extent
        return X_scaled, y_scaled
    
    def inverse_transform(self, y):
        return (y * self.X_extent) + self.X_mean
    
    
class NormalisationOverall(Processor):
    """Normalises inputs by subtracting mean and dividing by standard deviation.
    Scaling is done across all batches.
    """
    
    init_attr = ['momentum']
    save_attr = ['mean', 'stddev']
    
    def __init__(self, momentum=0.99, **kwargs):
        self.momentum = momentum
        self._mean = None
        self._stddev = None
        super(NormalisationOverall, self).__init__(**kwargs)
        
    @property
    def mean(self):
        return self._mean
    
    @mean.setter
    def mean(self, value):
        if self._training:
            self._mean = value
        
    @property
    def stddev(self):
        return self._stddev
    
    @stddev.setter
    def stddev(self, value):
        if self._training:
            self._stddev = value
            
    def transform(self, X, y):
        mean = np.mean(X)
        stddev = np.sqrt(np.mean(np.square(X - mean)))
        try:
            self.mean = self.mean * self.momentum + mean * (1 - self.momentum)
            self.stddev = self.stddev * self.momentum + stddev * (1 - self.momentum)
        except TypeError:
            # First time around so self.X_mean is None
            self.mean = mean
            self.stddev = stddev
            
        X_scaled = (X - self.mean) / self.stddev
        y_scaled = (y - self.mean) / self.stddev
        
        return X_scaled, y_scaled
    
    def inverse_transform(self, y):
        return (y * self.stddev) + self.mean
    
    
### Processor Saving Hooks

class ProcessorSavingHook(tft.SessionRunHook):
    """Saves the processor data."""
    
    def __init__(self, processor, model_dir, save_secs=600, 
                 save_steps=None, **kwargs):
        self.processor = processor
        self.model_dir = model_dir
        self._timer = tft.SecondOrStepTimer(every_secs=save_secs,
                                            every_steps=save_steps)
        self._global_step_tensor = None
        super(ProcessorSavingHook, self).__init__(**kwargs)
    
    def begin(self):
        self._global_step_tensor = tft.get_global_step()
        
    def after_create_session(self, session, coord):
        global_step = session.run(self._global_step_tensor)
        self._save(session, global_step)
        self._timer.update_last_triggered_step(global_step)
        
    def before_run(self, run_context):
        return tft.SessionRunArgs(self._global_step_tensor)
        
    def after_run(self, run_context, run_values):
        stale_global_step = run_values.results
        if self._timer.should_trigger_for_step(stale_global_step + 1):
            global_step = run_context.session.run(self._global_step_tensor)
            if self._timer.should_trigger_for_step(global_step):
                self._timer.update_last_triggered_step(global_step)
                self._save(run_context.session, global_step)
            
    def end(self, session):
        last_step = session.run(self._global_step_tensor)
        if last_step != self._timer.last_triggered_step():
            self._save(session, last_step)
        
    def _save(self, session, step):
        self.processor.save(self.model_dir, step)

In [21]:
### DNN Construction

# Keras-inspired nice interface, just without the slow speed and lack of 
# multicore functionality of Keras...

# Lambda wrappers so that each call to model_fn in Sequential.compile creates a
# new TensorFlow graph and thus allows repeated training without redefining the
# Estimator
def dense(*args, **kwargs):
    """Lambda wrapper around tf.layers.Dense."""
    return lambda: tfla.Dense(*args, **kwargs)


def dropout(*args, **kwargs):
    """Lambda wrapper around tf.layers.Dropout."""
    return lambda: tfla.Dropout(*args, **kwargs)

# Not currently working... :(
def alpha_dropout(*args, **kwargs):
    """Lambda wrapper around tf.keras.layers.AlphaDropout."""
    return lambda: tfk.layers.AlphaDropout(*args, **kwargs, dtype=tf.float32)


def batch_normalization(*args, **kwargs):
    """Lambda wrapper around tf.layers.BatchNormalization."""
    return lambda: tfla.BatchNormalization(*args, **kwargs)


class Sequential:
    """Defines a neural network. Expected usage is roughly:
    
    >>> model = Sequential()
    >>> model.add(dense(units=100, activation=tf.nn.relu))
    >>> model.add_train(dropout(rate=0.4))
    >>> model.add(dense(units=50, activation=tf.nn.relu))
    >>> model.add_train(dropout(rate=0.4))
    >>> model.add(dense(units=10, activation=tf.nn.relu))
    
    to define the neural network in the abstract (note that the last dense layer
    are treated as the logits), followed by:
    
    >>> dnn = model.compile()
    
    to actually create it in TensorFlow. Here, 'dnn' is a tf.Estimator, so may
    be used like:
    
    >>> dnn.train(...)
    >>> dnn.predict(...)
    >>> dnn.evaluate(...)
    """
    
    def __init__(self):
        """Creates a Sequential. See Sequential.__doc__ for more info."""
        self._layer_funcs = []
        self._layer_train = []
        self._kwargs = {}
        
    def add(self, layer):
        """Add a layer to the network. Should be the result of dense, above.
        """
        self._layer_funcs.append(layer)
        self._layer_train.append(False)
        
    def add_train(self, layer):
        """Add a layer to the network which needs to know if the network is in
        training or not. Should be the result of dropout, alpha_dropout or 
        batch_normalization, above.
        """
        self.add(layer)
        self._layer_train[-1] = True
        
    def set_kwargs(self, **kwargs):
        """Sets the custom keyword arguments to be passed to the estimators
        created during compile time.
        """
        self._kwargs = kwargs
        
    def update_kwargs(self, **kwargs):
        """Updates the custom keyword arguments to be passed to the estimators
        created during compile time.
        """
        self._kwargs.update(kwargs)
        
    def reset_kwargs(self):
        """Resets the custom keyword arguments to be passed to the estimators
        created during compile time.
        """
        self._kwargs = {}
        
    def compile(self, optimizer=None, loss_fn=tflo.mean_squared_error, 
                gradient_clip=None, **kwargs):
        """Takes its abstract neural network definition and compiles it into a
        tf.estimator.Estimator.
        
        May be given an :optimizer:, defaulting to tf.train.AdamOptimizer().
        May be given a :loss_fn:, defaulting to tf.losses.mean_squared_error.
        May be given a :gradient_clip:, defaulting to no clipping.
        
        Any additional kwargs are passed into the creation of the
        tf.estimator.Estimator.
        """
        
        # Probably shouldn't use the same optimizer instance every time? Hence
        # this.
        if optimizer is None:
            optimizer = tft.AdamOptimizer()
            
        def model_fn(features, labels, mode):
            # First layer is the feature inputs.
            layers = [features["X"]]
            
            for prev_layer, layer_func, train in zip(layers, self._layer_funcs, 
                                                     self._layer_train):
                if train:
                    layer = layer_func()(inputs=prev_layer, 
                                         training=mode == tfe.ModeKeys.TRAIN)
                else:
                    layer = layer_func()(inputs=prev_layer)
                    
                # Deliberately using the generator nature of zip to add elements
                # to the layers list as we're iterating through it.
                # https://media.giphy.com/media/3oz8xtBx06mcZWoNJm/giphy.gif
                layers.append(layer)
                
            logits = layers[-1]
            
            if mode == tfe.ModeKeys.PREDICT:
                return tfe.EstimatorSpec(mode=mode, predictions=logits)
            
            loss = loss_fn(labels, logits)

            if mode == tfe.ModeKeys.TRAIN:
                g_step = tft.get_global_step()
                update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
                with tf.control_dependencies(update_ops):
                    # Gradient clipping
                    if gradient_clip is None:
                        train_op = optimizer.minimize(loss=loss, global_step=g_step)
                    else:
                        gradients, variables = zip(*optimizer.compute_gradients(loss))
    #                     gradients0 = tf.Print(gradients[0], [tf.global_norm(gradients)], 'Global norm: ')
    #                     gradients = tuple([gradients0, *gradients[1:]])
                        gradients, _ = tf.clip_by_global_norm(gradients, 
                                                              gradient_clip)
                        train_op = optimizer.apply_gradients(zip(gradients, 
                                                                 variables),
                                                             global_step=g_step)
                return tfe.EstimatorSpec(mode=mode, loss=loss, 
                                         train_op=train_op)
            
            if mode == tfe.ModeKeys.EVAL:
                return tfe.EstimatorSpec(mode=mode, loss=loss)
            
            raise RuntimeError("mode '{}' not understood".format(mode))
                
        tools.update_without_overwrite(kwargs, self._kwargs)
        return tfe.Estimator(model_fn=model_fn, **kwargs)
    
    
def model_dir_str(model_dir, hidden_units, logits, drop_rate, drop_type, 
                  preprocessor, activation, uuid=None):
    """Returns a string for the model directory describing the network."""
    
    layer_counter = [(k, sum(1 for _ in g)) for k, g in it.groupby(hidden_units)]
    for layer_size, layer_repeat in layer_counter:
        if layer_repeat == 1:
            model_dir += '{}_'.format(layer_size)
        else:
            model_dir += '{}x{}_'.format(layer_size, layer_repeat)
    model_dir += '{}_'.format(logits)
    if drop_type in ('dropout', 'normal'):
        model_dir += 'D0'
    elif drop_type in ('alpha', 'alpha_dropout'):
        model_dir += 'A0'
    else:
        raise RuntimeError("Unrecognised drop_type '{}'".format(drop_type))
    model_dir += '{}_'.format(int(drop_rate * 10))
    model_dir += preprocessor.__class__.__name__
    
    if isinstance(activation, ft.partial):
        activation_fn = activation.func
        alpha = str(activation.keywords['alpha']).replace('.', '')
    else:
        activation_fn = activation
        alpha = '02'
        
    model_dir += '_' + activation_fn.__name__.replace('_', '')
    if activation_fn is tf.nn.leaky_relu:
        model_dir += alpha

    if uuid not in (None, ''):
        model_dir += '_' + str(uuid)
    return model_dir


def dnn_hyperparameters_from_dir(dir_name):
    """Creates DNN hyperparameters from the name of the directory of the DNN.
    """
    
    dnn_details = {}

    if 'D0' in dir_name:
        units, rest = dir_name.split('D0')
        dnn_details['drop_type'] = 'dropout'
    elif 'A0' in dir_name:
        units, rest = dir_name.split('A0')
        dnn_details['drop_type'] = 'alpha_dropout'
    else:
        raise RuntimeError("Bad dir_name string '{}'. Cannot split on 'D0' "
                           "or 'A0'.".format(dir_name))
    units = units.split('_')[:-1]  # last element is ''
    rest = rest.split('_')
    
    all_units = []
    for unit in units:
        if 'x' in unit:
            unit_size, unit_repeat = unit.split('x')
            unit_size, unit_repeat = int(unit_size), int(unit_repeat)
            all_units.extend([unit_size for _ in range(unit_repeat)])
        else:
            all_units.append(int(unit))
    dnn_details['hidden_units'] = all_units[:-1]
    dnn_details['logits'] = all_units[-1]
    
    dnn_details['drop_rate'] = int(rest[0])
    preprocessor_name = rest[1]
    preprocessor_class = Processor.find_subclass(preprocessor_name)
    dnn_details['batch_norm'] = False
    
    activation_name = rest[2].lower()
    
    # Not a great way to do this inversion, admittedly
    if activation_name == 'elu':
        dnn_details['activation'] = tf.nn.elu
    elif activation_name == 'relu':
        dnn_details['activation'] = tf.nn.relu
    elif activation_name == 'selu':
        dnn_details['activation'] = tf.nn.selu
    elif activation_name[:9] == 'leakyrelu':
        alpha = float(str(activation_name[9]) + '.' + str(activation_name[10:]))
        dnn_details['activation'] = ft.partial(tf.nn.leaky_relu, alpha=alpha)
    else:
        raise RuntimeError("Activation '{}' not understood.".format(activation_name))
        
    remaining = rest[3:]
    if len(remaining) == 0:
        uuid = None
    elif len(remaining) == 1:
        uuid = remaining[0]
    else:
        raise RuntimeError("Bad dir_name string '{}'. Too many remaining "
                           "arguments: {}".format(dir_name, remaining))
        
    return dnn_details, preprocessor_class, uuid
    
    
def create_dnn(hidden_units, logits, activation=tf.nn.relu, 
               drop_rate=0.0, drop_type='dropout', model_dir=None, log_steps=100, 
               gradient_clip=None, batch_norm=False,
               kernel_initializer=tfi.truncated_normal(mean=0, stddev=0.05)
               ):
    """Shortcut for creating a simple DNN with dense, dropout and batch 
    normalization layers, and then compiling it.
    """

    model = Sequential()
    if batch_norm:
        model.add_train(batch_normalization())
    for units in hidden_units:
        model.add(dense(units=units, activation=activation,
                        kernel_initializer=kernel_initializer))
        if batch_norm:
            model.add_train(batch_normalization())
        if drop_rate != 0:
            if drop_type == 'dropout':
                model.add_train(dropout(rate=drop_rate))
            elif drop_type in ('alpha', 'alpha_dropout'):
                model.add_train(alpha_dropout(rate=drop_rate))
    model.add(dense(units=logits))
    model.set_kwargs(model_dir=model_dir,
                     config=tfe.RunConfig(log_step_count_steps=log_steps))

    return model.compile(gradient_clip=gradient_clip)


def create_dnn_pro_from_model_dir(model_dir, log_steps=100, gradient_clip=None):
    """Creates a DNN and processor from their model_dir."""
    if model_dir[-1] in ('/', '\\'):
        model_dir = model_dir[:-1]
    model_dir_split = tools.split(['/', '\\'], model_dir)
    dir_name = model_dir_split[-1]
    dnn_details, preprocessor_class, uuid = dnn_hyperparameters_from_dir(dir_name)
    preprocessor = preprocessor_class.load_init(model_dir)
    dnn = create_dnn(**dnn_details)
    return dnn, preprocessor, uuid


def create_dnns_pros_from_dir(dir_, log_steps=100, gradient_clip=None, 
                              exclude_start=('.',), exclude_end=(),
                              exclude_in=()):
    """Creates multiple DNNs and processors from a directory containing the
    directories for multiple DNNs and processors.
    """
    
    subdirectories = sorted(next(os.walk(dir_))[1])
    if dir_[-1] in ('/', '\\'):
        dir_ = dir_[:-1]
    dnns_processors = []
    
    for subdir in subdirectories:
        if any(subdir.startswith(ex) for ex in exclude_start):
            tflog.warn("Excluding '{}' based on start.".format(subdir))
            continue
        if any(subdir.endswith(ex) for ex in exclude_end):
            tflog.warn("Excluding '{}' based on end.".format(subdir))
            continue
        if any(ex in subdir for ex in exclude_in):
            tflog.warn("Excluding '{}' based on containment.".format(subdir))
            continue
            
        model_dir = dir_ + '/' + subdir
        try:
            dnn, processor, uuid = create_dnn_pro_from_model_dir(model_dir,
                                                                 log_steps=log_steps,
                                                                 gradient_clip=gradient_clip)
        except (FileNotFoundError, RuntimeError) as e:
            tflog.warn("Could not load DNN from '{}'. Error message: '{}'"
                       .format(subdir, e))
        else:
            dnns_processors.append((dnn, processor))
            
    return dnns_processors

In [9]:
### Simpler interpolation methods
# Useful to give a baseline to compare the neural network models against.

class _RegressorBase:
    """Base class for performing predictions based on just the input. Subclasses
    are expected to provide a predict_single classmethod specifying their
    predictions.
    
    Its predict and evaluate methods are designed to resemble that of
    tf.estimator.Estimator's, so that we can call them in the same way. (We don't
    actually inherit from tf.estimator.Estimator because none of what these 
    classes do uses TensorFlow, so messing around with model functions and
    EstimatorSpecs is just unnecessary faff and overhead.)
    """
    
    # Flag to tell test_regressor (and thus BatchData) that this regressor
    # doesn't use TensorFlow.
    use_tf = False
    
    @staticmethod
    def _index_tol(cg, point, tol=0.001):
        """Searches through a list of 2-tuples, :cg:, to find the first element 
        which is within tolerance :tol: of :point:. Essentially the index method
        for lists, except this one makes sense for high precision floating point
        numbers.
        """
        
        t, x = point
        for i, element in enumerate(cg):
            t2, x2 = element
            if max(np.abs(t - t2), np.abs(x - x2)) < tol:
                return i
        raise ValueError('{} is not in {}'.format(point, type(cg)))
        
    def _prepare(self, Xi):
        """Performs any necessary preparations on the data :Xi: before making 
        predictions.
        """
        pass
    
    def _interp(self, Xi, point):
        """Helper function for performing interpolation on a coarse
        grid :Xi:, giving the value of the interpolation at :point:.
        
        The spacing of the grid is known from the global hyperparameters
        defining the coarse grid size, whilst it isn't necessary to know its
        location.
        
        The argument :point: should be scaled to the grid size, i.e.
        coarse_grid_sep.
        """
        raise NotImplementedError
    
    def predict_single(self, Xi, y):
        """Makes a prediction corresponding to input feature :Xi:.
        
        It is given the true result :y:. Not to cheat and return perfect
        results, but to determine its shape etc.
        """
        raise NotImplementedError
    
    def predict(self, input_fn, yield_single_examples=False):
        """The argument :input_fn: should probably be an instance of BatchData.
        
        The argument :yield_single_examples: is there for compatibility with the
        interface for the usual TF Estimators and is ignored.
        """
        
        returnval = []
        X, y = input_fn()
        
        for Xi in X['X']:
            returnval.append(self.predict_single(Xi, y))
            
        returnval = np.array(returnval)
        while True:
            yield returnval
            
    def evaluate(self, input_fn, steps=None):
        """The argument :input_fn: should probably be an instance of BatchData.
        
        The number of steps is given by :steps:, with None corresponding to
        infinity. The evaluation will terminate earlier if input_fn raises a
        StopIteration or a tf.errors.OutOfRangeError."""
        
        if steps is None:
            steps = np.inf
        
        losses = []
        for step in tools.rangeinf(0, steps):
            try:
                X, y = input_fn()
            except (StopIteration, tfer.OutOfRangeError):
                break
            predictor = self.predict(lambda: (X, y))
            prediction = next(predictor)
            losses.append(np.mean(np.square(y - prediction)))
            
        return {'loss': np.mean(losses), 'global_step': None}


class BilinearInterpBase(_RegressorBase):
    """Base class for performing bilinear interpolation."""
        
    def _interp(self, Xi, point):        
        # The actual t, x values for the grid don't matter from this point 
        # onwards; so this is just a translation from wherever X was actually 
        # calculated. So WLOG assume it was around 0.
        cg = coarse_grid((0, 0))
        t, x = point
        
        # The grid points nearest :point:.
        t_below = tools.round_mult(t, coarse_grid_sep.t, 'down')
        t_above = tools.round_mult(t, coarse_grid_sep.t, 'up')
        x_below = tools.round_mult(x, coarse_grid_sep.x, 'down')
        x_above = tools.round_mult(x, coarse_grid_sep.x, 'up')
        
        # The value of :Xi: at those grid points.
        t_b_x_b = Xi[self._index_tol(cg, (t_below, x_below))]
        t_a_x_b = Xi[self._index_tol(cg, (t_above, x_below))]
        t_b_x_a = Xi[self._index_tol(cg, (t_below, x_above))]
        t_a_x_a = Xi[self._index_tol(cg, (t_above, x_above))]
        
        # Shift the t, x values to be relative to the bottom-left point of the
        # grid square in which (t, x) lies.
        t_scale = (t % coarse_grid_sep.t) / coarse_grid_sep.t
        x_scale = (x % coarse_grid_sep.x) / coarse_grid_sep.x
        
        # Bilinear interpolation
        returnval = (1 - t_scale) * (1 - x_scale) * t_b_x_b
        returnval += t_scale * (1 - x_scale) * t_a_x_b
        returnval += (1 - t_scale) * x_scale * t_b_x_a
        returnval += t_scale * x_scale * t_a_x_a
        
        return returnval
    
    
class PolyInterpBase(_RegressorBase):
    """Base class for performing polynomial interpolation."""
    
    def __init__(self, poly_deg, *args, **kwargs):
        self.poly_deg = poly_deg
        self._poly_coefs = None
        super(PolyInterpBase, self).__init__(*args, **kwargs)
        
    def poly(self, point):
        """Interprets its currently stored polynomial coefficients as a 
        polynomial, and evaluates them at the specified point."""
        
        if self._poly_coefs is None:
            raise RuntimeError('Must run _prepare first!')
        
        t, x = point
        coefs = iter(self._poly_coefs)

        result = next(coefs)  # Intercept, i.e. constant term
        for power in range(1, self.poly_deg + 1):
            for x_power in range(0, power + 1):
                t_power = power - x_power
                coef = next(coefs)
                result += coef * (t ** t_power) * (x ** x_power)
        try:
            next_coef = next(coefs)
        except StopIteration:
            return result
        else:
            raise RuntimeError('coef_: {coef_}, poly_deg: {poly_deg}, '
                               'coef that shouldn\'t exist: {next_coef}'
                               .format(coef_=coef_, 
                                       poly_deg=self.poly_deg, 
                                       next_coef=next_coef))
    
    def _prepare(self, Xi):
        poly_features = skpr.PolynomialFeatures(degree=self.poly_deg, 
                                                include_bias=True)
        lin_reg = sklm.LinearRegression(fit_intercept=False)
        poly_pipe = skpi.Pipeline([('pf', poly_features), ('lr', lin_reg)])
        
        # The actual t, x values for the grid don't matter from this point 
        # onwards; so this is just a translation from wherever X was actually 
        # calculated. So WLOG assume it was around 0.
        cg = coarse_grid((0, 0))
        poly_pipe.fit(cg, Xi)
        self._poly_coefs = poly_pipe.named_steps['lr'].coef_
        
    
    def _interp(self, Xi, point):
        return self.poly(point)
    
    
class NearestInterpBase(_RegressorBase):
    """Base class for performing nearest-neighbour interpolation."""
        
    def _interp(self, Xi, point):        
        # The actual t, x values for the grid don't matter from this point 
        # onwards; so this is just a translation from wherever X was actually 
        # calculated. So WLOG assume it was around 0.
        cg = coarse_grid((0, 0))
        t, x = point
        
        # The grid point nearest :point:.
        t_nearest = tools.round_mult(t, coarse_grid_sep.t, 'round')
        x_nearest = tools.round_mult(x, coarse_grid_sep.x, 'round')
        
        # The value of :Xi: at those grid points.
        t_n_x_n = Xi[self._index_tol(cg, (t_nearest, x_nearest))]

        return t_n_x_n
    
    
class FineGridPredictorMixin(_RegressorBase):
    """Provides the predict_single function for predictions on a fine grid."""
    
    def predict_single(self, Xi, y):
        returnval = []
        # Translation doesn't matter at this point so WLOG the fine grid is
        # around 0, 0. (cls._interp makes the same assumption; these assumptions
        # must be consistent)
        for point in fine_grid((0, 0)):
            self._prepare(Xi)
            returnval.append(self._interp(Xi, point))
        return returnval
    
    
class PointPredictorMixin(_RegressorBase):
    """Provides the predict_single function for predictions at a single point."""
    
    def predict_single(self, Xi):
        # Separate the location data and the grid data
        t_offset = Xi[-2]
        x_offset = Xi[-1]
        Xi = Xi[:-2]
        self._prepare(Xi)
        # Wrapped in a list for consistency: this network just happens to only
        # be trying to predict a single label.
        return [self._interp(Xi, (t_offset * coarse_grid_sep.t, 
                                  x_offset * coarse_grid_sep.x))]
    
    
class Perfect(_RegressorBase):
    """Regressor that cheats to always give the perfect prediction."""
    
    def predict_single(self, Xi, y):
        return y

In [None]:
### Using regressors

def _use_regressor(regressor, processor):
    """Has a regressor make a prediction on some test data.
    
    The :regressor: argument should probably be an instance of
    tf.estimator.Estimator or of _RegressorBase.
    
    The :processor: argument should be an processor with data, i.e. it
    has already had its data function called on some BatchData.
    """
    
    if hasattr(regressor, 'use_tf'):
        use_tf = regressor.use_tf
    else:
        use_tf = True

    predictor = regressor.predict(input_fn=processor(use_tf),
                                  yield_single_examples=False)
    prediction = next(predictor)
    prediction_with_postprocessing = processor.inverse_transform(prediction)
    raw_X = processor.batch_data.X
    raw_y = processor.batch_data.y
    diff = prediction_with_postprocessing - raw_y
    squared_error = np.square(diff)
    result = tools.Object(prediction=prediction_with_postprocessing,
                          X=raw_X,
                          y=raw_y,
                          diff=diff,
                          average_loss=np.mean(squared_error),
                          loss=np.sum(squared_error))
    return result

In [None]:
### Using regressors in ensembles

class RegressorAverager:
    """Regressors that averages the results of other regressors to make its
    prediction.
    """
    
    use_tf = False
    
    def __init__(self, regressors_with_pro, mask=None, **kwargs):
        self.regressors_with_pro = regressors_with_pro
        self.mask = None
        if mask is None:
            self.reset_mask()
        else:
            self.set_mask(mask)
        super(RegressorAverager, self).__init__(**kwargs)
        
    def set_mask(self, mask):
        assert len(mask) == len(self.regressors_with_pro)
        self.mask = mask
        return self  # for chaining
        
    def reset_mask(self):
        self.mask = [True for _ in range(len(self.regressors_with_pro))]
        return self  # for chaining
        
    def predict(self, input_fn, *args, **kwargs):
        """The argument :input_fn: should probably be an instance of BatchData.
        """
        
        X, y = input_fn()
        X = X['X']
        test_data = BatchData.from_single_data(X, y)
        
        returnval = tools.AddBase()
        counter = 0
        for (regressor, processor), mask in zip(self.regressors_with_pro, self.mask):
            if mask:
                counter += 1
                if processor is None:
                    processor = IdentityProcess()
                processor.data(test_data)
                returnval += _use_regressor(regressor, processor).prediction
        returnval = returnval / counter
        
        while True:
            yield returnval

In [10]:
### Testing and evaluating regressors

def test_regressors(regressors_with_pro, gen_raw_data, batch_size=1):
    """Tests a list of regressors on some test data. The regressors
    may optionally have some preprocessing applied to their inputs.
    """

    test_data = BatchData.test(gen_raw_data, batch_size)
    return test_regressors_on_batch_data(regressors_with_pro, test_data)


def test_regressors_on_batch_data(regressors_with_pro, test_data):
    """As test_regressors, but takes a BatchData instance as an input 
    instead.
    """
    
    results = []
    for regressor, processor in regressors_with_pro:
        if processor is None:
            processor = IdentityProcess()
        processor.data(test_data)
        
        result = _use_regressor(regressor, processor)
        results.append(result)
        
    return results

In [11]:
### Visualising the results of regressors

# Only plots fine grid style stuff at the moment
def plot_regressors(regressors_with_info, X, y):
    """Plots the results of some regressors using the given data."""
    
    fig = plt.figure(figsize=(8, 8 * len(regressors_with_info)))
    
    test_data = BatchData.from_single_data(X, y)
    
    regressors_with_pro = [xx[:2] for xx in regressors_with_info]
    info = [xx[2] for xx in regressors_with_info]
    
    results = test_regressors_on_batch_data(regressors_with_pro, test_data)
    
    for i, (result, name) in enumerate(zip(results, info)):
        ax = make_3d_ax_for_grid_plotting(fig, (len(regressors_with_info), 1, i + 1))
        grid_plot(ax, X, 'cg', '_nolegend_')
        grid_plot(ax, result.prediction, 'fg', name)
        ax.legend()
        
    return results