# Example - using Mixture of Experts layer to define Keras model

This example illustrates a practical application how multiple experts can be combined together to build an effective mixture model. 

In this example, a gating model (defined a single-layer neural network with noisy version of softmax activation), would channel each example to exactly 2 experts, and combine their results with corresponding weight.

In [1]:
import numpy as np
import keras
import tensorflow as tf
import time
import tensor2tensor
from tensor2tensor.utils.expert_utils import *
from customutils.CustomLayers import MixtureOfExpertsLayer
import keras
import keras.backend as K

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# Generate synthetic data for practice
X = np.random.normal(scale = 1, size = 10**6).reshape(-1,20) #50K observations and 20 variables
y = X[:,0] + X[:,1] + X[:,0]* X[:,1]
X.shape, y.shape
print(np.var(y))


3.0329414499386727


In [3]:
# define model architecture that would be considered as single expert
class TimeLimit(keras.callbacks.Callback):
    def __init__(self, time_limit_seconds, verbose = 1):
        self.time_limit_seconds = time_limit_seconds
        self.verbose = verbose
        
    def on_train_begin(self,logs={}):
        self.time_start = time.time()
        
    def on_epoch_end(self,epoch,logs):
        if time.time() >= self.time_start + self.time_limit_seconds:
            self.model.stop_training = True
            if self.verbose > 0: print('Training terminated after epoch ', epoch+1)

        
callbacks_list = []

def build_model(input_dim = 20, hidden_layer_sizes = [4,1], lr_init = 0.1, lr_decay = 0.001, activation = 'tanh', output_dim = 1, random_state = 7, early_stopping = False, max_fit_time = 60, compile = False):
    tf.set_random_seed(random_state)
    np.random.seed(random_state)
    inputs  = keras.layers.Input((input_dim,), dtype='float32')
    for idx, hidden_layer_size in enumerate(hidden_layer_sizes):
        if idx == 0: hidden = keras.layers.Dense(hidden_layer_size, activation = activation)(inputs)
        else:        hidden = keras.layers.Dense(hidden_layer_size, activation = activation)(hidden)
    outputs = keras.layers.Dense(output_dim)(hidden) # no activation for the last layer
    model   = keras.Model(inputs, outputs)
    #model.summary()
    if compile ==True:
        model.compile(loss = 'mean_squared_error', optimizer = keras.optimizers.Adam(lr=lr_init, decay = lr_decay))

    # add callbacks to be utilized by the fit function - could check if 'callbacks_list' not in globals() 
    global callbacks_list 
    if 'callbacks_list' in globals(): callbacks_list.clear()
    
    if early_stopping == True:
        callbacks_list.append(keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=1e-4, patience=2, verbose=1, mode='auto'))
    if max_fit_time: 
        callbacks_list.append(TimeLimit(max_fit_time))
    
    return model  

In [4]:
def build_MoE_as_model(n_experts, input_dim, output_dim, model_function, use_top_K = None, loss = 'mse', optimizer = 'rmsprop', sparse_inputs = False, random_state = 7):
    # initialize models    
    models = []
    for i in range(n_experts):
        with tf.name_scope('Expert_' + str(1+i)):
            model = model_function(random_state = 13 + i * random_state) # creates a separate expert
            models.append(model)
    
    #inputs = keras.layers.Input((input_dim,), dtype='float32')
    inputs = keras.layers.Input((input_dim,), dtype='float', sparse = sparse_inputs)
    outputs= MixtureOfExpertsLayer(models, use_top_K = use_top_K, input_dim = input_dim)(inputs)
    model  = keras.Model(inputs, outputs)
    model.compile(loss = loss, optimizer = optimizer)
    
    #could also consider returning KerasRegressor wrapper
    return model

In [5]:
K.clear_session()
keras_model = build_MoE_as_model(
        10,                                                                   # number of experts to train
        20,                                                                   # input dimensions
        1,                                                                    # output dimensions 
        lambda **kwargs: build_model(hidden_layer_sizes = [20,10], **kwargs), # individual experts
        use_top_K = 2,                                                        # number of experts to use for each record 
        optimizer = keras.optimizers.Adam(lr = 0.01),                         # optimizer
        sparse_inputs = False                                                 # whether each model would use sparse inputs  
)
keras_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_11 (InputLayer)        (None, 20)                0         
_________________________________________________________________
mixture_of_experts_layer_1 ( ((None,), 1)              6410      
Total params: 6,410
Trainable params: 6,410
Non-trainable params: 0
_________________________________________________________________


In [6]:
# Fit model to the synthetic data generated above
keras_model.fit(X,y, batch_size = 2**10, epochs = 100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x1e4c2234240>

In [7]:
# Report model fit metrics
import sklearn
sklearn.metrics.r2_score(keras_model.predict(X, batch_size = 10**20), y)

0.9987138291250041

This model is attaining about 99.9% $R^2$ after less than 100 epochs (about 1 second per epoch on GPU)