In [1]:
import time
import collections
import multiprocessing
import inspect
import random
import numpy as np
import keras
import keras.backend as K

Using TensorFlow backend.


In [2]:
class Env:
    
    input_size = 8
    output_size = input_size + 4
    
    @staticmethod
    def create_weights(N):
        weights = {}
        weights['top'] = {
            'kernel': K.eval(keras.initializers.he_normal()((64,10))),
            'bias': K.eval(keras.initializers.zeros()((10,)))}
        l_size_last = 3
        for l in range(5):
            l_size = 2**(2+l)
            weights[(l,'pre')] = {
                'kernel': K.eval(keras.initializers.he_normal()(
                    (1,1,l_size_last,l_size))),
                'bias': K.eval(keras.initializers.zeros()((l_size,)))}
            for i in range(N+1):
                for j in range(i+1,N+1):
                    weights[(l,i,j)] = {
                        'kernel': K.eval(keras.initializers.he_normal()(
                            (3,3,l_size,l_size))),
                        'bias': K.eval(keras.initializers.zeros()((l_size,)))}
            l_size_last = l_size
        return weights
        
    @staticmethod
    def run(instructions, weights):
        (X_train,Y_train),(X_test,Y_test) = keras.datasets.cifar10.load_data()
        X_train, X_test = X_train/255-0.5, X_test/255-0.5
        X_train_idx = np.random.randint(0, high=X_train.shape[0], size=(4000,))
        X_train,Y_train = X_train[X_train_idx], Y_train[X_train_idx]
        X_test_idx = np.random.randint(0, high=X_test.shape[0], size=(1000,))
        X_test,Y_test = X_test[X_test_idx], Y_test[X_test_idx]
        X = X_input = keras.layers.Input(X_train.shape[1:])
        weights_update = {}
        for l in range(5):
            w = weights[(l,'pre')]
            o = keras.layers.Conv2D(
                w['kernel'].shape[-1],
                w['kernel'].shape[:2],
                padding='same',
                kernel_initializer=keras.initializers.constant(value=w['kernel']),
                bias_initializer=keras.initializers.constant(value=w['bias']))
            weights_update[(l,'pre')] = o
            X = o(X)
            layers = {0:X}
            connected = set()
            for i,ins in enumerate(instructions):
                previous_layer_a_idx = ins['previous_layer_a']
                if previous_layer_a_idx is not None:
                    connected.add(previous_layer_a_idx)
                    w = weights[(l,previous_layer_a_idx,i+1)]
                    previous_layer_a = layers[previous_layer_a_idx]
                    o = keras.layers.Conv2D(
                        w['kernel'].shape[-1],
                        w['kernel'].shape[:2],
                        padding='same',
                        kernel_initializer= \
                            keras.initializers.constant(value=w['kernel']),
                        bias_initializer= \
                            keras.initializers.constant(value=w['bias']))
                    weights_update[(l,previous_layer_a_idx,i+1)] = o
                    previous_layer_a = o(previous_layer_a)
                previous_layer_b_idx = ins['previous_layer_b']
                if previous_layer_b_idx is not None and \
                    previous_layer_b_idx == previous_layer_a_idx:
                    previous_layer_b = previous_layer_a
                elif previous_layer_b_idx is not None:
                    connected.add(previous_layer_b_idx)
                    w = weights[(l,previous_layer_b_idx,i+1)]
                    previous_layer_b = layers[previous_layer_b_idx]
                    o = keras.layers.Conv2D(
                        w['kernel'].shape[-1],
                        w['kernel'].shape[:2],
                        padding='same',
                        kernel_initializer= \
                            keras.initializers.constant(value=w['kernel']),
                        bias_initializer= \
                            keras.initializers.constant(value=w['bias']))
                    weights_update[(l,previous_layer_b_idx,i+1)] = o
                    previous_layer_b = o(previous_layer_b)
                if previous_layer_a_idx is None and \
                    previous_layer_b_idx is None:
                    continue
                if previous_layer_a_idx is not None and \
                    previous_layer_b_idx is not None:
                    previous_layer_ab = [previous_layer_a,previous_layer_b]
                    merge_method = ins['merge_method']
                    if merge_method == 'add':
                        X = keras.layers.Add()(previous_layer_ab)
                    elif merge_method == 'sub':
                        X = keras.layers.Subtract()(previous_layer_ab)
                    elif merge_method == 'mul':
                        X = keras.layers.Multiply()(previous_layer_ab)
                    elif merge_method == 'mul_sigmoid(a)':
                        X = keras.layers.Multiply()([
                            keras.layers.Lambda(lambda x: K.sigmoid(x))(
                                keras.layers.BatchNormalization()(previous_layer_a)),
                            previous_layer_b])
                    elif merge_method == 'mul_1-sigmoid(a)':
                        X = keras.layers.Multiply()([
                            keras.layers.Lambda(lambda x: 1.-K.sigmoid(x))(
                                keras.layers.BatchNormalization()(previous_layer_a)),
                            previous_layer_b])
                    elif merge_method == 'avg':
                        X = keras.layers.Average()(previous_layer_ab)
                    else:
                        raise Exception('unknown merge method')
                elif previous_layer_a_idx is not None:
                    X = previous_layer_a
                elif previous_layer_b_idx is not None:
                    X = previous_layer_b
                X = keras.layers.BatchNormalization()(X)
                X = keras.layers.Activation('relu')(X)
                layers[i+1] = X
            not_connected = set(layers.keys()) - connected
            not_connected = [layers[nc] for nc in not_connected]
            if not not_connected:
                raise Exception('no output')
            elif len(not_connected) == 1:
                X = list(not_connected)[0]
            else:
                X = keras.layers.Average()(list(not_connected))
            X = keras.layers.MaxPooling2D()(X)
        X = keras.layers.GlobalAveragePooling2D()(X)
        w = weights['top']
        o = keras.layers.Dense(
            w['kernel'].shape[-1],
            kernel_initializer=keras.initializers.constant(value=w['kernel']),
            bias_initializer=keras.initializers.constant(value=w['bias']),
            activation='softmax')
        weights_update['top'] = o
        X = o(X)
        M = keras.Model(X_input, X)
        M_optimizer = keras.optimizers.SGD(momentum=0.9, nesterov=True)
        M.compile(M_optimizer, 'sparse_categorical_crossentropy', ['acc'])
        hist = M.fit(
            X_train, Y_train,
            validation_data=(X_test,Y_test),
            batch_size=64, epochs=1)
        score = hist.history['val_acc'][-1]
        score = score**2 * 100 if score == score else 0.
        for k,v in weights_update.items():
            w = v.get_weights()
            if not np.any(np.isnan(w[0])):
                weights[k]['kernel'] = w[0]
            if not np.any(np.isnan(w[1])):
                weights[k]['bias'] = w[1]
        return score, weights
    
    def __init__(self, N=6):
        self.N = N
        p = multiprocessing.Pool(processes=1)
        try:
            self.weights = p.apply(__class__.create_weights, [self.N])
        finally:
            p.terminate()
            p.join()
        self.reset()
    
    def run_in_process(self, instructions):
        print(instructions)
        p = multiprocessing.Pool(processes=1)
        try:
            score, weights = p.apply(
                __class__.run,
                [instructions, self.weights])
        except:
            score, weights = 0., self.weights
        finally:
            p.terminate()
            p.join()
        self.weights = weights
        return score
    
    def reset(self):
        self.actions = []
        self.instructions = []
        self.planned_layers = {0}
        return np.zeros((__class__.output_size,))
    
    def step(self, action):
        result = np.zeros((__class__.output_size),)
        result[action] = 1
        result[__class__.input_size+2] = len(self.actions)%3
        result[__class__.input_size+3] = len(self.instructions)
        if len(self.actions) < 3*self.N:
            try:
                if len(self.actions) % 3 == 0:
                    self.instructions.append({})
                    if action < len(self.instructions):
                        self.instructions[-1]['previous_layer_a'] = action
                        if action not in self.planned_layers:
                            raise Exception()
                        self.planned_layers.add(len(self.instructions))
                    elif action == len(self.instructions):
                        self.instructions[-1]['previous_layer_a'] = None
                    else:
                        raise Exception()
                elif len(self.actions) % 3 == 1:
                    if action < len(self.instructions):
                        self.instructions[-1]['previous_layer_b'] = action
                        if action not in self.planned_layers:
                            raise Exception()
                        self.planned_layers.add(len(self.instructions))
                    elif action == len(self.instructions):
                        self.instructions[-1]['previous_layer_b'] = None
                    else:
                        raise Exception()
                elif len(self.actions) % 3 == 2:
                    self.instructions[-1]['merge_method'] = \
                        ['add','mul_sigmoid(a)','mul_1-sigmoid(a)'][action]
                        #['add','sub','mul','avg', \
                        #'mul_sigmoid(a)','mul_1-sigmoid(a)'][action]
            except:
                result[__class__.input_size] = -100.
                result[__class__.input_size+1] = 1.
                return result, -100., True, {}
            finally:
                self.actions.append(action)
        if len(self.actions) >= 3*self.N:
            score = self.run_in_process(self.instructions)
            score = score if score > 0. else -100.
            result[__class__.input_size] = score
            result[__class__.input_size+1] = 1.
            return result, score, True, {}
        result[__class__.input_size] = 1.
        result[__class__.input_size+1] = 0.
        return result, 1., False, {}

In [3]:
env = Env()
env_input_size = Env.input_size
env_output_size = Env.output_size
env_prev_states = env.N * 3
env.reset()

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [4]:
Model_Sample = collections.namedtuple('Model_Sample',
    ['state', 'state_1', 'action', 'reward'])

In [5]:
class Controller:
    
    def __init__(self, input_n, output_n, beta=0.0001, curiosity=1.):
        
        self.__curiosity = curiosity
        
        self.__input_n = input_n
        self.__output_n = output_n
        self.__l_shared = [
            keras.layers.BatchNormalization(),
            keras.layers.Bidirectional(
                keras.layers.LSTM(32, return_sequences=False))]
        self.__l_policy = [
            keras.layers.Dense(32, kernel_initializer='he_uniform'),
            keras.layers.BatchNormalization(),
            keras.layers.Activation('relu'),
            keras.layers.Dense(self.__output_n),
            keras.layers.Softmax()]
        self.__l_value = [
            keras.layers.Dense(32, kernel_initializer='he_uniform'),
            keras.layers.BatchNormalization(),
            keras.layers.Activation('relu'),
            keras.layers.Dense(1)]
        self.__l_cfeature = [
            keras.layers.Dense(128, activation='tanh'),
            keras.layers.Dense(128, activation='tanh')]
        self.__l_cforward = [
            keras.layers.Dense(512, activation='tanh'),
            keras.layers.Dense(128, activation='tanh')]
        self.__l_creverse = [
            keras.layers.Dense(128, activation='tanh'),
            keras.layers.Dense(self.__output_n, activation='softmax')]
        def apply_layers(x, layers):
            last_layer = x
            for l in layers:
                last_layer = l(last_layer)
            return last_layer
        
        m = m_input = keras.layers.Input((None, self.__input_n,))
        m = apply_layers(m, self.__l_shared)
        m = apply_layers(m, self.__l_policy)
        self.__m_policy = keras.models.Model([m_input], [m])
        self.__m_policy.compile('adam', 'mse')
        
        m = m_input = keras.layers.Input((None, self.__input_n,))
        m = apply_layers(m, self.__l_shared)
        m = apply_layers(m, self.__l_value)
        self.__m_value = keras.models.Model([m_input], [m])
        self.__m_value.compile('adam', 'mse')
        
        m = m_input = keras.layers.Input((None, self.__input_n,))
        m_shared = apply_layers(m, self.__l_shared)
        m_policy = apply_layers(m_shared, self.__l_policy)
        m_value = apply_layers(m_shared, self.__l_value)
        self.__m_value_policy = keras.models.Model([m_input], [m_value, m_policy])
        self.__m_value_policy.compile('adam', 'mse')
        
        m_input_s = keras.layers.Input((None, self.__input_n,))
        m_shared_s = apply_layers(m_input_s, self.__l_shared)
        m_shared_s = apply_layers(m_shared_s, self.__l_cfeature)
        self.__m_cfeature = keras.models.Model([m_input_s], [m_shared_s])
        self.__m_cfeature.compile('adam', 'mse')
        
        m_input_s = keras.layers.Input((None, self.__input_n,))
        m_input_s_1 = keras.layers.Input((None, self.__input_n,))
        m_shared_s = self.__m_cfeature(m_input_s)
        m_shared_s_1 = self.__m_cfeature(m_input_s_1)
        m = keras.layers.Concatenate()([m_shared_s, m_shared_s_1])
        m = apply_layers(m, self.__l_creverse)
        self.__m_creverse = keras.models.Model(
            [m_input_s, m_input_s_1], [m])
        self.__m_creverse.compile('adam', 'mse')
        
        m_input_s = keras.layers.Input((None, self.__input_n,))
        m_input_s_1 = keras.layers.Input((None, self.__input_n,))
        m_shared_s = self.__m_cfeature(m_input_s)
        m_shared_s = apply_layers(m_shared_s, self.__l_cforward)
        m_shared_s_1 = self.__m_cfeature(m_input_s_1)
        m = keras.layers.Lambda(
            lambda x: K.sqrt(K.sum(K.square(x[0]-x[1]), axis=-1, keepdims=True)),
            output_shape=(1,))([m_shared_s, m_shared_s_1])
        self.__m_cforward = keras.models.Model(
            [m_input_s, m_input_s_1], [m])
        self.__m_cforward.compile('adam', 'mse')
        
        m = m_input = keras.layers.Input((None, self.__input_n,))
        m_value, m_policy = self.__m_value_policy(m)
        m = keras.layers.Concatenate()([m_value, m_policy])
        m_input_s_1 = keras.layers.Input((None, self.__input_n,))
        m_cforward = self.__m_cforward([m_input, m_input_s_1])
        m_creverse = self.__m_creverse([m_input, m_input_s_1])
        self.__m_optimizer = keras.optimizers.Nadam(clipnorm=5.)
        self.__m_train = keras.models.Model(
            [m_input, m_input_s_1], [m, m_cforward, m_creverse])
        self.__m_train.compile(self.__m_optimizer, [
            lambda y_true, y_pred: __class__.__loss(y_true, y_pred, beta),
            'mse', 'categorical_crossentropy'])
        
        self.__m_policy.summary()
        self.__m_value.summary()
        self.__m_train.summary()
    
    @staticmethod
    def __loss(y_true, y_pred, beta):
        r, action_onehot = y_true[:,:1], y_true[:,1:]
        value, policy = y_pred[:,:1], y_pred[:,1:]
        advantage = r - value
        log_policy = K.log(policy + K.epsilon())
        log_choosen_action_prob = K.sum(action_onehot * log_policy, axis=-1, keepdims=True)
        action_loss = -K.mean(log_choosen_action_prob * advantage)
        value_loss = 0.5 * K.mean(K.square(advantage))
        entropy = K.mean(-K.sum(policy * log_policy, axis=-1, keepdims=True))
        entropy = K.print_tensor(entropy, message='policy entropy')
        return action_loss + value_loss - beta * entropy
    
    def train(self, samples, epochs=1, verbose=False):
        self.__m_train.fit(
            x=[
                np.array([s.state for s in samples], dtype=np.float32),
                np.array([s.state_1 for s in samples], dtype=np.float32)],
            y=[
                np.hstack([
                    np.reshape(np.array([s.reward for s in samples], dtype=np.float32), (-1, 1)),
                    keras.utils.to_categorical([s.action for s in samples], num_classes=self.__output_n)]),
                np.zeros((len(samples),1)),
                keras.utils.to_categorical([s.action for s in samples], num_classes=self.__output_n)],
            batch_size=64,
            epochs=epochs,
            verbose=verbose)
    
    def evalute_value(self, state, verbose=False):
        v = self.__m_value.predict(
            np.array([state], dtype=np.float32))[0,0]
        if verbose:
            print(v)
        return v
    
    def get_action_prob(self, state, verbose=False):
        action_prob = self.__m_policy.predict(
            np.array([state], dtype=np.float32))[0]
        if verbose:
            print(action_prob)
        return action_prob
    
    def get_intrinsic_reward(self, state_0, state_1, action, verbose=False):
        r = self.__m_cforward.predict([
            np.array([state_0], dtype=np.float32),
            np.array([state_1], dtype=np.float32)])[0,0] * self.__curiosity
        if verbose:
            print(r)
        return r 

In [6]:
class MultiprocessController:
    
    @staticmethod
    def run(pipe, init_args, init_kwargs):
        import os
        # enforce to run on cpu
        os.environ["CUDA_DEVICE_ORDER"] = 'PCI_BUS_ID'
        os.environ["CUDA_VISIBLE_DEVICES"] = ''
        controller = Controller(*init_args, **init_kwargs)
        pipe.send('controller inited')
        while True:
            cmd = pipe.recv()
            if cmd[0] == 'train':
                pipe.send(controller.train(*cmd[1], **cmd[2]))
            elif cmd[0] == 'evalute_value':
                pipe.send(controller.evalute_value(*cmd[1], **cmd[2]))
            elif cmd[0] == 'get_action_prob':
                pipe.send(controller.get_action_prob(*cmd[1], **cmd[2]))
            elif cmd[0] == 'get_intrinsic_reward':
                pipe.send(controller.get_intrinsic_reward(*cmd[1], **cmd[2]))
            else:
                break
    
    def __init__(self, *args, **kwargs):
        pipe_l, pipe_r = multiprocessing.Pipe(duplex=True)
        self.pipe = pipe_l
        self.process = multiprocessing.Process(
            target=__class__.run,
            args=[pipe_r, args, kwargs],
            daemon=True)
        self.process.start()
        print(self.pipe.recv())
    
    def train(self, *args, **kwargs):
        self.pipe.send(('train', args, kwargs))
        return self.pipe.recv()
        
    def evalute_value(self, *args, **kwargs):
        self.pipe.send(('evalute_value', args, kwargs))
        return self.pipe.recv()
        
    def get_action_prob(self, *args, **kwargs):
        self.pipe.send(('get_action_prob', args, kwargs))
        return self.pipe.recv()
    
    def get_intrinsic_reward(self, *args, **kwargs):
        self.pipe.send(('get_intrinsic_reward', args, kwargs))
        return self.pipe.recv()

In [7]:
def play(env, m, gamma=0.98, max_steps=1000, n_prev_states=8, verbose=False):
    state_0 = env.reset()
    state_null = np.zeros_like(state_0)
    state_queue = []
    episode = []
    samples = []
    action_probs = []
    gamelen = 0
    gamelen_max = 0
    def get_prev_states(episode, idx, get_state_1=False):
        states = [e.state if not get_state_1 else e.state_1 \
                  for e in episode[max(0,(idx-n_prev_states)+1):idx+1]]
        states = [state_null]*(max(0,n_prev_states-len(states))) + states
        return states
    def add_to_samples(episode, done):
        if done:
            discounted_reward = 0.
        else:
            discounted_reward = m.evalute_value(get_prev_states(episode, len(episode)-1))
        episode[-1] = Model_Sample(
                get_prev_states(episode, len(episode)-1),
                get_prev_states(episode, len(episode)-1, get_state_1=True),
                episode[-1].action,
                discounted_reward)
        for i in reversed(range(len(episode)-1)):
            discounted_reward = episode[i].reward + \
                gamma * discounted_reward
            episode[i] = Model_Sample(
                get_prev_states(episode, i),
                get_prev_states(episode, i, get_state_1=True),
                episode[i].action,
                discounted_reward)
        samples.extend(episode)
    for i in range(max_steps):
        state_queue.append(state_0)
        if len(state_queue) > n_prev_states:
            state_queue.pop(0)
        state_queue_padded = \
            [state_null]*(max(0,n_prev_states-len(state_queue))) + state_queue
        action_prob = m.get_action_prob(state_queue_padded)
        action_probs.append(action_prob)
        action = int(np.random.choice(
            list(range(action_prob.shape[-1])),
            p=action_prob))
        state_1, reward, done, _ = env.step(action)
        state_1_queue = state_queue + [state_1]
        if len(state_1_queue) > n_prev_states:
            state_1_queue.pop(0)
        state_1_queue_padded = \
            [state_null]*(max(0,n_prev_states-len(state_1_queue))) + state_1_queue
        reward_intrinsic = m.get_intrinsic_reward(
            state_queue_padded, state_1_queue_padded, action)
        reward += reward_intrinsic
        episode.append(Model_Sample(state_0, state_1, action, reward))
        state_0 = state_1
        gamelen += 1
        if done:
            add_to_samples(episode, True)
            episode = []
            state_0 = env.reset()
            state_null = np.zeros_like(state_0)
            state_queue = []
            gamelen_max = max(gamelen_max, gamelen)
            gamelen = 0
    if episode:
        add_to_samples(episode, False)
        gamelen_max = max(gamelen_max, gamelen)
        gamelen = 0
    if verbose:
        print('std[action_prob]', np.mean(np.std(action_probs, ddof=1, axis=0)))
        print('max game len', gamelen_max)
    return samples

In [8]:
m = MultiprocessController(env_output_size, env_input_size, beta=1e-3, curiosity=0.1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, None, 12)          0         
_________________________________________________________________
batch_normalization_1 (Batch (None, None, 12)          48        
_________________________________________________________________
bidirectional_1 (Bidirection (None, 64)                11520     
_________________________________________________________________
dense_1 (Dense)              (None, 32)                2080      
_________________________________________________________________
batch_normalization_2 (Batch (None, 32)                128       
_________________________________________________________________
activation_1 (Activation)    (None, 32)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 8)                 264       
__________

Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1


In [None]:
replays = []
for i in range(500):
    samples = play(env, m, max_steps=env.N*3*3,
        n_prev_states=env_prev_states, verbose=True)
    m.train(samples, epochs=1, verbose=True)
    print('epoch {} completed'.format(i+1))

std[action_prob] 0.002530775
max game len 4
epoch 1 completed
std[action_prob] 0.03062577
max game len 3
epoch 2 completed
std[action_prob] 0.023420041
max game len 2
epoch 3 completed
std[action_prob] 0.03688792
max game len 3
epoch 4 completed
std[action_prob] 3.0522827e-08
max game len 1
epoch 5 completed
std[action_prob] 0.05887877
max game len 3
epoch 6 completed
std[action_prob] 0.053177975
max game len 3
epoch 7 completed
std[action_prob] 0.03587527
max game len 3
epoch 8 completed
std[action_prob] 0.057982706
max game len 4
epoch 9 completed
std[action_prob] 0.028948687
max game len 3
epoch 10 completed
std[action_prob] 3.8572164e-08
max game len 1
epoch 11 completed
std[action_prob] 0.054516155
max game len 3
epoch 12 completed
std[action_prob] 0.048451856
max game len 3
epoch 13 completed
std[action_prob] 0.0011014083
max game len 2
epoch 14 completed
std[action_prob] 0.061253436
max game len 3
epoch 15 completed
std[action_prob] 0.068100594
max game len 3
epoch 16 completed
