In [None]:
import gym
import tensorflow as tf
import numpy as np
import random
import matplotlib.pyplot as plt


In [None]:
env = gym.make('CartPole-v1')

In [None]:
#param
l = 0.98 # lambda
a = 0.1 # learning rate
eg = 0.9 # epsilon greedy
egMin = 0.01

In [None]:
class Agent(tf.keras.Model):

  def __init__(self):
    super(Agent, self).__init__(name='mon_agent')
    # Define your layers here.
    self.dense1 = tf.keras.layers.Dense(8, activation='relu', input_shape=(4,))
    self.dense2 = tf.keras.layers.Dense(2, activation='relu')

  def call(self, inputs):
    tmp = self.dense1(inputs)
    return self.dense2(tmp)


In [None]:
class Coach(tf.keras.Model):
    def __init__(self, agent):
        super(Coach, self).__init__()
        self.agent = agent

    def call(self, inputs):
        [states, masks] = inputs
        return agent(states) * masks


In [None]:
def showHistory(history) :
    #print(history.history.keys())
    plt.figure(figsize=(15.0,10.0))
    fig, axes = plt.subplots(nrows=1, ncols=2) 
    fig.set_size_inches(15.0, 7.0)         
    axes[0].plot(history.history['loss'], label="loss")
    axes[0].legend()
    axes[1].plot(history.history['accuracy'], label="accuracy")
    axes[1].legend()
    plt.show()


In [None]:
agent = Agent()
agent.compile(optimizer=tf.keras.optimizers.Adam())
agent.build(tf.TensorShape([None,4]))

coach = Coach(agent)
coach.compile(optimizer=tf.keras.optimizers.Adam(),
    loss="mse",
    metrics=['accuracy'])
coach([tf.keras.Input(shape=(4)), tf.keras.Input(shape=(2))])


agent.summary()
coach.summary()

#print(agent([[0,0,0,0]]));


In [None]:
#i = 0
#r = 0
s0 = None
o0 = None
for epi in range(1,100) :
    memory = { "states":[], "masks":[], "actions":[], "values":[]}
    state = env.reset()
    c0 = 0
    c1 = 0
    # * * * explore * * *
    for step in range(500): #epi * 100):
        env.render()
        #print(state)
        if (random.random() > eg) :
            c0 +=1
            Qs = agent([state])[0];
            action = np.argmax(Qs);
            Q = Qs[action]
            #print("State: ", state, " Qs: ", Qs.numpy(), " Action: ", action, " Q: ", Q.numpy())
        else :
            c1 +=1
            Qs = agent([state])[0];
            action = random.randrange(2)
            Q = Qs[action]
            #print("State: ", state, " Action: ", action, " Q: ", Q.numpy())

        memory["states"].append(state)
        memory["actions"].append(action)
        mask = np.zeros(2)
        mask[action] = 1
        memory["masks"].append(mask)
        #if o0 == None : s0 = state
        state, rewards, done, info = env.step(action)
        #input()

        Q0 = Q    
        if not done : 
            #rewards -= 1.0
            Qs = agent([state])[0];
            Q = Q + a * (rewards + l * np.max(Qs) - Q)
            Q = Q.numpy()
        else :
            #rewards = -1.0
            rewards = 0.0
            #print(Q.numpy(), ' => ')
            Q = Q + a * (rewards - Q)
            Q = 0
            print('.', end='')
            #if o0 == None : o0 = Q #.numpy()

        
        #print(rewards, done, " Q : ", Q0.numpy(), " => ", Q.numpy())
        #print("rewards: ", rewards, " Qs: ", Qs.numpy(), " Q: ", Q.numpy(), " mask: ", mask)
        
        memory["values"].append(mask * Q) #.numpy())
        #memory["targets"].append( { "mask": mask, "action":action, "value":Q.numpy() } )
        if done : state = env.reset()
        #if rewards == 0 : state = env.reset()
        
    #print(memory)
    eg = max(egMin, eg*.9)
    print("\r\neG: ", eg, "; ", c0,"-",c1,"/",c0+c1)
    
    # * * * learn * * *
    history = coach.fit(
        [np.array(memory["states"]), np.array(memory["masks"])],
        np.array(memory["values"]),
        epochs=10, batch_size=64, verbose=0)
    
    #showHistory(history)
    
    #print(agent([s0])[0].numpy(), " <= ", o0)
    #break
    
    
    

In [None]:
#env.close()
agent.save_weights('CartPoleV1.tf')
#agent.load_weights('CartPoleV1.tf')


In [None]:
#eg = 0.05