In [None]:
import gym
import tensorflow as tf
import numpy as np
import random
import matplotlib.pyplot as plt


In [None]:
env = gym.make('CartPole-v1')

In [None]:
#param
l = 0.98 # lambda
a = 0.15 # learning rate
eg = 0.10 # epsilon greedy
egMin = 0.05
maxBufferSize = 50000 #500000
batchSize = 64

In [None]:
class Agent(tf.keras.Model):

  def __init__(self):
    super(Agent, self).__init__(name='mon_agent')
    # Define your layers here.
    #self.dense1 = tf.keras.layers.Dense(16, activation='relu', input_shape=(4,))
    self.dense1 = tf.keras.layers.Dense(24, activation='relu')
    self.dropout1 = tf.keras.layers.Dropout(0.5)
    self.dense2 = tf.keras.layers.Dense(24, activation='relu')
    self.dropout2 = tf.keras.layers.Dropout(0.5)
    self.dense3 = tf.keras.layers.Dense(2, activation='linear')

  def call(self, inputs):
    tmp = inputs
    tmp = self.dense1(tmp)
    tmp = self.dropout1(tmp)
    tmp = self.dense2(tmp)
    tmp = self.dropout2(tmp)
    return self.dense3(tmp)


In [None]:
class Coach(tf.keras.Model):
    def __init__(self, agent):
        super(Coach, self).__init__()
        self.agent = agent

    def call(self, inputs, **kwargs):
        [states, masks] = inputs
        #tf.print("states : ", states)
        #tf.print("masks : ", masks)
        return agent(states, **kwargs) * masks


In [None]:
def showHistory(history) :
    #print(history.history.keys())
    plt.figure(figsize=(15.0,10.0))
    fig, axes = plt.subplots(nrows=1, ncols=2) 
    fig.set_size_inches(15.0, 7.0)         
    axes[0].plot(history.history['loss'], label="loss")
    axes[0].legend()
    axes[1].plot(history.history['accuracy'], label="accuracy")
    axes[1].legend()
    plt.show()


In [None]:
agent = Agent()
agent.compile(optimizer=tf.keras.optimizers.Adam())
agent.build(tf.TensorShape([None,4]))
#agent.build(tf.TensorShape([None,2]))

coach = Coach(agent)
#coach.compile(optimizer=tf.keras.optimizers.Adam(1e-6),
coach.compile(optimizer=tf.keras.optimizers.Adam(),
    loss="mse",
    metrics=['accuracy'])
coach([tf.keras.Input(shape=(4,)), tf.keras.Input(shape=(2,))])
#coach([tf.keras.Input(shape=(2,)), tf.keras.Input(shape=(2,))])


agent.summary()
coach.summary()

#print(agent([[0,0,0,0]]));


In [None]:
#env.close()
#agent.save_weights('CartPoleV1.tf')
#agent.load_weights('CartPoleV1.tf')
#history = coach.fit(
#    [np.array(memory["states"]), np.array(memory["masks"])],
#    np.array(memory["values"]),
#    epochs=10, batch_size=64, verbose=1)
#by = [ agent([b['sp']])          for b in batch ]
#print(by)
#np.array(by * bm),
#by = [ b["m"] * ( b['r'] if b['d'] else b['r'] + a * np.max(agent([b['sp']])[0]) )
#           for b in batch ]
#print(by, bm)
#print(pix)
def drawAgent(agent, p=0.0, dp=0.0) :
    rx = np.arange(-0.24,0.24,0.02, np.float32)
    ry = np.arange(-2,2,0.2, np.float32)
    def agentDir(x,y) :
        [a ,b] = agent([[p, dp, x, y]])[0]
        #return '<' if b<a else '>'
        #return (b-a).numpy()
        return np.sign(b-a)
        
    pix = [
        #[ [[type(0.0), type(0.0), type(x), type(y)]] for x in rx]
        [ agentDir(x, y) for x in rx]
        for y in ry
        ]
    #print(pix)
    #return
    fig, ax = plt.subplots()
    fig.set_size_inches(15.0, 7.0)         
    im = ax.imshow(pix)
    ax.set_xticks(np.arange(len(rx)))
    ax.set_yticks(np.arange(len(ry)))
    ax.set_xticklabels([round(l,2) for l in rx])
    ax.set_yticklabels([round(l,2) for l in ry])
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")    
    for y in range(len(ry)):
        for x in range(len(rx)):
            #text = ax.text(x, y, pix[y][x], ha="center", va="center", color="w")
            text = ax.text(x, y, '<' if pix[y][x]<0 else '>' if pix[y][x]>0 else '0', ha="center", va="center", color="w")
    plt.show()

#print(pix)
drawAgent(agent)


In [None]:
memory = [] #{ "states":[], "masks":[], "actions":[], "values":[]}
loss = []
accuracy = []

In [None]:
for epi in range(1,100) :
    #memory = { "states":[], "masks":[], "actions":[], "values":[]}
    state = env.reset() #[2:]
    #state *= [0.0, 0.0, 1.0, 1.0]
    c0 = 0
    c1 = 0
    # * * * explore * * *
    print(epi, " => ", eg)
    nbTry = 0
    slm = 0
    slc = 0
    for step in range(1000): #epi * 100):
        env.render()
        #print(state)
        if (random.random() > eg) :
            c0 +=1
            Qs = agent([state])[0];
            action = np.argmax(Qs);
            Q = Qs[action]
            #print("State: ", state, " Qs: ", Qs.numpy(), " Action: ", action, " Q: ", Q.numpy())
        else :
            c1 +=1
            Qs = agent([state])[0];
            action = random.randrange(2)
            Q = Qs[action]
            #print("State: ", state, " Action: ", action, " Q: ", Q.numpy())

        mask = np.zeros(2)
        mask[action] = 1
        mem = { "s": state, "a": action, "m": mask}
        #sm = env.state
        state, rewards, done, info = env.step(action)
        #state *= [0.0, 0.0, 1.0, 1.0]
        #state = state[2:]
        
        #### CHEAT
        #if env.state[0] > 1.5 or env.state[0] < -1.5 : env.state = (0.0, env.state[1], env.state[2], env.state[3]); print('|',end='')
        #env.state = (0.0, 0.0, env.state[2], env.state[3])

        if done : rewards = 0.0
        rewards -= 1.0
        mem["r"] = rewards
        mem["sp"] = state
        mem["d"] = done
        memory.append(mem)
        #input()
        #print(mem)
        #Q0 = Q    
        #if not done : 
        #    #rewards -= 1.0
        #    Qs = agent([state])[0];
        #    #Q = Q + a * (rewards + l * np.max(Qs) - Q)
        #    Q = rewards + l * np.max(Qs)
        #    #Q = Q.numpy()
        #else :
        #    #rewards = -1.0
        #    rewards = 0.0
        #    #print(Q.numpy(), ' => ')
        #    Q = Q + a * (rewards - Q)
        #    #Q = 0.0
        #    print('.', end='')
        #    #if o0 == None : o0 = Q #.numpy()

        
        #print(rewards, done, " Q : ", Q0.numpy(), " => ", Q.numpy())
        #print("rewards: ", rewards, " Qs: ", Qs.numpy(), " Q: ", Q.numpy(), " mask: ", mask)
        
        #memory["values"].append(mask * Q) #.numpy())
        #memory["targets"].append( { "mask": mask, "action":action, "value":Q.numpy() } )
        slc += 1
        if done : 
            #print('.', end=''); 
            print('-' if abs(state[0])>2.4 else 'a' if abs(state[2])>0.2094 else 'len?', end=''); 
            if slc>slm : slm = slc
            slc = 0
            nbTry += 1
            #print(nbTry, " ", Qs.numpy(), " ", action, " ", sm, "=>", state, 'pos' if abs(state[0])>2.4 else 'angle' if abs(state[2])>0.2094 else 'len?');
            state = env.reset() #[2:]
            #state *= [0.0, 0.0, 1.0, 1.0]
        #if rewards == 0 : state = env.reset()
        
            if len(memory)>500 :
                while len(memory) > maxBufferSize :
                    memory = random.sample(memory, maxBufferSize)
                    #index = random.randrange(maxBufferSize)
                    #del memory["states"][index]
                    #del memory["masks"][index]
                    #del memory["actions"][index]
                    #del memory["values"][index]

                # * * * learn * * *

                batch = random.sample(memory, batchSize)
                bs = [ b["s"] for b in batch ]
                bm = [ b["m"] for b in batch ]
                by = [ b["m"] * ( b['r'] if b['d'] else b['r'] + a * np.max(agent([b['sp']])[0]) )
                       for b in batch ]

                #print("bs: ", bs[:4], " bm: ",bm[:4], "by: ", by[:4], "\r\n")
                history = coach.fit(
                    [np.array(bs), np.array(bm)],
                    np.array(by),
                    verbose=0)
                loss.append(history.history['loss'])
                accuracy.append(history.history['accuracy'])
            #print(history.history['loss'])
        #if step%200==0 : drawAgent(agent)
    drawAgent(agent)
    print("len max : ", slm, " len moy", 1000/nbTry)
    #print(memory)
    eg = max(egMin, eg*.95)
    #print("\r\neG: ", eg, "; ", c0,"-",c1,"/",c0+c1)
    #print("\r\neG: ", eg)
    
    #plt.figure(figsize=(15.0,7.0))
    #fig, axes = plt.subplots(nrows=1, ncols=2) 
    #fig.set_size_inches(15.0, 7.0)         
    #axes[0].plot(loss, label="loss")
    #axes[0].legend()
    #axes[1].plot(accuracy, label="accuracy")
    #axes[1].legend()
    #plt.show()
    #loss = []
    #accuracy = []

    #showHistory(history)
    
    #print(agent([s0])[0].numpy(), " <= ", o0)
    #break
    
    
    

In [None]:
#eg = 0.05
env.close()

In [None]:
a = 0
b = '<' if a<0 else '>' if a>0 else '0'
print(b)

In [None]:
state = env.reset() #[2:]
while True :
    env.render()

    Qs = agent([state])[0];
    action = np.argmax(Qs);
    Q = Qs[action]
    print(state, " ", Qs.numpy(), " ", action)
    drawAgent(agent, state[0], state[1])
    
    state, rewards, done, info = env.step(action)
    input()
    if done :
        break

In [None]:
memory = [] #{ "states":[], "masks":[], "actions":[], "values":[]}
loss = []
accuracy = []

state = env.reset()
for step in range(1000):
    env.render()
    #print(state)
    if (random.random() > eg) :
        c0 +=1
        Qs = agent([state])[0];
        action = np.argmax(Qs);
        Q = Qs[action]
        #print("State: ", state, " Qs: ", Qs.numpy(), " Action: ", action, " Q: ", Q.numpy())
    else :
        c1 +=1
        Qs = agent([state])[0];
        action = random.randrange(2)
        Q = Qs[action]
        #print("State: ", state, " Action: ", action, " Q: ", Q.numpy())

    mask = np.zeros(2)
    mask[action] = 1
    mem = { "s": state, "a": action, "m": mask}
    print(state, " ", Qs.numpy(), " ", action)
    drawAgent(agent)
    state, rewards, done, info = env.step(action)

    if done : rewards = 0.0
    mem["r"] = rewards
    mem["sp"] = state
    mem["d"] = done
    memory.append(mem)
    
    if done : 
        print(memory)
        break


batch = memory #random.sample(memory, batchSize)
bs = [ b["s"] for b in batch ]
bm = [ b["m"] for b in batch ]
by = [ b["m"] * ( b['r'] if b['d'] else b['r'] + a * np.max(agent([b['sp']])[0]) )
       for b in batch ]

print (bs, bm, by)
#history = coach.fit(
#    [np.array(bs), np.array(bm)],
#    np.array(by),
#    verbose=0)
#loss.append(history.history['loss'])
#accuracy.append(history.history['accuracy'])

drawAgent(agent)
