In [None]:
import gym
import tensorflow as tf
import numpy as np
import random
import matplotlib.pyplot as plt


In [None]:
env = gym.make('CartPole-v1')

In [None]:
#param

LEARNING_RATE = 0.001 # pour AdamOptimizer
GAMMA = 0.99 # taux pour reward => advantage

MEMORY_SIZE = 100 # memoire min avant train


In [None]:
class Agent(tf.keras.Model):

    def __init__(self):
        super(Agent, self).__init__()
        
        # Define your layers here.
        self.dense1 = tf.keras.layers.Dense(32, activation='relu')
        self.dense2 = tf.keras.layers.Dense(2, activation='softmax')

    def call(self, inputs):
        tmp = tf.convert_to_tensor(inputs)
        tmp = self.dense1(tmp)
        tmp = self.dense2(tmp)

        logits = tmp
        logs_pi = tf.math.log(logits)
        return logits, logs_pi

    def explore(self, inputs):
        logits, logs_pi = self(inputs)
        return tf.squeeze(tf.random.categorical(logits, 1)), tf.squeeze(logs_pi)

    def exploit(self, inputs):
        logits, _ = self(inputs)
        return tf.math.argmax(logits)


In [None]:
agent = Agent()
agent.build(tf.TensorShape([None,4]))
#agent.compile(optimizer=tf.keras.optimizers.Adam(LEARNING_RATE),
#    loss="mse",
#    metrics=['accuracy'])

agent.summary()

In [None]:
memory = [] 

In [None]:

class LossFromMemory():
    """ Calcul du loss """
    
    def __init__(self, memory):
        # Define your layers here.
        self.memory = memory
        self.approx_ent = None

    def __call__(self):
        # Récuperation des avantages
        tf_adv = tf.convert_to_tensor( [ mem['ad'] for mem in memory ] )
        
        # Récupérations des états
        states = [ mem['s'] for mem in memory ]
        # Récupération des l'actions choisies
        actions = [ mem['a'] for mem in memory ]
        
        _, logs_pi = agent(states) # Recalcule des logs de la Policy (pour récupérer les gradients)
        logp_a_op = tf.reduce_sum( logs_pi * tf.one_hot(actions, 2), axis=1 ) # Ne conserve que les logs qui corresponds à l'action

        pi_loss = -tf.reduce_mean(logp_a_op * tf_adv) # Calcul l'esperance de (log.pi().advantage)
        
        self.approx_ent = tf.reduce_mean(-logp_a_op) # Calcul de l'entropie
        
        return pi_loss

for epoch in range(1,500) :
    print("Epoch : ", epoch)
    state = env.reset()
    
    step = 0
    epoch_memory = [] # Memoire de l'époque

    while True :
        step += 1

        env.render()

        action, logs_pi = agent.explore([state]) # recupere l'action depuis la policy
        
        state_next, reward, done, info = env.step(action.numpy()) 
        if done : reward = -1.0 # si finie : reward negatif
        
        epoch_memory.append({ "s": state, "a": action, "r": reward, "l": logs_pi, "d": done, "sn": state_next }) # memorique l'étape
        
        state = state_next

        if done :
            advantage = 0
            for mem in np.flip(epoch_memory) : # Calcule l'avantage
                advantage = advantage * GAMMA + mem['r']
                mem['ad'] = advantage
                #print(advantage)
            
            memory.extend(epoch_memory) # ajout la epoque à la memoire globale
            break
        else :
            print('.', end='')
        
        state = state_next
        
    if len(memory) >= MEMORY_SIZE :
                
        pi_loss = LossFromMemory(memory)

        train_pi = tf.optimizers.Adam(
                learning_rate=LEARNING_RATE
            ).minimize(
                loss=pi_loss, 
                var_list=agent.trainable_variables
            ) # train

        print(train_pi.numpy(), pi_loss.approx_ent.numpy())
            

In [None]:
env.close()
