## N-ième est Mathis Deep Q Learning

In [12]:
import gym
import gym_sch
import numpy as np
import random
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.backend as K
import matplotlib.pyplot as plt
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import RMSprop, Adam
from tensorflow.keras.losses import MeanSquaredError, MSE
from collections import deque

In [13]:
from time import time

def timeit(func):
    def new_func(*args, **kwargs):
        init_time = time()
        res = func(*args, **kwargs)
        print("{0:10s} : {1:5f}s.".format(func.__name__, time()-init_time))
        return res
    return new_func

In [14]:
class Agent():
    def __init__(self, input_size, output_size):
        tf.keras.backend.clear_session()
        self.gamma = 0.9
        self.epsilon = 1.0
        self.epsilon_decay = 0.9998
        self.learning_rate = 0.002
        self.rho = 0.9
        self.input_size = input_size
        self.output_size = output_size
        self.sequential_replay = deque([], maxlen = 500)
        
        input_layer = keras.Input(shape=(self.input_size,), name="obs")
        hidden = Dense(24, activation="elu", kernel_initializer = "he_uniform")(input_layer)
        hidden2 = Dense(24, activation="elu")(hidden)
        output_layer = Dense(self.output_size, activation="linear")(hidden2)
        self.model = keras.Model(inputs=input_layer, outputs=output_layer)
        
        opt = RMSprop(learning_rate=self.learning_rate, rho=self.rho)
        #opt = Adam(self.learning_rate)
        self.optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, "dynamic")
        #self.loss_func = MeanSquaredError(reduction=tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE)
        self.loss_func = MSE
        
    def update_sequential_replay(self, state, action, reward, done, next_state):
        self.sequential_replay.append((state, action, reward, done, next_state))
    
    def train(self, batch_size = 32, epochs = 1, use_loss_scale = False, use_grad_clip = False):
        if len(self.sequential_replay) < batch_size:
            return
        for _ in range(epochs):
            states, q_targets, actions = self.get_prepared_batch(batch_size)
            self.compute_grad(states, q_targets, actions, use_loss_scale, use_grad_clip)
    
    def get_prepared_batch(self, batch_size = 100):        
        batch_indices = np.random.randint(0, high = len(self.sequential_replay), size = batch_size)
        batch_buffer = [self.sequential_replay[ind] for ind in batch_indices]                           
        states, actions, rewards, dones, next_states = [np.array([obs[ind] for obs in batch_buffer]) for ind in range(5)]
        
        states = tf.constant(states, dtype="float32")
        rewards = tf.reshape(tf.constant(rewards.T, dtype="float32"),(100,1))
        next_states = tf.constant(next_states, dtype="float32")
        
        q_targets = tf.math.add(rewards,self.gamma * tf.reduce_max(self.model(next_states), axis=1, keepdims=True))
        return states, q_targets, actions
        
    @tf.function
    def compute_grad(self, states, q_targets, actions, use_loss_scale = False, grad_clip = 0):
        gradients = None
        mask = tf.one_hot(actions, self.output_size)
        with tf.GradientTape() as tape: 
            q_value = self.model(states)
            q_value_choose = tf.reduce_sum(mask*q_value, axis = 1, keepdims = True)
            loss = self.loss_func(q_targets, q_value_choose)
        if use_loss_scale:
            scaled_loss = self.optimizer.get_scaled_loss(loss)
            scaled_grad = tape.gradient(scaled_loss, self.model.trainable_variables)
            gradient = self.optimizer.get_unscaled_gradients(scaled_grad)
        else:
            gradients = tape.gradient(loss, self.model.trainable_variables)
        if grad_clip:
            gradients = [tf.clip_by_norm(g, grad_clip) for g in gradients]
        self.optimizer.apply_gradients(zip(gradients, self.model.trainable_variables))
        return gradients
        
    def get_best_action(self, state, rand=True):
        self.epsilon *= self.epsilon_decay

        if rand and np.random.rand() <= self.epsilon:
            return random.randrange(self.output_size)
        
        state = tf.constant(state)
        act_values = self.model(state)

        action = tf.math.argmax(act_values[0]).numpy()
        return action

In [24]:
def gen_new_demand(env):
    k = random.randrange(21)
    env.products["PainDeMieEmballe"]["demand"] = k
    env.products["PainDeMieSansCrouteEmballe"]["demand"] = 20-k

@timeit
def train(episodes, agent, env):
    scores = list()
    t = time()
    for i in range(episodes+1):
        env.reset()
        gen_new_demand(env)
        reward = None
        while not env.done:
            state = np.array([env.observation_space])
            action = agent.get_best_action(state)
            state, action, reward, done, next_state = env.step([action])
            agent.update_sequential_replay(state, action[0], reward, done, next_state)
        if i%5==0:
            grad = agent.train(100)
        if i%500 == 0:
            score = 0
            for k in range(21):
                env.reset()
                env.products["PainDeMieEmballe"]["demand"] = k
                env.products["PainDeMieEmballeEmballe"]["demand"] = 20-k
                while not env.done:
                    state = np.array([env.observation_space])
                    action = agent.get_best_action(state, rand = False)
                    env.step([action])
                score += env.score
            scores.append(score/11)
            print("episode: {0:5d}/{1:5d}, score: {2:6.2f}, time: {3:4.2f}s"
                  .format(i, episodes, scores[-1], time()-t))
            t = time()
            
    fig, ax = plt.subplots()
    ax.plot(scores)
    ax.plot([0]*len(scores), color = 'r')
    print(max(scores))
        

In [25]:
env = gym.make("sch-v0")
env.from_json("PraindeMine.json")
agent = Agent(env.observation_space.__len__(), env.action_space.__len__())

In [26]:
train(40000, agent, env)

KeyError: 'PainDeMieEmballeEmballe'

In [None]:
for i in range(11):
    env.reset()
    env.products["P3"]["demand"] = i
    env.products["P4"]["demand"] = 10-i
    while not env.done:
        state = np.array([env.observation_space])
        action = agent.get_best_action(state, rand = False)
        env.step([action])
    print("Demand : {:2d}:{:2d} Score : {}.".format(i,10-i,env.score))

In [None]:
print(env.print_actions())

In [None]:
grads

In [None]:
agent.epsilon

In [None]:
a = np.eye(5,5)