## N-ième est Mathis Deep Q Learning

In [1]:
import gym
import gym_sch
from agentsch import *

In [2]:
from time import time

def timeit(func):
    def new_func(*args, **kwargs):
        init_time = time()
        res = func(*args, **kwargs)
        print("{0:10s} : {1:5f}s.".format(func.__name__, time()-init_time))
        return res
    return new_func

In [3]:
def gen_new_demand(env, k): 
    env.products["Farine"]["stock"] = 20
    env.products["Plastique"]["stock"] = 20
    env.products["PainDeMieEmballe"]["demand"] = k
    env.products["PainDeMieSansCrouteEmballe"]["demand"] = 20-k
    env.machines["Four"]["is_on"] = True
    env.machines["Decrouteur"]["is_on"] = True
    env.machines["Emballeur"]["is_on"] = True

@timeit
def train(episodes, agent, env):
    scores = list()
    t = time()
    for i in range(episodes+1):
        env.reset()
        gen_new_demand(env, random.randrange(21))
        reward = None
        compt = 0
        while not env.done or compt<10:
            state = np.array([env.observation_space])
            action = agent.get_best_action(state)
            state, action, reward, done, next_state = env.step([action])
            agent.update_sequential_replay(state, action[0], reward, done, next_state)
            compt += 1
        if i%5==0:
            grad = agent.train(batch_size = 100, epochs = 1, use_loss_scale = True, use_grad_clip = False)
        if i%500 == 0:
            score = 0
            for k in range(21):
                env.reset()
                gen_new_demand(env, k)
                compt = 0
                while not env.done or compt<10:
                    state = np.array([env.observation_space])
                    action = agent.get_best_action(state, rand = False)
                    env.step([action])
                    compt += 1
                score += env.score
            scores.append(score/21)
            print("episode: {0:5d}/{1:5d}, score: {2:7.2f}, time: {3:5.2f}s, eps: {4:4f}"
                  .format(i, episodes, scores[-1], time()-t, agent.epsilon))
            t = time()
            
    fig, ax = plt.subplots()
    ax.plot(scores)
    ax.plot([0]*len(scores), color = 'r')
    print(max(scores))
        

In [4]:
env = gym.make("sch-v0")
env.from_json("PraindeMine.json")
agent = Agent(env.observation_space.__len__(), env.action_space.__len__())

In [5]:
train(200000, agent, env)

episode:     0/200000, score: -100.00, time:  0.36s, eps: 0.999940
episode:   500/200000, score:  -96.19, time:  1.32s, eps: 0.972641
episode:  1000/200000, score:  -80.81, time:  0.98s, eps: 0.945184
episode:  1500/200000, score: -100.00, time:  0.96s, eps: 0.917675
episode:  2000/200000, score:  -50.62, time:  1.23s, eps: 0.891181
episode:  2500/200000, score:  -74.90, time:  1.42s, eps: 0.864849
episode:  3000/200000, score:  -70.43, time:  1.59s, eps: 0.839410
episode:  3500/200000, score:  -54.90, time:  1.85s, eps: 0.813192
episode:  4000/200000, score:  -37.14, time:  1.99s, eps: 0.788228
episode:  4500/200000, score:  -32.57, time:  2.20s, eps: 0.762970
episode:  5000/200000, score:  -45.95, time:  2.37s, eps: 0.738404
episode:  5500/200000, score:  -48.33, time:  2.48s, eps: 0.715486
episode:  6000/200000, score:  -17.19, time:  2.93s, eps: 0.691583
episode:  6500/200000, score:  -45.38, time:  2.91s, eps: 0.668777


KeyboardInterrupt: 

In [None]:
for k in range(21):
    env.reset()
    env.products["PainDeMieEmballe"]["demand"] = k
    env.products["PainDeMieSansCrouteEmballe"]["demand"] = 20-k
    compt = 0
    while not env.done or compt<10:
        state = np.array([env.observation_space])
        action = agent.get_best_action(state, rand = False)
        env.step([action])
        compt += 1
    print("Demand : {:2d}:{:2d} Score : {}.".format(k,20-k,env.score))

In [None]:
env.print_actions()

In [None]:
env.reset()
env.products["PainDeMieEmballe"]["demand"] = 0
env.products["PainDeMieSansCrouteEmballe"]["demand"] = 20