In [37]:
import gymnasium as gym

from stable_baselines3 import DQN, PPO, A2C, SAC, DDPG
from src.environment.DiscreteBattery import DiscreteBattery
from src.agent.utils import get_train_test_sets
import numpy as np
import matplotlib.pyplot as plt
import time

# DQN

In [38]:
N_train = 300
N_test = 100
test_env = DiscreteBattery(discrete_action_space=True, days=list(range(N_train)))
train_Ppvs, train_Pconsos, test_Ppvs, test_Pconsos = get_train_test_sets(test_env, N_train=N_train, N_test=N_test)

In [39]:
for seed in [0, 1, 2, 3, 4]:
    model = DQN.load(f"cleps/dqn_lr_1e-3_eps1e-1_{seed}")
    rewards = np.zeros(100)
    for day in range(N_test):
        test_env.Ppv = test_Ppvs[day]
        test_env.Pconso = test_Pconsos[day]
        obs, info = test_env.reset(change=False)

        cumulated_reward = 0        
        for h in range(143):
            action, _states = model.predict(obs, deterministic=True)
            obs, reward, terminated, truncated, info = test_env.step(action)
            cumulated_reward += reward
        rewards[day] = -cumulated_reward
    print(rewards.mean())

39.998704248999985
39.61742491999998
39.284732129999995
40.17158335966666
39.014186534000004


In [24]:
for seed in [0, 1, 2, 3, 4]:
    model = DQN.load(f"cleps/dqn_lr_1e-3_eps2e-1_{seed}")
    rewards = np.zeros(100)
    for day in range(N_test):
        test_env.Ppv = test_Ppvs[day]
        test_env.Pconso = test_Pconsos[day]
        obs, info = test_env.reset(change=False)

        cumulated_reward = 0        
        for h in range(143):
            action, _states = model.predict(obs, deterministic=True)
            obs, reward, terminated, truncated, info = test_env.step(action)
            cumulated_reward += reward
        rewards[day] = -cumulated_reward
    print(rewards.mean())

40.76508645099998
40.79360924466665
38.91041946366666
39.76417003333332
39.37478524466666


In [25]:
for seed in [0, 1, 2, 3, 4]:
    model = DQN.load(f"cleps/dqn_lr_1e-4_eps1e-1_{seed}")
    rewards = np.zeros(100)
    for day in range(N_test):
        test_env.Ppv = test_Ppvs[day]
        test_env.Pconso = test_Pconsos[day]
        obs, info = test_env.reset(change=False)

        cumulated_reward = 0        
        for h in range(143):
            action, _states = model.predict(obs, deterministic=True)
            obs, reward, terminated, truncated, info = test_env.step(action)
            cumulated_reward += reward
        rewards[day] = -cumulated_reward
    print(rewards.mean())

39.36708730766666
38.85296593033333
39.60588589699999
39.830707800333315
39.619948281999996


In [26]:
for seed in [0, 1, 2, 3, 4]:
    model = DQN.load(f"cleps/dqn_lr_1e-4_eps2e-1_{seed}")
    rewards = np.zeros(100)
    for day in range(N_test):
        test_env.Ppv = test_Ppvs[day]
        test_env.Pconso = test_Pconsos[day]
        obs, info = test_env.reset(change=False)

        cumulated_reward = 0        
        for h in range(143):
            action, _states = model.predict(obs, deterministic=True)
            obs, reward, terminated, truncated, info = test_env.step(action)
            cumulated_reward += reward
        rewards[day] = -cumulated_reward
    print(rewards.mean())

40.332106533333324
38.80476589166666
39.2697211
39.29970959299999
39.56055886666665


In [40]:
q_table = np.load("/Users/luweber/Projets/clean-battery/data/experiments/bill_min/agent_003/PI-q-table.npy")

In [41]:
rewards = np.zeros(100)
for day in range(N_test):
    test_env.Ppv = test_Ppvs[day]
    test_env.Pconso = test_Pconsos[day]
    obs, info = test_env.reset(change=False)
    
    cumulated_reward = 0
    for h in range(143):
        X = obs['SOC']
        T = obs['time']
        D = obs['delta']
#         states[h] = X / 100
        action = q_table[T, D, X].argmin()
        obs, reward, terminated, truncated, info = test_env.step(action)
        cumulated_reward += reward
    rewards[day] = -cumulated_reward
print(rewards.mean())

37.191714058666655


In [42]:
rewards = np.zeros(100)
for day in range(N_test):
    test_env.Ppv = test_Ppvs[day]
    test_env.Pconso = test_Pconsos[day]
    obs, info = test_env.reset(change=False)
    
    cumulated_reward = 0
    for h in range(143):
        X = obs['SOC']
        T = obs['time']
        D = obs['delta']
#         states[h] = X / 100
        action = 20
        obs, reward, terminated, truncated, info = test_env.step(action)
        cumulated_reward += reward
    rewards[day] = -cumulated_reward
print(rewards.mean())

39.61717133333332


# PPO

In [32]:
test_env = DiscreteBattery(discrete_action_space=False)
for seed in [0, 1, 2, 3, 4]:
    model = PPO.load(f"cleps/ppo_{seed}")
    rewards = np.zeros(100)
    for day in range(N_test):
        test_env.Ppv = test_Ppvs[day]
        test_env.Pconso = test_Pconsos[day]
        obs, info = test_env.reset(change=False)

        cumulated_reward = 0        
        for h in range(143):
            action, _states = model.predict(obs, deterministic=True)
            obs, reward, terminated, truncated, info = test_env.step(action)
            cumulated_reward += reward
        rewards[day] = -cumulated_reward
    print(rewards.mean())

42.31138291175564
43.330022547376586
41.71753078193807
41.64914796439629
42.56744387098881


# A2C

In [33]:
test_env = DiscreteBattery(discrete_action_space=False)
for seed in [0, 1, 2, 3, 4]:
    model = A2C.load(f"cleps/a2c_{seed}")
    rewards = np.zeros(100)
    for day in range(N_test):
        test_env.Ppv = test_Ppvs[day]
        test_env.Pconso = test_Pconsos[day]
        obs, info = test_env.reset(change=False)

        cumulated_reward = 0        
        for h in range(143):
            action, _states = model.predict(obs, deterministic=True)
            obs, reward, terminated, truncated, info = test_env.step(action)
            cumulated_reward += reward
        rewards[day] = -cumulated_reward
    print(rewards.mean())

39.61717133333332
43.29683799999998
45.75025808884283
43.31271797662972
44.04301290750703


# SAC

In [35]:
test_env = DiscreteBattery(discrete_action_space=False)
for seed in [0, 1, 2, 3, 4]:
    model = SAC.load(f"cleps/sac_{seed}")
    rewards = np.zeros(100)
    for day in range(N_test):
        test_env.Ppv = test_Ppvs[day]
        test_env.Pconso = test_Pconsos[day]
        obs, info = test_env.reset(change=False)

        cumulated_reward = 0        
        for h in range(143):
            action, _states = model.predict(obs, deterministic=True)
            obs, reward, terminated, truncated, info = test_env.step(action)
            cumulated_reward += reward
        rewards[day] = -cumulated_reward
    print(rewards.mean())

39.77742388659553
39.380497879414044
39.36470531048472
39.01615124268644
39.391096494162895


# DDPG

In [20]:
test_env = DiscreteBattery(discrete_action_space=False)
for seed in [0, 1, 2, 3, 4]:
    model = DDPG.load(f"cleps/ddpg_{seed}")
    rewards = np.zeros(100)
    for day in range(N_test):
        test_env.Ppv = test_Ppvs[day]
        test_env.Pconso = test_Pconsos[day]
        obs, info = test_env.reset(change=False)

        cumulated_reward = 0        
        for h in range(143):
            action, _states = model.predict(obs, deterministic=True)
            obs, reward, terminated, truncated, info = test_env.step(action)
            cumulated_reward += reward
        rewards[day] = -cumulated_reward
    print(rewards.mean())

39.482393720716644
39.70285278919432
39.24226006943459
39.246914450468005
39.418292223957636
