# Smart Charging Using Reinforcement Learning

In [144]:
import math
import numpy as np
import gym
from gym import Env
from gym.spaces import Discrete, Box
from keras.models import Sequential
from keras.layers import Dense, Flatten
from tensorflow.keras.optimizers.legacy import Adam
from rl.agents import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

In [145]:
class EVChargingEnvironment(Env):
    def __init__(self):
        self.action_space = Discrete(NUM_ACTIONS)
        self.battery_limit = BATTERY_LIMIT
        self.battery_level = self.battery_limit
        self.energy_demand = 0

        # A state in the environment consists of battery level and time interval
        self.observation_space = np.array([Box(low=np.array([0]), high=np.array([self.battery_limit])), Box(low=np.array([0]), high = np.array([8]))])

        # Initialize first state
        self.current_time_index = 0
        self.state = np.array([self.battery_level, self.current_time_index])

        # Set time coefficient as defined above
        self.time_coefficients = [0.9, 0.91, 0.93, 0.94, 0.96, 0.97, 0.99, 1.0]

        # Charging rates in range 0, 7, 14, 22 kWh divided by 4 (because we charge only 15 minutes)
        self.charging_rates = [0, 1.75, 3.5, 5.5]
        self.charging_rates_scaled = [0, 0.25, 0.5, 1]

    def step(self, action):

        charging_rate = self.charging_rates[action]
        charging_rate_scaled = self.charging_rates_scaled[action]


        if self.battery_level + charging_rate > self.battery_limit:
            self.battery_level = self.battery_limit
        else:
            self.battery_level += charging_rate

        reward = self.time_coefficients[self.current_time_index] * math.exp(charging_rate_scaled) * (-1)
        reward = round(reward, 2)

        if charging_rate == 0:
            reward = 0

        # Update time index
        self.current_time_index = (self.current_time_index + 1) % 9
        self.state[1] = self.current_time_index
    
        # Check if simulation is complete
        done = self.current_time_index == 8

        if done:
            # Get energy demand from normal distribution
            self.energy_demand = np.random.normal(30, 5)

            # Set new battery level
            #self.battery_level = self.battery_level - energy_demand

            if self.battery_level < self.energy_demand:
                # We set a very high penalty of 10,000 because of the cost function resulting in high costs
                reward -= 1000.0

        
        # Update battery level
        self.state[0] = int(self.battery_level)

        return self.state, reward, done, {}

    def reset(self):
        self.current_time_index = 0

        self.battery_level = max(0,self.battery_level - self.energy_demand)

        self.state = np.array([int(self.battery_level), self.current_time_index])
        return self.state

In [146]:
def build_model(states, actions):
    model = Sequential()
    model.add(Flatten(input_shape=(1, 2)))
    model.add(Dense(8, activation='relu'))
    model.add(Dense(4, activation='relu'))
    model.add(Dense(actions, activation='linear'))
    return model

def build_agent(model, actions):
    policy = BoltzmannQPolicy()
    memory = SequentialMemory(limit=50000, window_length=1)
    dqn = DQNAgent(model=model, memory=memory, policy=policy, nb_actions=actions, nb_steps_warmup=10000, target_model_update=1e-2)
    return dqn

NUM_ACTIONS = 4 # Action space contains 4 actions: zero, low, medium, high charging
NUM_TIME_INTERVALS = 8 # charging every 15 minutes from 2pm to 4pm
BATTERY_LIMIT = 58 # Assumption: 58 kWh (Netto) battery capacity

env = EVChargingEnvironment()

states = env.observation_space.shape
actions = env.action_space.n

model = build_model(states, actions)
dqn = build_agent(model, actions)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])
training_results = dqn.fit(env, nb_steps=100000, visualize=False, verbose=1)

  super().__init__(name, **kwargs)
2023-08-15 10:45:26.168100: W tensorflow/c/c_api.cc:304] Operation '{name:'dense_96/bias/Assign' id:15735 op device:{requested: '', assigned: ''} def:{{{node dense_96/bias/Assign}} = AssignVariableOp[_has_manual_control_dependencies=true, dtype=DT_FLOAT, validate_shape=false](dense_96/bias, dense_96/bias/Initializer/zeros)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.


Training for 100000 steps ...
Interval 1 (0 steps performed)


  updates=self.state_updates,
2023-08-15 10:45:27.337876: W tensorflow/c/c_api.cc:304] Operation '{name:'dense_98/BiasAdd' id:15788 op device:{requested: '', assigned: ''} def:{{{node dense_98/BiasAdd}} = BiasAdd[T=DT_FLOAT, _has_manual_control_dependencies=true, data_format="NHWC"](dense_98/MatMul, dense_98/BiasAdd/ReadVariableOp)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.
2023-08-15 10:45:27.807936: W tensorflow/c/c_api.cc:304] Operation '{name:'count_122/Assign' id:16016 op device:{requested: '', assigned: ''} def:{{{node count_122/Assign}} = AssignVariableOp[_has_manual_control_dependencies=true, dtype=DT_FLOAT, validate_shape=false](count_122, count_122/Initializer/zeros)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. E

1250 episodes - episode_reward: -991.714 [-1013.150, -9.760]

Interval 2 (10000 steps performed)
    1/10000 [..............................] - ETA: 59s - reward: 0.0000e+00

2023-08-15 10:46:00.562294: W tensorflow/c/c_api.cc:304] Operation '{name:'dense_98_1/BiasAdd' id:15862 op device:{requested: '', assigned: ''} def:{{{node dense_98_1/BiasAdd}} = BiasAdd[T=DT_FLOAT, _has_manual_control_dependencies=true, data_format="NHWC"](dense_98_1/MatMul, dense_98_1/BiasAdd/ReadVariableOp)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.
2023-08-15 10:46:01.245068: W tensorflow/c/c_api.cc:304] Operation '{name:'loss_123/AddN' id:16122 op device:{requested: '', assigned: ''} def:{{{node loss_123/AddN}} = AddN[N=2, T=DT_FLOAT, _has_manual_control_dependencies=true](loss_123/mul, loss_123/mul_1)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.
20

1250 episodes - episode_reward: -767.984 [-1018.570, -7.760] - loss: 16807.130 - mae: 504.335 - mean_q: -586.513

Interval 3 (20000 steps performed)
1250 episodes - episode_reward: -19.187 [-20.670, -6.500] - loss: 4268.744 - mae: 380.253 - mean_q: -456.139

Interval 4 (30000 steps performed)
1250 episodes - episode_reward: -20.552 [-20.670, -16.600] - loss: 2492.550 - mae: 264.975 - mean_q: -302.937

Interval 5 (40000 steps performed)
1250 episodes - episode_reward: -20.600 [-20.670, -18.690] - loss: 1897.483 - mae: 191.787 - mean_q: -207.176

Interval 6 (50000 steps performed)
1250 episodes - episode_reward: -20.282 [-20.670, -16.520] - loss: 1337.285 - mae: 96.013 - mean_q: -90.752

Interval 7 (60000 steps performed)
1250 episodes - episode_reward: -17.431 [-20.670, -12.520] - loss: 161.138 - mae: 24.012 - mean_q: -15.668

Interval 8 (70000 steps performed)
1250 episodes - episode_reward: -18.078 [-1014.550, -12.160] - loss: 16.185 - mae: 19.471 - mean_q: -12.652

Interval 9 (80000 

In [147]:
NUM_ACTIONS = 4 # Action space contains 4 actions: zero, low, medium, high charging
NUM_TIME_INTERVALS = 8 # charging every 15 minutes from 2pm to 4pm
BATTERY_LIMIT = 58 # Assumption: 58 kWh (Netto) battery capacity

env = EVChargingEnvironment()

episodes = 10

for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0

    print(f"Episode {episode}\nInitial State: {state}")

    while not done:
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score += reward

        print(f"Action: {action} | New State: {n_state} | Reward: {reward} | Done: {done}")
    
    print(f"Episode {episode} - Score: {round(score,2)}\n{'*' * 50}")


Episode 1
Initial State: [58  0]
Action: 0 | New State: [58  1] | Reward: 0 | Done: False
Action: 0 | New State: [58  2] | Reward: 0 | Done: False
Action: 0 | New State: [58  3] | Reward: 0 | Done: False
Action: 0 | New State: [58  4] | Reward: 0 | Done: False
Action: 2 | New State: [58  5] | Reward: -1.58 | Done: False
Action: 3 | New State: [58  6] | Reward: -2.64 | Done: False
Action: 1 | New State: [58  7] | Reward: -1.27 | Done: False
Action: 3 | New State: [58  8] | Reward: -2.72 | Done: True
Episode 1 - Score: -8.21
**************************************************
Episode 2
Initial State: [20  0]
Action: 2 | New State: [24  1] | Reward: -1.48 | Done: False
Action: 0 | New State: [24  2] | Reward: 0 | Done: False
Action: 1 | New State: [25  3] | Reward: -1.19 | Done: False
Action: 0 | New State: [25  4] | Reward: 0 | Done: False
Action: 1 | New State: [27  5] | Reward: -1.23 | Done: False
Action: 3 | New State: [33  6] | Reward: -2.64 | Done: False
Action: 3 | New State: [38  7

In [148]:
results = dqn.test(env, nb_episodes=150, visualize=False)

Testing for 150 episodes ...
Episode 1: reward: -20.670, steps: 8
Episode 2: reward: -13.580, steps: 8
Episode 3: reward: -12.520, steps: 8
Episode 4: reward: -12.520, steps: 8
Episode 5: reward: -12.520, steps: 8
Episode 6: reward: -12.520, steps: 8
Episode 7: reward: -15.690, steps: 8
Episode 8: reward: -12.520, steps: 8
Episode 9: reward: -12.520, steps: 8
Episode 10: reward: -12.520, steps: 8
Episode 11: reward: -12.520, steps: 8
Episode 12: reward: -12.520, steps: 8
Episode 13: reward: -12.520, steps: 8
Episode 14: reward: -12.520, steps: 8
Episode 15: reward: -12.520, steps: 8
Episode 16: reward: -15.690, steps: 8
Episode 17: reward: -16.720, steps: 8
Episode 18: reward: -12.520, steps: 8
Episode 19: reward: -12.520, steps: 8
Episode 20: reward: -13.590, steps: 8
Episode 21: reward: -12.520, steps: 8
Episode 22: reward: -16.700, steps: 8
Episode 23: reward: -15.690, steps: 8
Episode 24: reward: -14.650, steps: 8
Episode 25: reward: -17.730, steps: 8
Episode 26: reward: -12.520, s

In [149]:
NUM_ACTIONS = 4 # Action space contains 4 actions: zero, low, medium, high charging
NUM_TIME_INTERVALS = 8 # charging every 15 minutes from 2pm to 4pm
BATTERY_LIMIT = 58 # Assumption: 58 kWh (Netto) battery capacity

env = EVChargingEnvironment()
final_battery_levels = []
episode_rewards = []
episodes = 1000

for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    rewards = 0

    learned_policy = []
    print(f"Episode {episode}\nInitial State: {state}")

    while not done:
        state = np.expand_dims(state, axis=0)
        q_values = dqn.compute_q_values(state)  # Get Q-values from the DQN
        learned_action = np.argmax(q_values)  # Choose action with highest Q-value
        learned_policy.append(learned_action)
        state, reward, done, _ = env.step(learned_action)
        rewards += reward
    episode_rewards.append(rewards)
    final_battery_levels(state[0])

        #print(f"Action: {learned_action} | New State: {state} | Reward: {reward} | Done: {done}")
    
    print(f"Episode {episode} - Actions: {learned_policy} - Score: {round(scores,2)}\n{'*' * 50}")


Episode 1
Initial State: [58  0]
Episode 1 - Actions: [2, 2, 2, 2, 2, 2, 2, 2] - Score: -12.52
**************************************************
Episode 2
Initial State: [29  0]
Episode 2 - Actions: [2, 2, 2, 2, 2, 2, 2, 2] - Score: -12.52
**************************************************
Episode 3
Initial State: [25  0]
Episode 3 - Actions: [2, 2, 2, 2, 2, 2, 2, 2] - Score: -12.52
**************************************************
Episode 4
Initial State: [17  0]
Episode 4 - Actions: [2, 2, 2, 2, 2, 2, 2, 3] - Score: -13.59
**************************************************
Episode 5
Initial State: [24  0]
Episode 5 - Actions: [2, 2, 2, 2, 2, 2, 2, 2] - Score: -12.52
**************************************************
Episode 6
Initial State: [27  0]
Episode 6 - Actions: [2, 2, 2, 2, 2, 2, 2, 2] - Score: -12.52
**************************************************
Episode 7
Initial State: [28  0]
Episode 7 - Actions: [2, 2, 2, 2, 2, 2, 2, 2] - Score: -12.52
*****************************

In [151]:
NUM_ACTIONS = 4 # Action space contains 4 actions: zero, low, medium, high charging
NUM_TIME_INTERVALS = 8 # charging every 15 minutes from 2pm to 4pm
BATTERY_LIMIT = 58 # Assumption: 58 kWh (Netto) battery capacity

env = EVChargingEnvironment()
final_battery_levels = []
episode_rewards = []
episodes = 1000

for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    rewards = 0

    learned_policy = []

    while not done:
        state = np.expand_dims(state, axis=0)
        q_values = dqn.compute_q_values(state)  # Get Q-values from the DQN
        learned_action = np.argmax(q_values)  # Choose action with highest Q-value
        learned_policy.append(learned_action)
        state, reward, done, _ = env.step(learned_action)
        rewards += reward
    episode_rewards.append(rewards)
    final_battery_levels.append(state[0])
    print(rewards)
    print(state[0])

    


  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


-12.520000000000001
58
-12.520000000000001
58
-12.520000000000001
53
-12.520000000000001
52
-12.520000000000001
52
-12.520000000000001
47
-12.520000000000001
54
-12.520000000000001
51
-15.69
47
-18.73
46
-13.590000000000002
46
-12.520000000000001
49
-12.520000000000001
48
-14.65
45
-12.520000000000001
48
-12.520000000000001
53
-12.520000000000001
48
-12.520000000000001
49
-14.65
46
-12.520000000000001
48
-15.69
47
-12.520000000000001
50
-12.520000000000001
51
-17.73
45
-15.69
47
-16.72
46
-14.65
47
-15.69
45
-12.520000000000001
47
-12.520000000000001
48
-14.65
47
-18.73
45
-17.73
46
-12.520000000000001
46
-12.520000000000001
45
-13.590000000000002
46
-13.590000000000002
46
-12.520000000000001
50
-14.65
46
-19.7
46
-19.7
46
-14.65
47
-15.69
45
-12.520000000000001
46
-13.58
45
-16.72
46
-16.72
46
-12.520000000000001
53
-12.520000000000001
51
-12.520000000000001
49
-12.520000000000001
48
-13.58
45
-16.7
45
-18.73
46
-19.7
46
-18.73
46
-12.520000000000001
49
-12.520000000000001
47
-12.5200

In [133]:
num_episodes = 1000

states = []
all_scores = []

for episode in range(num_episodes):
    learned_policy = []
    scores = 0
    state = env.reset()
    done = False
    while not done:
        state = np.expand_dims(state, axis=0)
        q_values = dqn.compute_q_values(state)  # Get Q-values from the DQN
        learned_action = np.argmax(q_values)  # Choose action with highest Q-value
        learned_policy.append(learned_action)
        state, reward, done, _ = env.step(learned_action)
        scores += reward
    
    print(f"Episode {episode + 1}: Learned Policy: {learned_policy}")
    print(f"Episode {episode + 1}: Score: {scores}")
    print(f"Episode {episode + 1}: Final battery level: {state[0]}") 
    states.append(state[0])
    all_scores.append(scores)



Episode 1: Learned Policy: [0, 0, 0, 0, 0, 0, 0, 0]
Episode 1: Score: -1000.0
Episode 1: Final battery level: 0
Episode 2: Learned Policy: [0, 0, 0, 0, 0, 0, 0, 0]
Episode 2: Score: -1000.0
Episode 2: Final battery level: 0
Episode 3: Learned Policy: [0, 0, 0, 0, 0, 0, 0, 0]
Episode 3: Score: -1000.0
Episode 3: Final battery level: 0
Episode 4: Learned Policy: [0, 0, 0, 0, 0, 0, 0, 0]
Episode 4: Score: -1000.0
Episode 4: Final battery level: 0
Episode 5: Learned Policy: [0, 0, 0, 0, 0, 0, 0, 0]
Episode 5: Score: -1000.0
Episode 5: Final battery level: 0
Episode 6: Learned Policy: [0, 0, 0, 0, 0, 0, 0, 0]
Episode 6: Score: -1000.0
Episode 6: Final battery level: 0
Episode 7: Learned Policy: [0, 0, 0, 0, 0, 0, 0, 0]
Episode 7: Score: -1000.0
Episode 7: Final battery level: 0
Episode 8: Learned Policy: [0, 0, 0, 0, 0, 0, 0, 0]
Episode 8: Score: -1000.0
Episode 8: Final battery level: 0
Episode 9: Learned Policy: [0, 0, 0, 0, 0, 0, 0, 0]
Episode 9: Score: -1000.0
Episode 9: Final battery l

KeyboardInterrupt: 