In [None]:
import numpy as np
import gymnasium as gym
from gymnasium import spaces
import random
import math
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt
import itertools
from keras.losses import MeanSquaredError

### Create Environment

We define the environment according to the task:

![grafik-2.png](attachment:grafik-2.png)

In [None]:
# define the environment
class SmartCharging(gym.Env):

    def __init__(self, render_mode=None):

        # action - kw: zero(0) - 0, low(1) - 7, medium(2) - 14, high(3) - 22
        self.actions_to_kw = [0, 7, 14, 22]
        self.action_space = spaces.Discrete(4)
        
        # average EV capacity 72 kWh: https://ev-database.org/cheatsheet/useable-battery-capacity-electric-car
        self.battery_capacity = 72
        # time steps: 0 - 2 p.m., 1 - 2:15 p.m., ... , 7 - 3:45 p.m., 8 - 4 p.m.
        self.time_step = 0
        self.observation_space = np.array([spaces.Box(0, self.battery_capacity, shape=(1,)), spaces.Box(0, 8, shape=(1,))])
        self.running_out_counter = 0
        self.start_kwh = []
        self.kwh_after_loading = []
        self.energy_balance = []
        self.chosen_load_rates = [0,0,0,0]
        self.total_charging_costs = 0 
        
        # 15 minute charging slots
        self.time_slot = 0.25
        # random start battery status in kWh to simulate energy left from previous shift 
        # (between 0 and 60 kWh for more realistic setting, at first it was 0 and 72 kWh)
        self.battery_status = random.randint(0,self.battery_capacity-12)

    def _get_obs(self):
        #return {"battery status": self.battery_status, "time step": self.time_step}
        return np.array([self.battery_status, self.time_step])

    def step(self, action):

        terminated = False
        
        # loading time is over, determine demand and check if charged energy was sufficient
        if self.time_step == 8:
            
            self.kwh_after_loading.append(self.battery_status)
            energy_demand = np.random.normal(loc=30, scale=5)
            penalty_factor = 1
            sum_of_time_coeffs = 2
            
            # append new energy balance
            self.energy_balance.append(self.battery_status-energy_demand)
            
            # charged energy was not sufficient for the demand
            if energy_demand > self.battery_status:
                
                self.battery_status = 0
                # to really penalize running out of energy, make penalty as high as maximum charging costs multiplied
                # by some factor. Factor used to be 10 for teh first models, then changed to 1
                reward = (-1) * sum_of_time_coeffs * math.exp(self.actions_to_kw[3]) * penalty_factor
                # increase running out of energy counter
                self.running_out_counter += 1
            # charged energy was sufficient
            else:
                
                self.battery_status -= energy_demand
                reward = 0
                if self.battery_status > self.battery_capacity/2:
                    # finishing a shift with left energy more than half of the capacity is not loading efficient. 
                    # Define a penalty for this case which is equal to the running out of energy penalty if the medium
                    # load was chosen during the whole loading period before
                    reward = ((-1) * sum_of_time_coeffs * \
                              (math.exp(self.actions_to_kw[3]) - math.exp(self.actions_to_kw[2])) * penalty_factor)

            terminated = True
            self.time_step += 1
        # loading time is not over, adjust the current state 
        else:
            
            
            # increase action tracker
            self.chosen_load_rates[action] += 1
            
            charged_kWh = self.time_slot * self.actions_to_kw[action]
            time_coeff = self.time_slot 
            
            # if battery status would exceed capacity, it is only proportionally charged->adjust energy and time coefficient
            if self.battery_status + charged_kWh > self.battery_capacity:
                
                charged_kWh = self.battery_capacity - self.battery_status
                time_coeff = charged_kWh/self.actions_to_kw[action]
                
                
            self.battery_status += charged_kWh
            self.time_step += 1
            
            reward = (-1) * time_coeff * math.exp(self.actions_to_kw[action])
            self.total_charging_costs += (-1) * reward
        
        return self._get_obs(), reward, terminated

    def reset(self, seed=None, options=None):
        
        # make random initialization of start battery status
        self.battery_status = random.randint(0,self.battery_capacity-12)
        self.start_kwh.append(self.battery_status)
        self.time_step = 0
        
        return self._get_obs()
    
    def get_running_out_counter(self):
        return self.running_out_counter
    
    def get_energy_balance(self):
        return self.energy_balance
    
    def get_total_charging_costs(self):
        return self.total_charging_costs
    
    def get_mean_of_non_negative_energy_balance(self):
        non_negative_values = [value for value in self.energy_balance if value >= 0]
        
        if len(non_negative_values) == 0:
            return -1

        mean_value = np.mean(non_negative_values)

        return mean_value
    
    def get_chosen_load_rate_tracker(self):
        return self.chosen_load_rates
        
        
    def plot_energy_balances(self,filename):
        
        self.energy_balance = np.array(self.energy_balance)
        self.start_kwh = np.array(self.start_kwh)
        
        plt.figure(figsize=(14, 6))
        plt.plot(self.energy_balance, marker='o', linestyle='--', color='g', label='After Shift Energy Balance')
        plt.plot(self.start_kwh, marker='x', linestyle='--', color='r', label='Start Energy Before Loading')
        plt.plot(self.kwh_after_loading, marker='x', linestyle='--', color='b', label='Energy After Loading, Before Shift')
        plt.axhline(y=0, color='black', linestyle='-', linewidth=1, label='Zero Line')
        
        plt.fill_between(range(len(self.energy_balance)), 
                     self.energy_balance, self.start_kwh, 
                     where=(self.start_kwh > self.energy_balance), 
                     facecolor='green', alpha=0.3, interpolate=True, 
                     label='Before Loading > After Shift')

        plt.fill_between(range(len(self.energy_balance)), 
                     self.energy_balance, self.start_kwh, 
                     where=(self.energy_balance > self.start_kwh), 
                     facecolor='red', alpha=0.3, interpolate=True, 
                     label='After Shift > Before Loading')
        plt.xlabel('Days')
        plt.ylabel('kWh')
        plt.title('After Shift Energy Balance')
        plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
        plt.tight_layout(rect=[0, 0, 1, 1])
        plt.savefig(filename, bbox_inches='tight')

        plt.savefig(filename)
        plt.close()
    
    
    def plot_chosen_load_rate_counts(self,filename):
        
        categories = ['0 kW', '7 kW', '14 kW', '22 kW']
        plt.figure(figsize=(8, 6))
        plt.bar(categories, self.chosen_load_rates, color='skyblue', edgecolor='black')

        plt.xlabel('Load Rates')
        plt.ylabel('Counts')
        plt.title('Counts Of The Chosen Load Rates')
        plt.savefig(filename)
        plt.close()
    

### Deep Q-Network

We implement a Deep Q-Network to solve the learning task, using replay buffer and a target network. The code is taken from workshop 11 of the 2024 AAA lecture. All rights by Rammin Ahadi https://is3.uni-koeln.de/en/team/doctoral-researchers/ramin-ahadi. We extended and adjusted the code for our purposes, i.e. defining the second network and using a separate target network. 

In [None]:
class ReplayBuffer(object):
    def __init__(self, max_size, input_shape, n_actions, discrete=False):
        self.mem_size = max_size
        self.mem_cntr = 0
        self.discrete = discrete
        self.state_memory = np.zeros((self.mem_size, input_shape))
        self.new_state_memory = np.zeros((self.mem_size, input_shape))
        dtype = np.int8 if self.discrete else np.float32
        self.action_memory = np.zeros((self.mem_size, n_actions), dtype=dtype)
        self.reward_memory = np.zeros(self.mem_size)
        self.terminal_memory = np.zeros(self.mem_size, dtype=np.float32)

    def store_transition(self, state, action, reward, state_, done):
        index = self.mem_cntr % self.mem_size
        self.state_memory[index] = state
        self.new_state_memory[index] = state_
        # store one hot encoding of actions, if appropriate
        if self.discrete:
            actions = np.zeros(self.action_memory.shape[1])
            actions[action] = 1.0
            self.action_memory[index] = actions
        else:
            self.action_memory[index] = action
        self.reward_memory[index] = reward
        self.terminal_memory[index] = 1 - done
        self.mem_cntr += 1

    def sample_buffer(self, batch_size):
        max_mem = min(self.mem_cntr, self.mem_size)
        batch = np.random.choice(max_mem, batch_size)

        states = self.state_memory[batch]
        actions = self.action_memory[batch]
        rewards = self.reward_memory[batch]
        states_ = self.new_state_memory[batch]
        terminal = self.terminal_memory[batch]

        return states, actions, rewards, states_, terminal

def build_dqn(model_nr, lr, n_actions, input_dims):
    
    if model_nr == 1:
        # simpler network
        model = Sequential([
                    Dense(256, input_shape=(input_dims,)),
                    Activation('relu'),
                    Dense(256),
                    Activation('relu'),
                    Dense(n_actions)])
    else:
        # more complex network
        model = Sequential([
                    Dense(256, input_shape=(input_dims,)),
                    Activation('relu'),
                    Dense(512),
                    Activation('relu'),
                    Dense(256),
                    Activation('relu'),
                    Dense(n_actions)])

    model.compile(optimizer=Adam(learning_rate=lr), loss='mse')

    return model


class Agent(object):
    def __init__(self, alpha, gamma, n_actions, epsilon, batch_size,
                 input_dims, epsilon_dec=0.996,  epsilon_end=0.01,
                 mem_size=10000, fname='dqn_model.h5', network_nr=1):
        self.action_space = [i for i in range(n_actions)]
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_dec = epsilon_dec
        self.epsilon_min = epsilon_end
        self.batch_size = batch_size
        self.model_file = fname
        self.memory = ReplayBuffer(mem_size, input_dims, n_actions,
                                   discrete=True)
        self.q_eval = build_dqn(network_nr, alpha, n_actions, input_dims)
        # init target network
        self.q_target_network = build_dqn(network_nr, alpha, n_actions, input_dims)
        self.q_target_network.set_weights(self.q_eval.get_weights())

    def remember(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward, new_state, done)

    def choose_action(self, state):
        state = state[np.newaxis, :]
        rand = np.random.random()
        if rand < self.epsilon:
            action = np.random.choice(self.action_space)
        else:
            actions = self.q_eval.predict(state)
            action = np.argmax(actions)

        return action

    def learn(self):
        if self.memory.mem_cntr > self.batch_size:
            state, action, reward, new_state, done = \
                                          self.memory.sample_buffer(self.batch_size)

            action_values = np.array(self.action_space, dtype=np.int8)
            action_indices = np.dot(action, action_values)

            q_eval = self.q_eval.predict(state)

            # determine next values with target network
            q_next = self.q_target_network.predict(new_state)

            q_target = q_eval.copy()

            batch_index = np.arange(self.batch_size, dtype=np.int32)

            q_target[batch_index, action_indices] = reward + \
                                  self.gamma*np.max(q_next, axis=1)*done

            _ = self.q_eval.fit(state, q_target, verbose=0)

            self.epsilon = self.epsilon*self.epsilon_dec if self.epsilon > \
                           self.epsilon_min else self.epsilon_min

    def update_target_network(self):
        self.q_target_network.set_weights(self.q_eval.get_weights())
    
    def save_model(self):
        self.q_eval.save(self.model_file)

    def load_model(self):
        #self.q_eval = load_model(self.model_file)
        self.q_eval = load_model(self.model_file, custom_objects={'mse': MeanSquaredError()})
        

def plotLearning(x, scores, epsilons, filename, lines=None, custom_yaxis=False):
    fig=plt.figure()
    ax=fig.add_subplot(111, label="1")
    ax2=fig.add_subplot(111, label="2", frame_on=False)

    ax.plot(x, epsilons, color="C0")
    ax.set_xlabel("Game", color="C0")
    ax.set_ylabel("Epsilon", color="C0")
    ax.tick_params(axis='x', colors="C0")
    ax.tick_params(axis='y', colors="C0")

    N = len(scores)
    running_avg = np.empty(N)
    for t in range(N):
	    running_avg[t] = np.mean(scores[max(0, t-20):(t+1)])

    ax2.scatter(x, running_avg, color="C1")
    #ax2.xaxis.tick_top()
    ax2.axes.get_xaxis().set_visible(False)
    ax2.yaxis.tick_right()
    #ax2.set_xlabel('x label 2', color="C1")
    ax2.set_ylabel('Score', color="C1")
    #ax2.xaxis.set_label_position('top')
    ax2.yaxis.set_label_position('right')
    if custom_yaxis:
        ax2.set_ylim([-3.5*1e10, 0.1*1e10])
    
    #ax2.tick_params(axis='x', colors="C1")
    ax2.tick_params(axis='y', colors="C1")
    
    if lines is not None:
        for line in lines:
            plt.axvline(x=line)

    plt.savefig(filename)


In [None]:
# train model for specified number of steps and hyperparameters and save results
def play_the_game_training(network_nr, learning_rate, gamma, epsilon, epsilon_end, n_games,model_prefix=''):

    env = SmartCharging()

    agent = Agent(gamma=gamma, epsilon=epsilon, alpha=learning_rate, input_dims=2,
                  n_actions=4, mem_size=1000000, batch_size=64, epsilon_end=epsilon_end, 
                  fname=f'data/RL/dqn_model{model_prefix}_network{network_nr}_lr{learning_rate}_gamma{gamma}_epsilon{epsilon}.h5', 
                  network_nr=network_nr)

    scores = []
    eps_history = []

    for i in range(n_games):
        done = False
        score = 0
        observation = env.reset()
        print(observation)
        while not done:
            action = agent.choose_action(observation)
            observation_, reward, done = env.step(action)
            score += reward
            agent.remember(observation, action, reward, observation_, int(done))
            observation = observation_
            agent.learn()

        eps_history.append(agent.epsilon)
        scores.append(score)

        # every four games update target network weights
        if i % 4 == 0:
            agent.update_target_network()

        avg_score = np.mean(scores[max(0, i-100):(i+1)])
        print('episode: ', i,'score: %.2f' % score,
              ' average score %.2f' % avg_score)

        if i % 10 == 0 and i > 0:
            agent.save_model()

    # save scores plot
    filename = f'data/RL/smartCharging{model_prefix}_network{network_nr}_lr{learning_rate}_gamma{gamma}_epsilon{epsilon}.png'
    x = [i+1 for i in range(n_games)]
    plotLearning(x, scores, eps_history, filename, custom_yaxis=False)
    
    # save scores
    avg_score = np.mean(scores[-min(50, n_games):])
    filename = f'data/RL/smartCharging_Scores_avg{avg_score}{model_prefix}_network{network_nr}_lr{learning_rate}_gamma{gamma}_epsilon{epsilon}.txt'
    with open(filename, 'w') as file:
        for item in scores:
            file.write(f"{item}\n")
            
    
    return avg_score

In [None]:
# load trained mode for specified hyperparameters and run it for specified number of steps in the environment, 
# save results
def play_the_game_prod(network_nr, learning_rate, gamma, epsilon, epsilon_end, n_games,model_prefix=''):

    env = SmartCharging()

    agent = Agent(gamma=gamma, epsilon=0, alpha=learning_rate, input_dims=2,
                  n_actions=4, mem_size=1000000, batch_size=64, epsilon_end=0, 
                  fname=f'data/RL/dqn_model{model_prefix}_network{network_nr}_lr{learning_rate}_gamma{gamma}_epsilon{epsilon}.h5', 
                  network_nr=network_nr)

    agent.load_model()
    scores = []
    eps_history = []

    for i in range(n_games):
        done = False
        score = 0
        observation = env.reset()
        while not done:
            print(observation)
            action = agent.choose_action(observation)
            observation_, reward, done = env.step(action)
            score += reward
            observation = observation_

        eps_history.append(agent.epsilon)
        scores.append(score)

        avg_score = np.mean(scores[max(0, i-100):(i+1)])
        print('episode: ', i,'score: %.2f' % score,
              ' average score %.2f' % avg_score)


    # save scores plot
    filename = f'data/RL/prod_smartCharging{model_prefix}_network{network_nr}_lr{learning_rate}_gamma{gamma}_epsilon{epsilon}.png'
    x = [i+1 for i in range(n_games)]
    plotLearning(x, scores, eps_history, filename, custom_yaxis=False)
    # save scores plot with y-axis scale comparable to training
    filename = f'data/RL/prod_comparable_smartCharging{model_prefix}_network{network_nr}_lr{learning_rate}_gamma{gamma}_epsilon{epsilon}.png'
    x = [i+1 for i in range(n_games)]
    plotLearning(x, scores, eps_history, filename, custom_yaxis=True)
    
    # save scores
    avg_score = np.mean(scores)
    filename = f'data/RL/prod_smartCharging_Scores_avg{avg_score}{model_prefix}_network{network_nr}_lr{learning_rate}_gamma{gamma}_epsilon{epsilon}.txt'
    with open(filename, 'w') as file:
        for item in scores:
            file.write(f"{item}\n")
        file.write(f"running out of energy: {env.get_running_out_counter()} times\n")
        file.write(f"mean of non negative energy balances: {env.get_mean_of_non_negative_energy_balance()} kWh\n")
        file.write(f"total charging costs: {env.get_total_charging_costs()}\n")
        
        
    # save performance interpretation plots 
    filename = f'data/RL/prod_energyBalances{model_prefix}_network{network_nr}_lr{learning_rate}_gamma{gamma}_epsilon{epsilon}.png'
    env.plot_energy_balances(filename)
    filename = f'data/RL/prod_loadRateCounts{model_prefix}_network{network_nr}_lr{learning_rate}_gamma{gamma}_epsilon{epsilon}.png'
    env.plot_chosen_load_rate_counts(filename)
            
    
    return avg_score

In [None]:
# conduct model training or application for all combinations of specified hyperparameters
def gridsearch(networks,learning_rates,gammas,epsilons,n_games,training,model_prefix=''):

    max_avg_score = 0
    max_parameters = 'empty'
    
    for network, lr, gamma, epsilon in itertools.product(networks, learning_rates, gammas, epsilons):

        epsilon_end = 0
        if epsilon > 0:
            epsilon_end = 0.25

        if training:
            avg_score = play_the_game_training(network, lr, gamma, epsilon, epsilon_end, n_games,model_prefix)
        else:
            avg_score = play_the_game_prod(network, lr, gamma, epsilon, epsilon_end, n_games,model_prefix)

        if max_avg_score == 0 or avg_score > max_avg_score:
            max_avg_score = avg_score
            max_parameters = f"Network: {network}, Learning Rate: {lr}, Gamma: {gamma}, Epsilon: {epsilon}"


    print(f'Max average score: {max_avg_score}')
    print(f'with parameters: ' + max_parameters)
    

We train models with all combinations of hyperparameters defined below and select the best, considering convergence and average total rewards per game in the last few rounds of training: 

In [None]:
networks = [1]
learning_rates = [0.0005, 0.05, 0.1]
# disount rate of feature rewards
gammas = [0.95, 0.7]
# fraction of greedy action selection
epsilons = [0, 0.4]
n_games = 1500

gridsearch(networks,learning_rates,gammas,epsilons,n_games,training=True)

The best results and convergence were given by parameters: learning_rate = 0.05, gamma = 0.7, epsilon = 0:
![smartCharging_network1_lr0.05_gamma0.7_epsilon0.png](attachment:smartCharging_network1_lr0.05_gamma0.7_epsilon0.png)

And by parameters: learning_rate = 0.05, gamma = 0.7, epsilon = 0.4:
![smartCharging_network1_lr0.05_gamma0.7_epsilon0.4.png](attachment:smartCharging_network1_lr0.05_gamma0.7_epsilon0.4.png)
Note: At this point we didn't have a minimum epsilon, later we changed the minimum epsilon to 0.25 in case the start epsilon is greater than zero.

In the other models the convergence did not always look too bad but the performance in terms of average total rewards per game in the last few rounds of training was not as good as in the two selected models. For the other models it looked more like this:

learning_rate = 0.1, gamma = 0.7, epsilon = 0:
![smartCharging_network1_lr0.1_gamma0.7_epsilon0.png](attachment:smartCharging_network1_lr0.1_gamma0.7_epsilon0.png)

learning_rate = 0.1, gamma = 0.95, epsilon = 0.4: 
![smartCharging_network1_lr0.1_gamma0.95_epsilon0.4.png](attachment:smartCharging_network1_lr0.1_gamma0.95_epsilon0.4.png)

learning_rate = 0.005, gamma = 0.7, epsilon = 0:
![smartCharging_network1_lr0.0005_gamma0.7_epsilon0.png](attachment:smartCharging_network1_lr0.0005_gamma0.7_epsilon0.png)

learning_rate = 0.005, gamma = 0.95, epsilon = 0.4: 
![smartCharging_network1_lr0.0005_gamma0.95_epsilon0.4.png](attachment:smartCharging_network1_lr0.0005_gamma0.95_epsilon0.4.png)

Note that the convergence towards the end is not influenced by the epsilon parameter, as epsilon is zero or almost zero after 200 games. 
We now run the two best parameter combinations with the more complex neural network (containing an additional layer and additional nodes) and a minimum epsilon in the case of start epsilon = 0.4 to maintain some exploration:

In [None]:
networks = [2]
learning_rates = [0.05]
gammas = [0.7]
epsilons = [0, 0.4]
n_games = 1500

gridsearch(networks,learning_rates,gammas,epsilons,n_games,training=True)

For the complex network the parameter combination: learning_rate = 0.05, gamma = 0.7, epsilon = 0.4 did not perform too bad. It looks like it overall converges, but with some 'noise' which is probably caused by the minimum epsilon of 0.1 during the whole training, which we did not use in the models with the simpler network:
![smartCharging_network2_lr0.05_gamma0.7_epsilon0.4.png](attachment:smartCharging_network2_lr0.05_gamma0.7_epsilon0.4.png)

For parameter combination: learning_rate = 0.05, gamma = 0.7, epsilon = 0 it did not really converge. A possible reason could be overfitting because of the more complex neural network. It could be that an possibly initial bias is omnipresent, which the e-greedy did overcome: 
![smartCharging_network2_lr0.05_gamma0.7_epsilon0.png](attachment:smartCharging_network2_lr0.05_gamma0.7_epsilon0.png)
Admittedly, at this point it is not clear if the more complex network performs worse in general, maybe different hyperparameters would work better or the one trained with e-greedy strategy already would outperform the previous selected models in a productive setting. But at the same time there is no strong evidence, so we just stick to the two previous selected models. 

So we will now run the two selected models in a productive setting for 100 simulated days and look closer at the performance and chosen actions:

In [None]:
networks = [1]
learning_rates = [0.05]
gammas = [0.7]
epsilons = [0, 0.4]
n_games = 100

gridsearch(networks,learning_rates,gammas,epsilons,n_games,training=False)

First selected model (training parameters: learning_rate = 0.05, gamma = 0.7, epsilon = 0):

![prod_energyBalances_network1_lr0.05_gamma0.7_epsilon0.png](attachment:prod_energyBalances_network1_lr0.05_gamma0.7_epsilon0.png)

![prod_loadRateCounts_network1_lr0.05_gamma0.7_epsilon0.png](attachment:prod_loadRateCounts_network1_lr0.05_gamma0.7_epsilon0.png)

Second selected model (training parameters: learning_rate = 0.05, gamma = 0.7, epsilon = 0.4):
![prod_energyBalances_network1_lr0.05_gamma0.7_epsilon0.4.png](attachment:prod_energyBalances_network1_lr0.05_gamma0.7_epsilon0.4.png)

![prod_loadRateCounts_network1_lr0.05_gamma0.7_epsilon0.4.png](attachment:prod_loadRateCounts_network1_lr0.05_gamma0.7_epsilon0.4.png)

It is striking, that in both cases the agent never runs out of energy during the shift, which was one of the goals. But it is also striking, that the vehicle is often charged to full capacity and often ends a shift with more than half (36 kWh) of the maximum capacity (72 kWh), which indicates room for improvement concerning the charging cost efficiency. This is also confirmed by the chosen charging rates which are mostly only the medium and high rate. 

The total charging cost over 100 days are very similar in both models, ratio:

In [None]:
cost_model_1 = 83534432682.9123 # 'total charging costs' taken from: 'prod_smartCharging_Scores_avg-3415615700.9592605_network1_lr0.05_gamma0.7_epsilon0.txt'
cost_model_2 = 85336201338.59041 # 'total charging costs' taken from: prod_smartCharging_Scores_avg-2788565543.9834995_network1_lr0.05_gamma0.7_epsilon0.4.txt)'

print(cost_model_1/cost_model_2)

We try two different of the before discarded models in productive performance and notice that they have the same issues:

In [None]:
networks = [1]
learning_rates = [0.1]
gammas = [0.95]
epsilons = [0.4]
n_games = 100

gridsearch(networks,learning_rates,gammas,epsilons,n_games,training=False)

![prod_energyBalances_network1_lr0.1_gamma0.95_epsilon0.4.png](attachment:prod_energyBalances_network1_lr0.1_gamma0.95_epsilon0.4.png)

![prod_loadRateCounts_network1_lr0.1_gamma0.95_epsilon0.4.png](attachment:prod_loadRateCounts_network1_lr0.1_gamma0.95_epsilon0.4.png)

In [None]:
networks = [2]
learning_rates = [0.05]
gammas = [0.7]
epsilons = [0.4]
n_games = 100

gridsearch(networks,learning_rates,gammas,epsilons,n_games,training=False)

![prod_energyBalances_network2_lr0.05_gamma0.7_epsilon0.4.png](attachment:prod_energyBalances_network2_lr0.05_gamma0.7_epsilon0.4.png)

Here it even happens that we run out of energy once. 

![prod_loadRateCounts_network2_lr0.05_gamma0.7_epsilon0.4.png](attachment:prod_loadRateCounts_network2_lr0.05_gamma0.7_epsilon0.4.png)

Since the other models don't perform better, we stick to our previous selection and try to design a different penalty (negative rewards), that put less focus on avoiding running out of energy.

We decrease the penalty for running out of energy from $(-1)*2*e^{22}*10$ to $(-1)*2*e^{22}$ (-> 2 hours of charging at highest rate). We also want to make more use of exploration in epsilon 0.4 case -> min_epsilon 0.1. We again train the two selected models in this new setting:

In [None]:
# note that the new model filenames and plot filenames now have the suffix '_lowerPenalty'

networks = [1]
learning_rates = [0.05]
gammas = [0.7]
epsilons = [0,0.4]
n_games = 1500

gridsearch(networks,learning_rates,gammas,epsilons,n_games,training=True,model_prefix='_lowerPenalty')

n_games = 100
gridsearch(networks,learning_rates,gammas,epsilons,n_games,training=False,model_prefix='_lowerPenalty')

First selected model (training parameters: learning_rate = 0.05, gamma = 0.7, epsilon = 0):

![prod_energyBalances_lowerPenalty_network1_lr0.05_gamma0.7_epsilon0.png](attachment:prod_energyBalances_lowerPenalty_network1_lr0.05_gamma0.7_epsilon0.png)

![prod_loadRateCounts_lowerPenalty_network1_lr0.05_gamma0.7_epsilon0.png](attachment:prod_loadRateCounts_lowerPenalty_network1_lr0.05_gamma0.7_epsilon0.png)

Second selected model (training parameters: learning_rate = 0.05, gamma = 0.7, epsilon = 0.4):

![prod_energyBalances_lowerPenalty_network1_lr0.05_gamma0.7_epsilon0.4.png](attachment:prod_energyBalances_lowerPenalty_network1_lr0.05_gamma0.7_epsilon0.4.png)

![prod_loadRateCounts_lowerPenalty_network1_lr0.05_gamma0.7_epsilon0.4.png](attachment:prod_loadRateCounts_lowerPenalty_network1_lr0.05_gamma0.7_epsilon0.4.png)

We observe that the focus is now more on medium and low charging rates and not on the high one anymore. But especially in the first model it still is often charged to full capacity and we end up with relatively high energy balance after a shift. This already looks better in the second model. But in both cases we now sometimes run out of energy and despite are not that charging cost efficient. We try to tweak it further.

We introduce a penalty for finishing a shift with battery status of over 50%. The penalty ($= (-1)*2*(e^{22}-e^{14})$) is designed in a way that charging for the full two hours with medium capacity and then finishing the shift with over 50% battery is as expensive as running out of energy. We hope that this is an incentive for the model to consider the lower rate or no charging more often. Since the exploration with the e-greedy policy seemed to have a positive effect in the last run, we increase exploration in the epsilon 0.4 case -> min_epsilon 0.25:

In [None]:
# note that the new model filenames and plot filenames now have the suffix '_lowerAndOver50Penalty'

networks = [1]
learning_rates = [0.05]
gammas = [0.7]
epsilons = [0,0.4]
n_games = 1500

gridsearch(networks,learning_rates,gammas,epsilons,n_games,training=True,model_prefix='_lowerAndOver50Penalty')

n_games = 100
gridsearch(networks,learning_rates,gammas,epsilons,n_games,training=False,model_prefix='_lowerAndOver50Penalty')

We look at the runs productive setting after training the models in the adjusted environment and we realise that the performance looks now much better:

First selected model (training parameters: learning_rate = 0.05, gamma = 0.7, epsilon = 0):
![prod_energyBalances_lowerAndOver50Penalty_network1_lr0.05_gamma0.7_epsilon0.png](attachment:prod_energyBalances_lowerAndOver50Penalty_network1_lr0.05_gamma0.7_epsilon0.png)

![prod_loadRateCounts_lowerAndOver50Penalty_network1_lr0.05_gamma0.7_epsilon0.png](attachment:prod_loadRateCounts_lowerAndOver50Penalty_network1_lr0.05_gamma0.7_epsilon0.png)

The first model runs out of energy a few times, but only just misses the demand, so only runs slightly into the red. The vehicle is rarely fully charged and the shift usually ends with less than 30 kWh of charge. In addition, the low charging rate or no charging is now used much more frequently. This is really what we wanted to achieve. Now the two goals of avoiding to run out of energy and minimsing the charging cost seem well balanced. 

Total charging cost over 100 days is only 12% of cost from the initial model version, which never ran out of energy:

In [None]:
cost_model_1_now = 9951475722.353336 # 'total charging costs' taken from: 'prod_smartCharging_Scores_avg-1103218197.8833294_lowerAndOver50Penalty_network1_lr0.05_gamma0.7_epsilon0.txt'
cost_model_1_old = 83534432682.9123 # 'total charging costs' taken from: 'prod_smartCharging_Scores_avg-3415615700.9592605_network1_lr0.05_gamma0.7_epsilon0.txt'

print(cost_model_1_now/cost_model_1_old)

Second selected model (training parameters: learning_rate = 0.05, gamma = 0.7, epsilon = 0.4):
![prod_energyBalances_lowerAndOver50Penalty_network1_lr0.05_gamma0.7_epsilon0.4.png](attachment:prod_energyBalances_lowerAndOver50Penalty_network1_lr0.05_gamma0.7_epsilon0.4.png)

![prod_loadRateCounts_lowerAndOver50Penalty_network1_lr0.05_gamma0.7_epsilon0.4.png](attachment:prod_loadRateCounts_lowerAndOver50Penalty_network1_lr0.05_gamma0.7_epsilon0.4.png)

The second model seems to focus even more on charging cost efficiency. It runs out of energy much more often and to a somewhat greater extent, but still not to a catastrophic degree (max. approx. -15 kWh). However, the shifts are ended much more clearly with a charge level below 30 kWh. Furthermore we observe that the low charging rate is now really dominating.

Total charging cost over 100 days is essentially nothing of the cost from the initial model version, which never ran out of energy:

In [None]:
cost_model_2_now = 26865769.03048192 # 'total charging costs' taken from: 'prod_smartCharging_Scores_avg-2581381854.819368_lowerAndOver50Penalty_network1_lr0.05_gamma0.7_epsilon0.4.txt'
cost_model_2_old = 85336201338.59041 # 'total charging costs' taken from: prod_smartCharging_Scores_avg-2788565543.9834995_network1_lr0.05_gamma0.7_epsilon0.4.txt)'

print(cost_model_2_now/cost_model_2_old)

It has been shown that reward design plays a key role here. We have achieved satisfactory results, but at the same time it should be noted that not all possibilities have been exhausted. To counteract the excessive overspending of available energy, for example, the penalty could be made dynamic. Currently, running out of energy is always equally expensive. The same applies to ending the shift with an unnecessarily high charge level, which indicates wasteful charging behaviour. The hyperparameter optimisation of both the neural network itself and the Deep Q-Network learning process could also be designed in more detail.

We propose the model 'dqn_model_lowerAndOver50Penalty_network1_lr0.05_gamma0.7_epsilon0.h5' as the final smart charging agent, because in our test it only ran out of energy a few times and to a small extend. At the same time we could reduce the charging cost to about 12% of the initial version, which strictly avoids running out of energy. This was the best improvement and performance we could achieve with respect to our two goals of not running out of energy and minimising the charging cost.

In [None]:
# load the finally selected model
filename = 'data/RL/dqn_model_lowerAndOver50Penalty_network1_lr0.05_gamma0.7_epsilon0.h5'
smart_charging_agent = load_model(filename, custom_objects={'mse': MeanSquaredError()})