In [None]:
# IMPORTS
import numpy as np
from pathlib import Path
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, LeakyReLU, ELU
from tensorflow.keras.optimizers import Adam
from rl.agents.dqn import DQNAgent
from rl.policy import LinearAnnealedPolicy, EpsGreedyQPolicy
from rl.memory import SequentialMemory
from CustomKerasRL2Callbacks_torqueCtrl import StoreEpisodeLogger, randomSpeedProfile
from gym.wrappers import FlattenObservation
from gym.core import Wrapper
from gym.spaces import Box, Tuple
import sys, os
sys.path.append(os.path.abspath(os.path.join('..')))
import gym_electric_motor as gem
from gym_electric_motor.reward_functions import WeightedSumOfErrors
from gym_electric_motor.physical_systems import ExternalSpeedLoad
from gym_electric_motor.reference_generators import ConstReferenceGenerator

In [None]:
class TransformObservationWrapper(Wrapper):
    """
    This wrapper function receives the outputs from the GEM simulation (observation, reward, done flag)
    and processes them as required. The original information from GEM is then overwritten.
    """
    def __init__(self, environment):
        super().__init__(environment)
        self.observation_space = Tuple((Box(
            np.concatenate(([environment.observation_space[0].low[0]], # angular velocity
                            environment.observation_space[0].low[5:7], # currents in dq
                            environment.observation_space[0].low[10:12], # voltages in dq
                            [-1, -1], # angles in cos, sin 
                            [-1])), # stator current
            np.concatenate(([environment.observation_space[0].high[0]],
                            environment.observation_space[0].high[5:7],
                            environment.observation_space[0].high[10:12],
                            [+1, +1],
                            [+1])),
        ), environment.observation_space[1])) # reference torque

        self.subactions = -np.power(-1, self.env.physical_system._converter._subactions)
        self.gamma = self.env.reward_function._gamma
        self.test = False

    def step(self, action):

        (state, ref), rew, term, info = self.env.step(action)

        self._obs_logger = np.concatenate((state, ref))

        eps = state[12] * np.pi
        angle_scale = 0.1
        angles = [angle_scale * np.cos(eps), angle_scale * np.sin(eps)]

        # transform the action information to the dq-frame
        u_abc = self.subactions[action]
        u_dq = self.env.physical_system.abc_to_dq_space(u_abc, epsilon_el=eps)
        now_requested_voltage = u_dq

        i_d = state[5]
        i_q = state[6]
        T = state[1]
        T_ref = ref[0]

        current_total = np.sqrt(i_d ** 2 + i_q ** 2) # calculate stator current i_s

        # redefine the state observation vector o
        # please note that all values are already normalized!
        observable_state = np.concatenate(([state[0]],
                                           state[5:7],
                                           now_requested_voltage, 
                                           angles, 
                                           [2 * current_total - 1]))

        # redefine the reward function
        id_boundary = 15 / 270
        dangerzone_boundary = 240 / 270
        torque_boundary = 5 / self.env.limits[1]
        e_T_abs = np.abs(T_ref - T)
        term = False # termination flag
        
        if current_total > 1: # region E, "Error zone", set the terminal flag
            rew = -1
            term = True
            
        elif current_total > dangerzone_boundary: # region D, "Danger zone", short time overcurrent
            reward_offset = - (1 - self.gamma)
            rew = (1 - (current_total - dangerzone_boundary) / (1 - dangerzone_boundary)) * (1 - self.gamma) / 2 + reward_offset
            
        elif i_d > id_boundary: # region C, "Caturation zone", saturation of the permanent magnet
            reward_offset = - (1 - self.gamma) / 2
            rew = (1 - (i_d - id_boundary) / (dangerzone_boundary - id_boundary)) * (1 - self.gamma) / 2 + reward_offset
            
        elif e_T_abs > torque_boundary: # region B, "Basic zone", torque is not yet accurate
            reward_offset = 0
            rew = (1 - e_T_abs / 2) * (1 - self.gamma) / 2 + reward_offset
        
        else: # region A, "Awesome zone", torque is accurate and current needs to be minimized
            reward_offset = (1 - self.gamma) / 2
            rew = (1 - current_total) * (1 - self.gamma) / 2 + reward_offset

        return (observable_state, ref), rew, term, info

    def reset(self, **kwargs):
        state, ref = self.env.reset()

        self._obs_logger = np.concatenate((state, ref))

        eps = state[12] * np.pi
        angle_scale = 0.1
        angles = [angle_scale * np.cos(eps), angle_scale * np.sin(eps)]
        torque_error = [(ref[0] - state[1]) / 2]

        # it is assumed that immediately after the reset no voltage is applied 
        u_abc = self.subactions[0]
        u_dq = self.env.physical_system.abc_to_dq_space(u_abc, epsilon_el=eps)
        now_requested_voltage = u_dq

        i_d = state[5]
        i_q = state[6]

        current_total = np.sqrt(i_d ** 2 + i_q ** 2) # calculate stator current i_s
        observable_state = np.concatenate(([state[0]], 
                                           state[5:7], 
                                           now_requested_voltage, 
                                           angles, 
                                           [2 * current_total - 1])) 

        return (observable_state, ref)

In [None]:
# Define the environment parameters
torque_ref_generator = ConstReferenceGenerator(reference_state='torque', reference_value=np.random.uniform(-1, 1))

motor_parameter = dict(p=3,            # [p] = 1, nb of pole pairs
                       r_s=17.932e-3,  # [r_s] = Ohm, stator resistance
                       l_d=0.37e-3,    # [l_d] = H, d-axis inductance
                       l_q=1.2e-3,     # [l_q] = H, q-axis inductance
                       psi_p=65.65e-3, # [psi_p] = Vs, magnetic flux of the permanent magnet
                       )  # BRUSA

u_sup = 350
nominal_values=dict(omega=12000*2*np.pi/60,
                    i=240,
                    u=u_sup)

limit_values=nominal_values.copy()
limit_values["i"] = 270
limit_values["torque"] = 200

sampling_time = 50e-6

In [None]:
# define the training routine
def train_agent(param_dict):
    
    # unpack the parameters
    subfolder_name = param_dict["subfolder_name"]

    gamma = param_dict["gamma"]

    alpha0 = param_dict["alpha0"]
    alpha1 = param_dict["alpha1"]
    lr_reduction_start = param_dict["lr_reduction_start"]
    lr_reduction_interval = param_dict["lr_reduction_interval"]

    epsilon0 = param_dict["epsilon0"]
    epsilon1 = param_dict["epsilon1"]
    nb_policy_annealing_steps = param_dict["nb_policy_annealing_steps"]

    layers = param_dict["layers"]
    neurons = param_dict["neurons"]
    target_update_parameter = param_dict["target_update_parameter"]

    batch_size = param_dict["batch_size"]
    memory_size = param_dict["memory_size"]
    
    nb_episode_steps = param_dict["nb_episode_steps"]
    nb_training_steps = param_dict["nb_training_steps"]
    
    activation_fcn = param_dict["activation_fcn"]
    activation_fcn_parameter = param_dict["activation_fcn_parameter"]
    
    tf.config.set_visible_devices([], 'GPU')

    # create the subfolder if it does not exist yet
    Path(subfolder_name).mkdir(parents=True, exist_ok=True)
    
    random_profile_generator = randomSpeedProfile(maxSpeed=nominal_values["omega"], 
                                                  epsLength=nb_episode_steps)

    # create the PMSM environment
    env = gem.make("Finite-TC-PMSM-v0",
                   motor = dict(
                       motor_parameter=motor_parameter,
                       limit_values=limit_values,
                       nominal_values=nominal_values,
                   ),
                   supply=dict(u_nominal=u_sup),
                   load=ExternalSpeedLoad(random_profile_generator.randomProfile, 
                                          tau=sampling_time),
                   tau=sampling_time,
                   reward_function=WeightedSumOfErrors(reward_weights={'torque': 1},  # but the reward distribution will be overwritten
                                                              gamma=gamma), # by means of the defined wrapper function
                   reference_generator=torque_ref_generator,
                   ode_solver='scipy.solve_ivp'
                   )

    (x, r) = env.reset()
    limits = env.physical_system.limits

    # wrap the environment to preprocess the observation as desired (to overwrite observation, reward, done flag)
    env = FlattenObservation(TransformObservationWrapper(env))

    # create the feedforward multilayer perceptron to be used as DQN
    # select special procedure for parameterized activations
    if activation_fcn == "leaky_relu" or activation_fcn == "elu":
        dense_activation_fcn = 'linear'
    else:
        dense_activation_fcn = activation_fcn
    
    
    nb_actions = env.action_space.n
    window_length = 1
    model = Sequential()
    model.add(Flatten(input_shape=(window_length,) + env.observation_space.shape))
    for i in range(layers):
        model.add(Dense(neurons, activation=dense_activation_fcn))
        if activation_fcn == 'leaky_relu':
            model.add(LeakyReLU(alpha=activation_fcn_parameter))
        elif activation_fcn == 'elu':
            model.add(ELU(alpha=activation_fcn_parameter))
    model.add(Dense(nb_actions,
                    activation='linear'
                    ))

    # define the DQN agent
    memory = SequentialMemory(limit=memory_size, window_length=window_length)
    policy = LinearAnnealedPolicy(EpsGreedyQPolicy(eps=epsilon0),
                                  attr='eps',
                                  value_max=epsilon0,
                                  value_min=epsilon1,
                                  value_test=0,
                                  nb_steps=nb_policy_annealing_steps)
    agent = DQNAgent(model=model,
                     nb_actions=nb_actions,
                     gamma=gamma,
                     batch_size=batch_size,
                     memory=memory,
                     memory_interval=1,
                     policy=policy,
                     train_interval=1,
                     target_model_update=target_update_parameter,
                     enable_double_dqn=False)

    # compile the agent
    agent.compile(Adam(lr=alpha0), metrics=['mse'])

    # define the logger to save the episode data 
    logger = StoreEpisodeLogger(folder_name=subfolder_name,
                                file_name="training_episode",
                                tau=sampling_time, 
                                limits=limits, training=True,
                                lr_max=alpha0, lr_min=alpha1,
                                nb_steps_start=lr_reduction_start,
                                nb_steps_reduction=lr_reduction_interval,
                                speed_generator=random_profile_generator,
                                create_eps_logs=True)
    
    # start training the agent (this will take a while, about 3 days on my local machine!)
    history = agent.fit(env,
                        nb_steps=nb_training_steps,
                        action_repetition=1,
                        verbose=0,
                        visualize=False,
                        nb_max_episode_steps=nb_episode_steps,
                        log_interval=10000,
                        callbacks=[logger])
    
    # save the network weights after training such that they can be reused
    agent.save_weights(filepath=subfolder_name + "/" + "weights.hdf5", overwrite=True)


In [None]:
param_dict = {"subfolder_name": "DQ_DTC_agent0",
    
              "gamma": 0.868,

              "alpha0": 2.887e-5,
              "alpha1": 1.736e-5,
              "lr_reduction_start":     460000,
              "lr_reduction_interval": 2710000,

              "layers": 10,
              "neurons": 560,

              "epsilon0": 2.119e-1,
              "epsilon1": 1.774e-1,
              "nb_policy_annealing_steps": 2210000,

              "memory_size": 365000,
              "batch_size": 32,
              "target_update_parameter": 2.096e-1,

              "nb_training_steps": 3000000,
              "nb_episode_steps": 14900,
              
              "activation_fcn": "leaky_relu", # one of the following:
                                              # 'softplus', 'leaky_relu', 'elu',
                                              # 'selu', 'sigmoid', 'tanh'
              
              "activation_fcn_parameter": 0.3425
              }

train_agent(param_dict)

In [None]:
from Plot_TimeDomain_torqueCtrl import plot_episode

# this function will save a pdf of the corresponding episode to the "Plots" folder
# a "Plots" folder will be created if there is none
plot_episode(training_folder = "DQ_DTC_agent0",
             episode_number = 0,
             episode_type = "training_episode")