# Load Dependencies

In [1]:
from __future__ import division
import warnings
warnings.filterwarnings('ignore')

import json
import multiprocessing
import os
import time
from threading import Thread

import gym
import numpy as np
import tensorflow as tf
from PIL import Image
from keras import backend as K

from keras.layers import (Activation, 
                          Convolution2D, 
                          Dense, 
                          Flatten, 
                          Permute)
from keras.models import (Sequential, 
                          load_model)
from keras.optimizers import Adam
from matplotlib import pyplot as plt
from rl.agents.dqn import DQNAgent
from rl.callbacks import (FileLogger, 
                          ModelIntervalCheckpoint, 
                          WandbLogger)
from rl.core import Processor
from rl.memory import SequentialMemory
from rl.policy import (EpsGreedyQPolicy, 
                       LinearAnnealedPolicy)

import import_ipynb
from cbrUtil import AtariProcessor

Using TensorFlow backend.


importing Jupyter notebook from cbrUtil.ipynb


# Build TrainAgent

In [2]:
class TrainAgent():
    """
    This class implements the "Training Agent" of the proposed PENG 
    Architecture.
    
    Attributes
    ----------
    env_name : str
        exact name of the Atari environment.
    name_of_run : str
        description of the current run will be used in WandbLogger.
    transferMode : boolean
        true=agent learns from previous model. 
        false=agent learns from scratch.
    transfer_architecture : str
        path to the architecture to learn from.
    transfer_policy : str
        path to the policy that is injected into the architecture.
    transfer_game_nb_actions : int
        number of actions from transfer game

    Methods
    -------
    printInitialization(self):
        Prints the initial values.
    
    buildModelBaseline(self):
        builds the keras model if no transfer happens.
    
    buildModelInjection(self):
        builds the keras model if transfer happens.
    
    configAgentInjection(self):
        configs the transfer agent model.
    
    compileAgentInjection(self):
        compiles the transfer agent.
    
    getModelBaselineSummary(self):
        returns keras model summary.
    
    getModelInjectionSummary(self):
        returns keras model summary for transfer.
    
    configAgentBaseline(self):
        configs the agent model.
    
    compileAgentBaseline(self):
        compiles the agent.
    
    createCallbacks(self):
        creates keras-rl callbacks.
    
    trainingBaseline(self):
        start agent training.
    
    trainingInjection(self):
        start transfer agent training.
    """
    
 
    def __init__(self,
                 env_name,
                 name_of_run,
                 transferMode = False,
                 transfer_architecture = None,
                 transfer_policy = None,
                 transfer_game_nb_actions = None):
        
        """
        Initializes the environment.
        
        Attributes
        ----------
        env_name : str
            exact name of the Atari environment.
        name_of_run : str
            description of the current run will be used in WandbLogger.
        transferMode : boolean
            true=agent learns from previous model. 
            false=agent learns from scratch.
        transfer_architecture : str
            path to the architecture to learn from.
        transfer_policy : str
            path to the policy that is injected into the architecture.
        transfer_game_nb_actions : int
            number of actions from transfer game
        
        """

        self.INPUT_SHAPE = (84,84)
        self.WINDOW_LENGTH = 4
        
        #init gym environment with random seed for reproduction
        self.env_name = env_name
        self.env = gym.make(self.env_name)
        np.random.seed(123)
        self.env.seed(123)
        
        
        #get nb_actions for the learning env
        self.nb_actions = self.env.action_space.n
        
        #init model, memory, processor, policy, dqn and history
        #same for baseline and transfer
        self.model = None
        self.memory = None
        self.processor = None
        self.policy = None
        self.dqn = None
        self.history = None
        
        #model to be trained when transfermode = true
        self.injectionModel = None #model to be injected into training
        
        #true->agent learns from similar model
        #false->agent learns from scratch
        self.transferMode = transferMode
        
        #defines the path to the ModelRepo of the environment  
        self.path = './KC/ModelRepo/Atari_'+ self.env_name
        self.path_to_architecture = self.path+'/Architecture/'
        self.path_to_policy = self.path+'/Policy/'
        
        #bring some uniqueness to the saving name
        self.TIME = str(int(time.time()))
        self.SAVE_NAME = self.env_name+'_'+self.TIME
        self.name_of_run = (self.env_name+
                            '_'+name_of_run+
                            '_'+self.TIME)
        
        #for building the keras-rl callbacks
        self.policy_filename = None
        self.ckpnt_policy_filename = None
        self.architecture_filename_start = None
        self.architecture_filename_end = None
        self.callbacks=None
        
        #print environment inits
        self.printInitialization()
        
        if self.transferMode == False:
            print("Starting normal training mode.")
            self.buildModelBaseline()
            self.configAgentBaseline()
            self.compileAgentBaseline()
            self.createCallbacks() #same in all scenarios
        else:
            print("Starting transfer training mode.")
            #most similar model = architecture+policy
            
            self.transfer_architecture = load_model(
                                                transfer_architecture)
            
            self.transfer_policy = transfer_policy

            #nb_actions of most similar model
            self.transfer_game_nb_actions = transfer_game_nb_actions
            
            assert self.transfer_architecture is not None
            assert self.transfer_policy is not None
            assert self.transfer_game_nb_actions is not None
            
            self.buildModelInjection()
            self.configAgentInjection()
            self.compileAgentInjection()
            self.createCallbacks()
            
        
    def printInitialization(self):
        """
        Prints the initial values.
        """
        print(f"Env:{self.env_name}\n"+
              f"nb_actions:{self.nb_actions}\n"+
              f"Path:{self.path}\n"+
              f"Name of Run: {self.name_of_run}\n"+
              f"Save Name: {self.SAVE_NAME}")
    
    def buildModelBaseline(self):
        """
        builds the keras model if no transfer happens.
        """
        input_shape = (self.WINDOW_LENGTH,) + self.INPUT_SHAPE
        model = Sequential()
        if K.common.image_dim_ordering() == 'tf':
            # (width, height, channels)
            model.add(Permute((2, 3, 1), input_shape=input_shape))
        elif K.image_dim_ordering() == 'th':
            # (channels, width, height)
            model.add(Permute((1, 2, 3), input_shape=input_shape))
        else:
            raise RuntimeError('Unknown image_dim_ordering.')
        model.add(Convolution2D(32, (8, 8), strides=(4, 4)))
        model.add(Activation('relu'))
        model.add(Convolution2D(64, (4, 4), strides=(2, 2)))
        model.add(Activation('relu'))
        model.add(Convolution2D(64, (3, 3), strides=(1, 1)))
        model.add(Activation('relu'))
        model.add(Flatten())
        model.add(Dense(512))
        model.add(Activation('relu'))
        model.add(Dense(self.nb_actions))
        model.add(Activation('linear'))
        self.model = model
        #print(model.summary())
        
    def buildModelInjection(self):
        """
        builds the keras model if transfer happens.
        """
        #model with same nb_actions: -> just insert the weights
        if self.transfer_game_nb_actions==self.nb_actions:
            print("Same numb actions")
            self.buildModelBaseline()
            self.transfer_architecture = self.model
            self.transfer_architecture.load_weights(
                                                self.transfer_policy)
            
            self.injectionModel = self.transfer_architecture
        else:
            
            newModel = Sequential()
            self.transfer_architecture.load_weights(
                                                self.transfer_policy)
            
            #transformArchitecture
            for layer in self.transfer_architecture.layers[:-2]:
                newModel.add(layer)
            
            newModel.add(Dense(units=self.nb_actions,
                               kernel_initializer='random_uniform',
                               name='dense_'+self.TIME))
            
            newModel.add(Activation('linear',
                                    name='activation_'+self.TIME))
            
            #changeLastLayerWeights - for testing purpose
            #otherwise uncomment random_weights = ....
            rando = [0.6130022623869411,
                      0.5847831013889547,
                      0.5576423909034692,
                      0.5527961438581136,
                      0.6381476456945895,
                      0.6665673062642212,
                      0.6184783155260986,
                      0.5933045164230435,
                      0.6347319971692273,
                      0.578295870829989,
                      0.6404208736445819,
                      0.6931395285466848,
                      0.5181019754212713,
                      0.5116250271400286,
                      0.6176034837902878,
                      0.650234751059707,
                      0.6022634782996388,
                      0.684100237641685]
            
            
            weights = newModel.get_weights()
            ia=0
            for weight in range(len(weights[-1])):
                #random_weight = np.random.uniform(low=0.5, high=0.7)
                random_weight = rando[ia]
                weights[-1][weight] = random_weight
                ia+=1
            newModel.set_weights(weights)
            self.injectionModel = newModel
            #self.injectionModel.summary()
            
    def configAgentInjection(self):
        """
        configs the transfer agent model.
        """
        self.memory = SequentialMemory(limit=1000000, 
                                  window_length=self.WINDOW_LENGTH)
        
        self.processor = AtariProcessor()
        
        #eps=0.5 worked so far for model injection = value_max
        self.policy = LinearAnnealedPolicy(EpsGreedyQPolicy(action_size=self.nb_actions,
                                                            q_Injection = False), 
                                      attr='eps', 
                                      value_max=0.5, 
                                      value_min=.1, 
                                      value_test=.05,
                                      nb_steps=1000000)
        
    def compileAgentInjection(self):
        """
        compiles the transfer agent.
        """
    
        #intelligent adaptation of the learning rate could be 
        #beneficial
        
        self.dqn = DQNAgent(model=self.injectionModel, 
                            nb_actions=self.nb_actions, 
                            policy=self.policy, 
                            memory=self.memory,
                            processor=self.processor, 
                            nb_steps_warmup=50000, 
                            gamma=.99, 
                            target_model_update=10000,
                            train_interval=4, delta_clip=1.)
        
        self.dqn.compile(Adam(lr=.00025), metrics=['mae'])
    
    @property
    def getModelBaselineSummary(self):
        """
        returns keras model summary.
        """
        return self.model.summary()
    
    @property
    def getModelInjectionSummary(self):
        """
        returns keras model summary for transfer.
        """
        return self.injectionModel.summary()
    
    def configAgentBaseline(self):
        """
        configs the agent model.
        """
        self.memory = SequentialMemory(limit=1000000, 
                                  window_length=self.WINDOW_LENGTH)
        
        self.processor = AtariProcessor()
        
        self.policy = LinearAnnealedPolicy(EpsGreedyQPolicy(action_size=self.nb_actions,
                                                            q_Injection = True), 
                                      attr='eps', 
                                      value_max=1., 
                                      value_min=.1, 
                                      value_test=.05,
                                      nb_steps=1000000)
    def compileAgentBaseline(self):
        """
        compiles the agent model.
        """
        self.dqn = DQNAgent(model=self.model, 
                            nb_actions=self.nb_actions, 
                            policy=self.policy, 
                            memory=self.memory,
                            processor=self.processor, 
                            nb_steps_warmup=50000, 
                            gamma=.99, 
                            target_model_update=10000,
                            train_interval=4, delta_clip=1.)

        self.dqn.compile(Adam(lr=.00025), metrics=['mae'])
        
    def createCallbacks(self):
        """
        creates keras-rl callbacks.
        """
        self.policy_filename = (self.path_to_policy +
                        '{}_weights_END.h5f'.format(self.SAVE_NAME))
        
        self.ckpnt_policy_filename = (self.path_to_policy+
                                       self.SAVE_NAME+
                                       '_weights_{step}.h5f')
        
        
        self.architecture_filename_start =(self.path_to_architecture+ 
                                       f'{self.SAVE_NAME}_start.h5')
        
                            
        self.architecture_filename_end = (self.path_to_architecture + 
                                '{}_end.h5'.format(self.SAVE_NAME))
                            
        self.callbacks=[ModelIntervalCheckpoint(
                                        self.ckpnt_policy_filename,
                                             interval=20000)]

        self.callbacks += [WandbLogger(self.name_of_run)]
        
      
    def trainingBaseline(self):
        """
        start agent training.
        """
        self.dqn.save(self.architecture_filename_start)
        
        self.history = self.dqn.fit(self.env, 
                               callbacks=self.callbacks, 
                               nb_steps=2000000, 
                               log_interval=10000)


        self.dqn.save_weights(self.policy_filename, 
                         overwrite=True)
                                
        self.dqn.save(self.architecture_filename_end)
        
    def trainingInjection(self):
        """
        start transfer agent training.
        """
        
        #try to use different stopping criterion
        
        self.dqn.save(self.architecture_filename_start)
        
        self.history = self.dqn.fit(self.env, 
                               callbacks=self.callbacks, 
                               nb_steps=2000000, 
                               log_interval=10000)


        self.dqn.save_weights(self.policy_filename, 
                         overwrite=True)
                                
        self.dqn.save(self.architecture_filename_end)

# Training...
The next code cells show, how to use this notebook standalone

In [3]:
#Which other game is going to be injected
INJECT = 'MsPacman-v0'

#Name of training environment -> has to be exact
ENV_NAME = 'Seaquest-v0' #with ideal q-value range

#If model injection is performed (=found a similar game) 
#than transfer_mode = True
TRANSFER_MODE = False

#Name of run appears in WandbLogger
if TRANSFER_MODE:
    NAME_OF_RUN = f'Inject({INJECT})'
else:
    #Baseline or Q-Inject
    NAME_OF_RUN ='Q-Inject'

#Define underlying architecture that training should 
#use (most sim game)
TRANSFER_ARCHITECTURE = './KC/ModelRepo/Atari_MsPacman-v0/Architecture/MsPacman-v0_1579634126_end.h5'

#Define most succesful policy corresponding to the architecture
TRANSFER_POLICY = './KC/ModelRepo/Atari_MsPacman-v0/Policy/MsPacman-v0_1579119635_weights_1840000.h5f'


#nb_action of most similar game
TRANSFER_GAME_NB_ACTIONS = 9


In [4]:
#Initialize trainAgent
trainAgent = TrainAgent(ENV_NAME,
                        NAME_OF_RUN,
                        transferMode=TRANSFER_MODE,
                        transfer_architecture=TRANSFER_ARCHITECTURE,
                        transfer_policy=TRANSFER_POLICY,
                        transfer_game_nb_actions=TRANSFER_GAME_NB_ACTIONS)

Env:Seaquest-v0
nb_actions:18
Path:./KC/ModelRepo/Atari_Seaquest-v0
Name of Run: Seaquest-v0_Q-Inject_1581107363
Save Name: Seaquest-v0_1581107363
Starting normal training mode.


In [5]:
#Start Training
if TRANSFER_MODE:
    #Found a similar game
    print("Transfer Learning")
    trainAgent.trainingInjection()
else:
    #did not find a similar game
    print("Normal Learning")
    trainAgent.trainingBaseline()


Normal Learning
Saved model
Training for 2000000 steps ...
Interval 1 (0 steps performed)
18 episodes - episode_reward: 0.611 [0.000, 3.000] - ale.lives: 2.525

Interval 2 (10000 steps performed)
13 episodes - episode_reward: 3.692 [1.000, 9.000] - ale.lives: 2.425

Interval 3 (20000 steps performed)
15 episodes - episode_reward: 3.467 [1.000, 6.000] - ale.lives: 2.500

Interval 4 (30000 steps performed)
12 episodes - episode_reward: 7.083 [1.000, 17.000] - ale.lives: 2.618

Interval 5 (40000 steps performed)
14 episodes - episode_reward: 3.357 [0.000, 9.000] - ale.lives: 2.470

Interval 6 (50000 steps performed)
18 episodes - episode_reward: 1.222 [0.000, 4.000] - loss: 0.002 - mae: 0.008 - mean_q: 0.016 - mean_eps: 0.951 - ale.lives: 2.477

Interval 7 (60000 steps performed)
14 episodes - episode_reward: 3.214 [0.000, 8.000] - loss: 0.002 - mae: 0.006 - mean_q: 0.013 - mean_eps: 0.942 - ale.lives: 2.470

Interval 8 (70000 steps performed)
11 episodes - episode_reward: 6.364 [2.000, 1

7 episodes - episode_reward: 14.286 [4.000, 22.000] - loss: 0.003 - mae: 0.424 - mean_q: 0.484 - mean_eps: 0.690 - ale.lives: 2.428

Interval 36 (350000 steps performed)
18 episodes - episode_reward: 2.500 [0.000, 7.000] - loss: 0.003 - mae: 0.436 - mean_q: 0.498 - mean_eps: 0.681 - ale.lives: 2.507

Interval 37 (360000 steps performed)
6 episodes - episode_reward: 16.167 [6.000, 25.000] - loss: 0.003 - mae: 0.469 - mean_q: 0.536 - mean_eps: 0.672 - ale.lives: 2.410

Interval 38 (370000 steps performed)
8 episodes - episode_reward: 15.625 [8.000, 22.000] - loss: 0.004 - mae: 0.487 - mean_q: 0.553 - mean_eps: 0.663 - ale.lives: 2.611

Interval 39 (380000 steps performed)
9 episodes - episode_reward: 12.444 [4.000, 18.000] - loss: 0.003 - mae: 0.510 - mean_q: 0.577 - mean_eps: 0.654 - ale.lives: 2.475

Interval 40 (390000 steps performed)
9 episodes - episode_reward: 12.889 [4.000, 19.000] - loss: 0.004 - mae: 0.538 - mean_q: 0.608 - mean_eps: 0.645 - ale.lives: 2.498

Interval 41 (40000

17 episodes - episode_reward: 2.471 [0.000, 7.000] - loss: 0.005 - mae: 1.324 - mean_q: 1.450 - mean_eps: 0.393 - ale.lives: 2.416

Interval 69 (680000 steps performed)
6 episodes - episode_reward: 18.667 [11.000, 31.000] - loss: 0.005 - mae: 1.347 - mean_q: 1.471 - mean_eps: 0.384 - ale.lives: 2.495

Interval 70 (690000 steps performed)
8 episodes - episode_reward: 16.625 [6.000, 30.000] - loss: 0.006 - mae: 1.370 - mean_q: 1.495 - mean_eps: 0.375 - ale.lives: 2.462

Interval 71 (700000 steps performed)
16 episodes - episode_reward: 4.000 [0.000, 12.000] - loss: 0.006 - mae: 1.389 - mean_q: 1.516 - mean_eps: 0.366 - ale.lives: 2.607

Interval 72 (710000 steps performed)
15 episodes - episode_reward: 4.667 [0.000, 14.000] - loss: 0.005 - mae: 1.390 - mean_q: 1.516 - mean_eps: 0.357 - ale.lives: 2.583

Interval 73 (720000 steps performed)
8 episodes - episode_reward: 12.250 [4.000, 25.000] - loss: 0.005 - mae: 1.420 - mean_q: 1.548 - mean_eps: 0.348 - ale.lives: 2.538

Interval 74 (7300

7 episodes - episode_reward: 18.143 [10.000, 24.000] - loss: 0.006 - mae: 1.866 - mean_q: 2.016 - mean_eps: 0.100 - ale.lives: 2.629

Interval 102 (1010000 steps performed)
5 episodes - episode_reward: 27.000 [13.000, 38.000] - loss: 0.006 - mae: 1.885 - mean_q: 2.035 - mean_eps: 0.100 - ale.lives: 2.579

Interval 103 (1020000 steps performed)
8 episodes - episode_reward: 14.375 [11.000, 22.000] - loss: 0.006 - mae: 1.897 - mean_q: 2.047 - mean_eps: 0.100 - ale.lives: 2.798

Interval 104 (1030000 steps performed)
7 episodes - episode_reward: 15.286 [13.000, 19.000] - loss: 0.006 - mae: 1.880 - mean_q: 2.029 - mean_eps: 0.100 - ale.lives: 2.290

Interval 105 (1040000 steps performed)
8 episodes - episode_reward: 16.875 [10.000, 25.000] - loss: 0.006 - mae: 1.890 - mean_q: 2.040 - mean_eps: 0.100 - ale.lives: 2.210

Interval 106 (1050000 steps performed)
7 episodes - episode_reward: 16.857 [10.000, 26.000] - loss: 0.006 - mae: 1.898 - mean_q: 2.048 - mean_eps: 0.100 - ale.lives: 2.377

I

7 episodes - episode_reward: 17.000 [8.000, 24.000] - loss: 0.005 - mae: 2.120 - mean_q: 2.279 - mean_eps: 0.100 - ale.lives: 2.540

Interval 135 (1340000 steps performed)
8 episodes - episode_reward: 13.625 [4.000, 19.000] - loss: 0.005 - mae: 2.134 - mean_q: 2.295 - mean_eps: 0.100 - ale.lives: 2.576

Interval 136 (1350000 steps performed)
7 episodes - episode_reward: 16.857 [11.000, 34.000] - loss: 0.005 - mae: 2.143 - mean_q: 2.306 - mean_eps: 0.100 - ale.lives: 2.393

Interval 137 (1360000 steps performed)
7 episodes - episode_reward: 16.286 [8.000, 27.000] - loss: 0.005 - mae: 2.154 - mean_q: 2.317 - mean_eps: 0.100 - ale.lives: 2.479

Interval 138 (1370000 steps performed)
9 episodes - episode_reward: 16.444 [13.000, 21.000] - loss: 0.005 - mae: 2.149 - mean_q: 2.311 - mean_eps: 0.100 - ale.lives: 2.357

Interval 139 (1380000 steps performed)
6 episodes - episode_reward: 18.833 [13.000, 24.000] - loss: 0.005 - mae: 2.154 - mean_q: 2.316 - mean_eps: 0.100 - ale.lives: 2.577

Inte

6 episodes - episode_reward: 27.167 [20.000, 36.000] - loss: 0.005 - mae: 2.227 - mean_q: 2.394 - mean_eps: 0.100 - ale.lives: 2.520

Interval 168 (1670000 steps performed)
5 episodes - episode_reward: 30.400 [20.000, 37.000] - loss: 0.005 - mae: 2.224 - mean_q: 2.390 - mean_eps: 0.100 - ale.lives: 2.536

Interval 169 (1680000 steps performed)
7 episodes - episode_reward: 25.571 [15.000, 37.000] - loss: 0.005 - mae: 2.223 - mean_q: 2.390 - mean_eps: 0.100 - ale.lives: 2.458

Interval 170 (1690000 steps performed)
6 episodes - episode_reward: 24.333 [16.000, 40.000] - loss: 0.005 - mae: 2.229 - mean_q: 2.394 - mean_eps: 0.100 - ale.lives: 2.364

Interval 171 (1700000 steps performed)
6 episodes - episode_reward: 28.000 [18.000, 40.000] - loss: 0.005 - mae: 2.228 - mean_q: 2.394 - mean_eps: 0.100 - ale.lives: 2.576

Interval 172 (1710000 steps performed)
4 episodes - episode_reward: 38.000 [30.000, 53.000] - loss: 0.005 - mae: 2.232 - mean_q: 2.399 - mean_eps: 0.100 - ale.lives: 2.548

I

done, took 43384.642 seconds
Saved model
