# Load Dependencies

In [1]:
from __future__ import division
import warnings
warnings.filterwarnings('ignore')

import json
import multiprocessing
import os
import time
from threading import Thread

import gym
import numpy as np
import tensorflow as tf
from PIL import Image
from keras import backend as K

from keras.layers import (Activation, 
                          Convolution2D, 
                          Dense, 
                          Flatten, 
                          Permute)
from keras.models import (Sequential, 
                          load_model)
from keras.optimizers import Adam
from matplotlib import pyplot as plt
from rl.agents.dqn import DQNAgent
from rl.callbacks import (FileLogger, 
                          ModelIntervalCheckpoint, 
                          WandbLogger)
from rl.core import Processor
from rl.memory import SequentialMemory
from rl.policy import (EpsGreedyQPolicy, 
                       LinearAnnealedPolicy)

import import_ipynb
from cbrUtil import AtariProcessor
import timeout_decorator

Using TensorFlow backend.


importing Jupyter notebook from cbrUtil.ipynb


# Load cb/most similar case and query case

In [2]:
ENV_NAME = 'Alien-v0'
mostSimCase = {'Seaquest-v0': 0.5}
caseBase = {'Assault-v0': 7, 'BreakoutDeterministic-v4': 4, 'MsPacman-v0': 9, 'Seaquest-v0': 18, 'SpaceInvaders-v0': 6}
queryCase = {'Alien-v0': 18}

# cbrTest

- [x] check if nb_actions are same
    - if same
        - [x] do nothing
    - if not same
        - [x] change last layer to target network and init last layer weights
- [x] plug-in all policies from most similar case and determine highest reward policy 

In [2]:
class TestAgent():
    """
    This class implements the "Training Agent" of the proposed PENG 
    Architecture.
    
    Attributes
    ----------
    env_name : str
        exact name of the Atari environment.
    
    mostSimCase : dict
        name of the most similar case with similarity
        
    caseBase : dict
        current model repository
        
    queryCase : dict
        current target gameplay task with number of actions
    
    
    Methods
    -------
    loadSimArchitecture(self):
        loads current architecture of most similar case.
    
    getTestingCase(self):
        loads current testing case.
    
    checkActionSize(self):
        checks if action space is equal.
    
    testChecker(self):
        builds DRL agent and checks env.
    
    testAgent(self, dqn ,verbose=1):
        runs environment for one episode in order to get reward.
    
    getTrainPolicy(self):
        returns policy for the training agent.
    """
    
    def __init__(self, 
                 mostSimCase, 
                 caseBase, 
                 queryCase):
        """
        Initializes the agent.
        
        Attributes
        ----------
        env_name : str
            exact name of the Atari environment.

        mostSimCase : dict
            name of the most similar case with similarity

        caseBase : dict
            current model repository

        queryCase : dict
            current target gameplay task with number of actions
        """
        
        if len(mostSimCase)==0:
            print("No similar case...")
            self.transferMode = False
        
        else:
            self.TIME = str(int(time.time()))
            self.INPUT_SHAPE = (84,84)
            self.WINDOW_LENGTH = 4
            self.transferMode = True
            self.mostSimCase = mostSimCase
            self.caseBase = caseBase
            self.queryCase = queryCase

            self.query_environment = None
            self.testing_environment = None
            

            self.path_to_games = './KC/ModelRepo/Atari_'

            self.path_to_architecture = '/Architecture/'
            self.path_to_policy = '/Policy/'

            self.testingCase = self.getTestingCase()
            self.testingCaseActions = None
            self.same_action_size = self.checkActionSize()
            
            self.env_name = self.query_environment
            self.env = gym.make(self.env_name)
            np.random.seed(123)
            self.env.seed(123)
            #get nb_actions for the learning env
            self.nb_actions = self.env.action_space.n
            
            self.model = None
            self.memory = None
            self.processor = None
            self.policy = None
            self.dqn = None
            self.history = None
            
            self.similar_architecture_path = (self.path_to_games+
                                        self.testing_environment+
                                        self.path_to_architecture)
            
            self.similar_policy_path = (self.path_to_games+
                                        self.testing_environment+
                                        self.path_to_policy)
            
            self.similar_architecture = self.loadSimArchitecture()
            

            self.testedPolicies = self.testChecker()
            self.trainPolicy = self.getTrainPolicy()
        
    def loadSimArchitecture(self):
        """
        loads current architecture of most similar case.
        """
        archi = None
        for architecture in os.listdir(self.similar_architecture_path):
            if 'h5' in architecture:
                archi = load_model(self.similar_architecture_path+
                                   architecture)
        return archi
        
        
    def getTestingCase(self):
        """
        loads current testing case.
        """
        testingCase = {}
        for case in self.mostSimCase:
            self.testing_environment = case
            testingCase[case] = self.caseBase[case]
            break
        return testingCase
        
    def checkActionSize(self):
        """
        checks if action space is equal.
        """
        action_size = {}
        env_query = None
        for game in self.queryCase:
            env_query = game
            self.query_environment = env_query
        for game in self.testingCase:
            self.testingCaseActions = self.caseBase[game]
            action_size[game] = (self.caseBase[game]
                                 == self.queryCase[env_query])
        return action_size
    
    def testChecker(self):
        """
        builds DRL agent and checks env.
        """
        for same in self.same_action_size:
            if self.same_action_size[same]:
                print("Nothing to change for architecture")
                testedPolicies = {}
                for pol in os.listdir(self.similar_policy_path):
                    if 'ipynb' in pol:
                        pass
                    else:
                        file_path = self.similar_policy_path+pol
                        newModel = Sequential()
                        self.similar_architecture.load_weights(file_path)
                        for layer in self.similar_architecture.layers:
                            newModel.add(layer)
                            
                        memory = SequentialMemory(limit=1000000, 
                                  window_length=self.WINDOW_LENGTH)
        
                        processor = AtariProcessor()
        
                        #eps=0.5 worked so far for model injection 
                        policy = LinearAnnealedPolicy(EpsGreedyQPolicy(action_size=self.nb_actions,
                                                            q_Injection = False), 
                                                      attr='eps', 
                                                      value_max=1.0, 
                                                      value_min=.1, 
                                                      value_test=.05,
                                                      nb_steps=1000000)
                
                        dqn = DQNAgent(model=newModel, 
                            nb_actions=self.nb_actions, 
                            policy=policy, 
                            memory=memory,
                            processor=processor, 
                            nb_steps_warmup=50000, 
                            gamma=.99, 
                            target_model_update=10000,
                            train_interval=4, delta_clip=1.)
        
                        dqn.compile(Adam(lr=.00025), metrics=['mae'])
                        try:
                            hist = self.testAgent(dqn)
                        except Exception as e:
                            pass
                        
                        try:
                            testedPolicies[pol] = hist.history['episode_reward'][0]
                        #print()
                        except Exception as e:
                            pass

                testedPolicies={k: v for k, v in sorted(testedPolicies.items(), 
                                 key=lambda item: item[1],
                                 reverse=True)}
                return testedPolicies
            else:
                print("Changing last layers for architecture, and testing it.")
                testedPolicies = {}
                ite = 0
                for pol in os.listdir(self.similar_policy_path):
                    if 'ipynb' in pol:
                        pass
                    else:
                        testedPolicies[pol] = {}
                        #print(f"Testing Policy: {pol}")
                        file_path = self.similar_policy_path+pol
                        newModel = Sequential()
                        self.similar_architecture.load_weights(file_path)
                        
                        for layer in self.similar_architecture.layers[:-2]:
                            newModel.add(layer)
                            
                        newModel.add(Dense(units=self.nb_actions,
                           kernel_initializer='random_uniform',
                           name='dense_'+self.TIME))
                        
                        newModel.add(Activation('linear',
                                name='activation_'+self.TIME))
                        
                        weg = []
                        weights = newModel.get_weights()
                        for weight in range(len(weights[-1])):
                            random_weight = np.random.uniform(low=0.5, high=0.7)
                            weg.append(random_weight)
                            weights[-1][weight] = random_weight
                        newModel.set_weights(weights)
                        
                        
                        memory = SequentialMemory(limit=1000000, 
                                  window_length=self.WINDOW_LENGTH)
        
                        processor = AtariProcessor()
        
                        #eps=0.5 worked so far for model injection 
                        policy = LinearAnnealedPolicy(EpsGreedyQPolicy(action_size=self.nb_actions,
                                                            q_Injection = False), 
                                                      attr='eps', 
                                                      value_max=1.0, 
                                                      value_min=.1, 
                                                      value_test=.05,
                                                      nb_steps=1000000)
                
                        dqn = DQNAgent(model=newModel, 
                            nb_actions=self.nb_actions, 
                            policy=policy, 
                            memory=memory,
                            processor=processor, 
                            nb_steps_warmup=50000, 
                            gamma=.99, 
                            target_model_update=10000,
                            train_interval=4, delta_clip=1.)
        
                        dqn.compile(Adam(lr=.00025), metrics=['mae'])
                        #hist = None
                        try:
                            hist = self.testAgent(dqn)
                            rew_name = 'reward'+str(ite)
                            weg_name = 'weights'+str(ite)
                            testedPolicies[rew_name]=hist.history['episode_reward'][0]
                            testedPolicies[weg_name]=weg
                            ite+=1
                        except Exception as e:
                            pass 
                        #print(hist.history['episode_reward'][0])
                        try:
                            #testedPolicies[pol] = hist.history['episode_reward'][0]
                            pass
                            #testedPolicies['reward']=hist.history['episode_reward'][0]
                            #testedPolicies['weights']=weg
                        #print()
                        except Exception as e:
                            pass

                #testedPolicies={k: v for k, v in sorted(testedPolicies.items(), 
                                 #key=lambda item: item[1],
                                 #reverse=True)}
                return testedPolicies

    @timeout_decorator.timeout(10)
    def testAgent(self, dqn ,verbose=1):
        """
        runs environment for one episode in order to get reward.
        """
        history = dqn.test(self.env, 
                          nb_episodes=1, 
                          visualize=False,
                        verbose=verbose) #0 = nothing to show
        return history
    
    def getTrainPolicy(self):
        """
        returns policy for the training agent.
        """
        train_pol = []
        for pol in self.testedPolicies:
            print(f"Similar Policy achieved reward of: {self.testedPolicies[pol]}")
            train_pol.append(pol)
            break
        return train_pol

In [None]:
tester = TestAgent(mostSimCase=mostSimCase,
                   caseBase=caseBase,
                   queryCase=queryCase)

Nothing to change for architecture
Testing for 1 episodes ...
Episode 1: reward: 3.000, steps: 457
Testing for 1 episodes ...
Episode 1: reward: 18.000, steps: 1135
Testing for 1 episodes ...
Episode 1: reward: 15.000, steps: 690
Testing for 1 episodes ...
Episode 1: reward: 16.000, steps: 821
Testing for 1 episodes ...
Episode 1: reward: 14.000, steps: 1061
Testing for 1 episodes ...
Episode 1: reward: 31.000, steps: 1141
Testing for 1 episodes ...
Episode 1: reward: 15.000, steps: 455
Testing for 1 episodes ...
Episode 1: reward: 15.000, steps: 813
Testing for 1 episodes ...


In [None]:
tester.testedPolicies
#tester.trainPolicy