In [1]:
import random, numpy, math, gym, sys
from keras import backend as K

import tensorflow as tf

Using TensorFlow backend.


In [2]:
HUBER_LOSS_DELTA = 1.0
LEARNING_RATE = 0.00025

In [3]:
def to_grayscale(img):
    return np.mean(img, axis=2).astype(np.uint8)

def downsample(img):
    return img[::2, ::2]

def preprocess(img):
    return to_grayscale(downsample(img))

In [4]:
def huber_loss(y_true, y_pred):
    err = y_true - y_pred

    cond = K.abs(err) < HUBER_LOSS_DELTA
    L2 = 0.5 * K.square(err)
    L1 = HUBER_LOSS_DELTA * (K.abs(err) - 0.5 * HUBER_LOSS_DELTA)

    loss = tf.where(cond, L2, L1)   # Keras does not cover where function in tensorflow :-(
    #print(f'loss: {loss}')
    return K.mean(loss)


In [5]:
from keras.models import Sequential
from keras.layers import *
from keras.optimizers import *

ATARI_SHAPE = (105, 80, 4)
class Brain:
    def __init__(self, stateCnt, actionCnt):
        self.stateCnt = stateCnt
        self.actionCnt = actionCnt

        self.model = self._createModel()
        self.model_ = self._createModel() 

    def _createModel(self):
        model = Sequential()
        #model.add(Input(shape=ATARI_SHAPE))
        model.add(Lambda(lambda x: x/255.0, input_shape = ATARI_SHAPE))
        model.add(Conv2D(16, kernel_size = 8, activation='relu', strides = (4, 4)))
        model.add(Conv2D(32, kernel_size=4, activation='relu', strides = (2, 2)))
        model.add(Flatten())
        model.add(Dense(units=256, activation='relu'))
        model.add(Dense(units = actionCnt, activation='linear'))

        opt = RMSprop(lr=LEARNING_RATE)
        model.compile(loss=huber_loss, optimizer=opt)

        return model

    def train(self, x, y, epochs=1, verbose=0):
        self.model.fit(x, y, batch_size=64, epochs=epochs, verbose=verbose)
    def predict(self, s, target=False):
        #print(f'predict s shape: {s.shape}')
        if target:
            #print(f'target s shape: {s.shape}')
            return self.model_.predict(s)
        else:
            #print(f's shape in predict: {s.shape}')
            return self.model.predict(s)

    def predictOne(self, s, target=False):
        #pdb.setTrace()
        #print('predictOne: ')
        #print(f's shape: {s.shape}')
        s = np.stack([s, s, s, s], axis = 2)
        return self.predict(s.reshape(1, 105, 80, 4), target=target).flatten()

    def updateTargetModel(self):
        self.model_.set_weights(self.model.get_weights())

In [6]:
class Memory:   # stored as ( s, a, r, s_ )
    samples = []

    def __init__(self, capacity):
        self.capacity = capacity

    def add(self, sample):
        #print(f'memory-add s shape: {sample[0].shape}')
        self.samples.append(sample)        

        if len(self.samples) > self.capacity:
            self.samples.pop(0)

    def sample(self, n):
        n = min(n, len(self.samples))
        return random.sample(self.samples, n)

    def isFull(self):
        return len(self.samples) >= self.capacity

In [7]:
MEMORY_CAPACITY = 100000
BATCH_SIZE = 32
ATARI_SHAPE = (105, 80, 4)
GAMMA = 0.99

MAX_EPSILON = 1
MIN_EPSILON = 0.01
LAMBDA = 0.001      # speed of decay

UPDATE_TARGET_FREQUENCY = 1000

class Agent:
    steps = 0
    epsilon = MAX_EPSILON

    def __init__(self, stateCnt, actionCnt):
        self.stateCnt = stateCnt
        self.actionCnt = actionCnt

        self.brain = Brain(stateCnt, actionCnt)
        self.memory = Memory(MEMORY_CAPACITY)
        
    def act(self, s):
        if random.random() < self.epsilon:
            return random.randint(0, self.actionCnt-1)
        else:
            return numpy.argmax(self.brain.predictOne(s))

    def printInfos(self):
        print(f' epsilon: {self.epsilon}')
        print(f' lenMemory: {len(self.memory.samples)}')
        print(f' capacityMemory: {self.memory.capacity}')
        print(f' steps: {self.steps}')
    
    def observe(self, sample):  # in (s, a, r, s_) format
        #print(f'observe s shape: {sample[0].shape}')
        self.memory.add(sample)        

        if self.steps % UPDATE_TARGET_FREQUENCY == 0:
            self.brain.updateTargetModel()

        # debug the Q function in poin S
        """"
        if self.steps % 100 == 0:
            S = numpy.random()
            pred = agent.brain.predictOne(S)
            print(pred[0])
            sys.stdout.flush()
        """
        # slowly decrease Epsilon based on our eperience
        self.steps += 1
        self.epsilon = MIN_EPSILON + (MAX_EPSILON - MIN_EPSILON) * math.exp(-LAMBDA * self.steps)
    def replay(self):    
        batch = self.memory.sample(BATCH_SIZE)
        batchLen = len(batch)
        #print(f'example batch element: {batch[0][0].shape}')
        
        #print(f'Batchlen: {batchLen}')
        no_state = numpy.zeros(ATARI_SHAPE)
        no_state_stack = np.stack([no_state, no_state, no_state, no_state], axis=2)
        states = numpy.array([ np.stack([o[0], o[0], o[0], o[0]], axis=2) for o in batch ])
        #print(f'Replay States: {states.shape}')
        states_ = numpy.array([ no_state if o[3] is None else np.stack([o[3], o[3], o[3], o[3]], axis=2) for o in batch ])
        #print(f'replay: states shape {states[0].shape}')
        #print(states)
        p = self.brain.predict(states)
        p_ = self.brain.predict(states_, target=True)

        x = numpy.zeros((batchLen, 105, 80, 4))
        y = numpy.zeros((batchLen, self.actionCnt))
        
        for i in range(batchLen):
            o = batch[i]
            s = o[0]; a = o[1]; r = o[2]; s_ = o[3]
            s = np.stack([o[0], o[0], o[0], o[0]], axis=2)
            if (s_  is not None):
                s_ = np.stack([o[3], o[3], o[3], o[3]], axis=2)
            t = p[i]
            if s_ is None:
                t[a] = r
            else:
                t[a] = r + GAMMA * numpy.amax(p_[i])
            #print(f'x shape: {x.shape}')
            x[i] = s
            y[i] = t

        self.brain.train(x, y)
class RandomAgent:
    memory = Memory(MEMORY_CAPACITY)

    def __init__(self, actionCnt):
        self.actionCnt = actionCnt

    def act(self, s):
        return random.randint(0, self.actionCnt-1)
    
    def printInfos(self):
        print(f' random: true')
        print(f' lenMemory: {len(self.memory.samples)}')
        print(f' capacityMemory: {self.memory.capacity}')
       
    

    def observe(self, sample):  # in (s, a, r, s_) format
        self.memory.add(sample)

    def replay(self):
        pass

In [8]:
class Environment:
    def __init__(self, problem):
        self.problem = problem
        self.env = gym.make(problem)

    def run(self, agent):
        s = self.env.reset()
        #print(f'Initial Environment s.shape 1: {s.shape}')
        s = preprocess(s)
        
        #print(f's shape: {s.shape}')
        #print(f's0 shape: {s[0].shape}')
        #print(f'Environment s.shape 2: {s.shape}')
        R = 0 
        numGames = 0
        while True:            
            # self.env.render()
            
            
            a = agent.act(s)
            #if(agent.memory.isFull()):
                #print(f'agent decided on action {a}')
            s_, r, done, info = self.env.step(a)
            s_ = preprocess(s_)
            if done: # terminal state
                s_ = None
                #print(f'Environment s.shape: {s.shape}')
            #if(agent.memory.isFull()):
                #print(f'about to call agent observe')
            agent.observe( (s, a, r, s_) )
            s = s_
            R += r
            
            #if(agent.memory.isFull()):
                #print(f'about to call agent replay')
            agent.replay()            

            

            if done:
                print(f'reward: {R}')
                agent.printInfos()
                break

In [9]:
PROBLEM = 'BreakoutDeterministic-v4'
env = Environment(PROBLEM)

stateCnt  = env.env.observation_space.shape[0]
actionCnt = env.env.action_space.n

agent = Agent(stateCnt, actionCnt)
randomAgent = RandomAgent(actionCnt)


try:
    while len(randomAgent.memory.samples) < 90000:
        env.run(randomAgent)
    print('memory full, broke out of while loop 1')
    agent.memory.samples = randomAgent.memory.samples
    randomAgent = None
    print('killed random agent')


    while True:
         #print('while loop 2')
         env.run(agent)

finally:
    agent.brain.model.save("cartpole-dqn.h5")

memory full, broke out of while loop 1
killed random agent


KeyboardInterrupt: 