In [2]:
import random, numpy, math, gym, sys
from keras import backend as K

import tensorflow as tf

Using TensorFlow backend.


In [3]:
HUBER_LOSS_DELTA = 1.0
LEARNING_RATE = 0.00025

In [4]:
def to_grayscale(img):
    return np.mean(img, axis=2).astype(np.uint8)

def downsample(img):
    return img[::2, ::2]

def preprocess(img):
    return to_grayscale(downsample(img))

In [5]:
def huber_loss(y_true, y_pred):
    err = y_true - y_pred

    cond = K.abs(err) < HUBER_LOSS_DELTA
    L2 = 0.5 * K.square(err)
    L1 = HUBER_LOSS_DELTA * (K.abs(err) - 0.5 * HUBER_LOSS_DELTA)

    loss = tf.where(cond, L2, L1)   # Keras does not cover where function in tensorflow :-(
    #print(f'loss: {loss}')
    return K.mean(loss)


In [6]:
from keras.models import Sequential
from keras.layers import *
from keras.optimizers import *

ATARI_SHAPE = (105, 80, 4)
class Brain:
    def __init__(self, stateCnt, actionCnt):
        self.stateCnt = stateCnt
        self.actionCnt = actionCnt

        self.model = self._createModel()
        self.model_ = self._createModel() 

    def _createModel(self):
        model = Sequential()
        #model.add(Input(shape=ATARI_SHAPE))
        model.add(Lambda(lambda x: x/255.0, input_shape = ATARI_SHAPE))
        model.add(Conv2D(16, kernel_size = 8, activation='relu', strides = (4, 4)))
        model.add(Conv2D(32, kernel_size=4, activation='relu', strides = (2, 2)))
        model.add(Flatten())
        model.add(Dense(units=256, activation='relu'))
        model.add(Dense(units = actionCnt, activation='linear'))

        opt = RMSprop(lr=LEARNING_RATE)
        model.compile(loss=huber_loss, optimizer=opt)

        return model

    def train(self, x, y, epochs=1, verbose=0):
        self.model.fit(x, y, batch_size=64, epochs=epochs, verbose=verbose)
    def predict(self, s, target=False):
        #print(f'predict s shape: {s.shape}')
        if target:
            #print(f'target s shape: {s.shape}')
            return self.model_.predict(s)
        else:
            #print(f's shape in predict: {s.shape}')
            return self.model.predict(s)

    def predictOne(self, s, target=False):
        #pdb.setTrace()
        #print('predictOne: ')
        #print(f's shape: {s.shape}')
        s = np.stack([s, s, s, s], axis = 2)
        return self.predict(s.reshape(1, 105, 80, 4), target=target).flatten()

    def updateTargetModel(self):
        self.model_.set_weights(self.model.get_weights())

In [7]:
class Memory:   # stored as ( s, a, r, s_ )
    samples = []

    def __init__(self, capacity):
        self.capacity = capacity

    def add(self, sample):
        #print(f'memory-add s shape: {sample[0].shape}')
        self.samples.append(sample)        

        if len(self.samples) > self.capacity:
            self.samples.pop(0)

    def sample(self, n):
        n = min(n, len(self.samples))
        return random.sample(self.samples, n)

    def isFull(self):
        return len(self.samples) >= self.capacity

In [8]:
MEMORY_CAPACITY = 100000
BATCH_SIZE = 32
ATARI_SHAPE = (105, 80, 4)
GAMMA = 0.99

MAX_EPSILON = 1
MIN_EPSILON = 0.01
LAMBDA = 0.001      # speed of decay

UPDATE_TARGET_FREQUENCY = 1000

class Agent:
    steps = 0
    epsilon = MAX_EPSILON

    def __init__(self, stateCnt, actionCnt):
        self.stateCnt = stateCnt
        self.actionCnt = actionCnt

        self.brain = Brain(stateCnt, actionCnt)
        self.memory = Memory(MEMORY_CAPACITY)
        
    def act(self, s):
        if random.random() < self.epsilon:
            return random.randint(0, self.actionCnt-1)
        else:
            return numpy.argmax(self.brain.predictOne(s))

    def printInfos(self):
        print(f' epsilon: {self.epsilon}')
        print(f' lenMemory: {len(self.memory.samples)}')
        print(f' capacityMemory: {self.memory.capacity}')
        print(f' steps: {self.steps}')
    
    def observe(self, sample):  # in (s, a, r, s_) format
        #print(f'observe s shape: {sample[0].shape}')
        self.memory.add(sample)        

        if self.steps % UPDATE_TARGET_FREQUENCY == 0:
            self.brain.updateTargetModel()

        # debug the Q function in poin S
        """"
        if self.steps % 100 == 0:
            S = numpy.random()
            pred = agent.brain.predictOne(S)
            print(pred[0])
            sys.stdout.flush()
        """
        # slowly decrease Epsilon based on our eperience
        self.steps += 1
        self.epsilon = MIN_EPSILON + (MAX_EPSILON - MIN_EPSILON) * math.exp(-LAMBDA * self.steps)
    def replay(self):    
        batch = self.memory.sample(BATCH_SIZE)
        batchLen = len(batch)
        #print(f'example batch element: {batch[0][0].shape}')
        
        #print(f'Batchlen: {batchLen}')
        no_state = numpy.zeros(ATARI_SHAPE)
        no_state_stack = np.stack([no_state, no_state, no_state, no_state], axis=2)
        states = numpy.array([ np.stack([o[0], o[0], o[0], o[0]], axis=2) for o in batch ])
        #print(f'Replay States: {states.shape}')
        states_ = numpy.array([ no_state if o[3] is None else np.stack([o[3], o[3], o[3], o[3]], axis=2) for o in batch ])
        #print(f'replay: states shape {states[0].shape}')
        #print(states)
        p = self.brain.predict(states)
        p_ = self.brain.predict(states_, target=True)

        x = numpy.zeros((batchLen, 105, 80, 4))
        y = numpy.zeros((batchLen, self.actionCnt))
        
        for i in range(batchLen):
            o = batch[i]
            s = o[0]; a = o[1]; r = o[2]; s_ = o[3]
            s = np.stack([o[0], o[0], o[0], o[0]], axis=2)
            if (s_  is not None):
                s_ = np.stack([o[3], o[3], o[3], o[3]], axis=2)
            t = p[i]
            if s_ is None:
                t[a] = r
            else:
                t[a] = r + GAMMA * numpy.amax(p_[i])
            #print(f'x shape: {x.shape}')
            x[i] = s
            y[i] = t

        self.brain.train(x, y)
class RandomAgent:
    memory = Memory(MEMORY_CAPACITY)

    def __init__(self, actionCnt):
        self.actionCnt = actionCnt

    def act(self, s):
        return random.randint(0, self.actionCnt-1)
    
    def printInfos(self):
        print(f' random: true')
        print(f' lenMemory: {len(self.memory.samples)}')
        print(f' capacityMemory: {self.memory.capacity}')
       
    

    def observe(self, sample):  # in (s, a, r, s_) format
        self.memory.add(sample)

    def replay(self):
        pass

In [9]:
class Environment:
    def __init__(self, problem):
        self.problem = problem
        self.env = gym.make(problem)

    def run(self, agent):
        s = self.env.reset()
        #print(f'Initial Environment s.shape 1: {s.shape}')
        s = preprocess(s)
        
        #print(f's shape: {s.shape}')
        #print(f's0 shape: {s[0].shape}')
        #print(f'Environment s.shape 2: {s.shape}')
        R = 0 
        numGames = 0
        while True:            
            # self.env.render()
            
            
            a = agent.act(s)
            #if(agent.memory.isFull()):
                #print(f'agent decided on action {a}')
            s_, r, done, info = self.env.step(a)
            s_ = preprocess(s_)
            if done: # terminal state
                s_ = None
                #print(f'Environment s.shape: {s.shape}')
            #if(agent.memory.isFull()):
                #print(f'about to call agent observe')
            agent.observe( (s, a, r, s_) )
            s = s_
            R += r
            
            #if(agent.memory.isFull()):
                #print(f'about to call agent replay')
            agent.replay()            

            

            if done:
                print(f'reward: {R}')
                agent.printInfos()
                break

In [None]:
PROBLEM = 'BreakoutDeterministic-v4'
env = Environment(PROBLEM)

stateCnt  = env.env.observation_space.shape[0]
actionCnt = env.env.action_space.n

agent = Agent(stateCnt, actionCnt)
randomAgent = RandomAgent(actionCnt)


try:
    while len(randomAgent.memory.samples) < 90000:
        env.run(randomAgent)
    print('memory full, broke out of while loop 1')
    agent.memory.samples = randomAgent.memory.samples
    randomAgent = None
    print('killed random agent')


    while True:
         #print('while loop 2')
         env.run(agent)

finally:
    agent.brain.model.save("cartpole-dqn.h5")

reward: 1.0
 random: true
 lenMemory: 179
 capacityMemory: 100000
reward: 2.0
 random: true
 lenMemory: 378
 capacityMemory: 100000
reward: 1.0
 random: true
 lenMemory: 566
 capacityMemory: 100000
reward: 0.0
 random: true
 lenMemory: 713
 capacityMemory: 100000
reward: 2.0
 random: true
 lenMemory: 911
 capacityMemory: 100000
reward: 4.0
 random: true
 lenMemory: 1212
 capacityMemory: 100000
reward: 0.0
 random: true
 lenMemory: 1352
 capacityMemory: 100000
reward: 0.0
 random: true
 lenMemory: 1495
 capacityMemory: 100000
reward: 5.0
 random: true
 lenMemory: 1800
 capacityMemory: 100000
reward: 0.0
 random: true
 lenMemory: 1938
 capacityMemory: 100000
reward: 0.0
 random: true
 lenMemory: 2071
 capacityMemory: 100000
reward: 1.0
 random: true
 lenMemory: 2268
 capacityMemory: 100000
reward: 0.0
 random: true
 lenMemory: 2401
 capacityMemory: 100000
reward: 0.0
 random: true
 lenMemory: 2553
 capacityMemory: 100000
reward: 0.0
 random: true
 lenMemory: 2691
 capacityMemory: 100000


reward: 1.0
 random: true
 lenMemory: 22265
 capacityMemory: 100000
reward: 0.0
 random: true
 lenMemory: 22390
 capacityMemory: 100000
reward: 0.0
 random: true
 lenMemory: 22529
 capacityMemory: 100000
reward: 1.0
 random: true
 lenMemory: 22688
 capacityMemory: 100000
reward: 1.0
 random: true
 lenMemory: 22849
 capacityMemory: 100000
reward: 0.0
 random: true
 lenMemory: 22981
 capacityMemory: 100000
reward: 0.0
 random: true
 lenMemory: 23118
 capacityMemory: 100000
reward: 0.0
 random: true
 lenMemory: 23256
 capacityMemory: 100000
reward: 2.0
 random: true
 lenMemory: 23464
 capacityMemory: 100000
reward: 0.0
 random: true
 lenMemory: 23598
 capacityMemory: 100000
reward: 1.0
 random: true
 lenMemory: 23766
 capacityMemory: 100000
reward: 2.0
 random: true
 lenMemory: 23953
 capacityMemory: 100000
reward: 1.0
 random: true
 lenMemory: 24124
 capacityMemory: 100000
reward: 2.0
 random: true
 lenMemory: 24336
 capacityMemory: 100000
reward: 0.0
 random: true
 lenMemory: 24479
 cap

reward: 0.0
 random: true
 lenMemory: 43554
 capacityMemory: 100000
reward: 0.0
 random: true
 lenMemory: 43689
 capacityMemory: 100000
reward: 2.0
 random: true
 lenMemory: 43909
 capacityMemory: 100000
reward: 2.0
 random: true
 lenMemory: 44132
 capacityMemory: 100000
reward: 1.0
 random: true
 lenMemory: 44293
 capacityMemory: 100000
reward: 1.0
 random: true
 lenMemory: 44454
 capacityMemory: 100000
reward: 1.0
 random: true
 lenMemory: 44621
 capacityMemory: 100000
reward: 0.0
 random: true
 lenMemory: 44759
 capacityMemory: 100000
reward: 3.0
 random: true
 lenMemory: 44993
 capacityMemory: 100000
reward: 0.0
 random: true
 lenMemory: 45135
 capacityMemory: 100000
reward: 1.0
 random: true
 lenMemory: 45296
 capacityMemory: 100000
reward: 0.0
 random: true
 lenMemory: 45422
 capacityMemory: 100000
reward: 2.0
 random: true
 lenMemory: 45646
 capacityMemory: 100000
reward: 1.0
 random: true
 lenMemory: 45822
 capacityMemory: 100000
reward: 5.0
 random: true
 lenMemory: 46168
 cap

reward: 2.0
 random: true
 lenMemory: 64326
 capacityMemory: 100000
reward: 2.0
 random: true
 lenMemory: 64517
 capacityMemory: 100000
reward: 5.0
 random: true
 lenMemory: 64834
 capacityMemory: 100000
reward: 1.0
 random: true
 lenMemory: 65018
 capacityMemory: 100000
reward: 2.0
 random: true
 lenMemory: 65244
 capacityMemory: 100000
reward: 1.0
 random: true
 lenMemory: 65402
 capacityMemory: 100000
reward: 1.0
 random: true
 lenMemory: 65579
 capacityMemory: 100000
reward: 1.0
 random: true
 lenMemory: 65742
 capacityMemory: 100000
reward: 1.0
 random: true
 lenMemory: 65899
 capacityMemory: 100000
reward: 0.0
 random: true
 lenMemory: 66034
 capacityMemory: 100000
reward: 1.0
 random: true
 lenMemory: 66191
 capacityMemory: 100000
reward: 1.0
 random: true
 lenMemory: 66351
 capacityMemory: 100000
reward: 0.0
 random: true
 lenMemory: 66483
 capacityMemory: 100000
reward: 0.0
 random: true
 lenMemory: 66624
 capacityMemory: 100000
reward: 4.0
 random: true
 lenMemory: 66938
 cap

reward: 3.0
 random: true
 lenMemory: 85477
 capacityMemory: 100000
reward: 1.0
 random: true
 lenMemory: 85644
 capacityMemory: 100000
reward: 4.0
 random: true
 lenMemory: 85935
 capacityMemory: 100000
reward: 0.0
 random: true
 lenMemory: 86076
 capacityMemory: 100000
reward: 1.0
 random: true
 lenMemory: 86252
 capacityMemory: 100000
reward: 2.0
 random: true
 lenMemory: 86444
 capacityMemory: 100000
reward: 2.0
 random: true
 lenMemory: 86654
 capacityMemory: 100000
reward: 2.0
 random: true
 lenMemory: 86868
 capacityMemory: 100000
reward: 0.0
 random: true
 lenMemory: 86995
 capacityMemory: 100000
reward: 1.0
 random: true
 lenMemory: 87164
 capacityMemory: 100000
reward: 2.0
 random: true
 lenMemory: 87384
 capacityMemory: 100000
reward: 2.0
 random: true
 lenMemory: 87572
 capacityMemory: 100000
reward: 3.0
 random: true
 lenMemory: 87857
 capacityMemory: 100000
reward: 1.0
 random: true
 lenMemory: 88038
 capacityMemory: 100000
reward: 4.0
 random: true
 lenMemory: 88363
 cap

reward: 1.0
 epsilon: 0.010000007037236153
 lenMemory: 100000
 capacityMemory: 100000
 steps: 18762
reward: 2.0
 epsilon: 0.010000004826953393
 lenMemory: 100000
 capacityMemory: 100000
 steps: 19139
reward: 8.0
 epsilon: 0.010000002467524697
 lenMemory: 100000
 capacityMemory: 100000
 steps: 19810
reward: 1.0
 epsilon: 0.010000001850055246
 lenMemory: 100000
 capacityMemory: 100000
 steps: 20098
reward: 1.0
 epsilon: 0.010000001582831769
 lenMemory: 100000
 capacityMemory: 100000
 steps: 20254
reward: 3.0
 epsilon: 0.010000001270252247
 lenMemory: 100000
 capacityMemory: 100000
 steps: 20474
reward: 3.0
 epsilon: 0.010000000921470515
 lenMemory: 100000
 capacityMemory: 100000
 steps: 20795
reward: 4.0
 epsilon: 0.010000000708372992
 lenMemory: 100000
 capacityMemory: 100000
 steps: 21058
reward: 3.0
 epsilon: 0.010000000566213254
 lenMemory: 100000
 capacityMemory: 100000
 steps: 21282
reward: 3.0
 epsilon: 0.010000000431803567
 lenMemory: 100000
 capacityMemory: 100000
 steps: 21553


reward: 3.0
 epsilon: 0.01
 lenMemory: 100000
 capacityMemory: 100000
 steps: 48440
reward: 7.0
 epsilon: 0.01
 lenMemory: 100000
 capacityMemory: 100000
 steps: 48828
reward: 3.0
 epsilon: 0.01
 lenMemory: 100000
 capacityMemory: 100000
 steps: 49038
reward: 4.0
 epsilon: 0.01
 lenMemory: 100000
 capacityMemory: 100000
 steps: 49295
reward: 6.0
 epsilon: 0.01
 lenMemory: 100000
 capacityMemory: 100000
 steps: 49640
reward: 3.0
 epsilon: 0.01
 lenMemory: 100000
 capacityMemory: 100000
 steps: 49852
reward: 4.0
 epsilon: 0.01
 lenMemory: 100000
 capacityMemory: 100000
 steps: 50099
reward: 6.0
 epsilon: 0.01
 lenMemory: 100000
 capacityMemory: 100000
 steps: 50699
reward: 4.0
 epsilon: 0.01
 lenMemory: 100000
 capacityMemory: 100000
 steps: 50970
reward: 8.0
 epsilon: 0.01
 lenMemory: 100000
 capacityMemory: 100000
 steps: 51310
reward: 3.0
 epsilon: 0.01
 lenMemory: 100000
 capacityMemory: 100000
 steps: 51780
reward: 3.0
 epsilon: 0.01
 lenMemory: 100000
 capacityMemory: 100000
 steps

reward: 4.0
 epsilon: 0.01
 lenMemory: 100000
 capacityMemory: 100000
 steps: 86612
reward: 4.0
 epsilon: 0.01
 lenMemory: 100000
 capacityMemory: 100000
 steps: 86853
reward: 5.0
 epsilon: 0.01
 lenMemory: 100000
 capacityMemory: 100000
 steps: 87131
reward: 6.0
 epsilon: 0.01
 lenMemory: 100000
 capacityMemory: 100000
 steps: 87476
reward: 3.0
 epsilon: 0.01
 lenMemory: 100000
 capacityMemory: 100000
 steps: 87707
reward: 6.0
 epsilon: 0.01
 lenMemory: 100000
 capacityMemory: 100000
 steps: 88068
reward: 3.0
 epsilon: 0.01
 lenMemory: 100000
 capacityMemory: 100000
 steps: 88293
reward: 4.0
 epsilon: 0.01
 lenMemory: 100000
 capacityMemory: 100000
 steps: 88585
reward: 8.0
 epsilon: 0.01
 lenMemory: 100000
 capacityMemory: 100000
 steps: 88975
reward: 3.0
 epsilon: 0.01
 lenMemory: 100000
 capacityMemory: 100000
 steps: 89185
reward: 3.0
 epsilon: 0.01
 lenMemory: 100000
 capacityMemory: 100000
 steps: 89395
reward: 4.0
 epsilon: 0.01
 lenMemory: 100000
 capacityMemory: 100000
 steps

reward: 3.0
 epsilon: 0.01
 lenMemory: 100000
 capacityMemory: 100000
 steps: 115370
reward: 8.0
 epsilon: 0.01
 lenMemory: 100000
 capacityMemory: 100000
 steps: 115787
reward: 3.0
 epsilon: 0.01
 lenMemory: 100000
 capacityMemory: 100000
 steps: 116016
reward: 3.0
 epsilon: 0.01
 lenMemory: 100000
 capacityMemory: 100000
 steps: 116246
reward: 4.0
 epsilon: 0.01
 lenMemory: 100000
 capacityMemory: 100000
 steps: 116520
reward: 3.0
 epsilon: 0.01
 lenMemory: 100000
 capacityMemory: 100000
 steps: 116730
reward: 4.0
 epsilon: 0.01
 lenMemory: 100000
 capacityMemory: 100000
 steps: 116970
reward: 8.0
 epsilon: 0.01
 lenMemory: 100000
 capacityMemory: 100000
 steps: 117389
reward: 4.0
 epsilon: 0.01
 lenMemory: 100000
 capacityMemory: 100000
 steps: 117644
reward: 4.0
 epsilon: 0.01
 lenMemory: 100000
 capacityMemory: 100000
 steps: 117884
reward: 4.0
 epsilon: 0.01
 lenMemory: 100000
 capacityMemory: 100000
 steps: 118181
reward: 6.0
 epsilon: 0.01
 lenMemory: 100000
 capacityMemory: 10

reward: 4.0
 epsilon: 0.01
 lenMemory: 100000
 capacityMemory: 100000
 steps: 151930
reward: 2.0
 epsilon: 0.01
 lenMemory: 100000
 capacityMemory: 100000
 steps: 152116
reward: 6.0
 epsilon: 0.01
 lenMemory: 100000
 capacityMemory: 100000
 steps: 152472
reward: 6.0
 epsilon: 0.01
 lenMemory: 100000
 capacityMemory: 100000
 steps: 152811
reward: 5.0
 epsilon: 0.01
 lenMemory: 100000
 capacityMemory: 100000
 steps: 153110
reward: 5.0
 epsilon: 0.01
 lenMemory: 100000
 capacityMemory: 100000
 steps: 153397
reward: 4.0
 epsilon: 0.01
 lenMemory: 100000
 capacityMemory: 100000
 steps: 153667
reward: 4.0
 epsilon: 0.01
 lenMemory: 100000
 capacityMemory: 100000
 steps: 154126
reward: 4.0
 epsilon: 0.01
 lenMemory: 100000
 capacityMemory: 100000
 steps: 154973
reward: 8.0
 epsilon: 0.01
 lenMemory: 100000
 capacityMemory: 100000
 steps: 155394
reward: 4.0
 epsilon: 0.01
 lenMemory: 100000
 capacityMemory: 100000
 steps: 155686
reward: 3.0
 epsilon: 0.01
 lenMemory: 100000
 capacityMemory: 10

reward: 3.0
 epsilon: 0.01
 lenMemory: 100000
 capacityMemory: 100000
 steps: 187239
reward: 4.0
 epsilon: 0.01
 lenMemory: 100000
 capacityMemory: 100000
 steps: 187515
reward: 2.0
 epsilon: 0.01
 lenMemory: 100000
 capacityMemory: 100000
 steps: 187906
reward: 5.0
 epsilon: 0.01
 lenMemory: 100000
 capacityMemory: 100000
 steps: 189552
reward: 5.0
 epsilon: 0.01
 lenMemory: 100000
 capacityMemory: 100000
 steps: 190016
reward: 1.0
 epsilon: 0.01
 lenMemory: 100000
 capacityMemory: 100000
 steps: 191674
reward: 3.0
 epsilon: 0.01
 lenMemory: 100000
 capacityMemory: 100000
 steps: 192290
reward: 5.0
 epsilon: 0.01
 lenMemory: 100000
 capacityMemory: 100000
 steps: 192920
reward: 4.0
 epsilon: 0.01
 lenMemory: 100000
 capacityMemory: 100000
 steps: 193246
reward: 6.0
 epsilon: 0.01
 lenMemory: 100000
 capacityMemory: 100000
 steps: 193581
reward: 5.0
 epsilon: 0.01
 lenMemory: 100000
 capacityMemory: 100000
 steps: 193904
reward: 4.0
 epsilon: 0.01
 lenMemory: 100000
 capacityMemory: 10