In [1]:
import random
import numpy as np

In [2]:
e = 1
iterations = 20000
max_memory = 1000000
hidden_size = 30
num_actions = 500
input_size = 1
batch_size = 128
totalSteps = 0
learningRate = 0.00025
learnStart = 100
updateTarget = 5

In [3]:
class environment():
    def __init__(self, p, s):
        self.p = p
        self.s = s
        self.time = 0
        self.done = False
        
    def random_action(self):
        return np.random.randint(1, self.s + 1)
    
    def step(self, amount):
        if(np.random.rand() < self.p):
            self.s += amount
        else:
            self.s -= amount
        r = 0
        done = False
        if(self.s <= 0):
            done = True
            r = -1
        if(self.s >= 500):
            done = True
            r = 1
        self.time += 1
        return self.s, r, done

In [4]:
class ExperienceReplay(object):
    def __init__(self, max_memory, discount):
        self.max_memory = max_memory
        self.memory = list()
        self.discount = discount

    def remember(self, states, game_over):
        # memory[i] = [[state_t, action_t, reward_t, state_t+1], game_over?]
        self.memory.append([states, game_over])
        if len(self.memory) > self.max_memory:
            del self.memory[0]

    def get_batch(self, model, batch_size=batch_size):
        indices = random.sample(np.arange(len(self.memory)), min(batch_size,len(self.memory)) )
        miniBatch = []
        for index in indices:
            miniBatch.append(self.memory[index])
        return miniBatch

In [5]:
from keras.models import Sequential
from keras import optimizers
from keras.layers.core import Dense, Activation, Dropout
class DeepQ:
    def __init__(self):
        self.model = self.createModel('relu', learningRate)
        self.target_model = self.createModel('relu', learningRate)

    def createModel(self, activationType, learningRate):
            layerSize = hidden_size
            model = Sequential()
            model.add(Dense(layerSize, input_shape=(input_size, ), init='lecun_uniform'))
            model.add(Activation(activationType))
            model.add(Dropout(0.1))
            model.add(Dense(layerSize, init='lecun_uniform'))
            model.add(Activation(activationType))
            model.add(Dropout(0.1))
            model.add(Dense(layerSize, init='lecun_uniform'))
            model.add(Activation(activationType))
            model.add(Dropout(0.1))
            model.add(Dense(layerSize, init='lecun_uniform'))
            model.add(Activation(activationType))
            model.add(Dropout(0.1))
            model.add(Dense(num_actions, init='uniform'))
            model.add(Activation("softmax"))
            optimizer = optimizers.RMSprop(lr=learningRate, rho=0.9, epsilon=1e-06)
            model.compile(loss="mse", optimizer=optimizer)
            #print model.summary()
            return model
    def getAction(self, s):
        state = np.array(s)
        qValues = self.model.predict(state.reshape(1, 1))[0]
        x = np.argmax(qValues) + 1 #outputs number from 1-500 inclusive
        if x > s:
            x = np.random.randint(1, s + 1)
        return x
    def updateTarget(self):
        self.target_model = self.model
    def trainModel(self, batch, discount):
        X_batch = np.empty((0, input_size), dtype = np.float64)
        Y_batch = np.empty((0, num_actions), dtype = np.float64)
        for sample in batch:
            state = np.array([sample[0][0]])
            action = sample[0][1]
            reward = sample[0][2]
            newState = np.array([sample[0][3]])
            isFinal = sample[1]
            qValues = self.model.predict(state.reshape(1,len(state)))[0]
            bestAction = np.argmax(self.target_model.predict(newState.reshape(1,len(newState)))[0])
            qValuesNewState = self.model.predict(newState.reshape(1,len(newState)))[0]
            targetValue = reward + discount * qValuesNewState[bestAction]

            X_batch = np.append(X_batch, np.array([state.copy()]), axis=0)
            Y_sample = qValues.copy()
            Y_sample[action] = targetValue
            Y_batch = np.append(Y_batch, np.array([Y_sample]), axis=0)
            if isFinal:
                X_batch = np.append(X_batch, np.array([newState.copy()]), axis=0)
                Y_batch = np.append(Y_batch, np.array([[reward]*num_actions]), axis=0)
        return self.model.train_on_batch(X_batch, Y_batch)
    def save(self):
        self.model.save_weights('my_model_weights.h5')
    def load(self):
        self.model.load_weights('my_model_weights.h5')

Using Theano backend.
Using gpu device 0: GeForce GTX 970 (CNMeM is disabled, cuDNN not available)


In [None]:
DQN = DeepQ()
strat = []
for i in xrange(1, 500):
    x = DQN.getAction(i)
    strat.append(x)
print strat

[1, 1, 3, 1, 2, 1, 5, 2, 7, 9, 5, 8, 12, 4, 11, 3, 13, 2, 2, 15, 14, 18, 7, 2, 1, 22, 7, 5, 29, 7, 29, 19, 2, 1, 9, 20, 12, 12, 17, 26, 37, 24, 6, 23, 37, 24, 18, 19, 47, 18, 38, 37, 6, 48, 53, 55, 49, 7, 58, 41, 33, 43, 61, 15, 8, 31, 56, 26, 20, 38, 5, 28, 20, 67, 53, 69, 41, 7, 76, 3, 24, 51, 80, 55, 8, 41, 14, 64, 17, 79, 90, 87, 89, 84, 10, 90, 90, 12, 82, 95, 37, 11, 34, 7, 13, 12, 36, 93, 20, 68, 35, 56, 12, 76, 37, 9, 88, 80, 87, 15, 81, 119, 60, 118, 96, 108, 99, 60, 8, 20, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 13

In [None]:
exp_replay = ExperienceReplay(max_memory, 0.99)
DQN = DeepQ()
history = []
for i_episode in range(iterations):
    #print i_episode, time / (i_episode+1)
    env = environment(0.3, 200)
    loss = 0
    if e > 0.05:
        e *= 0.991
    while(env.done == False):
        #env.render()
        #print observation
        s = env.s #get state
        #Select action
        if(np.random.rand() < e):
            action = env.random_action()
        else:
            action = DQN.getAction(s)          
        ss, r, done = env.step(action)
        #remeber this state and action for later training
        exp_replay.remember([s, action, r, ss], done)
        #train model
        s = ss
        totalSteps += 1
        if learnStart < totalSteps:
            if totalSteps % updateTarget == 0:
                DQN.updateTarget()
            loss += DQN.trainModel(exp_replay.get_batch(batch_size), 0.99)
        if done:
            t = env.time
            
            if s >= 500:
                history.append(1)
            else:
                history.append(0)
            if(i_episode % 10 == 9):
                print sum(history[len(history) - min(len(history), 100): len(history)])
                print "Episode {} finished after {} timesteps with loss {} and exploration value {}: end={}".format(i_episode, t+1,
                                                                                                    loss / (t + 1), e, s)
            break
#env.render(close=True)

0
Episode 9 finished after 8 timesteps with loss 0 and exploration value 0.913558883041: end=0
1
Episode 19 finished after 14 timesteps with loss 0.0831435394606 and exploration value 0.834589832783: end=0


In [None]:
print sum(history[len(history)-300:len(history)]) / 300.0
DQN.save()

In [None]:
strat = []
for i in xrange(1, 500):
    x = DQN.getAction(i)
    strat.append(x)
print strat