In [1]:
import random
import numpy as np

Create neural net:
Input: the amount of money (1-500)
Output: the amount to bet (1-input)
Objective is to maximize probability 200 will reach 500
Can calculate this using linear equation solver
p(i) = P * p(i+bet[i]) + (1 - P) * p(i-bet[i])
P * p(i+bet[i]) + (1 - P) * p(i-bet[i]) - p(i) = 0

In [2]:
e = 1
iterations = 20000
max_memory = 100
hidden_size = 30
num_actions = 500
input_size = 1
batch_size = 128
totalSteps = 0
learningRate = 0.0025
learnStart = 100
updateTarget = 1

In [3]:
class environment():
    def __init__(self, p, s):
        self.p = p
        self.s = s
        self.time = 0
        self.done = False
        
    def random_action(self):
        return np.random.randint(1, self.s + 1)
    
    def probability(self, bets):
        P = np.zeros(501) #0-500
        p = self.p
        a = []
        P[500] = 1
        first = np.zeros(501)
        first[0] = 1
        a.append(first)
        for i in range(1, 500): #solving for P(i) for i = 1 to 500
            t = np.zeros(501)
            t[min(i + bets[i], 500)] = p
            t[max(i - bets[i], 0)] = 1 - p
            t[i] = -1
            a.append(t)
        last = np.zeros(501)
        last[500] = 1
        a.append(last)
        a = np.array(a)
        b = np.zeros(501)
        b[500] = 1
        x = np.linalg.solve(a, b)
        return x
        
    def getBets(self, model):
        strat = [0]
        for i in xrange(1, 500):
            x = model.getAction(i)
            strat.append(x)
        return strat
    
    def goodness(self, probs):
        return sum(probs)
    
    def explore(self, bets, e):
        for i in xrange(1, 500):
            bets[i] += np.random.rand() * e * 15
            bets[i] = min(i, bets[i])
            bets[i] = max(1, bets[i])
        return bets
    
    def step(self, amount, model, e):
        bets = self.getBets(model)
        r1 = self.probability(bets)        
        bets[self.s] = amount
        #bets = self.explore(bets, e)
        r2 = self.probability(bets)
        r = 0
        if len(history) != 0:
            r = self.goodness(r2) - sum(history[len(history) - min(len(history), 100): len(history)]) / min(100, len(history))
        if(np.random.rand() < self.p):
            self.s += amount
        else:
            self.s -= amount
        done = False
        if(self.s <= 0 or self.s >= 500):
            done = True
        self.s 
        self.time += 1
        return self.s, r, done

In [4]:
class ExperienceReplay(object):
    def __init__(self, max_memory, discount):
        self.max_memory = max_memory
        self.memory = list()
        self.discount = discount

    def remember(self, states, game_over):
        # memory[i] = [[state_t, action_t, reward_t, state_t+1], game_over?]
        self.memory.append([states, game_over])
        if len(self.memory) > self.max_memory:
            del self.memory[0]

    def get_batch(self, model, batch_size=batch_size):
        indices = random.sample(np.arange(len(self.memory)), min(batch_size,len(self.memory)) )
        miniBatch = []
        for index in indices:
            miniBatch.append(self.memory[index])
        return miniBatch

In [5]:
from keras.models import Sequential
from keras import optimizers
from keras.layers.core import Dense, Activation, Dropout
class DeepQ:
    def __init__(self):
        self.model = self.createModel('relu', learningRate)
        self.target_model = self.createModel('relu', learningRate)

    def createModel(self, activationType, learningRate):
            layerSize = hidden_size
            model = Sequential()
            model.add(Dense(layerSize, input_shape=(input_size, ), init='lecun_uniform'))
            model.add(Activation(activationType))
            model.add(Dense(layerSize, init='lecun_uniform'))
            model.add(Activation(activationType))
            model.add(Dense(num_actions, init='lecun_uniform'))
            model.add(Activation("softmax"))
            optimizer = optimizers.RMSprop(lr=learningRate, rho=0.9, epsilon=1e-06)
            model.compile(loss="mse", optimizer=optimizer)
            #print model.summary()
            return model
    def getAction(self, s):
        state = np.array(s)
        qValues = self.model.predict(state.reshape(1, 1))[0]
        x = np.argmax(qValues) + 1 #outputs number from 1-500 inclusive
        if x > s:
            x = np.random.randint(1, s + 1)
        return x
    def updateTarget(self):
        self.target_model = self.model
    def trainModel(self, batch, discount):
        X_batch = np.empty((0, input_size), dtype = np.float64)
        Y_batch = np.empty((0, num_actions), dtype = np.float64)
        for sample in batch:
            state = np.array([sample[0][0]])
            action = sample[0][1]
            reward = sample[0][2]
            newState = np.array([sample[0][3]])
            isFinal = sample[1]
            qValues = self.model.predict(state.reshape(1,len(state)))[0]
            bestAction = np.argmax(self.target_model.predict(newState.reshape(1,len(newState)))[0])
            qValuesNewState = self.model.predict(newState.reshape(1,len(newState)))[0]
            targetValue = reward + discount * qValuesNewState[bestAction]

            X_batch = np.append(X_batch, np.array([state.copy()]), axis=0)
            Y_sample = qValues.copy()
            Y_sample[action] = targetValue
            Y_batch = np.append(Y_batch, np.array([Y_sample]), axis=0)
            if isFinal:
                X_batch = np.append(X_batch, np.array([newState.copy()]), axis=0)
                Y_batch = np.append(Y_batch, np.array([[reward]*num_actions]), axis=0)
        return self.model.train_on_batch(X_batch, Y_batch)
    def save(self):
        self.model.save_weights('my_model_weights.h5', overwrite=True)
    def load(self):
        self.model.load_weights('my_model_weights.h5')

Using Theano backend.
Using gpu device 0: GeForce GTX 970 (CNMeM is disabled, cuDNN not available)


In [None]:
DQN = DeepQ()
strat = []
for i in xrange(1, 500):
    x = DQN.getAction(i)
    strat.append(x)
print strat

[1, 2, 1, 1, 3, 1, 3, 8, 2, 10, 2, 2, 1, 13, 10, 16, 13, 5, 4, 3, 10, 13, 10, 12, 12, 23, 15, 13, 18, 13, 12, 16, 29, 9, 21, 13, 12, 2, 7, 30, 34, 7, 12, 10, 39, 17, 14, 39, 10, 36, 20, 49, 10, 53, 53, 28, 2, 47, 43, 38, 47, 59, 6, 31, 48, 50, 57, 13, 67, 28, 11, 40, 66, 53, 36, 50, 11, 34, 25, 55, 38, 25, 32, 64, 14, 59, 27, 40, 61, 1, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 9

In [None]:
exp_replay = ExperienceReplay(max_memory, 0.99)
DQN = DeepQ()
history = []
for i_episode in range(iterations):
    R = 0
    #print i_episode, time / (i_episode+1)
    env = environment(0.3, np.random.randint(1, 500))
    loss = 0
    if e > 0.05:
        e *= 0.991
    while(env.done == False):
        #env.render()
        #print observation
        s = env.s #get state
        #Select action
        if(np.random.rand() < e):
            action = env.random_action()
        else:
            action = DQN.getAction(s)          
        ss, r, done = env.step(action, DQN, e)
        R += r
        #remeber this state and action for later training
        exp_replay.remember([s, action, r, ss], done)
        #train model
        s = ss
        totalSteps += 1
        if learnStart < totalSteps:
            if totalSteps % updateTarget == 0:
                DQN.updateTarget()
            loss += DQN.trainModel(exp_replay.get_batch(batch_size), 0.99)
        if done:
            t = env.time
            history.append(env.goodness(env.probability(env.getBets(DQN))))
            if(i_episode % 5 == 4):
                DQN.save()
                print sum(history[len(history) - min(len(history), 100): len(history)])
                print "Episode {} finished after {} timesteps with goodness {} and exploration value {}: reward={}".format(i_episode, t+1,
                                                                                        env.goodness(env.probability(env.getBets(DQN))), e, r)
            break
#env.render(close=True)

409.300360348
Episode 4 finished after 9 timesteps with goodness 81.842199151 and exploration value 0.955802742746: reward=0.248092010105
819.555699031
Episode 9 finished after 5 timesteps with goodness 81.8088083639 and exploration value 0.913558883041: reward=-0.149992216346


In [None]:
print sum(history[len(history)-300:len(history)]) / 300.0

In [None]:
strat = []
for i in xrange(1, 500):
    x = DQN.getAction(i)
    strat.append(x)
print strat

In [None]:
env = environment(0.3, 200)
bets = np.ones(501)
print env.probability(0.3, bets)