In [1]:
import theano as theano
import theano.tensor as T
from theano.gradient import grad_clip
import numpy as np
import time
import operator
import sys
from time import time
import glob

In [3]:
class GRUTheano:
    
    def __init__(self, x_dim,y_dim, hidden_dim=128, bptt_truncate=-1):
        # Assign instance variables
        self.x_dim = x_dim
        self.y_dim = y_dim
        self.hidden_dim = hidden_dim
        self.bptt_truncate = bptt_truncate
        # Initialize the network parameters
        E = np.random.uniform(-np.sqrt(1./x_dim), np.sqrt(1./x_dim), (hidden_dim, x_dim))
        U = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (6, hidden_dim, hidden_dim))
        W = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (6, hidden_dim, hidden_dim))
        V = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (y_dim, hidden_dim))
        b = np.zeros((6, hidden_dim))
        c = np.zeros(y_dim)
        # Theano: Created shared variables
        self.E = theano.shared(name='E', value=E.astype(theano.config.floatX))
        self.U = theano.shared(name='U', value=U.astype(theano.config.floatX))
        self.W = theano.shared(name='W', value=W.astype(theano.config.floatX))
        self.V = theano.shared(name='V', value=V.astype(theano.config.floatX))
        self.b = theano.shared(name='b', value=b.astype(theano.config.floatX))
        self.c = theano.shared(name='c', value=c.astype(theano.config.floatX))
        # SGD / rmsprop: Initialize parameters
        self.mE = theano.shared(name='mE', value=np.zeros(E.shape).astype(theano.config.floatX))
        self.mU = theano.shared(name='mU', value=np.zeros(U.shape).astype(theano.config.floatX))
        self.mV = theano.shared(name='mV', value=np.zeros(V.shape).astype(theano.config.floatX))
        self.mW = theano.shared(name='mW', value=np.zeros(W.shape).astype(theano.config.floatX))
        self.mb = theano.shared(name='mb', value=np.zeros(b.shape).astype(theano.config.floatX))
        self.mc = theano.shared(name='mc', value=np.zeros(c.shape).astype(theano.config.floatX))
        # We store the Theano graph here
        self.theano = {}
        self.__theano_build__()
    
    def __theano_build__(self):
        E, V, U, W, b, c = self.E, self.V, self.U, self.W, self.b, self.c
        
        x = T.dmatrix('x')
        y = T.dmatrix('y')
        
        def forward_prop_step(x_t, s_t1_prev, s_t2_prev):
            # This is how we calculated the hidden state in a simple RNN. No longer!
            # s_t = T.tanh(U[:,x_t] + W.dot(s_t1_prev))
            
            # Word embedding layer
            x_e = E.dot(x_t)
            
            # GRU Layer 1
            z_t1 = T.nnet.sigmoid(U[0].dot(x_e) + W[0].dot(s_t1_prev) + b[0])
            r_t1 = T.nnet.sigmoid(U[1].dot(x_e) + W[1].dot(s_t1_prev) + b[1])
            c_t1 = T.tanh(U[2].dot(x_e) + W[2].dot(s_t1_prev * r_t1) + b[2])
            s_t1 = (T.ones_like(z_t1) - z_t1) * c_t1 + z_t1 * s_t1_prev
            
            # GRU Layer 2
            z_t2 = T.nnet.sigmoid(U[3].dot(s_t1) + W[3].dot(s_t2_prev) + b[3])
            r_t2 = T.nnet.sigmoid(U[4].dot(s_t1) + W[4].dot(s_t2_prev) + b[4])
            c_t2 = T.tanh(U[5].dot(s_t1) + W[5].dot(s_t2_prev * r_t2) + b[5])
            s_t2 = (T.ones_like(z_t2) - z_t2) * c_t2 + z_t2 * s_t2_prev
            
            # Final output calculation
            o_t = T.nnet.relu(V.dot(s_t2) + c)

            return [o_t, s_t1, s_t2]
        
        [o, s, s2], updates = theano.scan(
            forward_prop_step,
            sequences=x,
            truncate_gradient=self.bptt_truncate,
            outputs_info=[None, 
                          dict(initial=T.zeros(self.hidden_dim)),
                          dict(initial=T.zeros(self.hidden_dim))])
        
        prediction = T.argmax(o, axis=1)
        o_error = T.sum(T.pow(o-y,2))
        
        # Total cost (could add regularization here)
        cost = o_error
        
        # Gradients
        dE = T.grad(cost, E)
        dU = T.grad(cost, U)
        dW = T.grad(cost, W)
        db = T.grad(cost, b)
        dV = T.grad(cost, V)
        dc = T.grad(cost, c)
        
        # Assign functions
        self.predict = theano.function([x], o)
        self.predict_class = theano.function([x], prediction)
        self.ce_error = theano.function([x, y], cost)
        self.bptt = theano.function([x, y], [dE, dU, dW, db, dV, dc])
        
        # SGD parameters
        learning_rate = T.scalar('learning_rate')
        decay = T.scalar('decay')
        
        # rmsprop cache updates
        mE = decay * self.mE + (1 - decay) * dE ** 2
        mU = decay * self.mU + (1 - decay) * dU ** 2
        mW = decay * self.mW + (1 - decay) * dW ** 2
        mV = decay * self.mV + (1 - decay) * dV ** 2
        mb = decay * self.mb + (1 - decay) * db ** 2
        mc = decay * self.mc + (1 - decay) * dc ** 2
        
        self.sgd_step = theano.function(
            [x, y, learning_rate, theano.In(decay, value=0.9)],
            [], 
            updates=[(E, E - learning_rate * dE / T.sqrt(mE + 1e-6)),
                     (U, U - learning_rate * dU / T.sqrt(mU + 1e-6)),
                     (W, W - learning_rate * dW / T.sqrt(mW + 1e-6)),
                     (V, V - learning_rate * dV / T.sqrt(mV + 1e-6)),
                     (b, b - learning_rate * db / T.sqrt(mb + 1e-6)),
                     (c, c - learning_rate * dc / T.sqrt(mc + 1e-6)),
                     (self.mE, mE),
                     (self.mU, mU),
                     (self.mW, mW),
                     (self.mV, mV),
                     (self.mb, mb),
                     (self.mc, mc)
                    ])
        
        

In [4]:
def num_to_move(num):
    if num == 0:
        return "bet 0-25"
    elif num == 1:
        return "bet 25-50"
    elif num == 2:
        return "bet 50-75"
    elif num == 3:
        return "bet 75-100"
    elif num == 4:
        return "bet 100-125"
    elif num == 5:
        return "bet 125-150"
    elif num == 6:
        return "bet 150-175"
    elif num == 7:
        return "bet 175-200"
def save_model_parameters_theano(model, outfile):
    np.savez(outfile,
        E=model.E.get_value(),
        U=model.U.get_value(),
        W=model.W.get_value(),
        V=model.V.get_value(),
        b=model.b.get_value(),
        c=model.c.get_value())
    print "Saved model parameters to %s." % outfile
def load_model_parameters_theano(path, modelClass=GRUTheano):
    npzfile = np.load(path)
    E, U, W, V, b, c = npzfile["E"], npzfile["U"], npzfile["W"], npzfile["V"], npzfile["b"], npzfile["c"]
    hidden_dim, x_dim = E.shape[0], E.shape[1]
    y_dim,hidden_dim = V.shape[0], V.shape[1]
    print "Building model model from %s with hidden_dim=%d x_dim=%d y_dim=%d " % (path, hidden_dim, x_dim,y_dim)
    sys.stdout.flush()
    model = modelClass(x_dim, y_dim,hidden_dim=hidden_dim)
    model.E.set_value(E)
    model.U.set_value(U)
    model.W.set_value(W)
    model.V.set_value(V)
    model.b.set_value(b)
    model.c.set_value(c)
    return model
def predict_hand(hand):
    states = np.array([])
    moves = []
    for state,action,reward,done in hand:
        state = np.append(state,action)
        if len(states) == 0:
            states = np.array([state])
        else:
            states = np.append(states,np.array([state]),axis=0) 
        moves += [model.predict(states)[-1]]
    return moves
def predict_hand_moves(hand):
    states = np.array([])
    moves = []
    for state,action,reward,done in hand:
        state = np.append(state,action)
        if len(states) == 0:
            states = np.array([state])
        else:
            states = np.append(states,np.array([state]),axis=0) 
        moves += [num_to_move(np.argmax(model.predict(states)[-1]))]
## HELPER FUNCTIONS

In [5]:
from time import time
from time import sleep
import commands
import os
count = 0

experience_replay = [] ## MEMORIES OF EXPERIENCES WITH HIGH REWARD MAGNITUDES
recent_memory = []     ## RECENT MEMORIES FROM PARSED FILES
                       ## SAMPLE RANSOMLY FROM THIS TO REDUCE OVERESTIMATION
recent_memory_size = 400
replay_memory = 100
states_seen = 0

memory = 30
counts = {}
lr = 1e-5
gamma =.95
reward_length = 30 # GIVE REWARD AS AVERAGE OVER MANY STATES TO REDUCE RISKY BEHAVIOR

def train():
    
    ## ITERATE CONFIG FILES TO TRAIN ON MULTIPLE OTHER BOTS WITH DIFFERENT STRATEGIES
    for config in glob.glob("/Users/michaelgump/pokerbots-2017/config_files/*.txt"):
        
        ## WRITE IT TO THE MAIN DIRECTORY SO THE ENGINE WIL RUN IT
        with open(config,'rb') as conf:
            content = conf.readlines()
            text_file = open("/Users/michaelgump/pokerbots-2017/config.txt", "wb")
            for line in content:
                text_file.write(line)
            text_file.close()
        
        s=commands.getstatusoutput('cd /Users/michaelgump/pokerbots-2017/;java -jar engine.jar -Q')
        sleep(20)
        files = glob.glob("/Users/michaelgump/pokerbots-2017/selftraining/*.p")
        
        last_rewards = []
        
        # ITERATE PARSED GAMES
        for f in files:
            print f
            s= time()
            states,rewards = pickle.load(open(f,"rb"))

            train_states = np.array([])
            train_targets = np.array([])

            for i in range(len(states)):

                state= states[i]
                r,action,done = rewards[i]

                if len(train_states) == 0:
                    train_states = np.array([state])
                    target = model.predict(states)[-1]
                    train_targets = np.array([target])                
                else:
                    qval = model.predict(states)[-1]

                    if len(train_states)<memory:
                        train_states = np.append(train_states,np.array([state]),axis=0)
                    else:
                        train_states = np.append(train_states[1:],np.array([state]),axis=0)

                    newQ = model_value.predict(states)[-1]  # USE A VERSION OF THE NETWORK MANY ITERATIONS AGO
                                                            # FOR ESTIMATING VALUE OF NEXT STATE TO REDUCE
                                                            # ESTIMATION
                            
                    maxQ = np.max(newQ)

                    update = 0
                    
                    last_rewards = last_rewards[-reward_length:]+[float(reward)]
                    reward = float(sum(last_rewards))/reward_length
                    
                    if done:
                        update = reward
                    else:
                        update = reward+gamma*maxQ

                    text = num_to_move(action)
                    
                    if text in counts:
                        counts[text]+=1
                    else:
                        counts[text]=1

                    target = np.zeros(8)
                    target[:] = qval[:]
                    target[action] = float(update)
                    
                    if len(train_targets)<memory:
                        train_targets = np.append(train_targets,np.array([target]),axis=0)
                    else:
                        train_targets = np.append(train_targets[1:],np.array([target]),axis=0)

                    if len(experience_replay)>replay_memory:
                        experience_replay.sort(key=lambda x:x[2])
                        if float(r)>experience_replay[0][2]:
                            experience_replay = experience_replay[1:]+[(train_states,train_targets,reward)]
                    else:
                        experience_replay += [(train_states,train_targets,reward)]
                    states_seen += 1
                recent_memory += [(train_states,train_targets)]
            
            if len(recent_memory)>recent_memory_size: ## SAMPLE RECENT MEMORY RANDOMLY
                for i in range(len(recent_memory)-recent_memory_size):
                    inputs,outputs, = recent_memory.pop(np.random.randint(len(recent_memory)))
                    model.sgd_step(inputs,outputs,lr)
                    
            if states_seen > 500: ## DONT START REPLAYING EXPERIENCES UNTIL A MEMORY HAS BEEN ASSEMBLED
                exp_for_replay = experience_replay[np.random.randint(len(experience_replay))]
                inputs,outputs,r = exp_for_replay
                model.sgd_step(inputs,outputs,lr)
                
            os.remove(f)
            
    save_model_parameters_theano(model,'/Users/michaelgump/pokerbots-2017/modelbottest/model.npz')