In [1]:
import random
import gym
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten, MaxPooling2D, Conv2D, Reshape, LeakyReLU
from keras.optimizers import Adam
from keras import backend as K

# def custom_activation(x):
#    return (124 * K.sigmoid(x)) - 62

class DQNAgent:
    def __init__(self, env):
        self.memory = deque(maxlen=1000000) # Memory size = 10000
        self.gamma = 1 #0.95    # discount rate
        # self.epsilon = 1.0  # exploration rate
        self.epsilon = 0.05
        # self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.model = self._build_model(env)        
        self.size = env.size
        self.actions_size = len(env.action_space)

    def _build_model(self, env):
        # Neural Net for Deep-Q learning Model
        
        nb_actions = len(env.action_space)
        state_shape = env.state_shape
        
        model = Sequential()
        
        lrelu = lambda x: tf.keras.activations.relu(x, alpha=0.1)
        activation = lrelu # 'relu'
        
        model.add(Conv2D(64, kernel_size=(3, 3), input_shape=(env.size, env.size, 3), activation=activation)) # 3 filters? Parameters
        model.add(Conv2D(64, kernel_size=(3, 3), input_shape=state_shape, activation=activation)) # 3 filters? Parameters
        model.add(Conv2D(128, kernel_size=(2, 2), input_shape=state_shape, activation=activation))
        model.add(Conv2D(128, kernel_size=(3, 3), input_shape=state_shape, activation=activation))
        model.add(Flatten())
        
        model.add(Dense(nb_actions)) #, activation=custom_activation)) # last layer
        # model.add(Dense(nb_actions, activation='linear')) # last layer
        
        model.summary()
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        return model

    def memorize(self, state, action, reward, next_state, done, valid_actions_next):
        self.memory.append((np.array([state.T]), action, reward, np.array([next_state.T]), done, valid_actions_next))

    def act(self, state, valid_actions, eps_greedy=True):
        if eps_greedy and np.random.rand() <= self.epsilon: # Eps greedy
            return random.choice(valid_actions)
        act_values = self.model.predict(np.array([state.T]))[0,valid_actions]
        return valid_actions[np.argmax(act_values)]
    
    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        
        states = np.zeros((batch_size, self.size, self.size, 3))
        targets = np.zeros((batch_size, self.actions_size))
        
        count = 0
        
        for state, action, reward, next_state, done, valid_actions_next in minibatch:
            target = reward
            if not done:
                target = (reward + self.gamma *
                          np.amax(self.model.predict(next_state)[0, valid_actions_next]))
            target_f = self.model.predict(state)
            target_f[0][action] = target
            
            states[count] = state[0]
            targets[count] = target_f[0]
            count += 1
        
        self.model.fit(states, targets, epochs=1, verbose=0)
        
        # if self.epsilon > self.epsilon_min:
        #    self.epsilon *= self.epsilon_decay

    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)

Using TensorFlow backend.


In [2]:
# with tf.Session() as sess:  print(custom_activation(-100.0).eval()) 

In [2]:
import gym
import gym_reversi

import tensorflow as tf
from keras import backend

config = tf.ConfigProto( device_count = {'GPU': 1 , 'CPU': 4} ) 
sess = tf.Session(config=config) 
backend.set_session(sess)


def opponent(board, avail):
    dummy=np.where(avail==np.amax(avail))
    # dummy=np.where(avail>0)
    maxavail = list(zip(dummy[0],dummy[1]))
    move = random.choice(maxavail)    
    return move


def self_play_opponent(board, avail):
    global env, agent
    state = env.get_state(-env.AI_Player)
    valid_actions = env.get_actions(-env.AI_Player)
    return env.action_ind_to_board_pos(agent.act(state, valid_actions))


env = gym.make("reversi-v0", opponent = opponent, AI_Player = 1)

agent = DQNAgent(env)




Instructions for updating:
keep_dims is deprecated, use keepdims instead
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 6, 6, 64)          1792      
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 4, 4, 64)          36928     
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 3, 3, 128)         32896     
_________________________________________________________________
conv2d_4 (Conv2D)            (None, 1, 1, 128)         147584    
_________________________________________________________________
flatten_1 (Flatten)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 60)                7740      
Total params: 226,940
Trainable params: 226,940
Non-trainable para

state = env.reset()
valid_actions = env.get_actions(env.AI_Player)

a = agent.act(state, valid_actions)
next_state, r, done, _ = env.step(a)
valid_actions_next = env.get_actions(env.AI_Player)
agent.memorize(state, a, r, next_state, True, valid_actions_next)

In [3]:
env.board.sum()

0.0

In [4]:
# Evaluate based on playing againts deterministic opponent
def test(num_games):
    global env, agent
    winner_counter = {
        "AI":0,
        "Tie":0,
        "Opponent":0
    }
    
    tmp = env.opponent
    env.opponent = opponent
    
    for g in range(num_games):
        done = False
        state = env.reset()
        r = 0
        while(not done):
            # env.render()
            valid_actions = env.get_actions(env.AI_Player)
            action = agent.act(state, valid_actions, eps_greedy=False)
            next_state, reward, done, _ = env.step(action)        
            state = next_state
            r+=reward
        count = np.sum(env.board)
        if count == 0:
            winner = "Tie"
        elif env.AI_Player * count > 0:
            winner = "AI"
        else:
            winner = "Opponent"
        winner_counter[winner] += 1 / num_games
    env.opponent = tmp # reset opponent
    return winner_counter
test(50)





{'AI': 0.5000000000000001, 'Tie': 0, 'Opponent': 0.5000000000000001}

In [5]:
import time
import pickle

batch_size = 32
GAMES_PER_EPISODE = 100
EPISODES = 1000

# win_rate_ = win_rate
# tie_rate_ = tie_rate

win_rate = np.zeros(EPISODES)
tie_rate = np.zeros(EPISODES)

# win_rate[:len(win_rate_)] = win_rate_
# tie_rate[:len(tie_rate_)] = tie_rate_

# env.opponent = self_play_opponent
env.opponent = opponent

for e in range(EPISODES):
    e_start = time.time()
    for g in range(GAMES_PER_EPISODE):
        done = False
        state = env.reset()
        while(not done):
            # env.render()
            valid_actions = env.get_actions(env.AI_Player)
            action = agent.act(state, valid_actions)

            next_state, reward, done, _ = env.step(action)        
            valid_actions_next = env.get_actions(env.AI_Player)
            agent.memorize(state, action, reward, next_state, done, valid_actions_next)
            state = next_state
            if(len(agent.memory) > batch_size):
                agent.replay(batch_size)
    e_duration = time.time() - e_start
    if e % 10 == 0:
        winner_counter = test(50)

        print("episode: {}/{}, Time:{}: AI: {}, Tie: {}, Opponent: {}".format(e+1, EPISODES,e_duration,
            winner_counter["AI"], winner_counter["Tie"], winner_counter["Opponent"]))        
        win_rate[e] = winner_counter["AI"]
        tie_rate[e] = winner_counter["Tie"]

        if e % 100 == 0 and e > 0:
            agent.save("DQN_FINAL{}.h5".format(e))
    else:
        print("episode: {}/{}, Time:{}".format(e+1, EPISODES,e_duration))
    
    env.AI_Player *= -1
agent.save("DQN_FINAL{}.h5".format(EPISODES))
pickle.dump( agent.memory, open( "DQN_FINAL{}.p".format(EPISODES), "wb" ) )

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
episode: 1/1000, Time:144.3906922340393: AI: 0.4000000000000001, Tie: 0.02, Opponent: 0.5800000000000002
episode: 2/1000, Time:152.61040496826172
episode: 3/1000, Time:141.42814803123474
episode: 4/1000, Time:133.20092296600342
episode: 5/1000, Time:144.37718963623047
episode: 6/1000, Time:144.17212510108948
episode: 7/1000, Time:145.688152551651
episode: 8/1000, Time:129.90899515151978
episode: 9/1000, Time:140.38760566711426
episode: 10/1000, Time:130.71697998046875
episode: 11/1000, Time:137.3698308467865: AI: 0.36000000000000004, Tie: 0.04, Opponent: 0.6000000000000002
episode: 12/1000, Time:127.22597575187683
episode: 13/1000, Time:142.76701712608337
episode: 14/1000, Time:136.8526554107666
episode: 15/1000, Time:142.80982542037964
episode: 16/1000, Time:125.18704795837402
episode: 17/1000, Time:142.77976059913635
episode: 18/1000, Time:138.90682458877563
episode: 

episode: 162/1000, Time:136.08498811721802
episode: 163/1000, Time:141.17709350585938
episode: 164/1000, Time:141.78203630447388
episode: 165/1000, Time:141.24773979187012
episode: 166/1000, Time:131.3527810573578
episode: 167/1000, Time:139.8048403263092
episode: 168/1000, Time:139.05701661109924
episode: 169/1000, Time:141.25951647758484
episode: 170/1000, Time:139.2341742515564
episode: 171/1000, Time:144.54000115394592: AI: 0.4000000000000001, Tie: 0.06, Opponent: 0.5400000000000001
episode: 172/1000, Time:138.91344594955444
episode: 173/1000, Time:141.98491644859314
episode: 174/1000, Time:136.9497411251068
episode: 175/1000, Time:145.0505223274231
episode: 176/1000, Time:139.02203917503357
episode: 177/1000, Time:141.71966409683228
episode: 178/1000, Time:139.7070016860962
episode: 179/1000, Time:135.04539918899536
episode: 180/1000, Time:141.25584864616394
episode: 181/1000, Time:141.10532355308533: AI: 0.4200000000000001, Tie: 0.02, Opponent: 0.5600000000000002
episode: 182/100

episode: 331/1000, Time:146.12916040420532: AI: 0.6600000000000003, Tie: 0, Opponent: 0.34
episode: 332/1000, Time:140.62949967384338
episode: 333/1000, Time:145.5591380596161
episode: 334/1000, Time:142.15820479393005
episode: 335/1000, Time:143.8147327899933
episode: 336/1000, Time:140.82082796096802
episode: 337/1000, Time:142.84606671333313
episode: 338/1000, Time:143.56077980995178
episode: 339/1000, Time:142.01883959770203
episode: 340/1000, Time:141.84033036231995
episode: 341/1000, Time:141.4345052242279: AI: 0.6400000000000002, Tie: 0, Opponent: 0.36000000000000004
episode: 342/1000, Time:137.02667355537415
episode: 343/1000, Time:144.4146318435669
episode: 344/1000, Time:141.45099639892578
episode: 345/1000, Time:142.46276831626892
episode: 346/1000, Time:140.76434803009033
episode: 347/1000, Time:144.5444176197052
episode: 348/1000, Time:141.39466166496277
episode: 349/1000, Time:145.20867705345154
episode: 350/1000, Time:140.50961565971375
episode: 351/1000, Time:147.066262

episode: 499/1000, Time:129.66523241996765
episode: 500/1000, Time:124.20511770248413
episode: 501/1000, Time:128.8485541343689: AI: 0.5000000000000001, Tie: 0.02, Opponent: 0.48000000000000015
episode: 502/1000, Time:124.12205123901367
episode: 503/1000, Time:129.56820726394653
episode: 504/1000, Time:127.17364954948425
episode: 505/1000, Time:128.82888317108154
episode: 506/1000, Time:124.56945252418518
episode: 507/1000, Time:121.83358120918274
episode: 508/1000, Time:122.7141923904419
episode: 509/1000, Time:127.4548990726471
episode: 510/1000, Time:123.96226334571838
episode: 511/1000, Time:127.79007697105408: AI: 0.6800000000000003, Tie: 0, Opponent: 0.32
episode: 512/1000, Time:123.93478178977966
episode: 513/1000, Time:130.90564584732056
episode: 514/1000, Time:126.44949889183044
episode: 515/1000, Time:128.85438656806946
episode: 516/1000, Time:125.4986412525177
episode: 517/1000, Time:128.20688199996948
episode: 518/1000, Time:123.88852858543396
episode: 519/1000, Time:129.15

episode: 667/1000, Time:139.4533350467682
episode: 668/1000, Time:132.44217014312744
episode: 669/1000, Time:140.89601016044617
episode: 670/1000, Time:141.08935713768005
episode: 671/1000, Time:140.3746361732483: AI: 0.6600000000000003, Tie: 0, Opponent: 0.34
episode: 672/1000, Time:130.48087859153748
episode: 673/1000, Time:137.445307970047
episode: 674/1000, Time:135.54874348640442
episode: 675/1000, Time:137.37059688568115
episode: 676/1000, Time:134.57597851753235
episode: 677/1000, Time:144.0725438594818
episode: 678/1000, Time:132.70052886009216
episode: 679/1000, Time:136.7272846698761
episode: 680/1000, Time:141.94962239265442
episode: 681/1000, Time:142.38071584701538: AI: 0.6400000000000002, Tie: 0.04, Opponent: 0.32
episode: 682/1000, Time:127.4327404499054
episode: 683/1000, Time:132.9022934436798
episode: 684/1000, Time:131.25139164924622
episode: 685/1000, Time:133.56389212608337
episode: 686/1000, Time:125.6257050037384
episode: 687/1000, Time:128.5031111240387
episode:

episode: 835/1000, Time:130.36507105827332
episode: 836/1000, Time:126.1692328453064
episode: 837/1000, Time:129.52220940589905
episode: 838/1000, Time:118.8768560886383
episode: 839/1000, Time:127.49510765075684
episode: 840/1000, Time:125.58364295959473
episode: 841/1000, Time:125.50643587112427: AI: 0.6000000000000002, Tie: 0, Opponent: 0.4000000000000001
episode: 842/1000, Time:126.9010591506958
episode: 843/1000, Time:128.8206923007965
episode: 844/1000, Time:126.12725448608398
episode: 845/1000, Time:129.64079976081848
episode: 846/1000, Time:120.10728454589844
episode: 847/1000, Time:129.06932377815247
episode: 848/1000, Time:127.50994396209717
episode: 849/1000, Time:128.2559335231781
episode: 850/1000, Time:123.12796354293823
episode: 851/1000, Time:128.84751296043396: AI: 0.6800000000000003, Tie: 0.02, Opponent: 0.3
episode: 852/1000, Time:123.94675183296204
episode: 853/1000, Time:127.02466821670532
episode: 854/1000, Time:125.00849890708923
episode: 855/1000, Time:130.52241

In [None]:
import matplotlib.pyplot as plt

plt.plot(range(1,EPISODES+1), win_rate, label="DQN")
plt.xlabel("Episode")
plt.ylabel("Win-rate")
plt.ylim([0,1])
plt.title("DQN")
plt.savefig("tmp4_dis1_1000__.png")

In [28]:
pickle.dump( win_rate, open( "DQNwin_rateFINAL{}.p".format(EPISODES), "wb" ) )
pickle.dump( tie_rate, open( "DQNtie_rateFINAL{}.p".format(EPISODES), "wb" ) )

In [10]:
import matplotlib.pyplot as plt
plt.plot(range(EPISODES), win_rate)
plt.ylim([0,1])
plt.xlabel("Episode")
plt.ylabel("Win Rate")
plt.savefig("DQN_WinRateFinal.eps", format="eps")
plt.show()