#### Display related

In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
from IPython import display

def show_state(observation, step=0, info=""):
    plt.figure(3)
    plt.clf()
    plt.imshow(observation, cmap='gray')
    plt.title("%s | Step: %d %s" % (env.spec.id, step, info))
    plt.axis('off')
    
    display.clear_output(wait=True)
    display.display(plt.gcf())
    plt.close()

#### Preprocessing

In [3]:
# import cv2
from skimage.transform import resize
import numpy as np

def downsize(img_arry):
    img_arry = resize(img_arry, (84, 84), anti_aliasing=True)
    return img_arry # [:,:84]

def rgb2gray(img_arr):
    return np.dot(img_arr[...,:3], [0.299, 0.587, 0.114])

def preprocess_image(img_arr):
    downsized = downsize(img_arr)
    return rgb2gray(downsized)

def transform_reward(reward):
        return np.sign(reward)

In [3]:
from keras.models import Sequential
from keras.layers import Dense, Conv2D, Flatten

def atari_model(state_size, action_size):
    model = Sequential()
    model.add(Conv2D(32,
                            8,
                            strides=(4, 4),
                            padding="valid",
                            activation="relu",
                            input_shape=state_size,
                            data_format="channels_first"))
    model.add(Conv2D(64,
                            4,
                            strides=(2, 2),
                            padding="valid",
                            activation="relu",
                            input_shape=state_size,
                            data_format="channels_first"))
    model.add(Flatten())
    model.add(Dense(512, activation="relu"))
    model.add(Dense(action_size))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

Using TensorFlow backend.


In [4]:
def fit_batch(model, gamma, memory, batch_size):
    if len(memory) < batch_size: 
        return

    samples = random.sample(memory, batch_size)
    for sample in samples:
        state, action, reward, new_state, done = sample
        target = model.predict(state)
        if done:
            target[0][action] = reward
        else:
            Q_future = max(model.predict(new_state)[0])
            target[0][action] = reward + Q_future * gamma
        model.fit(state, target, epochs=1, verbose=0)
    return model 

In [5]:
 def choose_best_action(model, state):
    actions = model.predict(state)
    action = np.argmax(actions)
    return action

#### Main loop

In [7]:
import gym
import random
import numpy as np
from collections import deque

def getPreviousFourImages(images, counter):
    arr = [
        images[counter-4],
        images[counter-3],
        images[counter-2],
        images[counter-1],
    ]
    arr = np.expand_dims(arr, axis=0)
    return arr

env = gym.make("Breakout-v0")
state = env.reset()

# agent = Agent((4, 105, 80), env.action_space.n)

alpha = 0.1
gamma = 0.95
epsilon = 1.0
epsilon_min = 0.01
epsilon_decay = 0.995
learning_rate = 0.001
# itertion = 0
model = atari_model((4,84,84), env.action_space.n)
memory = deque(maxlen=2000)

for episode in range(2500):
#reset env, get first four images
    state = env.reset()

    #retrieve first four images
    counter = 0
    tmp_images = []

    for i in range(0, 5):
        action = env.action_space.sample()
        state, reward, done, _ = env.step(action)
        state = preprocess_image(state)
        tmp_images.append(state)
        counter += 1

    done = False
    tot_reward = 0
    time = 0
    while not done:
        previous_state = getPreviousFourImages(tmp_images, counter)
        e = random.random()
        if e < epsilon:
            action = env.action_space.sample()
        else:
            action = choose_best_action(previous_state)

        state, reward, done, _ = env.step(action)
        state = preprocess_image(state)
        tmp_images.append(state)
        counter += 1
        current_state = getPreviousFourImages(tmp_images, counter)
        memory.append((previous_state, action, reward, current_state, done))
        tot_reward += reward
        fit_batch(model, gamma, memory, 32)
    #Print score
    print("episode: {}/2500, score: {}"
            .format(episode, tot_reward))

episode: 0/2500, score: 1.0


KeyboardInterrupt: 

In [11]:
# model = None
import gym
import random
import numpy as np
from collections import deque

def getPreviousFourImages(images, counter):
    arr = [
        images[counter-4],
        images[counter-3],
        images[counter-2],
        images[counter-1],
    ]
    arr = np.expand_dims(arr, axis=0)
    return arr

env = gym.make("Breakout-v0")
state = env.reset()

# agent = Agent((4, 105, 80), env.action_space.n)

alpha = 0.1
gamma = 0.95
epsilon = 1.0
epsilon_min = 0.01
epsilon_decay = 0.995
learning_rate = 0.001
# itertion = 0
model_xgb = None
memory = deque(maxlen=2000)

def choose_best_action2(previous_state, isFit, action_space, model):
    if isFit:
        actions = model.predict(state)
    else:
        actions = np.zeros(action_space).reshape(1,-1)
    action = np.argmax(actions)
    return action
    
for episode in range(2500):
#reset env, get first four images
    state = env.reset()

    #retrieve first four images
    counter = 0
    tmp_images = []

    for i in range(0, 5):
        action = env.action_space.sample()
        state, reward, done, _ = env.step(action)
        state = preprocess_image(state)
        tmp_images.append(state)
        counter += 1

    done = False
    tot_reward = 0
    time = 0
    isFit = False
    while not done:
        previous_state = getPreviousFourImages(tmp_images, counter)
        e = random.random()
        if e < epsilon:
            action = env.action_space.sample()
        else:
            action = choose_best_action2(previous_state, isFit, env.action_space.n, model)

        state, reward, done, _ = env.step(action)
        state = preprocess_image(state)
        tmp_images.append(state)
        counter += 1
        current_state = getPreviousFourImages(tmp_images, counter)
        memory.append((previous_state, action, reward, current_state, done))
        tot_reward += reward
        train_xgboost(model_xgb, gamma, memory, 32, isFit, env.action_space.n)
    #Print score
    print("episode: {}/2500, score: {}"
            .format(episode, tot_reward))

(1, 28224) [[0. 0. 0. 0.]]
(1, 28224) [[0. 0. 0. 0.]]
(1, 28224) [[0. 0. 0. 0.]]
(1, 28224) [[0. 0. 0. 0.]]
(1, 28224) [[0. 0. 0. 0.]]
(1, 28224) [[0. 0. 0. 0.]]
(1, 28224) [[0. 0. 0. 0.]]
(1, 28224) [[0. 0. 0. 0.]]
(1, 28224) [[0. 0. 0. 0.]]
(1, 28224) [[0. 0. 0. 0.]]
(1, 28224) [[0. 0. 0. 0.]]
(1, 28224) [[0. 0. 0. 0.]]
(1, 28224) [[0. 0. 0. 0.]]
(1, 28224) [[0. 0. 0. 0.]]
(1, 28224) [[0. 0. 0. 0.]]
(1, 28224) [[0. 0. 0. 0.]]
(1, 28224) [[0. 0. 0. 0.]]
(1, 28224) [[0. 0. 0. 0.]]
(1, 28224) [[0. 0. 0. 0.]]
(1, 28224) [[0. 0. 0. 0.]]
(1, 28224) [[0. 0. 0. 0.]]
(1, 28224) [[0. 0. 0. 0.]]
(1, 28224) [[0. 0. 0. 0.]]
(1, 28224) [[0. 0. 0. 0.]]
(1, 28224) [[0. 0. 0. 0.]]
(1, 28224) [[0. 0. 0. 0.]]
(1, 28224) [[0. 0. 0. 0.]]
(1, 28224) [[0. 0. 0. 0.]]
(1, 28224) [[0. 0. 0. 0.]]
(1, 28224) [[0. 0. 0. 0.]]
(1, 28224) [[0. 0. 0. 0.]]
(1, 28224) [[0. 0. 0. 0.]]


TypeError: only size-1 arrays can be converted to Python scalars

In [10]:
import xgboost as xgb

def train_xgboost(model, gamma, memory, batch_size, isFit, action_space):
    if len(memory) < batch_size: 
        return

    samples = random.sample(memory, batch_size)
    states = []
    targets = []
    for sample in samples:
        state, action, reward, new_state, done = sample
        if isFit:
            target = model.predict(state)
        else:
            target = np.zeros(action_space).reshape(1,-1)
        if done:
            target[0][action] = reward
        else:
            if isFit:
                Q_future = max(model.predict(new_state)[0])
            else:
                Q_future = 0
            target[0][action] = reward + Q_future * gamma
        state = state.reshape(*state.shape[:2], -1)
        state = state.reshape(*state.shape[:1], -1)
#         print(state.shape, target)
        states.append(state[0])
        targets.append(target)

    model = xgb.train({
        'learning_rate': gamma,
        'update':'refresh',
        'process_type': 'update',
        'refresh_leaf': True,
        #'reg_lambda': 3,  # L2
        'reg_alpha': 0.1 ,# L1
        'silent': False
    }, dtrain=xgb.DMatrix(states, targets), xgb_model=model)
    return model 