In [1]:
import math
import numpy as np
import pandas as pd

np.random.seed(1)

import keras
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.models import Sequential
from keras.layers import Convolution2D, MaxPooling2D, AveragePooling2D
from keras.utils import to_categorical

import gym

Using TensorFlow backend.


In [2]:
env = gym.make('Phoenix-v0')

[2018-02-10 00:16:15,051] Making new env: Phoenix-v0


In [3]:
print (env.unwrapped.get_action_meanings())

['NOOP', 'FIRE', 'RIGHT', 'LEFT', 'DOWN', 'RIGHTFIRE', 'LEFTFIRE', 'DOWNFIRE']


In [4]:
def preprocess_observation(obs):
    img = obs[:]
    img = img[30:180:2, ::2]
    img = np.sum(img, axis=2)
    img[img > 0] = 1.
    img[img != 1.] = 2.
    img[img == 1.] = 0.
    img[img == 2.] = 1.
    return img.reshape(75, 80,1)

In [5]:
def discount_rewards(rewards, discount_rate):
    discounted_rewards = np.zeros(len(rewards))
    cumulative_rewards = 0
    for step in reversed(range(len(rewards))):
        cumulative_rewards = rewards[step] + cumulative_rewards * discount_rate
        discounted_rewards[step] = cumulative_rewards
    std = np.std(discounted_rewards)
    discounted_rewards -= np.mean(discounted_rewards)
    if std != 0:
        discounted_rewards /= std
    return discounted_rewards

In [14]:
model = Sequential()
 
model.add(Convolution2D(8, (3, 3), activation='sigmoid', kernel_initializer='glorot_uniform', use_bias=True,
                        kernel_regularizer=keras.regularizers.l2(0.001), data_format='channels_last', input_shape=(75,80,1)))
model.add(Convolution2D(8, (3, 3), activation='relu', kernel_initializer='glorot_uniform', use_bias=True,
                        kernel_regularizer=keras.regularizers.l2(0.001)))
model.add(Flatten())
model.add(Dense(10, activation='relu', kernel_initializer='glorot_uniform', use_bias=True, 
                kernel_regularizer=keras.regularizers.l2(0.001)))
model.add(Dense(4, activation='softmax'))

In [15]:
model.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.adagrad(lr=1e-3), metrics=['accuracy'])

In [16]:
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)

In [9]:
model.load_weights("model.h5")

OSError: Unable to open file (Unable to open file: name = 'model.h5', errno = 2, error message = 'no such file or directory', flags = 0, o_flags = 0)

In [None]:
batch_size = 100
iters = 0
k = 0
discount_rate = 0.8
possible_actions = np.array([0, 1, 2, 3])
rewards = []
actions = []
frames = []

while True:
    observation = env.reset()
    r = 0
    while True:
        """if iters == batch_size:
            iters = 0
            k += 1
            discounted = discount_rewards(rewards, discount_rate)
            actions = actions * discounted.reshape(-1, 1)
            model.train_on_batch(np.array(frames), np.array(actions))
            rewards = []
            actions = []
            frames = []
            if k % 20 == 0:
                # serialize weights to HDF5
                model.save_weights("model.h5", overwrite=True)"""
        #iters += 1
        img = preprocess_observation(observation)
        frames.append(img)
        img = np.expand_dims(img, axis=0)
        probs = model.predict(img).ravel()
        probs = probs / np.sum(probs)
        action = np.random.choice(4, p = probs)
        actions.append(probs)
        observation, reward, done, info = env.step(action)
        rewards.append(reward)
        r += reward
        if done:
            break;
    
    print(r)
    k += 1
    print("f", len(frames))
    if len(frames) > 5000:
        continue
    discounted = discount_rewards(rewards, discount_rate)
    actions = actions * discounted.reshape(-1, 1)
    model.train_on_batch(np.array(frames), np.array(actions))
    rewards = []
    actions = []
    frames = []
    if k % 20 == 0:
        # serialize weights to HDF5
        model.save_weights("model.h5", overwrite=True)
    """iters += 1;
    if iters % batch_size == 0:
        discounted = discount_rewards(rewards, discount_rate)
        actions = actions * discounted.reshape(-1, 1)
        model.train_on_batch(np.array(frames), np.array(actions))
        rewards = []
        actions = []
        frames = []"""

380.0
f 1718


In [23]:
observation = env.reset()
while True:
    env.render()
    img = preprocess_observation(observation)
    img = np.expand_dims(img, axis=0)
    probs = model.predict(img).ravel()
    probs = probs / np.sum(probs)
    action = np.random.choice(4, p = probs)
    observation, reward, done, info = env.step(action)
    if done:
        env.render(close=True)
        break;