In [None]:
import gym
from collections import deque
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from scipy import linalg
from copy import deepcopy

In [None]:
def int2onehot(n, max_n):
    res = np.zeros(max_n)
    res[n] = 1.
    return res

In [None]:
num_frames = 2
max_steps = 2000
num_games = 100

In [None]:
def dim_reduce(a, V, keep_dim):
    res = np.zeros([keep_dim])
    for i in xrange(keep_dim):
        res[i] = np.matmul(a, V[i])
    return res

In [None]:
def red_train_agent2(training_epochs,red_games, commulative_rewards_np):
    merged_summary = tf.summary.merge_all()
    mx = commulative_rewards_np.max()
    mean = commulative_rewards_np.mean()
    amplitude = commulative_rewards_np.max() - commulative_rewards_np.min()
    step = 1
    for red_game, reward in zip(red_games, commulative_rewards_np):
        step += 1
        actions = np.array([_[1] for _ in red_game])
        observations = np.array([_[0] for _ in red_game])
        l = len(observations)
        for n in xrange(training_epochs):
            grads_buff = sess.run(red_grads, feed_dict={red_x:observations.reshape([l, red_Din]).astype(dtype=np.float32),
                                                   red_y_:actions.reshape([l, Dout]).astype(dtype=np.float32)})
            c = (reward - mean) / mean
            sess.run(red_apply_grads, \
                feed_dict={red_grad1_ph:(c * grads_buff[0][0]), \
                                red_grad2_ph:(c * grads_buff[1][0]), \
                               red_grad3_ph:(c * grads_buff[2][0]), \
                               red_grad4_ph:(c * grads_buff[3][0])})

In [None]:
def play_random(num_games, max_steps, render):
    games = deque()
    env = gym.make("DemonAttack-ram-v3")
    commulative_rewards = deque()
    for i in xrange(num_games):
        #comp_observation = deque(maxlen = 3)
        observation = np.array(env.reset())
        comp_observation = deepcopy(observation) 
        comp_observations = deque()
        actions = deque()
        current_game = deque()
        commulative_reward = 0
        for j in xrange(max_steps):
            if render:
                env.render()            
            old_observation = observation
            action = env.action_space.sample()
            observation, reward, done, info = env.step(action)
            if len(comp_observation) == num_frames * 128:
                comp_observations.append(deepcopy(comp_observation))
            comp_observation = np.hstack((comp_observation, deepcopy(old_observation)))[-num_frames * 128:]
            commulative_reward += reward
            if(len(comp_observation) == num_frames * 128):
                current_game.append((comp_observation, int2onehot(action, env.action_space.n), reward))
                actions.append(int2onehot(action, env.action_space.n))
                comp_observations.append(comp_observation)
            if done:
                break;
        games.append(current_game)
        commulative_rewards.append(commulative_reward)
    #env.close()
    #del env
    return games, commulative_rewards

In [None]:
def red_play(num_games, max_steps, render, V):
    games = deque()
    actions = [ _ for _ in xrange(0, 6)]
    env = gym.make("DemonAttack-ram-v3")
    commulative_rewards = deque()
    for i in xrange(num_games):
        observation = env.reset()
        comp_observation = deepcopy(observation)
        current_game = deque()
        commulative_reward = 0
        for j in xrange(max_steps):
            if render:
                env.render()
            if len(comp_observation) == num_frames * 128:
                #print "red_play: random_choice(probabilities)", len(comp_observation)
                red_observation = dim_reduce(comp_observation, V, red_Din)
                probabilities = sess.run(red_y, \
                                     feed_dict={red_x:red_observation.reshape([1, \
                                        len(red_observation)]).astype(dtype=np.float32)})
                action = np.random.choice(a=actions, p=probabilities[0])
                observation, reward, done, info = env.step(action)
                current_game.append((red_observation, int2onehot(action, env.action_space.n), reward))
            else:
                #print "red_play: action_space.sample:", len(comp_observation)
                action = env.action_space.sample()
                observation, reward, done, info = env.step(action)
            commulative_reward += reward
            
            comp_observation = np.hstack((comp_observation, observation))[-num_frames * 128:]
            if done:
                break;
        games.append(current_game)
        commulative_rewards.append(commulative_reward)
    #env.close()
    #del env
    return games, commulative_rewards

In [None]:
games, commulative_rewards = play_random(num_games=num_games, max_steps=2000, render=False)
commulative_rewards_np = np.array(commulative_rewards)

In [None]:
rewards_np = np.array( [_ for _ in commulative_rewards], dtype=np.float )
print rewards_np.mean()
h = np.histogram(rewards_np)
plt.plot(h[1][:len(h[1]) - 1] + 10.5, h[0])
plt.show()

In [None]:
game = games[commulative_rewards_np.argmax()]
commulative_reward = commulative_rewards[commulative_rewards_np.argmax()]
print game[0], commulative_reward
print game[0][0], game[0][1], game[0][2]

In [None]:
observations = np.array([_[0] for _ in game], dtype=np.float)

In [None]:
U, s, V = linalg.svd(observations[1:], full_matrices=True, compute_uv=True, overwrite_a=False, check_finite=True, lapack_driver='gesdd')

In [None]:
print s

In [None]:
plt.plot(np.log10(s))
plt.show()

In [None]:
lr = 1e-5
training_epochs = 1
D_hid1 = 150
D_hid2 = 60
red_Din = 160
Din = 128
Dout = 6

In [None]:
sess = tf.Session()

In [None]:
red_x = tf.placeholder(tf.float32, [None, red_Din], name="red_x")

red_grad1_ph = tf.placeholder(tf.float32, name="red_grad1_ph")
red_grad2_ph = tf.placeholder(tf.float32, name="red_grad2_ph")
red_grad3_ph = tf.placeholder(tf.float32, name="red_grad3_ph")
red_grad4_ph = tf.placeholder(tf.float32, name="red_grad4_ph")


red_W1 = tf.Variable(tf.random_normal([red_Din, D_hid1], stddev=0.0001), name="red_W1")
red_b1 = tf.Variable(tf.random_normal([D_hid1], stddev=0.0001), name="red_b1") 

red_W = tf.Variable(tf.random_normal([D_hid1, Dout], stddev=0.0001), name="red_W")
red_b = tf.Variable(tf.random_normal([Dout], stddev=0.0001), name="red_b")

red_H = tf.nn.relu(tf.matmul(red_x, red_W1) + red_b1)
red_y = tf.nn.softmax(tf.matmul(red_H, red_W) + red_b)
#red_y = tf.nn.softmax(tf.matmul(red_x, red_W) + red_b, name = "red_y")
red_y_ = tf.placeholder(tf.float32, [None, Dout], name="red_y_")

In [None]:
#red_cross_entropy = tf.reduce_mean(-tf.reduce_sum(red_y_ * tf.log(red_y), reduction_indices=[1]))
red_square_loss = tf.reduce_mean(tf.pow(red_y_ - red_y, 2))

In [None]:
red_adam = tf.train.AdamOptimizer(learning_rate=lr)

#tvars = tf.trainable_variables()
red_tvars = [red_W, red_b, red_W1, red_b1]

red_grads = red_adam.compute_gradients(loss=red_square_loss, var_list=red_tvars)
red_apply_grads = red_adam.apply_gradients(zip([red_grad1_ph, red_grad2_ph, red_grad3_ph, red_grad4_ph],red_tvars))

In [None]:
sess.run(tf.global_variables_initializer())

In [None]:
rewards = np.ndarray([0], dtype=np.float32)

In [None]:
num_iterations = 20
for i in xrange(num_iterations):
    print "Iteration:", i + 1, '/', num_iterations
    red_games, commulative_rewards = red_play(num_games=num_games, max_steps=3000, render=False, V=V)
    commulative_rewards_np = np.array(commulative_rewards)
    rewards = np.hstack((rewards, commulative_rewards_np))
    x = np.linspace(1, len(rewards), len(rewards))
    plt.plot(x, rewards)
    plt.show()
    red_train_agent2(1, red_games, commulative_rewards_np)

In [None]:
f = file("rewards-2.dat", "w")
np.save(f, rewards)
f.close()

In [None]:
f = file("V-matrix-2.dat", "w")
np.save(f, V)
f.close()

In [None]:
saver = tf.train.Saver()


In [None]:
saver.save(sess, "OpenAI-DemonAttack-v3-a7-2")

In [None]:
f = file("checkpoints/V-matrix.dat")
V = np.load(f)
f.close()

In [None]:
print V

In [None]:
saver.restore(sess=sess, save_path="/home/martin/notebooks/checkpoints/OpenAI-DemonAttack-v3-a7")