# Play Cart Pole

In [None]:
import gym
env = gym.make("CartPole-v0") #https://gym.openai.com/envs/#board_game

In [None]:
print(env.observation_space.low, env.observation_space.high)
#cart position/velocity + pole angle/velocity at tip

In [None]:
env.action_space.n  #http://gym.openai.com/docs/

In [None]:
# Play randomly
env.reset()
for _ in range(1000):
    env.render()
    env.step(env.action_space.sample()) # take a random action

env.close()

# Q Training

In [None]:
import importlib
import qlearning
importlib.reload(qlearning)
import numpy as np
import gym
env = gym.make("CartPole-v0") #https://gym.openai.com/envs/#board_game

def state_function(state):
    if abs(state[0]) >= 2.2:
        x_pos = round(state[0], 2)
    else:
        x_pos = 0
    return (x_pos, round(state[1], 1), round(state[2], 2), qlearning.round_to(state[3], 1))
        
game = qlearning.GamePlayer(env, state_function)

In [None]:
total_episodes = 1000
alpha = 0.3
gamma = 0.9                 # Discounting rate
decay_rate = 0.0          # Exponential decay rate for exploration prob
epsilon = 0.3                 # Exploration rate
#game.erase_training()
rewards = game.train(total_episodes, alpha, gamma, epsilon, decay_rate, logEvery=100)
print("Total reward average:", np.mean(rewards))
print(len(game.qtable))

In [None]:
qlearning.visualize_computer_playing(5, env, game.q_trained_action)

In [None]:
game.qtable

# Double Q training

In [None]:
import importlib
import qlearning
importlib.reload(qlearning)
import numpy as np
import gym
env = gym.make("CartPole-v0") #https://gym.openai.com/envs/#board_game

def state_function(state):
    if abs(state[0]) >= 2.2:
        x_pos = round(state[0], 2)
    else:
        x_pos = 0
    return (x_pos, round(state[1], 1), round(state[2], 2), qlearning.round_to(state[3], 1))
        
game = qlearning.GamePlayer(env, state_function)

In [None]:
total_episodes = 50
alpha = 0.3
gamma = 0.9                 # Discounting rate
decay_rate = 0.0          # Exponential decay rate for exploration prob
epsilon = 0.3                 # Exploration rate
#game.erase_training()
rewards = game.double_q_train(total_episodes, alpha, gamma, epsilon, decay_rate, logEvery=10)
print("Total reward average:", np.mean(rewards))
print(len(game.qtable))

In [None]:
qlearning.visualize_computer_playing(5, env, game.double_trained_action)

In [None]:
print(game.Q2)

# Using tensorflow

In [None]:
import tensorflow as tf
import gym
import numpy as np
from qlearning import round_to
env = gym.make("CartPole-v0") #https://gym.openai.com/envs/#board_game

In [None]:
env.observation_space.shape[0]

In [None]:
state_dim = env.observation_space.shape[0]

tf.reset_default_graph()
alpha = 0.001
epsilon = 0.2
gamma = 0.9
N = 50
total_episodes = 4000
inputs = tf.placeholder(shape=[1,state_dim],dtype=tf.float32)
training_inputs = tf.placeholder(shape=[N,state_dim],dtype=tf.float32)
states = tf.placeholder(shape=[N,state_dim],dtype=tf.float32)
ytarget = tf.placeholder(shape=[N,env.action_space.n],dtype=tf.float32)
W1 = tf.Variable(tf.random_uniform([state_dim, 24],0,0.01))
W2 = tf.Variable(tf.random_uniform([24, 24],0,0.01))
W = tf.Variable(tf.random_uniform([24, env.action_space.n],0,0.01))

Qs = tf.matmul(tf.nn.relu(tf.matmul(tf.nn.relu(tf.matmul(states, W1)), W2)), W)
Q = tf.matmul(tf.nn.relu(tf.matmul(tf.nn.relu(tf.matmul(inputs, W1)), W2)), W)

loss = tf.reduce_sum(tf.square(ytarget - Qs))
trainer = tf.train.AdamOptimizer(learning_rate=alpha)
updateModel = trainer.minimize(loss)

In [None]:
init = tf.global_variables_initializer()
reward_list = []
tot_reward_list = []
logEvery = N * 2

with tf.Session() as sess:
    sess.run(init)
    nstep = 0
    Y = []
    S = []
    for episode in range(total_episodes):
        state = env.reset()
        done = False
        tot_reward = 0
        while done is False:
            if nstep == N:
                S = np.array(S).reshape(N,state_dim)
                sess.run([updateModel], feed_dict={states: S, ytarget: Y})
                nstep = 0
                Y = []
                S = []

            S.append([state])

            Y.append(sess.run([Q], feed_dict={inputs: [state]})[0][0])
            if np.random.rand(1) < epsilon:
                action = env.action_space.sample()
            else:
                action = np.argmax(Y[nstep])
            
            next_state, reward, done, _ = env.step(action)
            #print("State: {}, action: {}, Q(s,:): {}".format(state, action, Y[nstep]))
            if done:
                Y[nstep][action] = -10
            else:
                Qnext = sess.run([Q], feed_dict={inputs: [next_state]})[0][0]
                Y[nstep][action] = reward + gamma * np.max(Qnext)
            
            #print("Ytarget: {}, Q(s',:): {}".format(Y[nstep], Qnext))
            
            state = next_state
            tot_reward += reward 
            nstep += 1
            reward_list.append(tot_reward)

        if logEvery > 0 and (episode+1) % logEvery == 0:
            ave_reward = np.mean(reward_list)
            tot_reward_list.append(ave_reward)
            reward_list = []
            print('Episode {} Average Reward: {}, alpha: {}, e: {}'.format(episode+1, ave_reward, alpha, epsilon))

print("Total reward average:", np.mean(tot_reward_list))

In [None]:
from qlearning import visualize_computer_playing

action_function = lambda state: np.argmax(sess.run([Q], feed_dict={inputs: [state]})[0][0])

init = tf.global_variables_initializer()
with tf.Session() as sess:
    sess.run(init)
    visualize_computer_playing(15, env, action_function)

# Using keras

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
import gym
import numpy as np

In [None]:
env = gym.make("CartPole-v0") #https://gym.openai.com/envs/#board_game

In [None]:
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
alpha = 0.001

model = Sequential()
model.add(Dense(24, input_dim=state_size, activation='relu'))
model.add(Dense(24, activation='relu'))
model.add(Dense(action_size, activation='linear'))
model.compile(loss='mse', optimizer=Adam(lr=alpha))

epsilon = 0.2
gamma = 0.9
N = 50
total_episodes = 400

In [None]:
reward_list = []
tot_reward_list = []
logEvery = N * 2

nstep = 0
Y = []
S = []
for episode in range(total_episodes):
    state = env.reset()
    done = False
    tot_reward = 0
    while done is False:
        if nstep == N:
            S = np.stack(S, axis=0).reshape(N, state_size)
            Y = np.stack(Y, axis=0)
            model.fit(S, Y, epochs=1, verbose=0)
            nstep = 0
            Y = []
            S = []

        state = np.array(state).reshape(1, state_size)
        S.append(state)
        Y.append(model.predict(state)[0])
        if np.random.rand(1) < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(Y[nstep])

        next_state, reward, done, _ = env.step(action)
        next_state = np.array(next_state).reshape(1, state_size)
        if done:
            Y[nstep][action] = -10
        else:
            Qnext = model.predict(next_state)[0]
            Y[nstep][action] = reward + gamma * np.max(Qnext)

        state = next_state
        tot_reward += reward 
        nstep += 1
        reward_list.append(tot_reward)

    if logEvery > 0 and (episode+1) % logEvery == 0:
        ave_reward = np.mean(reward_list)
        tot_reward_list.append(ave_reward)
        reward_list = []
        print('Episode {} Average Reward: {}, alpha: {}, e: {}'.format(episode+1, ave_reward, alpha, epsilon))

print("Total reward average:", np.mean(tot_reward_list))

In [None]:
from qlearning import visualize_computer_playing

action_function = lambda state: np.argmax(model.predict(np.array(state).reshape(1, state_size))[0])
visualize_computer_playing(15, env, action_function)