# Play Cart Pole

In [None]:
import gym
env = gym.make("CartPole-v0") #https://gym.openai.com/envs/#board_game

In [None]:
print(env.observation_space.low, env.observation_space.high)
#cart position/velocity + pole angle/velocity at tip

In [None]:
env.action_space.n  #http://gym.openai.com/docs/

In [None]:
# Play randomly
env.reset()
for _ in range(1000):
    env.render()
    env.step(env.action_space.sample()) # take a random action

env.close()

# Q Training

In [None]:
import importlib
import qlearning
importlib.reload(qlearning)
import numpy as np
import gym
env = gym.make("CartPole-v0") #https://gym.openai.com/envs/#board_game

def state_function(state):
    if abs(state[0]) >= 2.2:
        x_pos = round(state[0], 2)
    else:
        x_pos = 0
    return (x_pos, round(state[1], 1), round(state[2], 2), qlearning.round_to(state[3], 1))
        
game = qlearning.GamePlayer(env, state_function)

In [None]:
total_episodes = 1000
alpha = 0.3
gamma = 0.9                 # Discounting rate
decay_rate = 0.0          # Exponential decay rate for exploration prob
epsilon = 0.3                 # Exploration rate
#game.erase_training()
rewards = game.train(total_episodes, alpha, gamma, epsilon, decay_rate, logEvery=100)
print("Total reward average:", np.mean(rewards))
print(len(game.qtable))

In [None]:
qlearning.visualize_computer_playing(5, game)

In [None]:
game.qtable

# Double Q training

In [None]:
import importlib
import qlearning
importlib.reload(qlearning)
import numpy as np
import gym
env = gym.make("CartPole-v0") #https://gym.openai.com/envs/#board_game

def state_function(state):
    if abs(state[0]) >= 2.2:
        x_pos = round(state[0], 2)
    else:
        x_pos = 0
    return (x_pos, round(state[1], 1), round(state[2], 2), qlearning.round_to(state[3], 1))
        
game = qlearning.GamePlayer(env, state_function)

In [None]:
total_episodes = 50
alpha = 0.3
gamma = 0.9                 # Discounting rate
decay_rate = 0.0          # Exponential decay rate for exploration prob
epsilon = 0.3                 # Exploration rate
#game.erase_training()
rewards = game.double_q_train(total_episodes, alpha, gamma, epsilon, decay_rate, logEvery=10)
print("Total reward average:", np.mean(rewards))
print(len(game.qtable))

In [None]:
qlearning.visualize_computer_playing(5, game, True)

In [None]:
print(game.Q2)

# Using tensorflow

In [None]:
import tensorflow as tf
import gym
import numpy as np
from qlearning import round_to
env = gym.make("CartPole-v0") #https://gym.openai.com/envs/#board_game

In [None]:
env.observation_space.shape[0]

In [None]:
state_dim = env.observation_space.shape[0]

tf.reset_default_graph()
alpha = 0.8
epsilon = 0.2
gamma = 0.9
N = 50 # 10: n steps before losing when playing with a non trained network
total_episodes = 4000
inputs = tf.placeholder(shape=[1,state_dim],dtype=tf.float32)
training_inputs = tf.placeholder(shape=[N,state_dim],dtype=tf.float32)
states = tf.placeholder(shape=[N,state_dim],dtype=tf.float32)
ytarget = tf.placeholder(shape=[N,env.action_space.n],dtype=tf.float32)
W1 = tf.Variable(tf.random_uniform([state_dim, 24],0,0.01))
W2 = tf.Variable(tf.random_uniform([24, 24],0,0.01))
W = tf.Variable(tf.random_uniform([24, env.action_space.n],0,0.01))
theta_target = {}
theta = {}
Qs = tf.nn.sigmoid(tf.matmul(tf.nn.relu(tf.matmul(tf.nn.relu(tf.matmul(states, W1)), W2)), W))
Q = tf.nn.sigmoid(tf.matmul(tf.nn.relu(tf.matmul(tf.nn.relu(tf.matmul(inputs, W1)), W2)), W))

loss = tf.reduce_sum(tf.square(ytarget - Qs))
trainer = tf.train.GradientDescentOptimizer(learning_rate=alpha)
updateModel = trainer.minimize(loss)

In [None]:
init = tf.global_variables_initializer()
reward_list = []
tot_reward_list = []
logEvery = N * 2

with tf.Session() as sess:
    sess.run(init)
    nstep = 0
    Y = []
    S = []
    for episode in range(total_episodes):
        state = env.reset()
        done = False
        tot_reward = 0
        while done is False:
            if nstep == N:
                S = np.array(S).reshape(N,state_dim)
                sess.run([updateModel], feed_dict={states: S, ytarget: Y})
                nstep = 0
                Y = []
                S = []

            S.append([state])

            Y.append(sess.run([Q], feed_dict={inputs: [state]})[0][0])
            if np.random.rand(1) < epsilon:
                action = env.action_space.sample()
            else:
                action = np.argmax(Y[nstep])
            
            next_state, reward, done, _ = env.step(action)
            #print("State: {}, action: {}, Q(s,:): {}".format(state, action, Y[nstep]))
            if done:
                Y[nstep][action] = reward
            else:
                Qnext = sess.run([Q], feed_dict={inputs: [next_state]})[0][0]
                Y[nstep][action] = reward + gamma * np.max(Qnext)
            
            #print("Ytarget: {}, Q(s',:): {}".format(Y[nstep], Qnext))
            
            state = next_state
            tot_reward += reward 
            nstep += 1
            reward_list.append(tot_reward)

        if logEvery > 0 and (episode+1) % logEvery == 0:
            ave_reward = np.mean(reward_list)
            tot_reward_list.append(ave_reward)
            reward_list = []
            print('Episode {} Average Reward: {}, alpha: {}, e: {}'.format(episode+1, ave_reward, alpha, epsilon))
        
    for episode in range(0):
        state = env.reset()
        print("****************************************************")
        print("EPISODE ", episode)
        done = False
        tot_reward = 0
        while done is False:
            action = np.argmax(sess.run([Q], feed_dict={inputs: [state]})[0][0])
            new_state, reward, done, info = env.step(action)
            env.render()
            state = new_state
            tot_reward += reward
        print("Reward:", tot_reward)
    env.close()
print("Total reward average:", np.mean(tot_reward_list))