In [1]:
import gym
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

env = gym.make('CartPole-v0')

# Input and output size based on the Env
input_size = env.observation_space.shape[0] # -> 4
output_size = env.action_space.n # -> 2(left, right)

# Set learning parameters
learning_rate = .1

X = tf.placeholder(tf.float32, [None, input_size], name="input_x") # None will be 1

# First layer of weights
W1 = tf.get_variable("W1", shape=[input_size, output_size], initializer=tf.contrib.layers.xavier_initializer())
Qpred = tf.matmul(X, W1)

# We need to define the parts of the network needed for learning a policy
Y = tf.placeholder(shape=[None, output_size], dtype=tf.float32)

# Loss function
loss = tf.reduce_sum(tf.square(Y - Qpred))

# Learning
train = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)

# Set Q-learning related parameters
dis = .99
num_episodes = 2000

# Create lists to contain total rewards and steps per episode
rList = []


[2017-08-04 13:59:43,349] Making new env: CartPole-v0


In [2]:
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)

sess.run(init)
for i in range(num_episodes):
    # Reset environment and get first new observation
    e = 1. / ((i / 10) + 1)
    step_count = 0
    s = env.reset()
    done = False


    # The Q-Network training
    while not done:
        step_count += 1
        x = np.reshape(s, [1, input_size])

        # Choose an action by freedily (with e change of random action) from the Q-network
        Qs = sess.run(Qpred, feed_dict = {X: x})
        if np.random.rand(1) < e:
            a = env.action_space.sample()
        else:
            a = np.argmax(Qs)

        # Get new state and reward from environment
        s1, reward, done, _ = env.step(a)
        if done:
            # Update Q, and no Qs+1, since it's terminal state
            Qs[0, a] = -100
        else:
            # Obtain the Q_s1 values by feeding the new state through our network
            x1 = np.reshape(s1, [1, input_size])
            Qs1 = sess.run(Qpred, feed_dict={X: x1})
            
            # Update Q
            Qs[0, a] = reward + dis * np.max(Qs1)

        # Train our network using target (Y) and predicted Q(Qpred) values
        sess.run(train, feed_dict={X:x, Y: Qs})
        s = s1

    rList.append(step_count)
    print("Episode: {} steps: {}".format(i, step_count))

    # If last 10's avg steps are 500, it's good enough
    if len(rList) > 10 and np.mean(rList[-10:]) > 500:
        break;


Episode: 0 steps: 14
Episode: 1 steps: 11
Episode: 2 steps: 30
Episode: 3 steps: 38
Episode: 4 steps: 19
Episode: 5 steps: 25
Episode: 6 steps: 17
Episode: 7 steps: 16
Episode: 8 steps: 30
Episode: 9 steps: 12
Episode: 10 steps: 23
Episode: 11 steps: 21
Episode: 12 steps: 100
Episode: 13 steps: 44
Episode: 14 steps: 30
Episode: 15 steps: 78
Episode: 16 steps: 33
Episode: 17 steps: 21
Episode: 18 steps: 25
Episode: 19 steps: 35
Episode: 20 steps: 23
Episode: 21 steps: 39
Episode: 22 steps: 108
Episode: 23 steps: 40
Episode: 24 steps: 34
Episode: 25 steps: 26
Episode: 26 steps: 17
Episode: 27 steps: 29
Episode: 28 steps: 34
Episode: 29 steps: 34
Episode: 30 steps: 22
Episode: 31 steps: 14
Episode: 32 steps: 25
Episode: 33 steps: 25
Episode: 34 steps: 28
Episode: 35 steps: 29
Episode: 36 steps: 27
Episode: 37 steps: 37
Episode: 38 steps: 24
Episode: 39 steps: 23
Episode: 40 steps: 25
Episode: 41 steps: 92
Episode: 42 steps: 25
Episode: 43 steps: 21
Episode: 44 steps: 13
Episode: 45 steps:

Episode: 364 steps: 44
Episode: 365 steps: 20
Episode: 366 steps: 27
Episode: 367 steps: 38
Episode: 368 steps: 37
Episode: 369 steps: 30
Episode: 370 steps: 28
Episode: 371 steps: 30
Episode: 372 steps: 45
Episode: 373 steps: 26
Episode: 374 steps: 36
Episode: 375 steps: 16
Episode: 376 steps: 26
Episode: 377 steps: 32
Episode: 378 steps: 27
Episode: 379 steps: 20
Episode: 380 steps: 42
Episode: 381 steps: 29
Episode: 382 steps: 13
Episode: 383 steps: 19
Episode: 384 steps: 46
Episode: 385 steps: 14
Episode: 386 steps: 9
Episode: 387 steps: 18
Episode: 388 steps: 27
Episode: 389 steps: 13
Episode: 390 steps: 19
Episode: 391 steps: 8
Episode: 392 steps: 22
Episode: 393 steps: 11
Episode: 394 steps: 8
Episode: 395 steps: 12
Episode: 396 steps: 8
Episode: 397 steps: 16
Episode: 398 steps: 18
Episode: 399 steps: 33
Episode: 400 steps: 29
Episode: 401 steps: 17
Episode: 402 steps: 15
Episode: 403 steps: 17
Episode: 404 steps: 19
Episode: 405 steps: 21
Episode: 406 steps: 16
Episode: 407 st

Episode: 723 steps: 22
Episode: 724 steps: 35
Episode: 725 steps: 21
Episode: 726 steps: 31
Episode: 727 steps: 23
Episode: 728 steps: 31
Episode: 729 steps: 36
Episode: 730 steps: 22
Episode: 731 steps: 8
Episode: 732 steps: 9
Episode: 733 steps: 9
Episode: 734 steps: 10
Episode: 735 steps: 48
Episode: 736 steps: 15
Episode: 737 steps: 10
Episode: 738 steps: 14
Episode: 739 steps: 13
Episode: 740 steps: 20
Episode: 741 steps: 24
Episode: 742 steps: 22
Episode: 743 steps: 28
Episode: 744 steps: 27
Episode: 745 steps: 30
Episode: 746 steps: 36
Episode: 747 steps: 57
Episode: 748 steps: 14
Episode: 749 steps: 34
Episode: 750 steps: 19
Episode: 751 steps: 34
Episode: 752 steps: 27
Episode: 753 steps: 22
Episode: 754 steps: 41
Episode: 755 steps: 18
Episode: 756 steps: 13
Episode: 757 steps: 20
Episode: 758 steps: 10
Episode: 759 steps: 9
Episode: 760 steps: 11
Episode: 761 steps: 17
Episode: 762 steps: 26
Episode: 763 steps: 18
Episode: 764 steps: 12
Episode: 765 steps: 9
Episode: 766 ste

Episode: 1077 steps: 31
Episode: 1078 steps: 21
Episode: 1079 steps: 10
Episode: 1080 steps: 28
Episode: 1081 steps: 9
Episode: 1082 steps: 29
Episode: 1083 steps: 23
Episode: 1084 steps: 12
Episode: 1085 steps: 8
Episode: 1086 steps: 31
Episode: 1087 steps: 14
Episode: 1088 steps: 42
Episode: 1089 steps: 39
Episode: 1090 steps: 22
Episode: 1091 steps: 28
Episode: 1092 steps: 26
Episode: 1093 steps: 38
Episode: 1094 steps: 18
Episode: 1095 steps: 26
Episode: 1096 steps: 25
Episode: 1097 steps: 29
Episode: 1098 steps: 43
Episode: 1099 steps: 22
Episode: 1100 steps: 14
Episode: 1101 steps: 54
Episode: 1102 steps: 23
Episode: 1103 steps: 25
Episode: 1104 steps: 22
Episode: 1105 steps: 20
Episode: 1106 steps: 41
Episode: 1107 steps: 68
Episode: 1108 steps: 55
Episode: 1109 steps: 30
Episode: 1110 steps: 28
Episode: 1111 steps: 24
Episode: 1112 steps: 19
Episode: 1113 steps: 61
Episode: 1114 steps: 37
Episode: 1115 steps: 19
Episode: 1116 steps: 46
Episode: 1117 steps: 32
Episode: 1118 step

Episode: 1421 steps: 38
Episode: 1422 steps: 15
Episode: 1423 steps: 38
Episode: 1424 steps: 33
Episode: 1425 steps: 53
Episode: 1426 steps: 46
Episode: 1427 steps: 69
Episode: 1428 steps: 44
Episode: 1429 steps: 19
Episode: 1430 steps: 76
Episode: 1431 steps: 33
Episode: 1432 steps: 36
Episode: 1433 steps: 34
Episode: 1434 steps: 27
Episode: 1435 steps: 37
Episode: 1436 steps: 67
Episode: 1437 steps: 64
Episode: 1438 steps: 27
Episode: 1439 steps: 28
Episode: 1440 steps: 38
Episode: 1441 steps: 75
Episode: 1442 steps: 31
Episode: 1443 steps: 27
Episode: 1444 steps: 51
Episode: 1445 steps: 25
Episode: 1446 steps: 60
Episode: 1447 steps: 43
Episode: 1448 steps: 65
Episode: 1449 steps: 25
Episode: 1450 steps: 40
Episode: 1451 steps: 81
Episode: 1452 steps: 22
Episode: 1453 steps: 38
Episode: 1454 steps: 74
Episode: 1455 steps: 57
Episode: 1456 steps: 51
Episode: 1457 steps: 35
Episode: 1458 steps: 34
Episode: 1459 steps: 42
Episode: 1460 steps: 53
Episode: 1461 steps: 59
Episode: 1462 st

Episode: 1767 steps: 37
Episode: 1768 steps: 21
Episode: 1769 steps: 34
Episode: 1770 steps: 10
Episode: 1771 steps: 9
Episode: 1772 steps: 11
Episode: 1773 steps: 10
Episode: 1774 steps: 8
Episode: 1775 steps: 23
Episode: 1776 steps: 24
Episode: 1777 steps: 20
Episode: 1778 steps: 28
Episode: 1779 steps: 24
Episode: 1780 steps: 20
Episode: 1781 steps: 36
Episode: 1782 steps: 12
Episode: 1783 steps: 25
Episode: 1784 steps: 25
Episode: 1785 steps: 25
Episode: 1786 steps: 51
Episode: 1787 steps: 21
Episode: 1788 steps: 17
Episode: 1789 steps: 17
Episode: 1790 steps: 24
Episode: 1791 steps: 20
Episode: 1792 steps: 24
Episode: 1793 steps: 28
Episode: 1794 steps: 28
Episode: 1795 steps: 30
Episode: 1796 steps: 35
Episode: 1797 steps: 17
Episode: 1798 steps: 21
Episode: 1799 steps: 34
Episode: 1800 steps: 27
Episode: 1801 steps: 31
Episode: 1802 steps: 19
Episode: 1803 steps: 31
Episode: 1804 steps: 22
Episode: 1805 steps: 29
Episode: 1806 steps: 28
Episode: 1807 steps: 17
Episode: 1808 step

In [3]:
# See our trained network in action
observation = env.reset()
reward_sum = 0
while True:
    env.render()
    
    x = np.reshape(observation, [1, input_size])
    Qs = sess.run(Qpred, feed_dict={X:x})
    a = np.argmax(Qs)
    
    observation, reward, done, _ = env.step(a)
    reward_sum += reward
    if done:
        print("Total score: {}".format(reward_sum))
        break

Total score: 50.0
