# policy gradient
a policy gradient is a gradient-based approach to searching $\theta$ for $\pi$, which is $f: s_i \mapsto a_i$. it is guided by a loss function of $\mathbb{E} log(p(a_i) \times (a_{chosen}) \times reward$.

In [5]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from Tictactoe import Tictactoe
%matplotlib inline

def discount_rewards(rewards, gamma):
    return np.array([sum([gamma**t*r for t, r in enumerate(rewards[i:])]) for i in range(len(rewards))])

In [6]:
n_features = 9
n_actions = 9
n_hidden = 256
discount_factor = 0.995

In [7]:
tf.reset_default_graph()

obs_ = tf.placeholder(name="observations", shape=[None, n_features], dtype=tf.float32)
a_ = tf.placeholder(name="actions", shape=[None, n_actions], dtype=tf.float32)
r_ = tf.placeholder(name="rewards", shape=[None, 1], dtype=tf.float32)

w1 = tf.Variable(tf.random_uniform([n_features, n_hidden]))
w2 = tf.Variable(tf.random_uniform([n_hidden, n_actions]))

w1 = tf.get_variable("w1",[n_features, n_hidden])
b1 = tf.get_variable("b1",[n_hidden])
w2 = tf.get_variable("w2",[n_hidden, n_hidden])
b2 = tf.get_variable("b2",[n_hidden])
w3 = tf.get_variable("w3",[n_hidden, n_actions])
b3 = tf.get_variable("b3",[n_actions])

z1 = tf.matmul(obs_, w1) + b1
fc1 = tf.nn.relu(z1)
z2 = tf.matmul(fc1, w2) + b2
fc2 = tf.nn.relu(z2)
z3 = tf.matmul(fc2, w3) + b3
probs = tf.nn.softmax(z3)  # apply softmax on logits

loss = tf.losses.log_loss(labels=a_, predictions=probs, weights=r_)
op = tf.train.AdamOptimizer(learning_rate=0.001).minimize(loss)

In [8]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

n_episodes = 50000
    
for episode in range(n_episodes):
    env = Tictactoe(move_penalty=-0.2)
    s1 = env.board.copy()
    d = False
    actions, rewards, states = [], [], []

    while not d:
        action_probs = sess.run(probs, feed_dict={obs_: s1.reshape(1, -1)})[0]
        available = env.available_actions()
        a = np.random.choice(available, p=action_probs[available]/np.sum(action_probs[available]))
        # a = env.sample()
        s2, r, d = env.step(a)

        actions.append(np.eye(n_actions)[a])
        rewards.append(r)
        states.append(s1)
        
        s1 = s2.copy()

    epr = np.vstack(discount_rewards(rewards, discount_factor))
    eps = np.vstack(states)
    epl = np.vstack(actions)
    epr -= np.mean(epr)
    epr /= np.std(epr)

    sess.run(op, {obs_: eps, a_: epl, r_: epr})

    if episode % (n_episodes / 10) == 0:
        print (episode)

0
5000
10000
15000
20000
25000
30000
35000
40000
45000


In [9]:
wins = 0.0
losses = 0.0
draws = 0.0
rollouts = 100

for _ in range(rollouts):
    
    env = Tictactoe()
    s = env.board
    d = False

    while not d:
        action_probs = sess.run(probs, feed_dict={obs_: s.reshape(1, -1)})[0]
        available = env.available_actions()
        a = np.random.choice(available, p=action_probs[available]/np.sum(action_probs[available]))
        s, r, d = env.step(a)

    if r == 1:
        wins += 1
    elif r == -1:
        losses += 1
    else:
        draws += 1
        
print ("wins", wins/rollouts)
print ("losses", losses/rollouts)
print ("draws", draws/rollouts)

('wins', 0.94)
('losses', 0.06)
('draws', 0.0)
