<a href="https://colab.research.google.com/github/nomomon/drl-js/blob/main/tic-tac-toe/notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [33]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

import numpy as np

import random

# Game (Environment)

In [164]:
# 0 1 2
# 3 4 5
# 6 7 8

def gameStatus(board, playerParity):
    lines = [
        [0, 1, 2],
        [3, 4, 5],
        [6, 7, 8],
        [0, 3, 6],
        [1, 4, 7],
        [2, 5, 8],
        [0, 4, 8],
        [2, 4, 6],
    ]

    # there is a winner
    for line in lines:
        if (board[line[1]] == board[line[0]] and 
            board[line[1]] == board[line[2]] and 
            board[line[1]] != 0):
            return ((np.sum(np.array(board) > 0) - playerParity) % 2 == 0) * 2 - 1

    # tie
    if(np.all(np.array(board) != 0)):
        return 0.5

    # game is not finished
    return 0

# Model

In [137]:
### Agent ###

def createPolicy():
    model = tf.keras.Sequential([
        layers.InputLayer((9,)),
        layers.Dense(20, activation = "relu"),
        layers.Dropout(0.01),
        layers.Dense(20, activation = "relu"),
        layers.Dropout(0.01),
        layers.Dense(9, activation = None)
    ])

    return model

In [26]:
def chooseAction(policy, board):
    while True:
        logits = policy.predict([board])
        action = tf.random.categorical(logits, num_samples = 1)
        action = action.numpy().flatten()[0]

        # free cell
        if(board[action] == 0):
            break

    return action

In [20]:
### Agent Memory ###

class Memory:
    def __init__(self): 
        self.clear()

    def clear(self): 
        self.observations = []
        self.actions = []
        self.rewards = []

    def add_to_memory(self, new_observation, new_action, new_reward): 
        self.observations.append(new_observation)
        self.actions.append(new_action)
        self.rewards.append(new_reward)

    # Helper function to combine a list of Memory objects into a single Memory.
    # This will be very useful for batching.
    def aggregate_memories(memories):
        batch_memory = Memory()

        for memory in memories:
            for step in zip(memory.observations, memory.actions, memory.rewards):
                batch_memory.add_to_memory(*step)

        return batch_memory

memory = Memory()

In [138]:
def normalize(x):
    x -= np.mean(x)
    x /= np.std(x)
    return x.astype(np.float32)

# Compute normalized, discounted, cumulative rewards (i.e., return)
# Arguments:
#   rewards: reward at timesteps in episode
#   gamma: discounting factor
# Returns:
#   normalized discounted reward
def discount_rewards(rewards, gamma = 0.95): 
    discounted_rewards = np.zeros_like(rewards)
    
    R = 0
    for t in reversed(range(0, len(rewards))):
        R = R * gamma + rewards[t]
        discounted_rewards[t] = R
        
    return normalize(discounted_rewards)

In [69]:
def compute_loss(logits, actions, rewards):
    neg_logprob = tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=logits, 
        labels=actions
    )
    loss = tf.reduce_mean(
        neg_logprob * rewards
    )
    
    return loss

In [70]:
### Training step (forward and backpropagation) ###

def train_step(model, optimizer, observations, actions, discounted_rewards):
  with tf.GradientTape() as tape:
      logits = model(observations)
      loss = compute_loss(logits, actions, discounted_rewards)

  grads = tape.gradient(loss, model.trainable_variables)
  optimizer.apply_gradients(zip(grads, model.trainable_variables))


# Plot

In [90]:
from IPython.display import clear_output
import matplotlib.pyplot as plt

points = np.array([])

In [98]:
def addToPlot(points, point):
    i = points.shape[0] + 1
    points = np.append(points, point)

    plt.clf()
    plt.plot(points, range(i))

# Training

In [166]:
points = np.array([])

learning_rate = 1e-3
optimizer = tf.keras.optimizers.Adam(learning_rate)

policy = createPolicy()

for episode in range(500):
    board = [0] * 9
    memory.clear()

    main_player = episode % 2

    while True:
        if(main_player == 1 and gameStatus(next_board, main_player) == 0):
            opponent_action = chooseAction(policy, (np.array(board) * -1).tolist())
            board[opponent_action] = -1

        action = chooseAction(policy, board)

        next_board = board
        next_board[action] = 1
        
        if(main_player == 0 and gameStatus(next_board, main_player) == 0):
            opponent_action = chooseAction(policy, (np.array(next_board) * -1).tolist())
            next_board[opponent_action] = -1

        status = gameStatus(next_board, main_player)

        reward = status - 0.1
        memory.add_to_memory(board, action, reward)

        if(status != 0):
            total_reward = sum(memory.rewards)
            if(episode % 1 == 0):
                print(f"{episode} reward: {total_reward}")

            train_step(
                policy, 
                optimizer, 
                observations = np.vstack(memory.observations),
                actions = np.array(memory.actions),
                discounted_rewards = discount_rewards(memory.rewards)
            )

            memory.clear()
            board = [0] * 9

            break

        board = next_board

0 reward: 0.6
1 reward: -1.4000000000000001
2 reward: -1.5
3 reward: 0.0
4 reward: -1.5
5 reward: -1.4000000000000001
6 reward: 0.6
7 reward: -1.4000000000000001
8 reward: 0.0
9 reward: -1.4000000000000001
10 reward: 0.6
11 reward: 0.5
12 reward: 0.6
13 reward: 0.5
14 reward: -1.5
15 reward: -1.4000000000000001
16 reward: 0.6
17 reward: 0.5
18 reward: -1.3
19 reward: 0.7
20 reward: 0.6
21 reward: -1.4000000000000001
22 reward: -1.3
23 reward: -1.4000000000000001
24 reward: -1.5
25 reward: 0.0
26 reward: 0.6
27 reward: -1.4000000000000001
28 reward: 0.6
29 reward: -1.4000000000000001
30 reward: -1.3
31 reward: -1.4000000000000001
32 reward: 0.6
33 reward: 0.5
34 reward: 0.6
35 reward: 0.5
36 reward: -1.5
37 reward: 0.5
38 reward: 0.6
39 reward: -1.4000000000000001
40 reward: -1.5
41 reward: -1.4000000000000001
42 reward: 0.6
43 reward: 0.5
44 reward: 0.0
45 reward: -1.4000000000000001
46 reward: 0.0
47 reward: 0.5
48 reward: -1.5
49 reward: -1.4000000000000001
50 reward: -1.5
51 reward:

# Web demo

Play against the AI

In [176]:
from IPython.display import clear_output 

def symbol(x):
    if x == 1:
        return "X"
    elif x == -1:
        return "O"
    else:
        return "?"

def printBoard(board):
    clear_output()
    cBoard = list(map(symbol, board))
    for i in range(0, 3):
        row = ""
        for j in range(0, 3):
            row += (cBoard[j + i * 3]) if (cBoard[j + i * 3] != "?") else str(j + i * 3 + 1)
            if j != 2:
                row += " | "
        print(row)
        if i != 2:
            print("---------")

def play(policy):
    player = (int(input("which player you want to be? (1 or 2) ")) + 1) % 2 

    board = [0, 0, 0,
             0, 0, 0,
             0, 0, 0]

    winner = 0

    for i in range(9):
        if (i % 2 == player):
            printBoard(board)
            action = int(input("what cell? ")) - 1
        else:
            action = chooseAction(policy, board.copy())

        board[action] = 1

        if(gameStatus(board, player) != 0):
            winner = gameStatus(board, player)
            break

        board = (np.array(board) * -1).tolist()
    
    printBoard(board)
    if(gameStatus(board) != 0):
        print("\nwinner is the", "humen" if winner else "ai")
    else:
        print("\nit's a tie!")

In [178]:
play(policy)

X | 2 | O
---------
4 | X | O
---------
7 | O | X


TypeError: ignored

# Deploy to TF.js

In [172]:
%%capture

!pip install tensorflowjs[wizard]
!pip install -U ipython

In [173]:
policy.save("./model/")

INFO:tensorflow:Assets written to: ./model/assets


In [174]:
!tensorflowjs_converter --input_format=keras_saved_model /content/model /content/tfjs_model

2021-11-12 22:01:36.565801: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:39] Overriding allow_growth setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
