In [4]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
import numpy as np
from tic_env import TictactoeEnv, OptimalPlayer
from rl_algorithms import *
from plots import *
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from tqdm import tqdm
import pickle
import seaborn as sns

# 3.2 Learning from experts

### Question 11

In [6]:
n_episodes = 20000
averaging_steps = 250
update_steps = 500
turns = np.array(['X','O'])

q_player = DeepQPlayer(0.15)
env = TictactoeEnv()

rewards = []
losses = []
average_reward = 0.
average_loss = []

other_player = OptimalPlayer(epsilon=0.5, player=turns[1])
x = []

for episode in tqdm(range(n_episodes)):
        # Update players
        other_player.set_player(turns[1])
        q_player.reset_attributes()
        q_player.set_player(turns[0])

        # Play game
        loss, reward = play_deep_game(env, q_player, other_player, turns, testing=False)
        average_reward += reward
        average_loss.extend(loss)

        if episode % (update_steps-1) == 0:
            q_player.update_target()

        if episode % averaging_steps == averaging_steps - 1:
            average_reward /= averaging_steps
            rewards.append(average_reward)
            losses.append(np.array(average_loss).mean())
            average_reward = 0.
            average_loss = []
            x.append(episode)

        # Change first player
        turns = turns[::-1]

100%|██████████| 20000/20000 [05:39<00:00, 58.94it/s]


In [7]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=x, y=rewards, mode='lines'))

fig.update_layout(
    title=r'$\text{Average reward for every %d games - }\epsilon_{opt}=0.5$' % averaging_steps,
    xaxis_title=r'$game$',
    yaxis_title=r'$average(reward)\text{- %d games}$' % (averaging_steps),
)

fig.show()

In [8]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=x, y=losses, mode='lines'))

fig.update_layout(
    title=r'$\text{Average loss for every %d games - }\epsilon_{opt}=0.5$' % averaging_steps,
    xaxis_title=r'$game$',
    yaxis_title=r'$average(loss)\text{- %d games}$' % (averaging_steps),
)

fig.show()

From the two plots, we can see that the Q-player learns how to play tic tac toe: the average rewards reaches a plateau around 0.2. The training loss has, of course, an opposite trend: as the average rewards get higher, the loss gets lower, as the network is learning. Finally, the final plateau is an indication that the network learned how to play the game. Our choice of $\epsilon$ depends on the fact that we want to explore new states and, at the same time, play reasonably well.

### Question 12

In [9]:
n_episodes = 20000
averaging_steps = 250
update_steps = 500
turns = np.array(['X','O'])

q_player = DeepQPlayer(0.2, capacity=1, batch_size=1) # Capacity = 1 and batch size = 1 => no memory, use only last action
env = TictactoeEnv()

rewards = []
losses = []
average_reward = 0.
average_loss = []

other_player = OptimalPlayer(epsilon=0.5, player=turns[1])
x = []

for episode in tqdm(range(n_episodes)):
        # Update players
        other_player.set_player(turns[1])
        q_player.reset_attributes()
        q_player.set_player(turns[0])

        # Play game
        loss, reward = play_deep_game(env, q_player, other_player, turns, testing=False)
        average_reward += reward
        average_loss.extend(loss)

        if episode % (update_steps-1) == 0:
            q_player.update_target()

        if episode % averaging_steps == averaging_steps - 1:
            average_reward /= averaging_steps
            rewards.append(average_reward)
            losses.append(np.array(average_loss).mean())
            average_reward = 0.
            average_loss = []
            x.append(episode)

        # Change first player
        turns = turns[::-1]

100%|██████████| 20000/20000 [05:02<00:00, 66.21it/s] 


In [10]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=x, y=rewards, mode='lines'))

fig.update_layout(
    title=r'$\text{Average reward without replay buffer for every %d games - }\epsilon_{opt}=0.5$' % averaging_steps,
    xaxis_title=r'$game$',
    yaxis_title=r'$average(reward)\text{- %d games}$' % (averaging_steps),
)

fig.show()

In [11]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=x, y=losses, mode='lines'))

fig.update_layout(
    title=r'$\text{Average loss for every %d games - }\epsilon_{opt}=0.5, without replay buffer$' % averaging_steps,
    xaxis_title=r'$game$',
    yaxis_title=r'$average(loss)\text{- %d games}$' % (averaging_steps),
)

fig.show()

In this case, the average reward is almost always close to the minimum: the networks loses all the time. In addition, after an initial

### Question 13

In [12]:
def deep_run_episodes(n_episodes, q_player, other_player, update_epsilon=False, update_other_epsilon=False):
    M_opts = []
    M_rands = []
    x = []
    turns = np.array(['X','O'])

    rewards = []
    losses = []
    average_reward = 0.
    average_loss = []
    update_steps = 500

    for episode in tqdm(range(n_episodes)):
        # Update players
        q_player.set_player(turns[0])
        q_player.reset_attributes()
        other_player.set_player(turns[1])

        if update_epsilon:
            q_player.update_epsilon(episode)
        if update_other_epsilon:
            other_player.update_epsilon(episode)


        # Play game
        loss, reward = play_deep_game(env, q_player, other_player, turns, testing=False)
        average_reward += reward
        average_loss.extend(loss)

        if episode % (update_steps-1) == 0:
            q_player.update_target()

        if episode % averaging_steps == averaging_steps - 1:
            average_reward /= averaging_steps
            rewards.append(average_reward)
            losses.append(np.array(average_loss).mean())
            average_reward = 0.
            average_loss = []

            # Compute M_opt and M_rand
            M_opt, M_rand = compute_measures(env, q_player, deep=True)
            M_opts.append(M_opt)
            M_rands.append(M_rand)
            x.append(episode)

        # Change first player
        turns = turns[::-1]

    return rewards, M_opts, M_rands, x

In [13]:
n_episodes = 20000
averaging_steps = 250

turns = np.array(['X','O'])


env = TictactoeEnv()

n_star_rewards = []
n_star_list = [500]
print(n_star_list)
n_star_M_opts = []
n_star_M_rands = []
other_player = OptimalPlayer(epsilon=0.5, player=turns[1])

for n_star in n_star_list:
    print('Current n_star: %d' % n_star)
    q_player = DeepVariableEpsilonQPlayer(0.8, 0.1, n_star, None, capacity=10000, batch_size=64)

    rewards, M_opts, M_rands, x = deep_run_episodes(n_episodes, q_player, other_player, update_epsilon=True)

    n_star_rewards.append(rewards)
    n_star_M_opts.append(M_opts)
    n_star_M_rands.append(M_rands)


# Save data
with open('deep_n_stars.pickle', 'wb') as f:
    pickle.dump((n_star_list, n_star_rewards, n_star_M_opts, n_star_M_rands), f)

[500]
Current n_star: 500


100%|██████████| 20000/20000 [14:12<00:00, 23.47it/s]  


In [14]:
# Load data

# with open('deep_n_stars.pickle', 'rb') as f:
#      n_star_list, n_star_rewards, n_star_M_opts, n_star_M_rands =  pickle.load(f)

In [15]:
fig = go.Figure()

for i, y_i in enumerate(n_star_M_opts):
    fig.add_trace(go.Scatter(x=x, y=y_i, mode='lines',name=r'$n^* = %d$' % n_star_list[i]))

fig.update_layout(
    title=r'$M_{opt}\text{ for every %d games for different values of }n^{∗}$' % averaging_steps,
    xaxis_title=r'$game$',
    yaxis_title=r'$M_{opt}$',
    width=1200, height=400
)

fig.show()

In [16]:
fig = go.Figure()

for i, y_i in enumerate(n_star_M_rands):
    fig.add_trace(go.Scatter(x=x, y=y_i, mode='lines',name=r'$n^* = %d$' % n_star_list[i]))

fig.update_layout(
    title=r'$M_{rand}\text{ for every %d games for different values of }n^{∗}$' % averaging_steps,
    xaxis_title=r'$game$',
    yaxis_title=r'$M_{rand}$',
    width=1200, height=400
)

fig.show()