In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
from tic_env import TictactoeEnv, OptimalPlayer
from rl_algorithms import *
from plots import *
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from tqdm import tqdm
import pickle
import random

In [3]:
# Set random seed
torch.manual_seed(0)
np.random.seed(0)
random.seed(0)

# 3.2 Learning from experts

### Question 11

In [19]:
n_episodes = 20000
averaging_steps = 250
update_steps = 500
turns = np.array(['X','O'])

q_player = DeepQPlayer(0.15)
env = TictactoeEnv()

rewards = []
losses = []
average_reward = 0.
average_loss = []

other_player = OptimalPlayer(epsilon=0.5, player=turns[1])
x = []

for episode in tqdm(range(n_episodes)):
        # Play game
        loss, reward = play_deep_game(env, q_player, other_player, turns, testing=False)
        average_reward += reward
        average_loss.extend(loss)

        if episode % (update_steps-1) == 0:
            q_player.update_target()

        if episode % averaging_steps == averaging_steps - 1:
            average_reward /= averaging_steps
            rewards.append(average_reward)
            losses.append(np.array(average_loss).mean())
            average_reward = 0.
            average_loss = []
            x.append(episode)

        # Change first player
        turns = turns[::-1]

100%|██████████| 20000/20000 [03:30<00:00, 94.93it/s] 


In [20]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=x, y=rewards, mode='lines'))

fig.update_layout(
    title=r'$\text{Average reward for every %d games - }\epsilon_{opt}=0.5$' % averaging_steps,
    xaxis_title=r'$game$',
    yaxis_title=r'$average(reward)\text{- %d games}$' % (averaging_steps),
)

fig.show()

In [21]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=x, y=losses, mode='lines'))

fig.update_layout(
    title=r'$\text{Average loss for every %d games - }\epsilon_{opt}=0.5$' % averaging_steps,
    xaxis_title=r'$game$',
    yaxis_title=r'$average(loss)\text{- %d games}$' % (averaging_steps),
)

fig.show()

From the two plots, we can see that the Q-player learns how to play tic tac toe: the average rewards reaches a plateau around 0.2. The training loss has, of course, an opposite trend: as the average rewards get higher, the loss gets lower, as the network is learning. Finally, the final plateau is an indication that the network learned how to play the game. Our choice of $\epsilon$ depends on the fact that we want to explore new states and, at the same time, play reasonably well.

### Question 12

In [22]:
n_episodes = 20000
averaging_steps = 250
update_steps = 500
turns = np.array(['X','O'])

q_player = DeepQPlayer(0.2, capacity=1, batch_size=1) # Capacity = 1 and batch size = 1 => no memory, use only last action
env = TictactoeEnv()

rewards = []
losses = []
average_reward = 0.
average_loss = []

other_player = OptimalPlayer(epsilon=0.5, player=turns[1])
x = []

for episode in tqdm(range(n_episodes)):
        # Play game
        loss, reward = play_deep_game(env, q_player, other_player, turns, testing=False)
        average_reward += reward
        average_loss.extend(loss)

        if episode % (update_steps-1) == 0:
            q_player.update_target()

        if episode % averaging_steps == averaging_steps - 1:
            average_reward /= averaging_steps
            rewards.append(average_reward)
            losses.append(np.array(average_loss).mean())
            average_reward = 0.
            average_loss = []
            x.append(episode)

        # Change first player
        turns = turns[::-1]

100%|██████████| 20000/20000 [02:48<00:00, 118.54it/s]


In [23]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=x, y=rewards, mode='lines'))

fig.update_layout(
    title=r'$\text{Average reward without replay buffer for every %d games - }\epsilon_{opt}=0.5$' % averaging_steps,
    xaxis_title=r'$game$',
    yaxis_title=r'$average(reward)\text{- %d games}$' % (averaging_steps),
)

fig.show()

In [24]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=x, y=losses, mode='lines'))

fig.update_layout(
    title=r'$\text{Average loss for every %d games - }\epsilon_{opt}=0.5$, without replay buffer' % averaging_steps,
    xaxis_title=r'$game$',
    yaxis_title=r'$average(loss)\text{- %d games}$' % (averaging_steps),
)

fig.show()

In this case, the average reward is almost always close to the minimum: the networks loses all the time. In addition, after an initial

### Question 13

In [29]:
def deep_run_episodes(n_episodes, q_player, other_player, update_epsilon=False, other_learning=False):
    M_opts = []
    M_rands = []
    x = []
    turns = ['X','O']

    rewards = []
    losses = []
    average_reward = 0.
    average_loss = []
    update_steps = 500
    averaging_steps = 250

    for episode in tqdm(range(n_episodes)):
        if update_epsilon:
            q_player.update_epsilon(episode)
        if update_epsilon and other_learning:
            other_player.update_epsilon(episode)


        # Play game
        loss, reward = play_deep_game(env, q_player, other_player, turns, testing=False, other_learning=other_learning)
        average_reward += reward
        average_loss.extend(loss)

        if episode % update_steps == update_steps - 1:
            q_player.update_target()

        if episode % averaging_steps == averaging_steps - 1:
            average_reward /= averaging_steps
            rewards.append(average_reward)
            losses.append(np.array(average_loss).mean())
            average_reward = 0.
            average_loss = []

            # Compute M_opt and M_rand
            M_opt, M_rand = compute_measures(env, q_player, deep=True)
            M_opts.append(M_opt)
            M_rands.append(M_rand)
            x.append(episode)

        # Change first player
        turns = turns[::-1]

    return rewards, M_opts, M_rands, x, losses

In [30]:
n_episodes = 20000
averaging_steps = 250

turns = np.array(['X','O'])


env = TictactoeEnv()

n_star_rewards = []
n_star_list = [int(x) for x in np.linspace(1, 40000, 11)]
print(n_star_list)
n_star_M_opts = []
n_star_M_rands = []
n_star_losses = []
other_player = OptimalPlayer(epsilon=0.5, player=turns[1])

for n_star in n_star_list:
    print('Current n_star: %d' % n_star)
    q_player = DeepVariableEpsilonQPlayer(0.8, 0.1, n_star, None, capacity=10000, batch_size=64, lr=5e-5)

    rewards, M_opts, M_rands, x, losses = deep_run_episodes(n_episodes, q_player, other_player, update_epsilon=True)

    n_star_rewards.append(rewards)
    n_star_M_opts.append(M_opts)
    n_star_M_rands.append(M_rands)
    n_star_losses.append(losses)


# Save data
with open('question_13.pickle', 'wb') as f:
    pickle.dump((n_star_list, n_star_rewards, n_star_M_opts, n_star_M_rands, n_star_losses), f)

[1, 4000, 8000, 12000, 16000, 20000, 24000, 28000, 32000, 36000, 40000]
Current n_star: 1


100%|██████████| 20000/20000 [08:05<00:00, 41.17it/s] 


Current n_star: 4000


100%|██████████| 20000/20000 [07:53<00:00, 42.22it/s] 


Current n_star: 8000


100%|██████████| 20000/20000 [07:48<00:00, 42.69it/s] 


Current n_star: 12000


100%|██████████| 20000/20000 [07:50<00:00, 42.53it/s] 


Current n_star: 16000


100%|██████████| 20000/20000 [07:46<00:00, 42.92it/s] 


Current n_star: 20000


100%|██████████| 20000/20000 [07:45<00:00, 43.00it/s] 


Current n_star: 24000


100%|██████████| 20000/20000 [07:45<00:00, 42.96it/s] 


Current n_star: 28000


100%|██████████| 20000/20000 [07:52<00:00, 42.34it/s] 


Current n_star: 32000


100%|██████████| 20000/20000 [07:49<00:00, 42.59it/s] 


Current n_star: 36000


100%|██████████| 20000/20000 [07:50<00:00, 42.52it/s] 


Current n_star: 40000


100%|██████████| 20000/20000 [07:41<00:00, 43.36it/s] 


In [31]:
# Load data

with open('question_13.pickle', 'rb') as f:
    n_star_list, n_star_rewards, n_star_M_opts, n_star_M_rands, losses =  pickle.load(f)

In [32]:
fig = go.Figure()

for i, y_i in enumerate(n_star_M_opts):
    fig.add_trace(go.Scatter(x=x, y=y_i, mode='lines',name=r'$n^* = %d$' % n_star_list[i]))

fig.update_layout(
    title=r'$M_{opt}\text{ for every %d games for different values of }n^{∗}$' % averaging_steps,
    xaxis_title=r'$game$',
    yaxis_title=r'$M_{opt}$',
    width=1200, height=400
)

fig.show()

In [33]:
fig = go.Figure()

for i, y_i in enumerate(n_star_losses):
    fig.add_trace(go.Scatter(x=x, y=y_i, mode='lines',name=r'$n^* = %d$' % n_star_list[i]))

fig.update_layout(
    title=r'$\text{Average loss for every %d games - }\epsilon_{opt}=0.5$' % averaging_steps,
    xaxis_title=r'$game$',
    yaxis_title=r'$average(loss)\text{- %d games}$' % (averaging_steps),
    width=1200, height=400
)

fig.show()

In [34]:
fig = go.Figure()

for i, y_i in enumerate(n_star_M_rands):
    fig.add_trace(go.Scatter(x=x, y=y_i, mode='lines',name=r'$n^* = %d$' % n_star_list[i]))

fig.update_layout(
    title=r'$M_{rand}\text{ for every %d games for different values of }n^{∗}$' % averaging_steps,
    xaxis_title=r'$game$',
    yaxis_title=r'$M_{rand}$',
    width=1200, height=400
)

fig.show()

### Question 14

In [35]:
n_star = 4000
n_episodes = 20000
averaging_steps = 250

env = TictactoeEnv()


eps_list = [x for x in np.linspace(0, 1, 11)]
print([round(ep,2) for ep in eps_list])
eps_rewards = []
eps_M_opts = []
eps_M_rands = []
eps_losses = []

for eps in eps_list:
    print('Current eps: %.2f' % eps)
    q_player = DeepVariableEpsilonQPlayer(0.8, 0.1, n_star, None, capacity=10000, batch_size=64, lr=5e-5)
    other_player = OptimalPlayer(epsilon=eps)

    rewards, M_opts, M_rands, x, losses = deep_run_episodes(n_episodes, q_player, other_player, update_epsilon=True)

    eps_rewards.append(rewards)
    eps_M_opts.append(M_opts)
    eps_M_rands.append(M_rands)
    eps_losses.append(losses)

# Save data
with open('question_14.pickle', 'wb') as f:
    pickle.dump((eps_list, eps_rewards, eps_M_opts, eps_M_rands, eps_losses), f)

[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
Current eps: 0.00


100%|██████████| 20000/20000 [09:35<00:00, 34.75it/s]


Current eps: 0.10


100%|██████████| 20000/20000 [08:57<00:00, 37.21it/s]


Current eps: 0.20


100%|██████████| 20000/20000 [08:34<00:00, 38.84it/s]


Current eps: 0.30


100%|██████████| 20000/20000 [08:34<00:00, 38.87it/s]


Current eps: 0.40


100%|██████████| 20000/20000 [08:01<00:00, 41.54it/s]


Current eps: 0.50


100%|██████████| 20000/20000 [07:48<00:00, 42.70it/s] 


Current eps: 0.60


100%|██████████| 20000/20000 [07:51<00:00, 42.42it/s] 


Current eps: 0.70


100%|██████████| 20000/20000 [07:28<00:00, 44.55it/s] 


Current eps: 0.80


100%|██████████| 20000/20000 [07:05<00:00, 46.99it/s] 


Current eps: 0.90


100%|██████████| 20000/20000 [06:52<00:00, 48.43it/s] 


Current eps: 1.00


100%|██████████| 20000/20000 [06:43<00:00, 49.60it/s] 


In [36]:
# Load data

with open('question_14.pickle', 'rb') as f:
    eps_list, eps_rewards, eps_M_opts, eps_M_rands, eps_losses =  pickle.load(f)

In [37]:
fig = go.Figure()

for i, y_i in enumerate(eps_M_opts):
    fig.add_trace(go.Scatter(x=x, y=y_i, mode='lines',name=r'$\epsilon_{opt} = %.2f$' % eps_list[i]))

fig.update_layout(
    title=r'$M_{opt}\text{ for every %d games for different values of }\epsilon_{opt}$' % averaging_steps,
    xaxis_title=r'$game$',
    yaxis_title=r'$M_{opt}$',
    width=1200, height=400
)

fig.show()

In [38]:
fig = go.Figure()

for i, y_i in enumerate(eps_M_rands):
    fig.add_trace(go.Scatter(x=x, y=y_i, mode='lines',name=r'$\epsilon_{opt} = %.2f$' % eps_list[i]))

fig.update_layout(
    title=r'$M_{rand}\text{ for every %d games for different values of }\epsilon_{opt}$' % averaging_steps,
    xaxis_title=r'$game$',
    yaxis_title=r'$M_{rand}$',
    width=1200, height=400
)

fig.show()

# 3.3 Learning by self-practice

### Question 16

In [61]:
n_episodes = 20000
averaging_steps = 250

env = TictactoeEnv()


eps_list = [x for x in np.linspace(0, 1, 11)]
print([round(ep,2) for ep in eps_list])
eps_rewards = []
eps_M_opts = []
eps_M_rands = []
eps_losses = []
q_players = []

for eps in eps_list:
    print('Current eps: %.2f' % eps)
    q_player = DeepQPlayer(epsilon=eps, lr=5e-5)
    other_player = DeepQPlayer(epsilon=eps, shared_networks=q_player.get_networks(), lr=5e-5)

    rewards, M_opts, M_rands, x, losses = deep_run_episodes(n_episodes, q_player, other_player, other_learning=True)

    eps_rewards.append(rewards)
    eps_M_opts.append(M_opts)
    eps_M_rands.append(M_rands)
    eps_losses.append(losses)
    q_players.append(q_player.get_networks())

# Save data
with open('question_16.pickle', 'wb') as f:
    pickle.dump((eps_list, eps_rewards, eps_M_opts, eps_M_rands, eps_losses, q_players), f)

[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
Current eps: 0.00


100%|██████████| 20000/20000 [07:36<00:00, 43.79it/s] 


Current eps: 0.10


100%|██████████| 20000/20000 [10:17<00:00, 32.39it/s]


Current eps: 0.20


100%|██████████| 20000/20000 [10:10<00:00, 32.79it/s]


Current eps: 0.30


100%|██████████| 20000/20000 [09:51<00:00, 33.82it/s]


Current eps: 0.40


100%|██████████| 20000/20000 [09:48<00:00, 33.97it/s]


Current eps: 0.50


100%|██████████| 20000/20000 [10:31<00:00, 31.67it/s]


Current eps: 0.60


100%|██████████| 20000/20000 [10:22<00:00, 32.12it/s]


Current eps: 0.70


100%|██████████| 20000/20000 [10:08<00:00, 32.87it/s]


Current eps: 0.80


100%|██████████| 20000/20000 [09:50<00:00, 33.85it/s]


Current eps: 0.90


100%|██████████| 20000/20000 [09:39<00:00, 34.49it/s]


Current eps: 1.00


100%|██████████| 20000/20000 [07:54<00:00, 42.15it/s]


In [62]:
# Load data

with open('question_16.pickle', 'rb') as f:
    eps_list, eps_rewards, eps_M_opts, eps_M_rands, eps_losses, q_players =  pickle.load(f)

In [63]:
fig = go.Figure()

for i, y_i in enumerate(eps_M_opts):
    fig.add_trace(go.Scatter(x=x, y=y_i, mode='lines',name=r'$\epsilon_{opt} = %.2f$' % eps_list[i]))

fig.update_layout(
    title=r'$M_{opt}\text{ for every %d games for different values of }\epsilon_{opt}$' % averaging_steps,
    xaxis_title=r'$game$',
    yaxis_title=r'$M_{opt}$',
    width=1200, height=400
)

fig.show()

In [64]:
fig = go.Figure()

for i, y_i in enumerate(eps_M_rands):
    fig.add_trace(go.Scatter(x=x, y=y_i, mode='lines',name=r'$\epsilon_{opt} = %.2f$' % eps_list[i]))

fig.update_layout(
    title=r'$M_{rand}\text{ for every %d games for different values of }\epsilon_{opt}$' % averaging_steps,
    xaxis_title=r'$game$',
    yaxis_title=r'$M_{rand}$',
    width=1200, height=400
)

fig.show()

### Question 17-18

In [65]:
n_episodes = 20000
averaging_steps = 250

env = TictactoeEnv()


n_star_rewards = []
n_star_list = [int(x) for x in np.linspace(1, 40000, 11)]
print(n_star_list)
n_star_M_opts = []
n_star_M_rands = []
n_star_losses = []
q_players = []

for n_star in n_star_list:
    print('Current n_star: %d' % n_star)
    q_player = DeepVariableEpsilonQPlayer(0.8, 0.1, n_star, None, capacity=10000, batch_size=64, lr=5e-5)
    other_player = DeepVariableEpsilonQPlayer(0.8, 0.1, n_star, None, capacity=10000, batch_size=64, shared_networks=q_player.get_networks(), lr=5e-5)

    rewards, M_opts, M_rands, x, losses = deep_run_episodes(n_episodes, q_player, other_player, update_epsilon=True, other_learning=True)

    n_star_rewards.append(rewards)
    n_star_M_opts.append(M_opts)
    n_star_M_rands.append(M_rands)
    n_star_losses.append(losses)
    q_players.append(q_player.get_networks())

# Save data
with open('question_17_18.pickle', 'wb') as f:
    pickle.dump((n_star_list, n_star_rewards, n_star_M_opts, n_star_M_rands, n_star_losses, q_players), f)

[1, 4000, 8000, 12000, 16000, 20000, 24000, 28000, 32000, 36000, 40000]
Current n_star: 1


100%|██████████| 20000/20000 [10:30<00:00, 31.73it/s] 


Current n_star: 4000


100%|██████████| 20000/20000 [11:06<00:00, 30.00it/s] 


Current n_star: 8000


100%|██████████| 20000/20000 [10:43<00:00, 31.08it/s]


Current n_star: 12000


100%|██████████| 20000/20000 [10:16<00:00, 32.42it/s]


Current n_star: 16000


100%|██████████| 20000/20000 [10:21<00:00, 32.19it/s]


Current n_star: 20000


100%|██████████| 20000/20000 [10:33<00:00, 31.55it/s] 


Current n_star: 24000


100%|██████████| 20000/20000 [10:52<00:00, 30.63it/s]


Current n_star: 28000


100%|██████████| 20000/20000 [10:48<00:00, 30.85it/s] 


Current n_star: 32000


100%|██████████| 20000/20000 [11:12<00:00, 29.73it/s] 


Current n_star: 36000


100%|██████████| 20000/20000 [10:48<00:00, 30.86it/s] 


Current n_star: 40000


100%|██████████| 20000/20000 [10:07<00:00, 32.95it/s]


In [66]:
# Load data

with open('question_17_18.pickle', 'rb') as f:
    n_star_list, n_star_rewards, n_star_M_opts, n_star_M_rands, n_star_losses, q_players =  pickle.load(f)

In [67]:
fig = go.Figure()

for i, y_i in enumerate(n_star_M_opts):
    fig.add_trace(go.Scatter(x=x, y=y_i, mode='lines',name=r'$n^* = %d$' % n_star_list[i]))

fig.update_layout(
    title=r'$M_{opt}\text{ for every %d games for different values of }n^{∗}$' % averaging_steps,
    xaxis_title=r'$game$',
    yaxis_title=r'$M_{opt}$',
    width=1200, height=400
)

fig.show()

In [68]:
fig = go.Figure()

for i, y_i in enumerate(n_star_M_rands):
    fig.add_trace(go.Scatter(x=x, y=y_i, mode='lines',name=r'$n^* = %d$' % n_star_list[i]))

fig.update_layout(
    title=r'$M_{rand}\text{ for every %d games for different values of }n^{∗}$' % averaging_steps,
    xaxis_title=r'$game$',
    yaxis_title=r'$M_{rand}$',
    width=1200, height=400
)

fig.show()

### Question 19

In [None]:
import torch
boh1, boh2 = torch.max(torch.ones(10))
print(boh1, boh2)