**IMPLEMENTATION:** The implementation of the QPlayer can be found in rl_algorithms.py


In [4]:
%load_ext autoreload
%autoreload 2
import numpy as np
from tic_env import TictactoeEnv, OptimalPlayer
from rl_algorithms import *
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from tqdm import tqdm
import pickle
import dill

In [13]:
write = False

# 2.1 Learning from experts


### Question 1

In [153]:
n_episodes = 20000
averaging_steps = 250
turns = np.array(['X','O'])

q_player = QPlayer(0.25, 0.99, 0.2)
env = TictactoeEnv()

rewards = []
average_reward = 0.

other_player = OptimalPlayer(epsilon=0.5, player=turns[1])
x = []

for episode in range(n_episodes):
        if episode % 5000 == 0:
            print('Current episode: %d' % episode)

        # Update players
        other_player.set_player(turns[1])
        
        # Play game
        reward = play_game(env, q_player, other_player, turns, episode, testing=False)
        average_reward += reward

        if episode % averaging_steps == averaging_steps - 1:
            average_reward /= averaging_steps
            rewards.append(average_reward)
            average_reward = 0.
            x.append(episode)

        # Change first player
        turns = turns[::-1]

Current episode: 0
Current episode: 5000
Current episode: 10000
Current episode: 15000


In [170]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=x, y=rewards, mode='lines'))

fig.update_layout(
    title=r'$\Large{\text{Average reward for every %d games - learner }}\epsilon=0.2$' % averaging_steps,
    xaxis_title=r'$\Large{game}$',
    yaxis_title=r'$\Large{average(reward)\text{- %d games}}$' % (averaging_steps),
)

fig.update_layout(
    font=dict(
        size=18,
    )
)

fig.show()

In this case we set $\epsilon=0.2$. We can observe that the avarage reward computed every 250 games increases over time, going from a value of $-0.4$ to a value around $0.3$. Therefore the Q-player is able to learn the game. In particular it is able to win more often than the Optimal Player with  $\epsilon_{opt} = 0.5$.

## 2.1.1 Decreasing exploration

In [30]:
def run_episodes(n_episodes, q_player, other_player, update_epsilon=False, update_other_epsilon=False, set_player=True):
    rewards = []
    M_opts = []
    M_rands = []
    x = []
    turns = np.array(['X','O'])

    average_reward = 0.

    for episode in range(n_episodes):
        if episode % 5000 == 0:
            print('\tCurrent episode: %d' % episode)


        # Update players
        if update_epsilon:
            q_player.update_epsilon(episode)
        if update_other_epsilon:
            other_player.update_epsilon(episode)
        if set_player:
            other_player.set_player(turns[1])

        # Play game
        reward = play_game(env, q_player, other_player, turns, episode, testing=False)
        average_reward += reward

        if episode % averaging_steps == averaging_steps - 1:
            average_reward /= averaging_steps
            rewards.append(average_reward)
            average_reward = 0.

            # Compute M_opt and M_rand
            M_opt, M_rand = compute_measures(env, q_player)
            M_opts.append(M_opt)
            M_rands.append(M_rand)
            x.append(episode)

        # Change first player
        turns = turns[::-1]

    return rewards, M_opts, M_rands, x

### Question 2

In [None]:
n_episodes = 20000
averaging_steps = 250

env = TictactoeEnv()

n_star_rewards = []
n_star_list = [int(x) for x in np.linspace(1, 40000, 11)]
print(n_star_list)
n_star_M_opts = []
n_star_M_rands = []
other_player = OptimalPlayer(epsilon=0.5, player=turns[1])

for n_star in n_star_list:
    print('Current n_star: %d' % n_star)
    q_player = VariableEpsilonQPlayer(0.25, 0.99, 0.8, 0.1, n_star)

    rewards, M_opts, M_rands, x = run_episodes(n_episodes, q_player, other_player, update_epsilon=True)

    n_star_rewards.append(rewards)
    n_star_M_opts.append(M_opts)
    n_star_M_rands.append(M_rands)

# Save data
if write:
    with open('question_2.pickle', 'wb') as f:
        pickle.dump((n_star_list, n_star_rewards, n_star_M_opts, n_star_M_rands), f)

[1, 4000, 8000, 12000, 16000, 20000, 24000, 28000, 32000, 36000, 40000]
Current n_star: 1
	Current episode: 0
	Current episode: 5000
	Current episode: 10000
	Current episode: 15000
Current n_star: 4000
	Current episode: 0
	Current episode: 5000
	Current episode: 10000
	Current episode: 15000
Current n_star: 8000
	Current episode: 0
	Current episode: 5000
	Current episode: 10000
	Current episode: 15000
Current n_star: 12000
	Current episode: 0
	Current episode: 5000
	Current episode: 10000
	Current episode: 15000
Current n_star: 16000
	Current episode: 0
	Current episode: 5000
	Current episode: 10000
	Current episode: 15000
Current n_star: 20000
	Current episode: 0
	Current episode: 5000
	Current episode: 10000
	Current episode: 15000
Current n_star: 24000
	Current episode: 0
	Current episode: 5000
	Current episode: 10000
	Current episode: 15000
Current n_star: 28000
	Current episode: 0
	Current episode: 5000
	Current episode: 10000
	Current episode: 15000
Current n_star: 32000
	Current

In [171]:
# Load data

with open('question_2.pickle', 'rb') as f:
    n_star_list, n_star_rewards, n_star_M_opts, n_star_M_rands =  pickle.load(f)

In [175]:
fig = go.Figure()

for i,  y_i in enumerate(n_star_rewards):
    fig.add_trace(go.Scatter(x=x, y=y_i, mode='lines',name=r'$n^* = %d$' % n_star_list[i]))

fig.update_layout(
    title=r'$\Large{\text{Average reward every %d games for different values of }}n^{∗}$' % averaging_steps,
    xaxis_title=r'$\Large{game$}',
    yaxis_title=r'$\Large{average(reward)\text{- %d games}}$' % (averaging_steps),
    width=1200, height=400
)

fig.update_layout(
    font=dict(
        size=18,
    )
)

fig.data[-1].line.color = "#98267A"



fig.show()

Values in the interval $[4000,24000]$ have similar performances with respect to the case with fixed $\epsilon$ (i.e: the case with $n^* = 1$) since they achieve the same average reward around 0.5 in the end.
However we observe how $n^* = 4000$ gives the best results since it has the fastets growth after 4000 games than the fixed $\epsilon$. Values in the interval $[28000,40000]$ have worse performances.

### Question 3

In [176]:
fig = go.Figure()

for i, y_i in enumerate(n_star_M_opts):
    fig.add_trace(go.Scatter(x=x, y=y_i, mode='lines',name=r'$n^* = %d$' % n_star_list[i]))
    
fig.update_layout(
    title=r'$\Large{M_{opt}\text{ every %d games for different values of }}n^{∗}$' % averaging_steps,
    xaxis_title=r'$\Large{game}$',
    yaxis_title=r'$\Large{M_{opt}}$',
    width=1200,
    height=400
)

fig.update_layout(
    font=dict(
        size=18,
    )
)

fig.data[-1].line.color = "#98267A"

    
fig.show()

In [179]:
fig = go.Figure()

for i, y_i in enumerate(n_star_M_rands):
    fig.add_trace(go.Scatter(x=x, y=y_i, mode='lines',name=r'$n^* = %d$' % n_star_list[i]))
    
fig.update_layout(
    title=r'$\Large{M_{rand}\text{ every %d games for different values of }n^{∗}}$' % averaging_steps,
    xaxis_title=r'$\Large{game}$',
    yaxis_title=r'$\Large{M_{rand}}$',
    width=1200, height=400
)

fig.data[-1].line.color = "#98267A"
fig.data[1].line.color = "#000000"

fig.update_layout(
    font=dict(
        size=18,
    )
)
    
fig.show()

We can observe how in both plots the values of $M_{opt}$ and $M_{rand}$ tend correctly to the respective values $0$ and $1$.
Similarly to before we observe how for $n^*=4000$ we have the fastest learning, whereas for largest values such as $n^*=40000$ we have slower and instable convergence.

## 2.1.2 Good experts and bad experts

### Questions 4

In [None]:
n_star = 4000
n_episodes = 20000
averaging_steps = 250

env = TictactoeEnv()

eps_rewards = []
eps_list = [round(x,2) for x in np.linspace(0, 1, 11)]
print([round(ep,2) for ep in eps_list])
eps_M_opts = []
eps_M_rands = []
Q_values = []

for eps in eps_list:
    print('Current eps: %.2f' % eps)
    q_player = VariableEpsilonQPlayer(0.25, 0.99, 0.8, 0.1, n_star)
    other_player = OptimalPlayer(epsilon=eps, player=turns[1])
   
    rewards, M_opts, M_rands, x = run_episodes(n_episodes, q_player, other_player, update_epsilon=True)

    if eps==0 or eps==1:
        Q_values.append(q_player.get_Q_values())

    eps_rewards.append(rewards)
    eps_M_opts.append(M_opts)
    eps_M_rands.append(M_rands)

# Save data
if write:
    with open('question_4.pickle', 'wb') as f:
        pickle.dump((eps_list, eps_rewards, eps_M_opts, eps_M_rands), f)

    with open('question_6.pickle', 'wb') as f:
        pickle.dump(dill.dumps(Q_values), f)


[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
Current eps: 0.00
	Current episode: 0
	Current episode: 5000
	Current episode: 10000
	Current episode: 15000
Current eps: 0.10
	Current episode: 0
	Current episode: 5000
	Current episode: 10000
	Current episode: 15000
Current eps: 0.20
	Current episode: 0
	Current episode: 5000
	Current episode: 10000
	Current episode: 15000
Current eps: 0.30
	Current episode: 0
	Current episode: 5000
	Current episode: 10000
	Current episode: 15000
Current eps: 0.40
	Current episode: 0
	Current episode: 5000
	Current episode: 10000
	Current episode: 15000
Current eps: 0.50
	Current episode: 0
	Current episode: 5000
	Current episode: 10000
	Current episode: 15000
Current eps: 0.60
	Current episode: 0
	Current episode: 5000
	Current episode: 10000
	Current episode: 15000
Current eps: 0.70
	Current episode: 0
	Current episode: 5000
	Current episode: 10000
	Current episode: 15000
Current eps: 0.80
	Current episode: 0
	Current episode: 5000
	Current ep

AttributeError: Can't pickle local object 'QPlayer.__init__.<locals>.<lambda>'

In [180]:
# Load data

with open('question_4.pickle', 'rb') as f:
    eps_list, eps_rewards, eps_M_opts, eps_M_rands =  pickle.load(f)

In [185]:
fig = go.Figure()
for i, eps_reward in enumerate(eps_rewards):
    fig.add_trace(go.Scatter(x=x, y=eps_reward, mode='lines',name=r'$\epsilon_{opt} = %.2f$' % eps_list[i]))

fig.update_layout(
    title=r'$\Large{\text{Average reward every %d games for different values of }\epsilon_{opt}}$' % averaging_steps,
    xaxis_title=r'$\Large{game}$',
    yaxis_title=r'$\Large{average(reward)\text{- %d games}}$' % (averaging_steps))
fig.update_layout(
    font=dict(
        size=18,
    )
)
fig.show()

In [191]:
fig = go.Figure()

for i, y_i in enumerate(eps_M_opts):
    fig.add_trace(go.Scatter(x=x, y=y_i, mode='lines',name=r'$\epsilon_{opt} = %.2f$' % eps_list[i]))
    
fig.update_layout(
    title=r'$\Large{M_{opt}\text{ for every %d games for different values of }\epsilon_{opt}}$' % averaging_steps,
    xaxis_title=r'$\Large{game}$',
    yaxis_title=r'$\Large{M_{opt}}$',
    width=1300, height=415
)
fig.data[-1].line.color = "#98267A"
fig.update_layout(
    font=dict(
        size=18,
    )
)
    
fig.show()

In [192]:
fig = go.Figure()

for i, y_i in enumerate(eps_M_rands):
    fig.add_trace(go.Scatter(x=x, y=y_i, mode='lines',name=r'$\epsilon_{opt} = %.2f$' % eps_list[i]))
    
fig.update_layout(
    title=r'$\Large{M_{rand}\text{ for every %d games for different values of }\epsilon_{opt}}$' % averaging_steps,
    xaxis_title=r'$\Large{game}$',
    yaxis_title=r'$\Large{M_{rand}}$',
    width=1200, height=415
)
fig.data[-1].line.color = "#98267A"

fig.update_layout(
    font=dict(
        size=18,
    )
)

fig.show()

We can see that for $M_{opt}$ the learners which perform better are the ones that trained with good experts (we can see how for small values of $\epsilon_{opt}$ the curves converge faster to $0$) whereas the ones performing worse are those trained with bad experts (in particular for $\epsilon = 0.9$ and $\epsilon = 1.0$ they don't even reach the value $0$). On the other hand, the learners performing better in $M_{rand}$ are those trained with bad experts, and the ones performing worse are those trained with good experts (in particular for $\epsilon$ $0.0$, $0.1$ ,$0.2$, $0.3$ the curves stay under the value 0.8).

### Question 5

In [193]:
#to rerun
M_opt_max = np.array(eps_M_opts).max(initial=-1)
M_rand_max = np.array(eps_M_rands).max(initial=-1)

print("The highest M_opt value is %.2f" % M_opt_max)
print("The highest M_rand value is %.2f" % M_rand_max)

The highest M_opt value is 0.00
The highest M_rand value is 0.93


### Question 6

In [7]:
# Load data

with open('question_6.pickle', 'rb') as f:
    Q_values = dill.loads(pickle.load(f))

In [36]:
Q_values_1 = list(Q_values[0].values())
fig = px.imshow(Q_values_1,text_auto=True,
                labels=dict(x="Available actions", y="State - log scale", color="$Q_{value}$"),
                                 title="$Q_1(s,a)$", zmin=-1, zmax=1)
fig.update_layout(
    width=900, height=800,
    font=dict(
        size=20,
    )
)

fig.update_yaxes(
    type="log"
)


fig.show()

In [35]:
Q_values_2 = list(Q_values[1].values())
fig = px.imshow(Q_values_2,text_auto=True,
                labels=dict(x="Available actions", y="State - log scale", color="$Q_{value}$"),
                                 title="$Q_2(s,a)$", zmin=-1, zmax=1)
fig.update_yaxes(
    type="log"
)

fig.update_layout(
    width=900, height=800,
    font=dict(
        size=20,
    )
)

fig.show()

From the heatmaps we can immediately observe how the  $Q_{values}$ of the 2 Agents are different. In particular, we can see how the Agent 2, which played with a random player, it explored state-actions which Agent 2 didn't (e.g: the column associated to action 4). 
In addition, Agent 2 has many positive values, whereas Agent 1 has most of them negative. This can be explained by considering that in order to win with a random player it's sufficient to know the good moves, whereas with an optimal player it is necessary to avoid all bad moves, since the optimal player would exploit them to win the match. 

# 2.2 Learning by self-practice

In [None]:
n_episodes = 20000
averaging_steps = 250

env = TictactoeEnv()

eps_rewards = []
eps_list = [round(x,2) for x in np.linspace(0, 1, 11)]
print(eps_list)
eps_M_opts = []
eps_M_rands = []

for eps in eps_list:
    print('Current eps: %.2f' % eps)
    q_player_1 = QPlayer(0.25, 0.99, eps)
    q_player_2 = QPlayer(0.25, 0.99, eps, q_player_1.get_Q_values())
 
    rewards, M_opts, M_rands, x = run_episodes(n_episodes, q_player_1, q_player_2, set_player=False)

    eps_rewards.append(rewards)
    eps_M_opts.append(M_opts)
    eps_M_rands.append(M_rands)

# Save data
if write:
    with open('question_7.pickle', 'wb') as f:
        pickle.dump((eps_list, eps_rewards, eps_M_opts, eps_M_rands), f)

In [194]:
# Load data

with open('question_7.pickle', 'rb') as f:
    eps_list, eps_rewards, eps_M_opts, eps_M_rands =  pickle.load(f)

In [198]:
fig = go.Figure()
for i, y_i in enumerate(eps_M_opts[:-1]):
    fig.add_trace(go.Scatter(x=x, y=y_i, mode='lines',name=r'$\epsilon = %.2f$' % eps_list[i]))
    
fig.update_layout(
    title=r'$\Large{M_{opt}\text{ for every %d games for different values of }\epsilon}$' % averaging_steps,
    xaxis_title=r'$\Large{game}$',
    yaxis_title=r'$\Large{M_{opt}}$',
    width=1200, height=415
)

fig.data[-1].line.color = "#98267A"

fig.update_layout(
    font=dict(
        size=18,
    )
)
    
fig.show()

In [199]:
fig = go.Figure()

for i, y_i in enumerate(eps_M_rands[:-1]):
    fig.add_trace(go.Scatter(x=x, y=y_i, mode='lines',name=r'$\epsilon = %.2f$' % eps_list[i]))
    
fig.update_layout(
    title=r'$\Large{M_{rand}\text{ for every %d games for different values of }\epsilon}$' % averaging_steps,
    xaxis_title=r'$\Large{game}$',
    yaxis_title=r'$\Large{M_{rand}}$',
    width=1200, height=415
)

fig.data[-1].line.color = "#98267A"

fig.update_layout(
    font=dict(
        size=18,
    )
)
    
fig.show()

By observing the Fig. we can see how the agent is almost always able to learn, even if the learning phase is more noisy and slow in the case of $M_{opt}$. The only exception we find is for $\epsilon = 0$: in this case, we can see how it performs poorly in both $M_{opt}$ and $M_{rand}$.

### Questions 8-9


In [93]:
n_episodes = 20000
averaging_steps = 250

env = TictactoeEnv()

n_star_rewards = []
n_star_list = [int(x) for x in np.linspace(1, 40000, 11)]
print(n_star_list)
n_star_M_opts = []
n_star_M_rands = []


for n_star in n_star_list:
    print('Current n_star: %d' % n_star)
    q_player = VariableEpsilonQPlayer(0.25, 0.99, 0.8, 0.1, n_star)
    q_player_2 = VariableEpsilonQPlayer(0.25, 0.99, 0.8, 0.1, n_star, q_player.get_Q_values())

    rewards, M_opts, M_rands, x = run_episodes(n_episodes, q_player, q_player_2, update_epsilon=True, update_other_epsilon=True, set_player=False)

    n_star_rewards.append(rewards)
    n_star_M_opts.append(M_opts)
    n_star_M_rands.append(M_rands)

Q_values = q_player.get_Q_values()

# Save data
if write:
    with open('question_8_9.pickle', 'wb') as f:
        pickle.dump((n_star_list, n_star_rewards, n_star_M_opts, n_star_M_rands), f)

    with open('question_10.pickle', 'wb') as f:
        pickle.dump(dill.dumps(Q_values), f)

[1, 4000, 8000, 12000, 16000, 20000, 24000, 28000, 32000, 36000, 40000]
Current n_star: 1
	Current episode: 0
	Current episode: 5000
	Current episode: 10000
	Current episode: 15000
Current n_star: 4000
	Current episode: 0
	Current episode: 5000
	Current episode: 10000
	Current episode: 15000
Current n_star: 8000
	Current episode: 0
	Current episode: 5000
	Current episode: 10000
	Current episode: 15000
Current n_star: 12000
	Current episode: 0
	Current episode: 5000
	Current episode: 10000
	Current episode: 15000
Current n_star: 16000
	Current episode: 0
	Current episode: 5000
	Current episode: 10000
	Current episode: 15000
Current n_star: 20000
	Current episode: 0
	Current episode: 5000
	Current episode: 10000
	Current episode: 15000
Current n_star: 24000
	Current episode: 0
	Current episode: 5000
	Current episode: 10000
	Current episode: 15000
Current n_star: 28000
	Current episode: 0
	Current episode: 5000
	Current episode: 10000
	Current episode: 15000
Current n_star: 32000
	Current

In [201]:
# Load data

with open('question_8_9.pickle', 'rb') as f:
    n_star_list, n_star_rewards, n_star_M_opts, n_star_M_rands =  pickle.load(f)

In [203]:
fig = go.Figure()

for i, y_i in enumerate(n_star_M_opts):
    fig.add_trace(go.Scatter(x=x, y=y_i, mode='lines',name=r'$n^* = %d$' % n_star_list[i]))
    
fig.update_layout(
    title=r'$\Large{M_{opt}\text{ for every %d games for different values of }n^{∗}}$' % averaging_steps,
    xaxis_title=r'$\Large{game}$',
    yaxis_title=r'$\Large{M_{opt}}$',
    width=1200, height=415
)
fig.data[-1].line.color = "#98267A"

fig.update_layout(
    font=dict(
        size=18,
    )
)
    
fig.show()

In [204]:
fig = go.Figure()

for i, y_i in enumerate(n_star_M_rands):
    fig.add_trace(go.Scatter(x=x, y=y_i, mode='lines',name=r'$n^* = %d$' % n_star_list[i]))
    
fig.update_layout(
    title=r'$\Large{M_{rand}\text{ for every %d games for different values of }n^{∗}}$' % averaging_steps,
    xaxis_title=r'$\Large{game}$',
    yaxis_title=r'$\Large{M_{rand}}$',
    width=1200, height=415
)

fig.data[-1].line.color = "#98267A"

fig.update_layout(
    font=dict(
        size=18,
    )
)  

fig.show()

Decreasing $\epsilon$ seems to help the learning especially in the case of $M_{opt}$. Indeed by observing the plots we can see how differently than before the curves are generally less noisy and they converge all together towards $0$. However only for $n^*=28000$ we really achieve the value $0$ whereas for other $n^*$ the maximum value is around $-0.1$.

In [81]:
M_opt_max = np.array(n_star_M_opts).max(initial=-1)
M_rand_max = np.array(n_star_M_rands).max(initial=-1)

print("The highest M_opt value is %.2f" % M_opt_max)
print("The highest M_rand value is %.2f" % M_rand_max)

The highest M_opt value is 0.00
The highest M_rand value is 0.93


### Question 10

In [24]:
# Load data

with open('question_10.pickle', 'rb') as f:
    Q_values = dill.loads(pickle.load(f))

In [27]:
#get keys of states
key_state_1='[[ 1. -1.  0.]\n [-1. -1.  1.]\n [-1.  1.  1.]]'
key_state_2='[[-1. -1.  0.]\n [ 0.  1.  1.]\n [ 1. -1.  1.]]'
key_state_3='[[ 0.  0. -1.]\n [-1.  1.  1.]\n [-1.  0.  1.]]'

In [28]:
#get available actions
available_actions_state_1=Q_values.get(key_state_1)
available_actions_state_2=Q_values.get(key_state_2)
available_actions_state_3=Q_values.get(key_state_3)

In [29]:
import plotly.express as px

fig = px.imshow([available_actions_state_1, available_actions_state_2, available_actions_state_3],text_auto=True,
                labels=dict(x="Available actions", y="State", color="$Q_{value}$"),
                                y=['state_1', 'state_2', 'state_3'])
fig.update_layout(
    font=dict(
        size=20,
    )
)
fig.show()