# Q-learning

In [3]:
import random
import numpy as np

from tic_env import TictactoeEnv, OptimalPlayer


st2int = lambda st: int(((st.flatten() + 1) * (3**np.arange(9))).sum())

# https://xkcd.com/832/
class Player:
    def __init__(self, learning_rate, discount_factor, exp_rate):
        self.states = []
        self.states_value = {}  # state -> value
        self.lr = learning_rate
        self.decay_gamma = discount_factor
        self.exp_rate = exp_rate
        self.env = TictactoeEnv()
                
    def act(self, grid, symbol):
        if np.random.uniform(0, 1) <= self.exp_rate:
            action = self.randomMove(grid)
        else:
            positions = self.availablePositions(grid)
            value_max = -999
            if symbol == 'X':
                symb = 1
            elif symbol == 'O':
                symb = -1
            else:
                print("ERROR: wrong symbol")
            for p in positions:
                next_board = grid.copy()
                next_board[p] = symb
#                 print(next_board)
                state_key = st2int(next_board)
                value = 0 if self.states_value.get(state_key) is None else self.states_value[state_key]
                if value >= value_max:
                    value_max = value
                    action = p
            # print("{} takes action {}".format(self.name, action))
        return action
    
    def randomMove(self, grid):
        """ Chose a random move from the available options. """
        return random.choice(self.availablePositions(grid))
         
    def availablePositions(self, grid): #nommé empty dans env
        '''return all empty positions'''
        return [(i // 3, i % 3) for i in range(9) if not grid[(i // 3, i % 3)]]
    
    def update_qtable(self, reward):
        for state in reversed(self.states):
            if self.states_value.get(state) is None:
                self.states_value[state] = 0
            self.states_value[state] += self.lr*(self.decay_gamma*reward - self.states_value[state])
            reward = self.states_value[state]
            
    def addState(self, state):
        self.states.append(st2int(state))
            
    def train(self, N, epsilon = 0., print_every = 100):
        Turns = np.array(['X','O'])
        avg_reward = 0
        for i in range(1, N+1):
            self.exp = i
            self.env.reset()
            grid, _, __ = self.env.observe()
            Turns = Turns[::-1]
            player_opt = OptimalPlayer(epsilon, player=Turns[0])
                
            for j in range(9):
                if self.env.current_player == player_opt.player:
                    move = player_opt.act(grid)
                else:
                    move = self.act(grid, Turns[1])

                grid, end, winner = self.env.step(move, print_grid=False)
                self.addState(self.env.grid.reshape(9))
            
                if end:
                    if i % print_every == 0:
                        print("Game n°:", i, "exp :", self.exp)
                        print('-------------------------------------------')
                        print('Game end, winner is player ' + str(winner))
                        print('Optimal player = ' +  Turns[0])
                        print('Player = ' +  Turns[1])
#                         self.env.render()
                        print("AVERGAE REWARD :", avg_reward/print_every)
                        avg_reward = 0
                    reward = self.env.reward(Turns[1])
                    avg_reward += reward
                    self.update_qtable(reward)
                    self.env.reset()
                    break
                    
    def test_policy(self, N_test, epsilon=0.):
        Turns = np.array(['X','O'])
        n_wins = 0
        n_loss = 0
        
        for i in range(N_test):
            self.env.reset()
            grid, _, __ = self.env.observe()
            Turns = Turns[::-1]
            player_opt = OptimalPlayer(epsilon, player=Turns[0])
                
            for j in range(9):
                if self.env.current_player == player_opt.player:
                    move = player_opt.act(grid)
                else:
                    move = self.act(grid, Turns[1] )

                grid, end, winner = self.env.step(move, print_grid=False)
            
                if end:
                    if winner == Turns[1]:
                        n_wins +=1
                    if winner == Turns[0]:
                        n_loss +=1
                    self.env.reset()
#                     print("Your win is ", winner == Turns[1])
                    break
                    
        return (n_wins-n_loss)/N_test

In [7]:
#TRAIN PARAMETERS
learning_rate = 0.05
discount_factor = 0.99
expl_level = 0.5
optimal_eps_train = 0.5

my_player = Player(learning_rate, discount_factor, expl_level)

In [8]:
#TRAIN
games_to_train = 1000
print_every = 1000
my_player.train(games_to_train, optimal_eps_train, print_every)


Game n°: 0 exp : 0
-------------------------------------------
Game end, winner is player None
Optimal player = O
Player = X
AVERGAE REWARD : 0.0


In [11]:
#TEST
optimal_eps_test = 1
games_to_test = 500
M = my_player.test_policy(games_to_test, optimal_eps_test)
print(M)

0.016


##  Learning from experts

**Question 1**. Plot average reward for every 250 games during training – i.e. after the 50th game, plot
the average reward of the first 250 games, after the 100th game, plot the average reward of games 51 to
100, etc. Does the agent learn to play Tic Tac Toe? \
*Expected answer*: A figure of average reward over time (caption length < 50 words). Specify your choice
of $\epsilon$.

###  Decreasing exploration

###  Good experts and bad experts

##  Learning by self-practice

#  Deep Q-Learning

In [47]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision.datasets import MNIST
from torchvision.transforms import ToTensor


class DQLNN(nn.Module):
    """ DQLNN, expects input shape (3, 3, 2) """
    def __init__(self):
        super(DQLNN, self).__init__()

        self.fc1 = nn.Linear(3*3*2, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, 9)
        
    def forward(self, x):
        return self.fc3(F.relu(self.fc2(F.relu(self.fc1(x.flatten())))))
    
model_dql = DQLNN()

In [None]:
# Player reinitializing

## Learning from experts

##      Learning by self-practice

# Comparing Q-Learning with Deep Q-Learning

I expect DQL working better than QL