In [2]:
import random
import matplotlib 
import numpy as np 
from collections import defaultdict 

In [3]:
class BoardEnvironment:
    """ this class creates an environment for agents to interact with"""

    def __init__(self):
        "initialize board"

    def set_players(self, playerA, playerB):
        " connects players with the environment "
        self.playerA = playerA
        self.playerB = playerB
        self.reset()  # defines current_player
        self.score_board = {'X': 0, 'O': 0}

    def reset(self):
        self.turn = 'X'  # the board always starts with X, regardless of which player

        # board states are a 16-character representing the state of the board.
#         self.board = list('----------------')
        self.board = list('-----------------------')
        self.score_board = {'X': 0, 'O': 0}
        if (self.playerA and self.playerB):  # if they are set
            self.playerA.reset_past()
            self.playerB.reset_past()
            if (random.random() < 0.5):  # randomly pick the player to start
                self.current_player = self.playerA
            else:
                self.current_player = self.playerB

    def print_board(self, board_string=None):
        "print more readable board either from supplied board string or the current board"
        if not board_string:
            B = self.board
        else:
            B = board_string

#         print('*',B[0],'*',B[1],'*')
#         print(B[2],B[12],B[3],B[13],B[4])
#         print('*',B[5],'*',B[6],'*')
#         print(B[7],B[14],B[8],B[15],B[9])
#         print('*',B[10],'*',B[11],'*')
    

    def get_state(self):
        return "".join(self.board)

    def other_player(self):
        # note, returns other player even if playerA is playing itself
        if (self.current_player == self.playerA):
            return self.playerB
        else:
            return self.playerA

    def available_actions(self):
#         return [ind for ind, val in enumerate(self.board[:12]) if val == '-']
        return [ind for ind, val in enumerate(self.board[:17]) if val == '-']

    def other_turn(self):
        return 'X' if self.turn == 'O' else 'O'

    def play_game(self):
        # returns the winning player or None if a tie
        self.reset()
        while True:
            choice = self.current_player.select_action()

            self.board[choice] = self.turn  # should check if valid

            score = self.winner(choice)
            self.score_board[self.turn] += len(score)
            if len(score)!=0:
                #not a tie
#                 self.current_player.reward(100)
                for i in score:
                    self.board[i]=self.turn
            else:
            # switch players
                self.turn = self.other_turn()
                self.current_player = self.other_player()

            if self.is_full():
                break

        if self.score_board[self.turn] > self.score_board[self.other_turn()]:
            self.current_player.reward(100)
            self.other_player().reward(-100)
            return self.current_player
        elif self.score_board[self.turn] < self.score_board[self.other_turn()]:
            self.other_player().reward(100)
            self.current_player.reward(-100)
            return self.other_player()
        else:# it's a tie
            self.current_player.reward(0)
            self.other_player().reward(0)
            return None

    def winner(self, choice):
        boxes = (
#             (0, 2, 3, 5),
#             (1, 3, 4, 6),
#             (5, 7, 8, 10),
#             (6, 8, 9, 11)
            (0,3,4,7),
            (1,4,5,8),
            (2,5,9,6),
            (7,10,11,14),
            (8,11,12,15),
            (9,12,13,16)
        )
        result=[]
        for box in boxes:
            if (choice in box) and all(self.board[i] != '-' for i in box):
#                 if box == (0,2,3,5):
#                     result.append(12)
#                 elif box == (1,3,4,6):
#                     result.append(13)
#                 elif box == (5,7,8,10):
#                     result.append(14)
#                 else:
#                     result.append(15)
                if box == (0,3,4,7):
                    result.append(17)
                elif box == (1,4,5,8):
                    result.append(18)
                elif box == (2,5,9,6):
                    result.append(19)
                elif box == (7,10,11,14):
                    result.append(20)
                elif box == ((8,11,12,15)):
                    result.append(21)
                else:
                    result.append(22)
        return result  # if there is no winner

    def is_full(self):
#         return ('-' not in self.board[0:12])
        return ('-' not in self.board[0:17])
    # %%

In [4]:
class Agent:
    """ this class is a generic Q-Learning reinforcement learning agent for discrete states and fixed actions
    represented as strings"""
    def __init__(self, environment, policy = 'max', learning_rate = 0.5, discount_factor = 0.7, epsilon = 0.6):
        if policy in ['max', 'random', 'epsilon']:
          self.policy = policy
        else:
          raise InputError(policy, ' is not an available policy')
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.Q = defaultdict(lambda: 0.0) # stores (state, action) value tuples as keys
        self.environment = environment
        self.epsilon = epsilon # Fraction of time making a random choice for epsilon policy
        self.reset_past()

    def reset_past(self):
      self.past_action = None
      self.past_state = None
          
    def select_action(self):
      available_actions = self.environment.available_actions()
      if (self.policy == 'random') or (self.policy == 'epsilon' and random.random() < self.epsilon):
        choice = random.choice(available_actions)
      else: #self.policy == 'max' or it's an epsilon policy determined to pick the max
        Q_vals = [self.Q[(self.environment.get_state(), x)] for x in available_actions]
        #randomly pick one of the maximum values
        max_val = max(Q_vals) # will often be 0 in the beginning
        max_pos = [i for i, j in enumerate(Q_vals) if j == max_val]
        max_indices = [available_actions[x] for x in max_pos]
        choice = random.choice(max_indices)
      self.past_state = self.environment.get_state()
      self.past_action = choice
      return choice
        
    def reward(self, reward_value):
        # finding the best expected reward
        available_actions = self.environment.available_actions()
        next_Q_vals = [self.Q[(self.environment.get_state(), x)] for x in available_actions]
        max_next_Q = max(next_Q_vals) if next_Q_vals else 0 # will often be 0 in the beginning
        td_target = reward_value + self.discount_factor * max_next_Q
        reward_pred_error = td_target - self.Q[(self.past_state,self.past_action)]
        if (self.past_state or self.past_action):
          self.Q[(self.past_state,self.past_action)] += self.learning_rate * reward_pred_error


In [4]:
import sys
class RepeatedGames:
    def __init__(self, environment, playerA, playerB):
        self.environment = environment
        self.playerA = playerA
        self.playerB = playerB
        self.reset_history()
    
    def reset_history(self):
        self.history = []
    
    def play_game(self):
        winner = self.environment.play_game()
        if (winner == self.playerA):
          self.history.append('A')
        elif (winner == self.playerB):
          self.history.append('B')
        else:
          self.history.append('-')
    
    def play_games(self, games_to_play):
        for i in range(games_to_play):
            self.play_game()
#             sys.stdout.write("\r")
#             sys.stdout.write("{:2d} games played.".format(i))
#             sys.stdout.flush()
        print(self.history[-games_to_play:].count('A'),'games won by player A')
        print(self.history[-games_to_play:].count('B'),'games won by player B')
        print(self.history[-games_to_play:].count('-'),'ties')
        win_rate=self.history[-games_to_play:].count('A')/len(self.history[-games_to_play:])*100
        print("Winning rate: {}".format(win_rate))

# Training

In [5]:
board = BoardEnvironment()
A = Agent(board, 'epsilon')
B = Agent(board, 'random')
board.set_players(A,B)

tournament = RepeatedGames(board,A,B)
tournament.play_games(100)
print()
tournament.play_games(1000000)
print()
tournament.play_games(1000000)
print()
tournament.play_games(1000000)
print()
tournament.play_games(1000000)
print()

tournament.play_games(100)

50 games won by player A
38 games won by player B
12 ties
Winning rate: 50.0

441072 games won by player A
439776 games won by player B
119152 ties
Winning rate: 44.1072

441323 games won by player A
439010 games won by player B
119667 ties
Winning rate: 44.1323

441882 games won by player A
438963 games won by player B
119155 ties
Winning rate: 44.1882

443514 games won by player A
436969 games won by player B
119517 ties
Winning rate: 44.351400000000005

44 games won by player A
47 games won by player B
9 ties
Winning rate: 44.0


# Export q table

In [None]:
with open('hard1.txt', 'w') as f:
    print(dict(A.Q), file=f)

## Plot the reward hisotry

In [None]:
# plot the history
import numpy as np
import pylab as py
import matplotlib
%matplotlib inline 

history = np.array(tournament.history.copy())
rewards = np.zeros(len(history))
rewards[history == 'A'] = 100
rewards[history == 'B'] = -100

def running_mean(x, N):
    return np.convolve(x, np.ones((N,))/N, mode='valid')
r_mean = running_mean(rewards, 1000)
py.plot(r_mean)
py.xlabel('games played')
py.ylabel('average reward')
py.title('Average rewards over 100 games (win/loss is 100/-100)');